In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
!pip install torchviz
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import torchvision.models as models
import torchvision.datasets as dset
import torchvision.transforms as transforms
from torch.autograd import Variable
from torchvision.models.vgg import model_urls
from torchviz import make_dot
import torch.nn.functional as F
import random
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

!pip install GPUtil

import torch
from GPUtil import showUtilization as gpu_usage
from numba import cuda 


random_state=2019

In [None]:
data = pd.read_csv('../input/abusive-language/hate_dataset.csv')

In [None]:
from torchtext.data.utils import get_tokenizer
from collections import Counter
from torchtext.vocab import Vocab

tokenizer = get_tokenizer('basic_english')
counter = Counter()
for i in data.tweet:
    counter.update(tokenizer(i))
# start_and_end = '<start> <end>'
# counter.update(tokenizer(start_and_end))
vocab = Vocab(counter, min_freq=1,specials = ['<unk>','<pad>','<start>','<end>'])

In [None]:
def generate_combined(data):
    combined_data = []
    for i in range(len(data)):
        combined_data.append((data.iloc[i].tweet,data.iloc[i].target))
    return combined_data

combined_data = generate_combined(data)

In [None]:
text_pipeline = lambda x: [vocab[token] for token in tokenizer(x)]

In [None]:
from sklearn.model_selection import train_test_split

train_data,validation_data = train_test_split(combined_data,shuffle=True,random_state=random_state,stratify=data.target,test_size=0.2)

In [None]:
from torch.utils.data import DataLoader

def collate_batch(batch):
    target, offsets = [], []
    pad_token = vocab['<pad>']
    lens = [len(tokenizer(text)) for (text,target) in batch]
    batch_size = len(lens)
    longest_sent = max(lens) + 2
    padded_text = np.ones((batch_size, longest_sent)) * pad_token
    for i,(_text, _target) in enumerate(batch):
         target.append(_target)
         temp_text = '<start> ' + _text + ' <end>'  
         sentence_length = len(tokenizer(temp_text))
         processed_text = torch.tensor(text_pipeline(temp_text), dtype=torch.int64)
         padded_text[i,:sentence_length] = processed_text
         offsets.append([processed_text.size(0)])
    target = torch.tensor([t for t in target],dtype=torch.long)
    target = target.unsqueeze(1)
    offsets = torch.tensor(offsets)
    padded_text = torch.tensor(padded_text,dtype=torch.int64)
    return padded_text.to(device), target.to(device), offsets.to(device)

train_dataloader = DataLoader(train_data, batch_size=64, shuffle=True, collate_fn=collate_batch)
validation_dataloader = DataLoader(validation_data, batch_size=64, shuffle=True, collate_fn=collate_batch)

In [None]:
class Encoder(nn.Module):
    def __init__(self, embedding_dim, encoder_hidden_dim,embedding_layer,n_layers=1, dropout_prob=0.5):
        super().__init__()
 
        self.embedding = embedding_layer
        self.encoderLabelsTransform = torch.nn.Linear(1, encoder_hidden_dim)
        self.encoder = nn.GRU(embedding_dim, encoder_hidden_dim, n_layers,batch_first=True)
        self.dropout = nn.Dropout(dropout_prob)
 
    def forward(self, input_batch,label):
        embedded = self.dropout(self.embedding(input_batch))
        tensorlabel = self.encoderLabelsTransform(label)
        tensorlabel = tensorlabel.unsqueeze(0)
        outputs, hidden = self.encoder(embedded,tensorlabel)
        outputs = self.dropout(outputs)
 
        return outputs, hidden

In [None]:
class BahdanauDecoder(nn.Module):
      def __init__(self, embedding_dim,hidden_size, output_size, embedding_layer, n_layers=1, drop_prob=0.1):
        super(BahdanauDecoder, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.drop_prob = drop_prob

        self.embedding = embedding_layer

        self.fc_hidden = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
        self.fc_encoder = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
        self.weight = nn.Parameter(torch.randn(hidden_size,1))
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.drop_prob)
        self.gru = nn.GRU(self.hidden_size+embedding_dim, self.hidden_size, batch_first=True)
        self.classifier = nn.Linear(self.hidden_size, self.output_size)

      def forward(self, inputs, hidden, encoder_outputs):
#         encoder_outputs = encoder_outputs.squeeze()
        # Embed input words
        embedded = self.embedding(inputs)
        embedded = embedded.view(encoder_outputs.shape[0],-1)
        embedded = self.dropout(embedded)

        # Calculating Alignment Scores
        x = torch.tanh(self.fc_hidden(torch.transpose(hidden,0,1))+self.fc_encoder(encoder_outputs))
        weight = self.weight.repeat(encoder_outputs.shape[0],1,1)
        alignment_scores = x.bmm(weight)  

        # Softmaxing alignment scores to get Attention weights
        attn_weights = F.softmax(alignment_scores, dim=1)

        # Multiplying the Attention weights with encoder outputs to get the context vector
        context_vector = torch.bmm(torch.transpose(attn_weights,1,2),
                                 encoder_outputs)
        context_vector = torch.transpose(context_vector,0,1)
        # Concatenating context vector with embedded input word
        output = torch.cat((embedded, context_vector[0]), 1).unsqueeze(1)
        # Passing the concatenated vector as input to the LSTM cell
        output, hidden = self.gru(output, hidden)
        # Passing the LSTM output through a Linear layer acting as a classifier
        output = F.softmax(self.classifier(output).view(encoder_outputs.shape[0],-1), dim=1)
        return output, hidden, attn_weights

In [None]:
class Decoder(nn.Module):
    def __init__(self, one_step_decoder,device):
        super().__init__()
        self.one_step_decoder = one_step_decoder
        self.decoderLabelsTransform = nn.Linear(1,self.one_step_decoder.hidden_size)
        self.device = device
 
    def forward(self,encoder_outputs, label, start_token,target=None,lengths=None,teacher_forcing_ratio=1,max_length=20):
        sort_ind = None
        if target is not None:
            lengths, sort_ind = lengths.squeeze(1).sort(dim=0, descending=True)
            encoder_outputs = encoder_outputs[sort_ind]
            target = target[sort_ind]
            label = label[sort_ind]
        else:
            teacher_forcing_ratio = 0
        batch_size = encoder_outputs.shape[0]
        trg_len = target.shape[1]-1 if target is not None else max_length
        trg_vocab_size = self.one_step_decoder.output_size
        hidden = self.decoderLabelsTransform(label)
        hidden = hidden.unsqueeze(0)
        
        outputs = torch.zeros(batch_size, trg_len, trg_vocab_size).to(self.device) 
        top1 = (torch.ones(batch_size,dtype=torch.long)*start_token).to(self.device)
        for t in range(trg_len):
            # Pass the encoder_outputs. For the first time step the 
            # hidden state comes from the encoder model.
            batch_size_t = sum([l > t for l in lengths]) if lengths is not None else batch_size
            teacher_force = random.random() < teacher_forcing_ratio
            inputs = target[:batch_size_t,t].unsqueeze(1) if teacher_force else top1[:batch_size_t].unsqueeze(1)
            output, hidden, a = self.one_step_decoder(inputs, hidden[:,:batch_size_t], encoder_outputs[:batch_size_t])
            outputs[:batch_size_t,t] = output

            top1 = output.argmax(1)
            
        return outputs,target,label,sort_ind

In [None]:
class BeamState(object):

    def __init__(self, word, h, outputs,sentence, nll):
        """
        Args:
            word -- the id of the word characterising the state
            h -- the hidden state associated to that state
            sentence -- a list of word ids (the past ids plus the current one)
            nll -- the negative log likelihood corresponding to the sentence
        """
        self.word, self.h, self.outputs, self.sentence, self.nll = word, h, outputs, sentence, nll


class BeamSearchDecoder(object):

    def __init__(self, styleTransfer):
        self.model = styleTransfer
        self.max_length = self.model.max_len
        self.width = self.model.beam_width

    def _decode(self, tokens, encoder_outputs, h):
        """
        Args:
            tokens --
            h --
        Outputs:
            logProbs --
            indices --
            h --
        """
        currTokens = tokens.unsqueeze(1)
        currh = h
        # generate next h state and logit
        # generator needs input (seq_len, batch_size, input_size)
        vocabProbs, h, _ = self.model.decoder.one_step_decoder(currTokens, currh, encoder_outputs)
        # beam search trick to prevent probs vanishing
        logProbs = torch.log(vocabProbs)
        # take the beam_with most probable words
        logProbs, indices = torch.topk(logProbs, self.width, dim=-1)
        return vocabProbs,logProbs, indices, h

    def _beamDecode(self, encoder_outputs,h0):
        """
        Returning the ids of the beam_width most probable sentences' words.
        Args:
            h0 -- the first hidden state of dim = dim_y + dim_z
        """
        batch_size = h0.shape[1]
        go = torch.tensor([self.model.vocab['<start>']] * batch_size,dtype=torch.long).to(self.model.device)
        init_state = BeamState(
            go,
            h0,
            [[] for _ in range(batch_size)],
            [[self.model.vocab['<start>']] for i in range(batch_size)],
            [0]*batch_size)
        beam = [init_state]
        for _ in range(self.max_length):
            storeBeamLayer = [[] for _ in range(batch_size)]
            for state in beam:
                vocabProbs, logProbs, indices, h = self._decode(state.word,encoder_outputs, state.h)
                for b in range(batch_size):
                    for w in range(self.width):
                        word = int(indices[b, w])
                        storeBeamLayer[b].append(
                            BeamState(word,
                                      h[:, b, :],
                                      state.outputs[b] + [vocabProbs[b]],
                                      state.sentence[b] + [indices[b, w]],
                                      state.nll[b] - logProbs[b, w]))

            beam = [init_state for _ in range(self.width)]
            for b in range(batch_size):
                # sort beam states by their probability (cumulated nll)
                # TODO check if performance increase by dividing nll
                # by number of words
                sortedBeamLayer = sorted(storeBeamLayer[b], key=lambda k: k.nll)
                for w in range(self.width):
                    beam[w].word[b] = sortedBeamLayer[w].word
                    beam[w].h[:, b, :] = sortedBeamLayer[w].h
                    beam[w].outputs[b] = sortedBeamLayer[w].outputs
                    beam[w].sentence[b] = sortedBeamLayer[w].sentence
                    beam[w].nll[b] = sortedBeamLayer[w].nll

        # Returning the ids of the beam_width most probable sentences' words.
        sentences = torch.tensor(beam[0].sentence,dtype=torch.long)
        word_sentences = [
            [self.model.vocab.itos[i] for i in sent]
            for sent in sentences]
        # TODO strip the EOS
        word_sentences = list(map(lambda x: " ".join(x), word_sentences))
        
        #Outputs of decoder at every time step
        outputs = beam[0].outputs
        for i in range(len(outputs)):
            outputs[i] = torch.tensor([t.to(torch.device('cpu')).detach().numpy() for t in outputs[i]],dtype = torch.float32)
        outputs = torch.tensor([t.numpy() for t in outputs],dtype=torch.float32)
        return outputs.to(device),sentences,word_sentences

    def rewriteBatch(self, encoder_outputs,labels,word_id=True):
        h0 = self.model.decoder.decoderLabelsTransform(labels)
        h0 = h0.unsqueeze(0)
        outputs = self._beamDecode(encoder_outputs,h0)
        if word_id:
            return outputs[0],torch.tensor(outputs[1],dtype=torch.long).to(device)
        else:
            return outputs[2],None

In [None]:
from torch.nn.utils.rnn import pack_padded_sequence
class Model(nn.Module):

    def __init__(self, encoder, decoder,embedding_layer,embedding_dim,Ks,C,beam_width,max_len,vocab,device):
        super(Model, self).__init__()
        self.beam_width = beam_width
        self.max_len = max_len
        self.device = device
        self.vocab = vocab
        self.encoder = encoder
        self.decoder = decoder
        self.embedding = embedding_layer
        self.convs = nn.ModuleList([nn.Conv2d(1, 128, (K,len(vocab)),stride=1) for K in Ks])
        '''
        self.conv13 = nn.Conv2d(Ci, Co, (3, D))
        self.conv14 = nn.Conv2d(Ci, Co, (4, D))
        self.conv15 = nn.Conv2d(Ci, Co, (5, D))
        '''
        self.dropout = nn.Dropout(0.5)
        self.fc1 = nn.Linear(len(Ks)*128, C)
        
        self.optimizer = optim.Adam([{'params':self.encoder.encoderLabelsTransform.parameters()},
                                     {'params':self.encoder.encoder.parameters()},
                                    {'params':self.decoder.parameters()},
                                    {'params':self.convs.parameters(),'lr':0.0005},
                                    {'params':self.fc1.parameters(),'lr':0.0005}])
        self.criterion = nn.CrossEntropyLoss().to(device)
                
          
    
    def _zero_grad(self):
        self.optimizer.zero_grad()
    
    def encoder_decoder(self,inputs,targets,labels1,labels2,lens):
        encoder_outputs,_ = encoder(inputs,labels1)
        return decoder(encoder_outputs,labels2,self.vocab['<start>'],targets,lens) 
    
    def encoder_beam_decoder(self,inputs,targets,labels1,labels2,lens):
        encoder_outputs,_ = encoder(inputs,labels1)
        beam = BeamSearchDecoder(self)
        return beam.rewriteBatch(encoder_outputs,labels2)
    
    
    def classifier(self,generated_inputs):
        x = [F.leaky_relu(conv(generated_inputs), negative_slope=0.01).squeeze(3)
            for conv in self.convs]
        x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x]

        x = torch.cat(x, 1)
        x = self.dropout(x)
        x = self.fc1(x)
        return x
    
    def forward(self,inputs,labels,lens):        
        batch_size, seq_len = inputs.size()
        
        autoencoder_outputs, autoencoder_classifier_outputs,style_converted_classifier_outputs, reconverted_outputs,reconverted_classifier_outputs,sort_ind = None,None,None,None,None,None
        
#         Autoencoder
        autoencoder_outputs,inputs,labels,sort_ind = self.encoder_decoder(inputs,inputs,labels,labels,lens)
        autoencoder_outputs = self.dropout(autoencoder_outputs)
        lens = lens[sort_ind]

        
#         Classifier 1
#         new_input = torch.zeros(batch_size,seq_len,len(vocab)).to(self.device)
#         for i in range(batch_size):
#             for j in range(seq_len):
#                 new_input[i][j][inputs[i][j]] = 1
#         new_input = new_input[:,1:]
        autoencoder_classifier_outputs = self.classifier(autoencoder_outputs.unsqueeze(1))
        #Backprop
        trgs = inputs[:,1:]
        trgs = pack_padded_sequence(trgs, (lens-1).squeeze(1).to(torch.device('cpu')), batch_first=True)
        autoencoder_scores = pack_padded_sequence(autoencoder_outputs, (lens-1).squeeze(1).to(torch.device('cpu')), batch_first=True)
        loss1 = self.criterion(autoencoder_scores.data,trgs.data)
        loss1.backward(retain_graph=True)
        loss2 = self.criterion(autoencoder_classifier_outputs,torch.tensor(labels,dtype=torch.long).squeeze(1))
        loss2.backward(retain_graph=True)
        clip_gradient(self.optimizer,grad_clip)
        self.optimizer.step()
        self._zero_grad()
        
        
        #Style Conversion to other style
        style_outputs,next_style_inputs = self.encoder_beam_decoder(inputs,None,labels,1-labels,None)
        style_outputs = self.dropout(style_outputs)
        next_style_inputs = next_style_inputs[:,1:]
        
        #Classifier 2
        style_converted_classifier_outputs = self.classifier(style_outputs.unsqueeze(1))
        
        #Backprop
        loss3 = self.criterion(style_converted_classifier_outputs,1-torch.tensor(labels,dtype=torch.long).squeeze(1))
        loss3.backward(retain_graph=True)
        clip_gradient(self.optimizer,grad_clip)
        self.optimizer.step()
        self._zero_grad()
        
        #Reconversion
        reconverted_outputs,inputs,labels,_ = self.encoder_decoder(next_style_inputs,inputs,1-labels,labels,lens)
        reconverted_outputs = self.dropout(reconverted_outputs)
        
        #Classifier 3
        reconverted_classifier_outputs = self.classifier(reconverted_outputs.unsqueeze(1))
        
        #Backprop
        reconverted_scores = pack_padded_sequence(reconverted_outputs, (lens-1).squeeze(1).to(torch.device('cpu')), batch_first=True)
        loss4 = self.criterion(reconverted_scores.data,trgs.data)
        loss4.backward(retain_graph=True)
        loss5 = self.criterion(reconverted_classifier_outputs,torch.tensor(labels,dtype=torch.long).squeeze(1))
        loss5.backward()
        clip_gradient(self.optimizer,grad_clip)
        self.optimizer.step()
        self._zero_grad()

        return autoencoder_outputs, autoencoder_classifier_outputs,style_converted_classifier_outputs, reconverted_outputs,reconverted_classifier_outputs,sort_ind
    
    def update_model(self,autoencoder_outputs, autoencoder_classifier_outputs,style_converted_classifier_outputs, reconverted_outputs,reconverted_classifier_outputs,targets,labels,lens):
        trgs = targets[:,1:]
        trgs = pack_padded_sequence(trgs, (lens-1).to(torch.device('cpu')), batch_first=True)
        
        autoencoder_scores = pack_padded_sequence(autoencoder_outputs, (lens-1).to(torch.device('cpu')), batch_first=True)
        reconverted_scores = pack_padded_sequence(reconverted_outputs, (lens-1).to(torch.device('cpu')), batch_first=True)
        
        loss1 = self.criterion(autoencoder_scores.data,trgs.data)
        loss2 = self.criterion(autoencoder_classifier_outputs,labels)
        loss3 = self.criterion(style_converted_classifier_outputs,1-labels)
        loss4 = self.criterion(reconverted_scores.data,trgs.data)
        loss5 = self.criterion(reconverted_classifier_outputs,labels)
        loss1.backward(retain_graph=True)
        loss2.backward(retain_graph=True)
        loss3.backward(retain_graph=True)
        loss4.backward(retain_graph=True)
        loss5.backward()
        clip_gradient(self.optimizer, grad_clip)
        self.optimizer.step()
    
    def compute_loss(self,autoencoder_outputs, autoencoder_classifier_outputs,style_converted_classifier_outputs, reconverted_outputs,reconverted_classifier_outputs,targets,labels,lens):
        trgs = targets[:,1:]
        trgs = pack_padded_sequence(trgs, (lens-1).to(torch.device('cpu')), batch_first=True)
        
        autoencoder_scores = pack_padded_sequence(autoencoder_outputs, (lens-1).to(torch.device('cpu')), batch_first=True)
        reconverted_scores = pack_padded_sequence(reconverted_outputs, (lens-1).to(torch.device('cpu')), batch_first=True)

        loss1 = self.criterion(autoencoder_scores.data,trgs.data).item()
        loss2 = self.criterion(autoencoder_classifier_outputs,labels).item()
        loss3 = self.criterion(style_converted_classifier_outputs,1-labels).item()
        loss4 = self.criterion(reconverted_scores.data,trgs.data).item()
        loss5 = self.criterion(reconverted_classifier_outputs,labels).item()
        print(loss1,loss2,loss3,loss4,loss5)
        return loss1+loss2+loss3+loss4+loss5
        

In [None]:
def clip_gradient(optimizer, grad_clip):
    """
    Clips gradients computed during backpropagation to avoid explosion of gradients.
    :param optimizer: optimizer with the gradients to be clipped
    :param grad_clip: clip value
    """
    for group in optimizer.param_groups:
        for param in group['params']:
            if param.grad is not None:
                param.grad.data.clamp_(-grad_clip, grad_clip)

In [None]:
embedding_layer = nn.Embedding(len(vocab),100)
encoder = Encoder(100,200,embedding_layer)
one_step_decoder = BahdanauDecoder(100,200,len(vocab),embedding_layer)
decoder = Decoder(one_step_decoder,device)
model = Model(encoder,decoder,embedding_layer,100,[1,2,3,4],2,5,30,vocab,device)
model.to(device)

In [None]:
# model.load_state_dict(torch.load('../input/abusive-language/model.pt'))

In [None]:
def get_content_preservation_score(actual_word_lists, generated_word_lists, embedding_model):
    #sentiment_words = lexicon_helper.get_sentiment_words()
    cosine_distances = list()
    skip_count = 0
    for word_list_1, word_list_2 in zip(actual_word_lists, generated_word_lists):
        cosine_similarity = 0
        words_1 = set(word_list_1)
        words_2 = set(word_list_2)

        # words_1 -= sentiment_words
        # words_2 -= sentiment_words
        try:
            cosine_similarity = 1 - cosine(
                get_sentence_embedding(words_1, embedding_model),
                get_sentence_embedding(words_2, embedding_model))
            cosine_distances.append(cosine_similarity)
        except ValueError:
            skip_count += 1
            logger.debug("Skipped lines: {} :-: {}".format(word_list_1, word_list_2))

    logger.debug("{} lines skipped due to errors".format(skip_count))
    mean_cosine_distance = statistics.mean(cosine_distances) if cosine_distances else 0

    # del sentiment_words

    return mean_cosine_distance

In [None]:
class AverageMeter(object):
    """
    Keeps track of most recent, average, sum, and count of a metric.
    """

    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

In [None]:
def accuracy(scores, targets, k,two_d=True):
    """
    Computes top-k accuracy, from predicted and true labels.
    :param scores: scores from the model
    :param targets: true labels
    :param k: k in top-k accuracy
    :return: top-k accuracy
    """

    batch_size = targets.size(0)
    if two_d==True:
        batch_size*=targets.size(1)
    _, ind = scores.topk(k, -1,True, True)
    ind = ind.squeeze(-1)
    correct = ind.eq(targets)
    correct_total = correct.view(-1).float().sum()  # 0D tensor
    return correct_total.item() * (100.0 / batch_size)

In [None]:
import time
from torch.nn.utils.rnn import pack_padded_sequence


def validate(val_loader, model,print_freq=100):
    """
    Performs one epoch's validation.
    :param val_loader: DataLoader for validation data.
    :param encoder: encoder model
    :param decoder: decoder model
    :param criterion: loss layer
    :return: BLEU-4 score
    """
    model.eval()  # eval mode (no dropout or batchnorm)

    batch_time = AverageMeter()
    losses = AverageMeter()
    top1accs = AverageMeter()
    classifier_accs = AverageMeter()

    start = time.time()

    with torch.no_grad():
        # Batches
        for i, (caps, label,caplens) in enumerate(val_loader):

            # Move to device, if available
            caps = caps.to(device)
            label = label.to(device)
            caplens = caplens.to(device)

            # Forward prop.
            a,ac,sc,r,rc,sort_ind = model(caps,torch.tensor(label,dtype=torch.float),caplens)
            caps_sorted = caps[sort_ind]
            label_sorted = label[sort_ind]
            caplens_sorted = caplens[sort_ind]
            
            # Since we decoded starting with <start>, the targets are all words after <start>, up to <end>

            # Calculate loss
            loss = model.compute_loss(a,ac,sc,r,rc,caps_sorted,label_sorted.squeeze(1),caplens_sorted.squeeze(1))
            torch.cuda.empty_cache()

#             # Add doubly stochastic attention regularization

            # Keep track of metrics
            losses.update(loss, sum(caplens_sorted.squeeze(1)-1))
            top1 = (accuracy(a,caps_sorted[:,1:],1) + accuracy(r,caps_sorted[:,1:],1))/2.0
            top1accs.update(top1,sum(caplens_sorted.squeeze(1)-1))
            class_ac = (accuracy(ac,label_sorted.squeeze(-1),1) +accuracy(sc,(1-label_sorted).squeeze(-1),1) + accuracy(rc,label_sorted.squeeze(-1),1))/3.0
            classifier_accs.update(class_ac,caps.shape[0])
            batch_time.update(time.time() - start)

            start = time.time()

            if i % print_freq == 0:
                print('Validation: [{0}/{1}]\t'
                      'Batch Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                      'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                      'Classifier Accuract {classifier.val:.3f} ({classifier.avg:.3f})\t'.format(i, len(val_loader), batch_time=batch_time,loss=losses, classifier=classifier_accs))
            
                print('Top-1 Accuracy {top1.val:.3f} ({top1.avg:.3f})'.format(top1=top1accs))

In [None]:
print_freq=100

batch_time = AverageMeter()  # forward prop. + back prop. time
data_time = AverageMeter()  # data loading time
losses = AverageMeter()  # loss (per word decoded)
top1accs = AverageMeter()
classifier_accs=AverageMeter()
grad_clip = 5.

epochs_since_improvement = 0

# adjust_learning_rate(decoder_optimizer,0.8)

start = time.time()
for epoch in range(1):
#     if epochs_since_improvement > 0 and epochs_since_improvement % 8 == 0:
#             adjust_learning_rate(decoder_optimizer, 0.8)
    model.train()
    for i, (caps, label,caplens) in enumerate(train_dataloader):
        
        model._zero_grad()

        # Move to device, if available
        caps = caps.to(device)
        label = label.to(device)
        caplens = caplens.to(device)

        # Forward prop.
        a,ac,sc,r,rc,sort_ind = model(caps,torch.tensor(label,dtype=torch.float),caplens)
        caps_sorted = caps[sort_ind]
        label_sorted = label[sort_ind]
        caplens_sorted = caplens[sort_ind]
        
        
        # Calculate loss
        loss = model.compute_loss(a,ac,sc,r,rc,caps_sorted,label_sorted.squeeze(1),caplens_sorted.squeeze(1))
#         model.update_model(a,ac,sc,r,rc,caps_sorted,label_sorted.squeeze(1),caplens_sorted.squeeze(1))
#         print(torch.cuda.memory_summary(device=None, abbreviated=False))
        torch.cuda.empty_cache()
            

        # Keep track of metrics
        losses.update(loss, sum(caplens_sorted.squeeze(1)-1))
        top1 = (accuracy(a,caps_sorted[:,1:],1) + accuracy(r,caps_sorted[:,1:],1))/2.0
        top1accs.update(top1,sum(caplens_sorted.squeeze(1)-1))
        class_ac = (accuracy(ac,label_sorted.squeeze(-1),1,False) +accuracy(sc,(1-label_sorted).squeeze(-1),1,False) + accuracy(rc,label_sorted.squeeze(-1),1,False))/3.0
        classifier_accs.update(class_ac,caps.shape[0])
        batch_time.update(time.time() - start)

        start = time.time()

        if i % print_freq == 0:
            print('Epoch: [{0}][{1}/{2}]\t'
                  'Batch Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Data Load Time {data_time.val:.3f} ({data_time.avg:.3f})\t'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'.format(epoch, i, len(train_dataloader),
                                                                          batch_time=batch_time,
                                                                          data_time=data_time, loss=losses))
            print('Top-1 Accuracy {top1.val:.3f} ({top1.avg:.3f})'.format(top1=top1accs))
            print('Classification Accuracy {classifier.val:.3f} ({classifier.avg:.3f})'.format(classifier=classifier_accs))
    validate(validation_dataloader,model,print_freq)

In [None]:
# new_loader = []
# for i,(caps,label,caplens) in enumerate(train_dataloader):
#     ind = caplens.argmax(dim=0)
#     new_loader.append((caps,label,caplens))
#     if i%4==0:
#         break
# embedding_layer = nn.Embedding(len(vocab),100)
# encoder = Encoder(100,200,embedding_layer)
# one_step_decoder = BahdanauDecoder(100,200,len(vocab),embedding_layer)
# decoder = Decoder(one_step_decoder,device)
# model = Model(encoder,decoder,embedding_layer,100,[1,2,3,4],2,5,30,vocab,device)
# model.to(device)
# # for param_group in model.optimizer.param_groups:
# #         param_group['lr'] = 0.0005
# # print(model.optimizer)

In [None]:
# print(new_loader[0][0].shape)

In [None]:
def adjust_learning_rate(optimizer, shrink_factor):
    """
    Shrinks learning rate by a specified factor.
    :param optimizer: optimizer whose learning rate must be shrunk.
    :param shrink_factor: factor in interval (0, 1) to multiply learning rate with.
    """

    print("\nDECAYING learning rate.")
    for param_group in optimizer.param_groups:
        param_group['lr'] = param_group['lr'] * shrink_factor
    print("The new learning rate is %f\n" % (optimizer.param_groups[0]['lr'],))

In [None]:
# import time

# print_freq=100

# batch_time = AverageMeter()  # forward prop. + back prop. time
# data_time = AverageMeter()  # data loading time
# losses = AverageMeter()  # loss (per word decoded)
# top1accs = AverageMeter()
# classifier_accs=AverageMeter()
# grad_clip = 5.

# best_loss=200

# epochs_since_improvement = 0

# # adjust_learning_rate(decoder_optimizer,0.8)

# start = time.time()
# for epoch in range(200):
#     if epochs_since_improvement > 0 and epochs_since_improvement % 20 == 0:
#             adjust_learning_rate(model.optimizer, 0.8)
#     model.train()
#     for i, (caps, label,caplens) in enumerate(new_loader):
        
#         model._zero_grad()

#         # Move to device, if available
#         caps = caps.to(device)
#         label = label.to(device)
#         caplens = caplens.to(device)

#         # Forward prop.
#         a,ac,sc,r,rc,sort_ind = model(caps,torch.tensor(label,dtype=torch.float),caplens)
#         caps_sorted = caps[sort_ind]
#         label_sorted = label[sort_ind]
#         caplens_sorted = caplens[sort_ind]
        
        
#         # Calculate loss
#         loss = model.compute_loss(a,ac,sc,r,rc,caps_sorted,label_sorted.squeeze(1),caplens_sorted.squeeze(1))
# #         model.update_model(a,ac,sc,r,rc,caps_sorted,label_sorted.squeeze(1),caplens_sorted.squeeze(1))
# #         print(torch.cuda.memory_summary(device=None, abbreviated=False))
#         torch.cuda.empty_cache()
            

#         # Keep track of metrics
#         losses.update(loss, sum(caplens_sorted.squeeze(1)-1))
# #         top1 = (accuracy(a,caps_sorted[:,1:],1) + accuracy(r,caps_sorted[:,1:],1))/2.0
#         top1accs.update(accuracy(a,caps_sorted[:,1:],1),sum(caplens_sorted.squeeze(1)-1))
# #         class_ac = (accuracy(ac,label_sorted.squeeze(-1),1,False) +accuracy(sc,(1-label_sorted).squeeze(-1),1,False) + accuracy(rc,label_sorted.squeeze(-1),1,False))/3.0
# #         classifier_accs.update(accuracy(ac,(1-label_sorted).squeeze(-1),1,False),caps.shape[0])
#         batch_time.update(time.time() - start)

#         start = time.time()

#         if i % print_freq == 0:
#             print('Epoch: [{0}][{1}/{2}]\t'
#                   'Batch Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
#                   'Data Load Time {data_time.val:.3f} ({data_time.avg:.3f})\t'
#                   'Loss {loss.val:.4f} ({loss.avg:.4f})\t'.format(epoch, i, len(new_loader),
#                                                                           batch_time=batch_time,
#                                                                           data_time=data_time, loss=losses))
#             print('Top-1 Accuracy {top1.val:.3f} ({top1.avg:.3f})'.format(top1=top1accs))
#             print('Classification Accuracy {classifier.val:.3f} ({classifier.avg:.3f})'.format(classifier=classifier_accs))
# #     validate(validation_dataloader,model,print_freq)
#         if loss<best_loss:
#             best_loss=loss
#             epochs_since_improvement=0
#         else:
#             epochs_since_improvement+=1

In [None]:
class BeamSearchDecoder1(object):

    def __init__(self, styleTransfer):
        self.model = styleTransfer
        self.max_length = self.model.max_len
        self.width = self.model.beam_width

    def _decode(self, tokens, encoder_outputs, h):
        """
        Args:
            tokens --
            h --
        Outputs:
            logProbs --
            indices --
            h --
        """
        currTokens = tokens.unsqueeze(1)
        currh = h
        # generate next h state and logit
        # generator needs input (seq_len, batch_size, input_size)
        vocabProbs, h, _ = self.model.decoder.one_step_decoder(currTokens, currh, encoder_outputs)
        # beam search trick to prevent probs vanishing
        logProbs = torch.log(vocabProbs)
        # take the beam_with most probable words
        logProbs, indices = torch.topk(logProbs, self.width, dim=-1)
        return vocabProbs,logProbs, indices, h

    def _beamDecode(self, encoder_outputs,h0):
        """
        Returning the ids of the beam_width most probable sentences' words.
        Args:
            h0 -- the first hidden state of dim = dim_y + dim_z
        """
        batch_size = h0.shape[1]
        go = torch.tensor([self.model.vocab['<start>']] * batch_size,dtype=torch.long).to(self.model.device)
        init_state = BeamState(
            go,
            h0,
            [[] for _ in range(batch_size)],
            [[self.model.vocab['<start>']] for i in range(batch_size)],
            [0]*batch_size)
        beam = [init_state]
        for _ in range(self.max_length):
            storeBeamLayer = [[] for _ in range(batch_size)]
            for state in beam:
                vocabProbs, logProbs, indices, h = self._decode(state.word,encoder_outputs, state.h)
                for b in range(batch_size):
                    for w in range(self.width):
                        word = int(indices[b, w])
                        storeBeamLayer[b].append(
                            BeamState(word,
                                      h[:, b, :],
                                      state.outputs[b] + [vocabProbs[b]],
                                      state.sentence[b] + [indices[b, w]],
                                      state.nll[b] - logProbs[b, w]))

            beam = [init_state for _ in range(self.width)]
            for b in range(batch_size):
                # sort beam states by their probability (cumulated nll)
                # TODO check if performance increase by dividing nll
                # by number of words
                sortedBeamLayer = sorted(storeBeamLayer[b], key=lambda k: k.nll)
                for w in range(self.width):
                    beam[w].word[b] = sortedBeamLayer[w].word
                    beam[w].h[:, b, :] = sortedBeamLayer[w].h
                    beam[w].outputs[b] = sortedBeamLayer[w].outputs
                    beam[w].sentence[b] = sortedBeamLayer[w].sentence
                    beam[w].nll[b] = sortedBeamLayer[w].nll

        # Returning the ids of the beam_width most probable sentences' words.
        sentences = torch.tensor(beam[0].sentence,dtype=torch.long)
        word_sentences = [
            [self.model.vocab.itos[i] for i in sent]
            for sent in sentences]
        # TODO strip the EOS
        word_sentences = list(map(lambda x: " ".join(x), word_sentences))
        
        #Outputs of decoder at every time step
        outputs = beam[0].outputs
        for i in range(len(outputs)):
            outputs[i] = torch.tensor([t.to(torch.device('cpu')).detach().numpy() for t in outputs[i]],dtype = torch.float32)
        outputs = torch.tensor([t.numpy() for t in outputs],dtype=torch.float32)

        return outputs.to(device),sentences,word_sentences,beam

    def rewriteBatch(self, encoder_outputs,labels,word_id=True):
        h0 = self.model.decoder.decoderLabelsTransform(labels)
        h0 = h0.unsqueeze(0)
        outputs = self._beamDecode(encoder_outputs,h0)
        if word_id:
            return outputs[0],torch.tensor(outputs[1],dtype=torch.long).to(device)
        else:
            return outputs[2],outputs[3]

In [None]:
sentence = new_loader[0][0]
labels = torch.tensor(new_loader[0][1],dtype=torch.float)
print([vocab.itos[x] for x in sentence[0]])
# index = 0
# sentence = tokenizer(data.iloc[index].tweet)
# print(sentence,data.iloc[index].target)
# sentence = [vocab[x] for x in sentence]
# sentence = torch.tensor(sentence,dtype=torch.long).to(device)
# sentence = sentence.unsqueeze(0)
# labels = [data.iloc[index].target]
# labels = torch.tensor(labels,dtype=torch.float).unsqueeze(1).to(device)
# print(sentence.shape,labels.shape)
encoder_outputs,_ = encoder(sentence,labels)
beam = BeamSearchDecoder1(model)
outputs,outputs1 = beam.rewriteBatch(encoder_outputs,labels,False)
print(outputs[0])
torch.cuda.empty_cache()

In [None]:
for i,(caps,labels,caplens) in enumerate(new_loader):
    sentence = caps
    labels = labels
    lens=caplens

In [None]:
print(new_loader[0][0].shape)

In [None]:
output,_,_,_  = model.encoder_decoder(sentence,None,torch.tensor(labels,dtype=torch.float),torch.tensor(labels,dtype=torch.float),None)

In [None]:
print(output.shape)

In [None]:
ans,_ = output.topk(k=5,dim=2)
print()

In [None]:
fin_output = output.argmax(dim=2)

In [None]:
print([vocab.itos[x] for x in fin_output[0]])

In [None]:
torch.save(model.state_dict(), 'model.pt')

In [None]:
# model.load_state_dict(torch.load('model.pt'))

In [None]:
# vocab['quin']