In [78]:
from convokit import Corpus, download
corpus = Corpus(filename=download("movie-corpus"))
from collections import Counter
import json
import torch
import torch.nn as nn
from torch.utils.data import Dataset
import math
import torch.nn.functional as F
import torch.utils.data

Downloading movie-corpus to C:\Users\realmeid\.convokit\downloads\movie-corpus
Downloading movie-corpus from http://zissou.infosci.cornell.edu/convokit/datasets/movie-corpus/movie-corpus.zip (40.9MB)... Done


In [79]:
corpus_movie_lines = corpus.utterances

In [27]:
print(corpus_movie_lines.id.text)

AttributeError: 'dict' object has no attribute 'id'

In [80]:
lines_dict = {}
for line in corpus_movie_lines:
    lines_dict[line] = corpus_movie_lines[line].text

In [81]:
lines_dict

{'L1045': 'They do not!',
 'L1044': 'They do to!',
 'L985': 'I hope so.',
 'L984': 'She okay?',
 'L925': "Let's go.",
 'L924': 'Wow',
 'L872': "Okay -- you're gonna need to learn how to lie.",
 'L871': 'No',
 'L870': 'I\'m kidding.  You know how sometimes you just become this "persona"?  And you don\'t know how to quit?',
 'L869': 'Like my fear of wearing pastels?',
 'L868': 'The "real you".',
 'L867': 'What good stuff?',
 'L866': "I figured you'd get to the good stuff eventually.",
 'L865': 'Thank God!  If I had to hear one more story about your coiffure...',
 'L864': "Me.  This endless ...blonde babble. I'm like, boring myself.",
 'L863': 'What crap?',
 'L862': 'do you listen to this crap?',
 'L861': 'No...',
 'L860': 'Then Guillermo says, "If you go any lighter, you\'re gonna look like an extra on 90210."',
 'L699': 'You always been this selfish?',
 'L698': 'But',
 'L697': "Then that's all you had to say.",
 'L696': 'Well, no...',
 'L695': "You never wanted to go out with 'me, did y

In [82]:
import re
def remove_punc(string):
    new_string = re.sub(r'[^\w\s]', '', string)
    return new_string.lower()

In [83]:
max_len = 25
pairs = []
for line in corpus_movie_lines:
    if corpus_movie_lines[line].reply_to is not None:
        qa_pairs = []
        first = corpus_movie_lines[line].reply_to
        second = line
        qa_pairs.append(remove_punc(lines_dict[first]).split()[:max_len])
        qa_pairs.append(remove_punc(lines_dict[second]).split()[:max_len])
        pairs.append(qa_pairs)

In [84]:
len(pairs)

221616

In [85]:
word_freq = Counter()
for pair in pairs :
    word_freq.update(pair[0])
    word_freq.update(pair[1])
min_freq = 5
words = [w for w in word_freq.keys() if word_freq[w] > min_freq]
word_map = {k:v+1 for v,k in enumerate(words)}
word_map['<unk>'] = len(word_map) + 1
word_map['<start>'] = len(word_map) + 1
word_map['<end>'] = len(word_map) + 1
word_map['<pad>'] = 0

with open("WORDMAP_corpus.json", 'w') as j:
    json.dump(word_map,j)

Counter({'they': 13851,
         'do': 28669,
         'to': 90468,
         'not': 25024,
         'she': 10453,
         'okay': 5777,
         'i': 129808,
         'hope': 1173,
         'so': 15426,
         'wow': 346,
         'lets': 2998,
         'go': 11504,
         'no': 26170,
         'youre': 17378,
         'gonna': 4893,
         'need': 4822,
         'learn': 485,
         'how': 13331,
         'lie': 545,
         'im': 29178,
         'kidding': 582,
         'you': 159012,
         'know': 27311,
         'sometimes': 868,
         'just': 19274,
         'become': 339,
         'this': 28165,
         'persona': 4,
         'and': 44290,
         'dont': 31827,
         'quit': 437,
         'the': 108505,
         'real': 2013,
         'like': 17475,
         'my': 24078,
         'fear': 259,
         'of': 41711,
         'wearing': 303,
         'pastels': 2,
         'what': 43004,
         'good': 8741,
         'stuff': 1257,
         'figured': 403,
  

In [86]:
def encode_question(words, word_map):
    enc_c = [word_map.get(word,word_map['<unk>']) for word in words] + [word_map['<pad>']]*(max_len - len(words))
    return enc_c
def encode_reply(words, word_map):
    enc_c = [word_map['<start>']] + [word_map.get(word,word_map['<unk>']) for word in words] +[word_map['<end>']] + [word_map['<pad>']]*(max_len - len(words))
    return enc_c

In [87]:
encode_question(pairs[0][0], word_map)

[1, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [88]:
pairs_encoded = []
for pair in pairs:
    ques = encode_question(pair[0], word_map)
    ans = encode_reply(pair[1], word_map)
    pairs_encoded.append([ques,ans])

In [89]:
pairs_encoded[0]

[[1, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [18186,
  1,
  2,
  4,
  18187,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0]]

In [134]:
with open("paires_encoded.json", 'w') as j:
    json.dump(pairs_encoded,j)

In [91]:
class Dataset(Dataset):
    def __init__(self):
        self.pairs = json.load(open("paires_encoded.json"))
        self.dataset_size = len(self.pairs)
    def __getitem__(self,item):
        question = torch.LongTensor(self.pairs[item][0])
        reply = torch.LongTensor(self.pairs[item][1])
        
        return question, reply
    def __len__(self):
        return self.dataset_size

In [135]:
train_loader = torch.utils.data.DataLoader(Dataset(),batch_size = 100, shuffle = True, pin_memory = True)

In [94]:
def create_masks(question, reply_input, reply_target):
    
    def subsequent_mask(size):
        mask = torch.triu(torch.ones(size, size)).transpose(0, 1).type(dtype=torch.uint8)
        return mask.unsqueeze(0)
    
    question_mask = question!=0
    question_mask = question_mask.to(device)
    question_mask = question_mask.unsqueeze(1).unsqueeze(1)         # (batch_size, 1, 1, max_words)
     
    reply_input_mask = reply_input!=0
    reply_input_mask = reply_input_mask.unsqueeze(1)  # (batch_size, 1, max_words)
    reply_input_mask = reply_input_mask & subsequent_mask(reply_input.size(-1)).type_as(reply_input_mask.data) 
    reply_input_mask = reply_input_mask.unsqueeze(1) # (batch_size, 1, max_words, max_words)
    reply_target_mask = reply_target!=0              # (batch_size, max_words)
    
    return question_mask, reply_input_mask, reply_target_mask

In [112]:
class Embeddings(nn.Module):
    def __init__(self, vocab_size, d_model, max_len =50, num_layers = 6):
        super (Embeddings, self).__init__()
        self.d_model = d_model
        self.dropout = nn.Dropout(0.1)
        self.embed = nn.Embedding(vocab_size, d_model)
        self.pe = self.create_positional_encoding(max_len, self.d_model) #1, max_len, d_model
        self.te = self.create_positional_encoding(num_layers, self.d_model)#1, num_layer, d_model
    def create_positional_encoding(self, max_len, d_model):
        pe = torch.zeros(max_len, d_model).to(device)
        for pos in range(max_len):
            for i in range(0,d_model,2):
                pe[pos,i] = math.sin(pos/(10000**(2*i/d_model)))
                pe[pos,i+1] = math.cos(pos/(10000**(2*(i+1)/d_model)))
        pe = pe.unsqueeze(0)
        return pe
    def forward(self,encoded_words, layer_idx):
        if layer_idx ==0:
            embeddings = self.embed(encoded_words)*math.sqrt(d_model)#give more importance to the embeddings than the position
        embeddings += self.pe[:,:embeddings.size(1)] #pe is expanded to (batch_size,vocab_size, d_model) for the sum
        embeddings += self.te[:,layer_idx,:].unsqueeze(1).repeat(1,embeddings.size(1),1)
        embeddings = self.dropout(embeddings)
        return embeddings
        

In [96]:
class MultiHeadAttention(nn.Module):
    def __init__(self,heads,d_model):
        super (MultiHeadAttention, self).__init__()
        assert d_model%heads == 0
        self.d_k = d_model//heads
        self.heads = heads
        self.dropout = nn.Dropout(0.1)
        self.query = nn.Linear(d_model, d_model)
        self.key = nn.Linear(d_model, d_model)
        self.value = nn.Linear(d_model, d_model)
        self.concat = nn.Linear(d_model, d_model)
    def forward(self, query, key, value,mask):
        query = self.query(query)
        key = self.key(key)
        value = self.value(value)
        
        query = query.view(query.shape[0], query.shape[1], heads, self.d_k).permute(0,2,1,3)
        key = key.view(key.shape[0], key.shape[1], heads, self.d_k).permute(0,2,1,3)
        value = value.view(value.shape[0], value.shape[1], heads, self.d_k).permute(0,2,1,3)
        
        scores = torch.matmul(query,key.permute(0,1,3,2))/math.sqrt(query.size(-1))
        scores = scores.masked_fill(mask == 0, -1e9)
        weights = F.softmax(scores, dim =-1)
        weights = self.dropout(weights)
        #(batch_size,8, max_words, max_words) dot (batch_size, 8, max_word, 64) = (batch_size, 8, max_word, 64)
        context = torch.matmul(weights,value).permute(0,2,1,3)
        context = context.view(context.shape[0],-1,self.heads*self.d_k)
        interacted = self.concat(context)
        return interacted
        

In [97]:
class FeedForward(nn.Module):

    def __init__(self, d_model, middle_dim = 2048):
        super(FeedForward, self).__init__()
        
        self.fc1 = nn.Linear(d_model, middle_dim)
        self.fc2 = nn.Linear(middle_dim, d_model)
        self.dropout = nn.Dropout(0.1)

    def forward(self, x):
        out = F.relu(self.fc1(x))
        out = self.fc2(self.dropout(out))
        return out

In [114]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, heads):
        super(EncoderLayer, self).__init__() 
        self.self_multihead = MultiHeadAttention(heads, d_model)
        self.feedforward = FeedForward(d_model)
        self.layernorm = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(0.1)
    def forward(self, embeddings, mask):
        interacted = self.dropout(self.self_multihead(embeddings, embeddings, embeddings,mask))
        interacted = self.layernorm(interacted+embeddings)
        feed_forward_out = self.dropout(self.feedforward(interacted))
        encoded = self.layernorm(feed_forward_out + interacted)
        return encoded

In [115]:
class DecoderLayer(nn.Module):
    def __init__(self, d_model, heads):
        super(DecoderLayer, self).__init__()
        self.self_multihead = MultiHeadAttention(heads, d_model)
        self.src_multihead = MultiHeadAttention(heads, d_model)
        self.feedforward = FeedForward(d_model)
        self.layernorm = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(0.1)
    def forward(self,embeddings, encoded, src_mask, target_mask):
        query =self.dropout(self.self_multihead(embeddings, embeddings, embeddings,target_mask))
        query = self.layernorm(query+embeddings)
        interacted = self.dropout(self.src_multihead(query, encoded, encoded,src_mask))
        interacted = self.layernorm(interacted+query)
        feed_forward_out = self.dropout(self.feedforward(interacted))
        decoded = self.layernorm(feed_forward_out + interacted)
        return decoded

In [3]:
class Transformer(nn.Module):
    def __init__(self, d_model, heads,num_layers, word_map):
        super(Transformer, self).__init__()
        self.d_model = d_model
        self.vocab= len(word_map)
        self.num_layers = num_layers
        self.emb = Embeddings(self.vocab, self.d_model, self.num_layers)
        self.encoder = EncoderLayer(d_model, heads)
        self.decoder = DecoderLayer(d_model, heads)
        self.logit = nn.Linear(d_model, self.vocab)
    def encode(self, src_words, src_mask):
        src_embeddings = self.embed(src_words,i)
        for i in range(self.num_layers):
            src_embeddings = self.embed(src_words,i)
            src_embeddings = self.encoder(src_embeddings, src_mask)
        return src_embeddings
    
    def decode(self, target_words, target_mask, src_embeddings, src_mask):
        
        for i in range(self.num_layers):
            tgt_embeddings = self.embed(target_words,i)
            tgt_embeddings = self.decoder(tgt_embeddings, src_embeddings, src_mask, target_mask)
        return tgt_embeddings
    def forward(self, src_words, src_mask,target_words, target_mask):
        encoded = self.encode(src_words, src_mask)
        decoded = self.decode(target_words, target_mask, encoded, src_mask)
        out = F.log_softmax(self.logit(decoded))
        return out

NameError: name 'nn' is not defined

In [117]:
class AdamWarmup():
    def __init__(self, model_size, warmup_steps,optimizer):
        self.model_size = model_size
        self.warmup_steps = warmup_steps
        self.optimizer = optimizer
        self.current_step = 0
        self.lr = 0
    def get_lr(self):
        return math.sqrt(1/self.model_size)*min(self.current_step**(-0.5), self.current_step*self.warmup_steps**(-1.5))
    def step(self):
        self.current_step +=1
        lr = get_lr()
        for param in self.optimizer.param_groups:
            param['lr']= lr
        self.lr = lr
        self.optimizer.step()
            

In [128]:
class LossWithLS(nn.Module):
    def __init__(self, size, smooth):
        super(LossWithLS, self).__init__()
        self.criterion = nn.KLDivLoss(size_average= False, reduce=False)
        self.smooth = smooth
        self.size = size #vocab_size
        self.confidence = 1- smooth
    def forward(self,prediction, target, mask):
        """
        prediction of shape: (batch_size, max_words, vocab_size)
        target and mask of shape: (batch_size, max_words)
        """
        prediction = prediction.view(-1,prediction.size(-1))#(batch_size*max_words, vocab_size)
        target = target.view(-1) #(batch_size* max_words)
        mask = mask.float().view(-1)
        labels = prediction.data.clone()
        labels.fill_(self.smooth/(self.size - 1))
        labels.scatter_(1, target.data.unsqueeze(1), self.confidence)
        loss = self.criterion(prediction, labels)#(batch_size*max_words, vocab_size)
        loss = (loss.sum(1)*mask).sum()/mask.sum()
        return loss
        

In [133]:
d_model = 512
heads = 8
num_layers = 3
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
epochs = 10

with open('WORDMAP_corpus.json', 'r') as j:
    word_map = json.load(j)
    
transformer = Transformer(d_model = d_model, heads = heads, num_layers = num_layers, word_map = word_map)
transformer = transformer.to(device)
adam_optimizer = torch.optim.Adam(transformer.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9)
transformer_optimizer = AdamWarmup(model_size = d_model, warmup_steps = 4000, optimizer = adam_optimizer)
criterion = LossWithLS(len(word_map), 0.1)

In [None]:
#Gradient accumulation: when your RAM doesn't support your batch_size
desired_batch =100
tolerated_batch = 50 
acc_step = desired_batch/tolerated_batch

#when accumulating the loss
loss = loss/acc_step
loss.backward()
if ((i + 1)%acc_step ==0):
    optmizer.step()
    optimizer.zero_grad()

In [132]:
def train(train_loader, transformer, criterion, epoch):
    transformer.train()
    sum_loss = 0
    count = 0
    
    for i, (question, reply) in enumerate(train_loader):
        samples = question.shape[0] #batch_size
        reply = reply.to(device)
        question = question.to(device)
        reply_target = reply[:,:-1]
        reply_input = reply[:,1:]
        
        question_mask, reply_input_mask, reply_target_mask = create_masks(question, reply_input, reply_target)
        
        out = transformer(question, question_mask, reply_input, reply_input_mask)
        loss = criterion(out,reply_target, reply_target_mask)
        
        # Backprop
        transformer_optimizer.optimizer.zero_grad()
        loss.backward()
        transformer_optimizer.step()
        
        sum_loss += loss.item() * samples
        count += samples
        
        if i % 100 == 0:
            print("Epoch [{}][{}/{}]\tLoss: {:.3f}".format(epoch, i, len(train_loader), sum_loss/count))

In [None]:
def evaluate(transformer,question, question_mask, max_len,word_map):
    rev_word_map = {v:k for k,v in word_map.items()}
    transformer.eval()
    start_token = word_map['<start>']
    encoded = transformer.encode(question, question_mask)
    words = torch.LongTensor([[start_token]]).to(device)
    
    for step in range(max_len - 1):
        #since the words are being generated, the class and functions can't be used as the size are changing
        size = words.shape[1]
        target_mask = torch.triu(torch.ones(size, size)).transpose(0, 1).type(dtype=torch.uint8)
        target_mask = target_mask.to(device).unsqueeze(0).unsqueeze(0)
        decoded = transformer.decode(words, target_mask, encoded, question_mask)
        #decoded (1x1xvocab_size) = batch x max_words x vocab_size
        predictions = decoded[:,-1]
        #predictions= 1 x vocab_size
        _, next_word = torch.max(predictions, dim = 1)
        next_word = next_word.item()
        if next_word == word_map['<end>']:
            break
        words = torch.cat([words, torch.LongTensor([[next_word]]).to(device)], dim = 1)
        
    if words.dim() == 2:
        words = words.squeeze(0)
        words = words.tolist()
        
    sen_idx = [w for w in words if w not in {word_map['<start>']}]
    sentence = ' '.join([rev_word_map[sen_idx[k]] for k in range(len(sen_idx))])
    
    return sentence

In [None]:
for epoch in range(epochs):
    
    train(train_loader, transformer, criterion, epoch)
    
    if epoch % 5 == 0:
        state = {'epoch': epoch, 'transformer': transformer, 'transformer_optimizer': transformer_optimizer}
        torch.save(state, 'checkpoint_' + str(epoch) + '.pth.tar')

In [None]:
checkpoint = torch.load('checkpoint.pth.tar')
transformer = checkpoint['transformer']

In [None]:
while(1):
    question = input("Question: ") 
    if question == 'quit':
        break
    max_len = input("Maximum Reply Length: ")
    enc_qus = [word_map.get(word, word_map['<unk>']) for word in question.split()]
    question = torch.LongTensor(enc_qus).to(device).unsqueeze(0)
    question_mask = (question!=0).to(device).unsqueeze(1).unsqueeze(1)  
    sentence = evaluate(transformer, question, question_mask, int(max_len), word_map)
    print(sentence)