In [38]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import torch
import torch.nn.utils.prune
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import csv
import random
import re
import os
import unicodedata
import codecs
import itertools

In [2]:
CUDA = torch.cuda.is_available()
device = torch.device("cuda" if CUDA else "cpu")

In [3]:
#Data Preprocessing

In [4]:
lines_path = os.path.join("cornell movie-dialogs corpus","movie_lines.txt")
convs_path = os.path.join("cornell movie-dialogs corpus","movie_conversations.txt")

In [5]:
# Visualize some lines
with open(lines_path,'r',encoding='iso-8859-1') as file:
    lines=file.readlines()
for line in lines[:8]:
    print(line.strip())   

L1045 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ They do not!
L1044 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ They do to!
L985 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ I hope so.
L984 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ She okay?
L925 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Let's go.
L924 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ Wow
L872 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Okay -- you're gonna need to learn how to lie.
L871 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ No


In [6]:
#Splitting each line into their respective fields
line_fileds = ['line_ID','character_ID','movie_ID','character','dialogue']
dialogues={}
# Visualize some lines
with open(lines_path,'r',encoding='iso-8859-1') as f:
    for line in f:
        values = line.split(" +++$+++ ")
        #Extracting fields
        obj = {}
        for i,field in enumerate(line_fileds):
            obj[field]=values[i]
        dialogues[obj['line_ID']]=obj

In [7]:
# Now grouping fileds of lines in movie conversation
conv_filed = ["character_ID_1","character_ID_2","movie_ID","utterance_ID"]
conversations = []
with open(convs_path,'r',encoding='iso-8859-1') as f:
    for line in f:
        values=line.split(" +++$+++ ")
        obj = {}
        for i,field in enumerate(conv_filed):
            obj[field]=values[i]
        lineID = eval(obj["utterance_ID"])
        obj["lines"]=[]
        for ID in lineID:
            obj["lines"].append(dialogues[ID])
        conversations.append(obj)

In [8]:
#Extracting pair of sentences from conversations
qa_pairs = []
for convo in conversations:
    for i in range(len(convo["lines"])-1):
        inline = convo["lines"][i]["dialogue"].strip()
        outline = convo["lines"][i+1]["dialogue"].strip()
        # filtering the wrong samples, if either of the list is empty
        if inline and outline:
            qa_pairs.append([inline,outline])

In [9]:
datafile = os.path.join("cornell movie-dialogs corpus","formatted_movie_lines.txt")
delimiter = '\t'
delimiter = str(codecs.decode(delimiter,"unicode_escape"))

#Wrinting the preprocess data into csv
print('Wrinting the preprocess data into csv')
with open(datafile,'w',encoding="utf-8") as outputfile:
    writer = csv.writer(outputfile,delimiter=delimiter)
    for pair in qa_pairs:
        writer.writerow(pair)
print("Writing the file is successfull")

Wrinting the preprocess data into csv
Writing the file is successfull


In [10]:
# Visualize some lines
with open(datafile,'r') as file:
    lines=file.readlines()
for line in lines[:8]:
    print(line.strip())   

Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.	Well, I thought we'd start with pronunciation, if that's okay with you.
Well, I thought we'd start with pronunciation, if that's okay with you.	Not the hacking and gagging and spitting part.  Please.
Not the hacking and gagging and spitting part.  Please.	Okay... then how 'bout we try out some French cuisine.  Saturday?  Night?
You're asking me out.  That's so cute. What's your name again?	Forget it.
No, no, it's my fault -- we didn't have a proper introduction ---	Cameron.
Cameron.	The thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does.
The thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does.	Seems like she could get a date easy enough...
Why?	Unsolved mystery.  She used to be really popular when she started high sch

In [11]:
PAD_token = 0
SOS_token = 1
EOS_token = 2

class vocab:
    def __init__(self,name):
        self.name=name
        self.word2index={}
        self.word2count={}
        self.index2word={PAD_token:"PAD",SOS_token:"SOS",EOS_token:"EOS"}
        self.num_words=3
        
    def addSent(self,sent):
        for word in sent.split(' '):
            self.addWord(word)
            
    def addWord(self,word):
        if word not in self.word2index:
            self.word2index[word] = self.num_words
            self.word2count[word]=1
            self.index2word[self.num_words]=word
            self.num_words +=1
        else:
            self.word2count[word] +=1
            
    #removing the words which have count lower than the threshold value
    def trim(self,min_count):
        keep_words = []
        for k,v in self.word2count.items():
            if v >= min_count:
                keep_words.append(k)
        print('Keep words  {} / {} = {:.4f}'.format(len(keep_words),len(self.word2index),len(keep_words)/len(self.word2index)))
        # Reinitialize the dictionary
        self.word2index={}
        self.word2count={}
        self.index2word={PAD_token:"PAD",SOS_token:"SOS",EOS_token:"EOS"}
        self.num_words=3
        for word in keep_words:
            self.addWord(word)

In [12]:
# function to convert unicode string into its respective ASCII value
def unicodetoASCII(s):
    return ''.join(c for c in unicodedata.normalize('NFD',s) if unicodedata.category(c)!='Mn')

In [13]:
# cleaning the dataset from noise
def normalizing(s):
    s = unicodetoASCII(s.lower().strip())
    # replacing special characters by whitespace and the special character
    s = re.sub(r"([.!?])",r" \1",s)
    # removing all characters which are not letters
    s = re.sub(r"[^a-zA-Z.!?]+",r" ",s)
    # removing whitespaces
    s = re.sub(r"\s+",r" ",s).strip()
    return s

In [14]:
# now reading the datafile saved
print("Reading and processing the text file")
lines = open(datafile,encoding="utf-8").read().strip().split('\n')
#splitting every line into pairs and normalize
pairs = [[normalizing(s) for s in pair.split('\t')] for pair in lines]
print("Process Successfull")
voc = vocab('cornell movie-dialogs corpus')

Reading and processing the text file
Process Successfull


In [15]:
# Checking the sentences length
ML = 10
def filterpair(p):
    return len(p[0].split()) < ML and len(p[1].split()) < ML

def filterpairs(pairs):
    return [pair for pair in pairs if filterpair(pair)]

In [16]:
pairs = [pair for pair in pairs if len(pair)>1]
print('Total conversations {}'.format(len(pairs)))
pairs = filterpairs(pairs)
print('After filtering, Total conversations {}'.format(len(pairs)))

Total conversations 221282
After filtering, Total conversations 64271


In [17]:
# now creating the vocablary using the pairs
for pair in pairs:
    voc.addSent(pair[0])
    voc.addSent(pair[1])
print("Counted words = {}".format(voc.num_words))
for pair in pairs[:10]:
    print(pair)   

Counted words = 18008
['there .', 'where ?']
['you have my word . as a gentleman', 'you re sweet .']
['hi .', 'looks like things worked out tonight huh ?']
['you know chastity ?', 'i believe we share an art instructor']
['have fun tonight ?', 'tons']
['well no . . .', 'then that s all you had to say .']
['then that s all you had to say .', 'but']
['but', 'you always been this selfish ?']
['do you listen to this crap ?', 'what crap ?']
['what good stuff ?', 'the real you .']


In [18]:
# removing rare words
def trimRareWords(voc,pairs,minc):
    voc.trim(minc)
    # Filtering out pairs with trim words
    keep_pairs=[]
    for pair in pairs:
        inputsent = pair[0]
        outputsent = pair[1]
        ki = True
        ko = True
        for word in inputsent.split(' '):
            if word not in voc.word2index:
                ki = False
                break
        for word in outputsent.split(' '):
            if word not in voc.word2index:
                ko = False
                break
        if ki and ko:
            keep_pairs.append(pair)
    print("Trimmed from {} pairs to {}, {:.4f} of total".format(len(pairs),len(keep_pairs),len(keep_pairs)/len(pairs)))
    return keep_pairs
            

In [19]:
pairs = trimRareWords(voc,pairs,3)

Keep words  7823 / 18005 = 0.4345
Trimmed from 64271 pairs to 53165, 0.8272 of total


In [20]:
def indexfromsent(voc,sent):
    return [voc.word2index[word] for word in sent.split(' ')] + [EOS_token]

In [21]:
def zeropadding(l,fv=0):
    return list(itertools.zip_longest(*l,fillvalue=fv))

In [22]:
def binarymatrix(l,val=0):
    m=[]
    for i,seq in enumerate(l):
        m.append([])
        for token in seq:
            if token==PAD_token:
                m[i].append(0)
            else:
                m[i].append(1)
    return m

In [23]:
def inputVar(l,voc):
    indexes_batch = [indexfromsent(voc,sent) for sent in l]
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
    padList = zeropadding(indexes_batch)
    padVar = torch.LongTensor(padList)
    return padVar,lengths

In [24]:
def outputVar(l,voc):
    indexes_batch = [indexfromsent(voc,sent) for sent in l]
    max_target_len = max([len(indexes) for indexes in indexes_batch])
    padList = zeropadding(indexes_batch)
    mask = binarymatrix(padList)
    mask = torch.ByteTensor(mask)
    padVar = torch.LongTensor(padList)
    return padVar,mask, max_target_len

In [25]:
def batch2TrainData(voc,pair_batch):
    pair_batch.sort(key=lambda x: len(x[0].split(' ')),reverse=True)
    inbatch,outbatch = [],[]
    for pair in pair_batch:
        inbatch.append(pair[0])
        outbatch.append(pair[1])
    inp,length = inputVar(inbatch,voc)
    out,mask,max_len = outputVar(outbatch,voc)
    return inp,length,out,mask,max_len

In [33]:
class EncoderRNN(nn.Module):
    def __init__(self,hidden_size,embedding,n_layers=1,dropout=0):
        super(EncoderRNN,self).__init__()
        self.n_layers=n_layers
        self.hidden_size=hidden_size
        self.embedding=embedding
        self.gru = nn.GRU(hidden_size,hidden_size,n_layers,dropout=(0 if n_layers==1 else dropout), bidirectional=True)
    
    def forward(self,input_seq,input_lengths,hidden=None):
        embedded = self.embedding(input_seq)
        packed = torch.nn.utils.rnn.pack_padded_sequence(embedded,input_lengths)
        outputs, hidden = self.gru(packed,hidden)
        outputs,_ = torch.nn.utils.rnn.pad_packed_sequence(outputs)
        outputs = outputs[:,:,:self.hidden_size]+outputs[:,:,self.hidden_size:]
        return outputs,hidden

In [27]:
class Attn(torch.nn.Module):
    def __init__(self,method,hidden_size):
        super(Attn,self).__init__()
        self.method=method
        self.hidden_size=hidden_size
    
    def dot_score(self,hidden,encoder_output):
        return torch.sum(hidden*encoder_output,dim=2)
    
    def forward(self,hidden,encoder_outputs):
        attn_engeries = self.dot_score(hidden,encoder_outputs)
        attn_engeries=attn_engeries.t()
        return F.softmax(attn_engeries,dim=1).unsqueeze(1)

In [28]:
class LuongAttnDecoderRNN(nn.Module):
    def __init__(self,attn_model,embedding,hidden_size,output_size,n_layers=1,dropout=0.1):
        super(LuongAttnDecoderRNN,self).__init__()
        self.attn_model = attn_model
        self.embedding = embedding
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout = dropout
        self.embedding_dropout = nn.Dropout(self.dropout)
        self.gru = nn.GRU(self.hidden_size,self.hidden_size,self.n_layers,dropout=(0 if self.n_layers==1 else self.dropout))
        self.concat = nn.Linear(self.hidden_size*2,self.hidden_size)
        self.out = nn.Linear(self.hidden_size,self.output_size)
        self.atn = Attn(self.attn_model,self.hidden_size)
        
    def forward(self,input_step,last_hidden,encoder_outputs):
        embedded = self.embedding(input_step)
        embedded = self.embedding_dropout(embedded)
        rnn_output, hidden = self.gru(embedded,last_hidden)
        attn_weights = self.atn(rnn_output,encoder_outputs)
        context = attn_weights.bmm(encoder_outputs.transpose(0,1))
        rnn_output = rnn_output.squeeze(0)
        context = context.squeeze(1)
        concat_input = torch.cat((rnn_output,context),1)
        concat_output = torch.tanh(self.concat(concat_input))
        output = self.out(concat_output)
        output = F.softmax(output,dim=1)
        return output,hidden

In [35]:
def maskNLLLoss(decoder_out,target,mask):
    nTotal = mask.sum()
    target = target.view(-1,1)
    gathered_tensor = torch.gather(decoder_out,1,target)
    crossEntropy = -torch.log(gathered_tensor)
    loss = crossEntropy.masked_select(mask)
    loss = loss.mean()
    loss = loss.to(device)
    return loss,nTotal.item()

In [40]:
def train(input_variable, lengths, target_variable, mask, max_target_len, encoder, decoder, embedding,
          encoder_optimizer, decoder_optimizer, batch_size, clip, max_length=ML):

    # Zero gradients
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    # Set device options
    input_variable = input_variable.to(device)
    lengths = lengths.to(device)
    target_variable = target_variable.to(device)
    mask = mask.to(device)

    # Initialize variables
    loss = 0
    print_losses = []
    n_totals = 0

    # Forward pass through encoder
    encoder_outputs, encoder_hidden = encoder(input_variable, lengths)

    # Create initial decoder input (start with SOS tokens for each sentence)
    decoder_input = torch.LongTensor([[SOS_token for _ in range(batch_size)]])
    decoder_input = decoder_input.to(device)

    # Set initial decoder hidden state to the encoder's final hidden state
    decoder_hidden = encoder_hidden[:decoder.n_layers]

    # Determine if we are using teacher forcing this iteration
    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    # Forward batch of sequences through decoder one time step at a time
    if use_teacher_forcing:
        for t in range(max_target_len):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden, encoder_outputs
            )
            # Teacher forcing: next input is current target
            decoder_input = target_variable[t].view(1, -1)
            # Calculate and accumulate loss
            mask_loss, nTotal = maskNLLLoss(decoder_output, target_variable[t], mask[t])
            loss += mask_loss
            print_losses.append(mask_loss.item() * nTotal)
            n_totals += nTotal
    else:
        for t in range(max_target_len):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden, encoder_outputs
            )
            # No teacher forcing: next input is decoder's own current output
            _, topi = decoder_output.topk(1)
            decoder_input = torch.LongTensor([[topi[i][0] for i in range(batch_size)]])
            decoder_input = decoder_input.to(device)
            # Calculate and accumulate loss
            mask_loss, nTotal = maskNLLLoss(decoder_output, target_variable[t], mask[t])
            loss += mask_loss
            print_losses.append(mask_loss.item() * nTotal)
            n_totals += nTotal

    # Perform backpropatation
    loss.backward()

    # Clip gradients: gradients are modified in place
    _ = nn.utils.clip_grad_norm_(encoder.parameters(), clip)
    _ = nn.utils.clip_grad_norm_(decoder.parameters(), clip)

    # Adjust model weights
    encoder_optimizer.step()
    decoder_optimizer.step()

    return sum(print_losses) / n_totals



In [41]:
def trainIters(model_name, voc, pairs, encoder, decoder, encoder_optimizer, decoder_optimizer, embedding, encoder_n_layers, decoder_n_layers, save_dir, n_iteration, batch_size, print_every, save_every, clip, corpus_name, loadFilename):

    # Load batches for each iteration
    training_batches = [batch2TrainData(voc, [random.choice(pairs) for _ in range(batch_size)])
                      for _ in range(n_iteration)]

    # Initializations
    print('Initializing ...')
    start_iteration = 1
    print_loss = 0
    if loadFilename:
        start_iteration = checkpoint['iteration'] + 1

    # Training loop
    print("Training...")
    for iteration in range(start_iteration, n_iteration + 1):
        training_batch = training_batches[iteration - 1]
        # Extract fields from batch
        input_variable, lengths, target_variable, mask, max_target_len = training_batch

        # Run a training iteration with batch
        loss = train(input_variable, lengths, target_variable, mask, max_target_len, encoder,
                     decoder, embedding, encoder_optimizer, decoder_optimizer, batch_size, clip)
        print_loss += loss

        # Print progress
        if iteration % print_every == 0:
            print_loss_avg = print_loss / print_every
            print("Iteration: {}; Percent complete: {:.1f}%; Average loss: {:.4f}".format(iteration, iteration / n_iteration * 100, print_loss_avg))
            print_loss = 0

        # Save checkpoint
        if (iteration % save_every == 0):
            directory = os.path.join(save_dir, model_name, corpus_name, '{}-{}_{}'.format(encoder_n_layers, decoder_n_layers, hidden_size))
            if not os.path.exists(directory):
                os.makedirs(directory)
            torch.save({
                'iteration': iteration,
                'en': encoder.state_dict(),
                'de': decoder.state_dict(),
                'en_opt': encoder_optimizer.state_dict(),
                'de_opt': decoder_optimizer.state_dict(),
                'loss': loss,
                'voc_dict': voc.__dict__,
                'embedding': embedding.state_dict()
            }, os.path.join(directory, '{}_{}.tar'.format(iteration, 'checkpoint')))

In [42]:
class GreedySearchDecoder(nn.Module):
    def __init__(self, encoder, decoder):
        super(GreedySearchDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, input_seq, input_length, max_length):
        # Forward input through encoder model
        encoder_outputs, encoder_hidden = self.encoder(input_seq, input_length)
        # Prepare encoder's final hidden layer to be first hidden input to the decoder
        decoder_hidden = encoder_hidden[:decoder.n_layers]
        # Initialize decoder input with SOS_token
        decoder_input = torch.ones(1, 1, device=device, dtype=torch.long) * SOS_token
        # Initialize tensors to append decoded words to
        all_tokens = torch.zeros([0], device=device, dtype=torch.long)
        all_scores = torch.zeros([0], device=device)
        # Iteratively decode one word token at a time
        for _ in range(max_length):
            # Forward pass through decoder
            decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden, encoder_outputs)
            # Obtain most likely word token and its softmax score
            decoder_scores, decoder_input = torch.max(decoder_output, dim=1)
            # Record token and score
            all_tokens = torch.cat((all_tokens, decoder_input), dim=0)
            all_scores = torch.cat((all_scores, decoder_scores), dim=0)
            # Prepare current token to be next decoder input (add a dimension)
            decoder_input = torch.unsqueeze(decoder_input, 0)
        # Return collections of word tokens and scores
        return all_tokens, all_scores

In [44]:
def evaluate(encoder, decoder, searcher, voc, sentence, max_length=ML):
    ### Format input sentence as a batch
    # words -> indexes
    indexes_batch = [indexesFromSentence(voc, sentence)]
    # Create lengths tensor
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
    # Transpose dimensions of batch to match models' expectations
    input_batch = torch.LongTensor(indexes_batch).transpose(0, 1)
    # Use appropriate device
    input_batch = input_batch.to(device)
    lengths = lengths.to(device)
    # Decode sentence with searcher
    tokens, scores = searcher(input_batch, lengths, max_length)
    # indexes -> words
    decoded_words = [voc.index2word[token.item()] for token in tokens]
    return decoded_words


def evaluateInput(encoder, decoder, searcher, voc):
    input_sentence = ''
    while(1):
        try:
            # Get input sentence
            input_sentence = input('> ')
            # Check if it is quit case
            if input_sentence == 'q' or input_sentence == 'quit': break
            # Normalize sentence
            input_sentence = normalizeString(input_sentence)
            # Evaluate sentence
            output_words = evaluate(encoder, decoder, searcher, voc, input_sentence)
            # Format and print response sentence
            output_words[:] = [x for x in output_words if not (x == 'EOS' or x == 'PAD')]
            print('Bot:', ' '.join(output_words))

        except KeyError:
            print("Error: Encountered unknown word.")

In [45]:
model_name = 'cb_model'
attn_model = 'dot'
#attn_model = 'general'
#attn_model = 'concat'
hidden_size = 500
encoder_n_layers = 2
decoder_n_layers = 2
dropout = 0.1
batch_size = 64

# Set checkpoint to load from; set to None if starting from scratch
loadFilename = None
checkpoint_iter = 4000
#loadFilename = os.path.join(save_dir, model_name, corpus_name,
#                            '{}-{}_{}'.format(encoder_n_layers, decoder_n_layers, hidden_size),
#                            '{}_checkpoint.tar'.format(checkpoint_iter))


# Load model if a loadFilename is provided
if loadFilename:
    # If loading on same machine the model was trained on
    checkpoint = torch.load(loadFilename)
    # If loading a model trained on GPU to CPU
    #checkpoint = torch.load(loadFilename, map_location=torch.device('cpu'))
    encoder_sd = checkpoint['en']
    decoder_sd = checkpoint['de']
    encoder_optimizer_sd = checkpoint['en_opt']
    decoder_optimizer_sd = checkpoint['de_opt']
    embedding_sd = checkpoint['embedding']
    voc.__dict__ = checkpoint['voc_dict']


print('Building encoder and decoder ...')
# Initialize word embeddings
embedding = nn.Embedding(voc.num_words, hidden_size)
if loadFilename:
    embedding.load_state_dict(embedding_sd)
# Initialize encoder & decoder models
encoder = EncoderRNN(hidden_size, embedding, encoder_n_layers, dropout)
decoder = LuongAttnDecoderRNN(attn_model, embedding, hidden_size, voc.num_words, decoder_n_layers, dropout)
if loadFilename:
    encoder.load_state_dict(encoder_sd)
    decoder.load_state_dict(decoder_sd)
# Use appropriate device
encoder = encoder.to(device)
decoder = decoder.to(device)
print('Models built and ready to go!')

Building encoder and decoder ...
Models built and ready to go!


In [46]:
# Configure training/optimization
clip = 50.0
teacher_forcing_ratio = 1.0
learning_rate = 0.0001
decoder_learning_ratio = 5.0
n_iteration = 4000
print_every = 1
save_every = 500

# Ensure dropout layers are in train mode
encoder.train()
decoder.train()

# Initialize optimizers
print('Building optimizers ...')
encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate * decoder_learning_ratio)
if loadFilename:
    encoder_optimizer.load_state_dict(encoder_optimizer_sd)
    decoder_optimizer.load_state_dict(decoder_optimizer_sd)

# If you have cuda, configure cuda to call
for state in encoder_optimizer.state.values():
    for k, v in state.items():
        if isinstance(v, torch.Tensor):
            state[k] = v.cuda()

for state in decoder_optimizer.state.values():
    for k, v in state.items():
        if isinstance(v, torch.Tensor):
            state[k] = v.cuda()

# Run training iterations
print("Starting Training!")
trainIters(model_name, voc, pairs, encoder, decoder, encoder_optimizer, decoder_optimizer,
           embedding, encoder_n_layers, decoder_n_layers, save_dir, n_iteration, batch_size,
           print_every, save_every, clip, corpus_name, loadFilename)

Building optimizers ...
Starting Training!


NameError: name 'save_dir' is not defined