In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import torch
from torch.jit import script, trace
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import csv
import random
import re
import os
import unicodedata
import codecs
from io import open
import itertools
import math
import time

device = "cpu"

# Part 1: Data Preprocessing

In [2]:
corpus_name = "cornell movie-dialogs corpus"
corpus = os.path.join("data", corpus_name)

In [3]:
# to read or write files use open
# to access file module use os
lines_filepath = os.path.join(corpus, "movie_lines.txt")
conv_filepath = os.path.join(corpus, "movie_conversations.txt")

In [4]:
# visualise some lines
with open(lines_filepath, 'r') as file:
    lines = file.readlines()
for line in lines[:8]:
    print(line.strip()) 
# displayOrder: lineid charID movieID charNAME utterance
print()
# visualise some conversations
with open(conv_filepath, 'r') as file:
    conv = file.readlines()
for line in conv[:8]:
    print(line.strip()) 
# displayOrder: char1ID char2ID movieID lineIDs

L1045 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ They do not!
L1044 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ They do to!
L985 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ I hope so.
L984 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ She okay?
L925 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Let's go.
L924 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ Wow
L872 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Okay -- you're gonna need to learn how to lie.
L871 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ No

u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L194', 'L195', 'L196', 'L197']
u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L198', 'L199']
u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L200', 'L201', 'L202', 'L203']
u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L204', 'L205', 'L206']
u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L207', 'L208']
u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L271', 'L272', 'L273', 'L274', 'L275']
u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L276', 'L277']
u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L280', 'L281']


In [5]:
# Splits each line of the file into a dictionary of fields(lineID, charID, movieID, characterNAME, text)
line_fields = ["lineID", "characterID", "movieID", "character", "text"]
lines = {} # empty dictionary
with open(lines_filepath, 'r', encoding='iso-8859-1') as f:
    for line in f:
        values = line.split(" +++$+++ ")
        # Extarct fields
        lineObj = {}
        for i, field in enumerate(line_fields):
            lineObj[field] = values[i]
        lines[lineObj['lineID']] = lineObj # key is lineID

In [6]:
# lines
list(lines.items())[:2]

[('L1045',
  {'lineID': 'L1045',
   'characterID': 'u0',
   'movieID': 'm0',
   'character': 'BIANCA',
   'text': 'They do not!\n'}),
 ('L1044',
  {'lineID': 'L1044',
   'characterID': 'u2',
   'movieID': 'm0',
   'character': 'CAMERON',
   'text': 'They do to!\n'})]

In [7]:
# process the conversations
# Group fields of lines from 'LoadLines' into conversatons based on 'movie_conversations.txt'
conv_fields = ["character1ID", "character2ID", "movieID", "utteranceIDs"]
conversations = []
with open(conv_filepath, 'r', encoding='iso-8859-1') as f:
    for line in f:
        values = line.split(" +++$+++ ")
        # Extract fields
        convObj = {}
        for i, field in enumerate(conv_fields):
            convObj[field] = values[i]
#       Convert string resulted from split to list, since ConvObj["utterancesIDs"] == "['L8299', ...]"
        # remember that utterance ID is the LineID
        lineIDs = eval(convObj["utteranceIDs"])
        # Reassamble lines
        convObj["lines"] = []
        for lineID in lineIDs:
            convObj["lines"].append(lines[lineID])
        conversations.append(convObj)

In [8]:
conversations[0]

{'character1ID': 'u0',
 'character2ID': 'u2',
 'movieID': 'm0',
 'utteranceIDs': "['L194', 'L195', 'L196', 'L197']\n",
 'lines': [{'lineID': 'L194',
   'characterID': 'u0',
   'movieID': 'm0',
   'character': 'BIANCA',
   'text': 'Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.\n'},
  {'lineID': 'L195',
   'characterID': 'u2',
   'movieID': 'm0',
   'character': 'CAMERON',
   'text': "Well, I thought we'd start with pronunciation, if that's okay with you.\n"},
  {'lineID': 'L196',
   'characterID': 'u0',
   'movieID': 'm0',
   'character': 'BIANCA',
   'text': 'Not the hacking and gagging and spitting part.  Please.\n'},
  {'lineID': 'L197',
   'characterID': 'u2',
   'movieID': 'm0',
   'character': 'CAMERON',
   'text': "Okay... then how 'bout we try out some French cuisine.  Saturday?  Night?\n"}]}

In [9]:
conversations[0]["lines"][0]["text"].strip() # with strip '/n' is discarded

'Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.'

In [10]:
# processing the dataset part 3
# Extract pair of sentences from conversations
# ques ans pair
qa_pairs = []
for conversation in conversations:
    for i in range(len(conversation["lines"]) - 1):
        inputLine = conversation["lines"][i]["text"].strip()
        targetLine = conversation["lines"][i+1]["text"].strip()
        # filter wrong results
        if inputLine and targetLine:
            qa_pairs.append([inputLine, targetLine])

In [11]:
qa_pairs[2]

['Not the hacking and gagging and spitting part.  Please.',
 "Okay... then how 'bout we try out some French cuisine.  Saturday?  Night?"]

###### Formatted lines seperated by tab

In [12]:
# Define path to new file
datafile = os.path.join(corpus, "formatted_movie_lines.txt")
delimiter = '\t'
# Unescape the delimiter
delimiter = str(codecs.decode(delimiter, "unicode_escape"))

# write new csv file note that it is tab seperated not comma seperated
print("\nWriting newly formatted file...")
with open(datafile, 'w', encoding='utf-8') as outputfile:
    writer = csv.writer(outputfile, delimiter = delimiter, lineterminator='\n')
    for pair in qa_pairs:
        writer.writerow(pair)
print("Done writing to file")


Writing newly formatted file...
Done writing to file


In [13]:
# visualise some lines
datafile = os.path.join(corpus, "formatted_movie_lines.txt")
with open(datafile, 'rb') as file:
    lines = file.readlines()
for line in lines[:10]:
    print(line)

b"Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.\tWell, I thought we'd start with pronunciation, if that's okay with you.\r\n"
b"Well, I thought we'd start with pronunciation, if that's okay with you.\tNot the hacking and gagging and spitting part.  Please.\r\n"
b"Not the hacking and gagging and spitting part.  Please.\tOkay... then how 'bout we try out some French cuisine.  Saturday?  Night?\r\n"
b"You're asking me out.  That's so cute. What's your name again?\tForget it.\r\n"
b"No, no, it's my fault -- we didn't have a proper introduction ---\tCameron.\r\n"
b"Cameron.\tThe thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does.\r\n"
b"The thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does.\tSeems like she could get a date easy enough...\r\n"
b'Why?\tUnsolved myster

This Class keeps mapping of words to ```indexes```

In [14]:
# Processing the words
PAD_token = 0 # used for padding short sentences
SOS_token = 1 # Start of sent token
EOS_token = 2 # End of Sent token

class Vocabulary:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
        self.num_words = 3 # Count SOS, EOS, PAD
        
    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)
    # add index(unique value) for each word
    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.num_words # just opp of 3rd line
            self.word2count[word] = 1
            self.index2word[self.num_words] = word
            self.num_words += 1
        else:
            self.word2count[word] += 1
            
    # remove words that don't pass min count
    def trim(self, min_count):
        keep_words = []
        for k, v in self.word2count.items():
            if v >= min_count:
                keep_words.append(k)
                
        # print words that are kept
        print('keep_words {} / {} = {:.4f}'.format(len(keep_words), len(self.word2index), len(keep_words) / len(self.word2index)))
        # Reinitailize dictionaries
        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
        self.num_words = 3 # Count SOS, EOS, PAD

        for word in keep_words:
            self.addWord(word)

In [15]:
sen = "hi ali bin arshad"
sen.split()

['hi', 'ali', 'bin', 'arshad']

In [16]:
# turn unicode string to plain ASCII
# nfd: normal form decomposed
# mn: normalized
def unicodeToAscii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c)!= 'Mn')

In [17]:
''.join(['a','l','i'])

'ali'

In [18]:
# Lowercase, trim, and remove non-letter characters
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    # re.sub = Subsitute any .!? by a whitespace + the character
    # r is to not consider \1 as a character(r to escape a backslash)
    s = re.sub(r"([.!?])", r" \1", s)
    # remove any char that is not a sequence of lower or upper character
    # + means one or more
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    # remove seq of whitespace charactersc
    s = re.sub(r"\s+", r" ", s).strip()
    return s

In [None]:
# testing the funcion
normalizeString("aa123bcd!s's    ad?")

'aa bcd !s s ad ?'

In [None]:
datafile = os.path.join(corpus, "formatted_movie_lines.txt")

# Read query/response pairs and return voc object
def readVocs(datafile, corpus_name):
    # Read the file and split into lines
    print("Reading and processing file.....Plaese Wait!")
    lines = open(datafile, encoding='utf-8').read().strip().split('\n') # conversatons are splitted by \n
    # Split every line into pair and normalize
    pairs = [[normalizeString(s) for s in pair.split('\t')] for pair in lines] # for each char(represented by pair) we are gonna normalize
    print("Done Reading!")
    voc = Vocabulary(corpus_name)
    return voc, pairs
    


In [None]:
# a = [i for i in range(0,10)]
# a

In [None]:
# lines[0]
# lines[0].split('\t')

In [None]:
# len(pairs)

In [None]:
# pairs[0][0].split()

In [None]:
# Returns True if boths sentences in a pair 'p' are under the MAX_LENGTH threshold
MAX_LENGTH = 10 # Max sentence length to consider
def filterPair(p):
    # Input sequences need to preserve the last word for EOS token
    return len(p[0].split(' ')) < MAX_LENGTH and len(p[1].split(' ')) < MAX_LENGTH
    # if True keep the pair else ignore
# Filter pairs using filterPair condition
def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

# Using the functions defined above, return a populated voc object and pairs list
def loadPrepareData(corpus, corpus_name, datafile, save_dir):
    print("Start preparing training data ...")
    voc, pairs = readVocs(datafile, corpus_name)
    print("Read {!s} sentence pairs".format(len(pairs)))
    pairs = filterPairs(pairs)
    print("Trimmed to {!s} sentence pairs".format(len(pairs)))
    print("Counting words...")
#      Word 2 index word conversion occurs here
    for pair in pairs:
        voc.addSentence(pair[0])
        voc.addSentence(pair[1])
    print("Counted words:", voc.num_words)
    return voc, pairs


# Load/Assemble voc and pairs
save_dir = os.path.join("data", "save")
voc, pairs = loadPrepareData(corpus, corpus_name, datafile, save_dir)
# Print some pairs to validate
print("\npairs:")
for pair in pairs[:10]:
    print(pair)


Start preparing training data ...
Reading and processing file.....Plaese Wait!
Done Reading!
Read 221282 sentence pairs
Trimmed to 64271 sentence pairs
Counting words...
Counted words: 18008

pairs:
['there .', 'where ?']
['you have my word . as a gentleman', 'you re sweet .']
['hi .', 'looks like things worked out tonight huh ?']
['you know chastity ?', 'i believe we share an art instructor']
['have fun tonight ?', 'tons']
['well no . . .', 'then that s all you had to say .']
['then that s all you had to say .', 'but']
['but', 'you always been this selfish ?']
['do you listen to this crap ?', 'what crap ?']
['what good stuff ?', 'the real you .']


In [None]:
# We don't want our model to have words which appeared less than 3 times
MIN_COUNT = 3

def trimRareWords(voc, pairs, MIN_COUNT):
    # Trim words less than MIN_COUNT from voc
    voc.trim(MIN_COUNT) # Function from class voc
    # filter out pairs with trimmed words
    keep_pairs = []
    for pair in pairs:
        input_sequence = pair[0]
        output_sequence = pair[1]
        keep_input = True
        keep_output = True
        # Check input Sequence
        for word in input_sequence.split(' '):
            if word not in voc.word2index: # voc.word2index contains filtered words
                keep_input = False
                break
        # now check output sequence
        for word in output_sequence.split(' '):
            if word not in voc.word2index:
                keep_output = False
                break
                
        # only keep words that are not trimmed in in or out seq
        if keep_input and keep_output:
            keep_pairs.append(pair)
            
    print("Trimmed from {} pairs to {}, {:.4f} of total".format(len(pairs), len(keep_pairs), len(keep_pairs) / len(pairs)))
    return keep_pairs
        
# trim voc and pairs
pairs = trimRareWords(voc, pairs, MIN_COUNT)

keep_words 7823 / 18005 = 0.4345
Trimmed from 64271 pairs to 53165, 0.8272 of total


# Data preparation

![title](img/batch_Sentence.png)

we need to be able to index our batch along time, and across all sequences in the batch. Therefore, we transpose our input batch shape to (max_length, batch_size), so that indexing across the first dimension returns a time step across all sentences in the batch. We handle this transpose implicitly in the zeroPadding function.

![title](img/batch_sent2.png)

In [None]:
# preparing data part1
# returns converted word 2 index
def indexesFromSentence(voc, sentence):
    return [voc.word2index[word] for word in sentence.split(' ')]+ [EOS_token]

In [None]:
pairs[1][0]

'you have my word . as a gentleman'

In [None]:
# Testing function
indexesFromSentence(voc,pairs[1][0])
# 2 at the end of output represents EOS_token

[7, 8, 9, 10, 4, 11, 12, 13, 2]

In [None]:
# Define some samples for testing
inp = []
out = []
# print(pairs[1])
for pair in pairs[:10]:
    inp.append(pair[0])
    out.append(pair[1])
print(inp)
print(len(inp))
indexes = [indexesFromSentence(voc, sentence) for sentence in inp]
indexes

['there .', 'you have my word . as a gentleman', 'hi .', 'have fun tonight ?', 'well no . . .', 'then that s all you had to say .', 'but', 'do you listen to this crap ?', 'what good stuff ?', 'wow']
10


[[3, 4, 2],
 [7, 8, 9, 10, 4, 11, 12, 13, 2],
 [16, 4, 2],
 [8, 31, 22, 6, 2],
 [33, 34, 4, 4, 4, 2],
 [35, 36, 37, 38, 7, 39, 40, 41, 4, 2],
 [42, 2],
 [47, 7, 48, 40, 45, 49, 6, 2],
 [50, 51, 52, 6, 2],
 [58, 2]]

In [None]:
a = ['A', 'B', 'C', 'D', 'E']
b = [1, 2, 3]
print(list(zip(a,b)))
list(itertools.zip_longest(a,b))

[('A', 1), ('B', 2), ('C', 3)]


[('A', 1), ('B', 2), ('C', 3), ('D', None), ('E', None)]

In [None]:
# Preparing data for model part 2
a = [[3, 4, 2],
 [7, 8, 9, 10, 4, 11, 12, 13, 2],
 [16, 4, 2],
 [8, 31, 22, 6, 2],
 [33, 34, 4, 4, 4, 2],
 [35, 36, 37, 38, 7, 39, 40, 41, 4, 2],
 [42, 2],
 [47, 7, 48, 40, 45, 49, 6, 2],
 [50, 51, 52, 6, 2],
 [58, 2]]


In [None]:
list(itertools.zip_longest(*a, fillvalue = 0))

[(3, 7, 16, 8, 33, 35, 42, 47, 50, 58),
 (4, 8, 4, 31, 34, 36, 2, 7, 51, 2),
 (2, 9, 2, 22, 4, 37, 0, 48, 52, 0),
 (0, 10, 0, 6, 4, 38, 0, 40, 6, 0),
 (0, 4, 0, 2, 4, 7, 0, 45, 2, 0),
 (0, 11, 0, 0, 2, 39, 0, 49, 0, 0),
 (0, 12, 0, 0, 0, 40, 0, 6, 0, 0),
 (0, 13, 0, 0, 0, 41, 0, 2, 0, 0),
 (0, 2, 0, 0, 0, 4, 0, 0, 0, 0),
 (0, 0, 0, 0, 0, 2, 0, 0, 0, 0)]

In [None]:
def zeroPadding(l, fillValue = 0):
    return list(itertools.zip_longest(*l, fillvalue=fillValue))

In [None]:
leng = [len(ind) for ind in indexes]
max(leng)

10

In [None]:
# Test the function
test_result = zeroPadding(indexes)
print(len(test_result))
test_result

10


[(3, 7, 16, 8, 33, 35, 42, 47, 50, 58),
 (4, 8, 4, 31, 34, 36, 2, 7, 51, 2),
 (2, 9, 2, 22, 4, 37, 0, 48, 52, 0),
 (0, 10, 0, 6, 4, 38, 0, 40, 6, 0),
 (0, 4, 0, 2, 4, 7, 0, 45, 2, 0),
 (0, 11, 0, 0, 2, 39, 0, 49, 0, 0),
 (0, 12, 0, 0, 0, 40, 0, 6, 0, 0),
 (0, 13, 0, 0, 0, 41, 0, 2, 0, 0),
 (0, 2, 0, 0, 0, 4, 0, 0, 0, 0),
 (0, 0, 0, 0, 0, 2, 0, 0, 0, 0)]

In [None]:
# data modelling part 3
# l contains the index value of each word
# remember that pad+token is 0
def binaryMatrix(l, value=0):
    m = []
    for i, seq in enumerate(l):
        m.append([])
        for token in seq:
            if token == PAD_token:
                m[i].append(0)
            else:
                m[i].append(1)
    return m

In [None]:
binary_result = binaryMatrix(test_result)
binary_result

[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 [1, 1, 1, 1, 1, 1, 0, 1, 1, 0],
 [0, 1, 0, 1, 1, 1, 0, 1, 1, 0],
 [0, 1, 0, 1, 1, 1, 0, 1, 1, 0],
 [0, 1, 0, 0, 1, 1, 0, 1, 0, 0],
 [0, 1, 0, 0, 0, 1, 0, 1, 0, 0],
 [0, 1, 0, 0, 0, 1, 0, 1, 0, 0],
 [0, 1, 0, 0, 0, 1, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 1, 0, 0, 0, 0]]

inputVar function converts sentences to tensors

In [None]:
#  Returns padded input tensor and a tensor of lengths for each or the sequences in the batch
# l is going to be questions not replies, i.e. only what the first char said
def inputVar(l, voc):
    indexes_batch = [indexesFromSentence(voc, sentence) for sentence in l]
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
    padList = zeroPadding(indexes_batch)
    padVar = torch.LongTensor(padList)
    return padVar, lengths

similar to inputVar but in binaryMaskTensor every element that is a PAD_TOKEN is 0 and all others are 1

In [None]:
# returns padded target sequence tensor, padding mask and max target len
# this one is for all replies
def outputVar(l, voc):
    indexes_batch = [indexesFromSentence(voc, sentence) for sentence in l]
    max_target_len = max([len(indexes) for indexes in indexes_batch])
    padList = zeroPadding(indexes_batch)
    mask = binaryMatrix(padList)
    mask = torch.ByteTensor(mask)
    padVar = torch.LongTensor(padList)
    return padVar, mask, max_target_len

In [None]:
pairs[20000]

['well well well . huh ?', 'yeah .']

###### We use mini batches to speed up the process

In [None]:
# Returns all items for a given batch of pairs
def batch2TrainData(voc, pair_batch):
    # Sorts the ques in descending order as shown in img above
    pair_batch.sort(key=lambda x:len(x[0].split(" ")), reverse=True)
    input_batch, output_batch = [], []
    for pair in pair_batch:
        input_batch.append(pair[0])
        output_batch.append(pair[1])
    inp, lengths = inputVar(input_batch, voc)
    # assert len(inp) == lengyh[0]
    output, mask, max_target_len = outputVar(output_batch, voc)
    return inp, lengths, output, mask, max_target_len

In [None]:
# Example for validation
small_batch_size = 5
batches = batch2TrainData(voc, [random.choice(pairs) for i in range(small_batch_size)])
input_variable, lengths, target_variable, mask, max_target_len = batches

print("input variable:")
print(input_variable)
print("lengths: ", lengths)
print("target_variable:")
print(target_variable)
print("mask:")
print(mask)
print("max_target_len:", max_target_len)


input variable:
tensor([[   7,  410, 1118,  318,  318],
        [ 293,   25,    4,    4,    4],
        [ 117,  351,    4,    2,    2],
        [ 380,  380,    4,    0,    0],
        [2798,  699,    2,    0,    0],
        [  83,    6,    0,    0,    0],
        [  21,    2,    0,    0,    0],
        [   4,    0,    0,    0,    0],
        [   2,    0,    0,    0,    0]])
lengths:  tensor([9, 7, 5, 3, 3])
target_variable:
tensor([[   4,   25,  995,  318,    7],
        [   4,  770,  219,   40,   92],
        [6575,  117,    6,   83,   67],
        [  25,   41,    2,  329,   12],
        [ 148,    4,    0,  318,  125],
        [ 380,    2,    0,   40,    4],
        [  62,    0,    0,   53,    2],
        [  96,    0,    0, 6925,    0],
        [ 159,    0,    0,    6,    0],
        [   2,    0,    0,    2,    0]])
mask:
tensor([[1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1],
        [1, 1, 0, 1, 1],
        [1, 1, 0, 1, 1],
        [1, 0, 

# Building The model

GRUs(gatted recurrent unit) are better than LSTMs

!['title'](img/rnnstonn.png)

!['title'](img/definingModels.png)

In [None]:
# encoder class inherited from nn.Module
# hidden size: how many RNN cells are there in hidden layer
# embedding: converts index to dense vector of values
# size of embedding = number of input features
# seq is the timestep
class EncoderRNN(nn.Module):
    def __init__(self, hidden_size, embedding, n_layers=1, dropout=0):
        super(EncoderRNN, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        self.embedding = embedding
        
        # the input_size & hidden_size params are both set to 'hidden_size'
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=(0 if n_layers == 1 else dropout), bidirectional=True)
    
    # overridden forward function of class nn.Module
    def forward(self, input_seq, input_lengths, hidden=None):
        # input_seq: batch of input sentences; shape=(max_length, batch_size)
        # input_lengths: list of sentence lengths corresponding to each sentence in the batch
        # hidden state of shape:(n_layers times num_directions(2 in this case), batch_size, hidden_size)
        
        # convert word indexes to embeddings
        embedded = self.embedding(input_seq)
        # pack padded batch of sequences for RNN module
        packed = torch.nn.utils.rnn.pack_padded_sequence(embedded, input_lengths)
        # Forward pass through  GRU
        outputs, hidden = self.gru(packed, hidden)
        # Unpack padding
        outputs, _ = torch.nn.utils.rnn.pad_packed_sequence(outputs)
        # Sum bidirectional gru outputs
        outputs = outputs[:, :, :self.hidden_size] + outputs[:, :, self.hidden_size:]
        # Return output from final hidden state
        # outputs: (timesteps, batch, hidden_size)
        return outputs, hidden
        # outputs: the output features h_t from the last layer of GRU, for each timestep(sum of bidirectional outputs)
        # hidden: hidden state for the last timestep, of shape=(n_layers x mum_directions, batch_size, hidden_size)
    

!['title'](img/forwardExp.png)

In [None]:
# Understanding Pack Padded Sequence

!["title"](img/understandingPackedPaddedSeq.png)

In [None]:
# decoder is built with attention mechanism

!['title'](img/attentionMechanism.png)

!['title'](img/attentionOp.png)

!['title'](img/attentionImplementation.png)

In [None]:
# Luong attention layer
class Attn(torch.nn.Module):
    def __init__(self, method, hidden_size): # method can be dot, general or concat
        super(Attn, self).__init__()
        self.method = method
        if self.method not in ['dot', 'general', 'concat']:
            raise ValueError(self.method, "is not an appropriate attention method.")
        self.hidden_size = hidden_size
        if self.method == 'general':
            self.attn = torch.nn.Linear(self.hidden_size, hidden_size)
        elif self.method == 'concat':
            self.attn = torch.nn.Linear(self.hidden_size * 2, hidden_size)
            self.v = torch.nn.Parameter(torch.FloatTensor(hidden_size))
        
    def dot_score(self, hidden, encoder_output): # hidden is the decoder output at certain timestep
     # Element wise multiplication of current target state with the encoder output and sum them
        return torch.sum(hidden * encoder_output, dim=2)

    def general_score(self, hidden, encoder_output):
        energy = self.attn(encoder_output)
        return torch.sum(hidden * energy, dim=2)

    def concat_score(self, hidden, encoder_output):
        energy = self.attn(torch.cat((hidden.expand(encoder_output.size(0), -1, -1), encoder_output), 2)).tanh()
        return torch.sum(self.v * energy, dim=2)
    
    def forward(self, hidden, encoder_outputs):
        # hidden of shape:(1,batch_size, hidden_size)
        # encoder output shape: (max_length, batch_size, hidden_size)
        # comment 3:(1, batch_size, hidden_size) * (max_length, batch_size, hidden_size) = (max_length, batch_size, hidden_size)
        
        # calculate the attention weights(energies)
        if self.method == 'general':
            attn_energies = self.general_score(hidden, encoder_outputs)
        elif self.method == 'concat':
            attn_energies = self.concat_score(hidden, encoder_outputs)
        elif self.method == 'dot':
            attn_energies = self.dot_score(hidden, encoder_outputs) # (max_length, batch_size)
        # Transpose max_length and batch_size dimensions
        attn_energies = attn_energies.t()
        # Return the softmax normalized probability scores(with added dimension)
        return F.softmax(attn_energies, dim=1).unsqueeze(1) # (batch_size, 1, max_length)
    # softmax: sum of entire row is 1
    

In [None]:
# understanding of comment 3
# summing across dim=2 means we're summing all the cols of each row
a = torch.randn(5,3,7)
print(a)
torch.sum(a, dim=2)

tensor([[[ 6.9640e-02,  2.1571e-01, -1.2635e+00, -3.6372e-01, -7.5780e-01,
           2.4598e-01,  1.0155e+00],
         [-4.5015e-02,  1.5900e+00,  4.4714e-01, -6.8561e-01,  1.6686e-01,
          -1.7394e-01, -3.0888e-01],
         [ 4.9842e-01, -1.3865e+00, -9.7843e-02,  4.4360e-01, -1.1801e+00,
           9.3310e-01, -2.9521e-01]],

        [[ 4.4978e-01, -2.0821e-03, -2.3229e+00, -3.8986e-01,  5.7675e-01,
           2.0282e+00, -4.9625e-01],
         [ 5.4959e-01,  6.8952e-01, -4.6726e-01, -7.1014e-01, -8.5396e-01,
           1.4278e-01, -4.8880e-01],
         [-1.4950e+00,  1.2706e-01,  1.2325e-02,  5.3121e-01, -4.4726e-01,
          -3.4532e-01,  1.8732e+00]],

        [[ 1.1064e+00, -2.4615e+00, -2.8076e-01, -1.0520e+00,  5.5258e-01,
          -4.4130e-01, -8.8252e-01],
         [ 1.8633e+00,  4.0903e-01, -3.3262e-02,  1.6605e+00, -4.5381e-01,
           8.4681e-01,  1.1736e+00],
         [-4.1982e-01,  7.1894e-01, -9.7942e-02, -2.8754e-02, -2.0465e+00,
          -1.3710e-01,  1

tensor([[-0.8382,  0.9905, -1.0845],
        [-0.1564, -1.1383,  0.2562],
        [-3.4592,  5.4662, -0.1637],
        [-2.0183, -0.7168, -0.3677],
        [-1.5930,  3.6495,  3.5803]])

In [None]:
# Designing the Decoder 1
# We are using the attention to build decoder
# note that data is(max_length,batch_size)
# we feed data row by row to every GRU   
# output from encoder is fed to 1st timestep of decoder
# dropout drops random num of neurons in each layer, helps neurons not to be dependent on each other


class LuongAttnDecoderRNN(nn.Module):
    def __init__(self, attn_model, embedding, hidden_size, output_size, n_layers=1, dropout=0.1):
        super(LuongAttnDecoderRNN, self).__init__()
        self.attn_model = attn_model
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout = dropout
        
        # Define layers
        self.embedding = embedding
        self.embedding_dropout = nn.Dropout(dropout)
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=(0 if n_layers == 1 else dropout))
        self.concat = nn.Linear(hidden_size * 2, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        
        self.attn = Attn(attn_model, hidden_size)
        
    def forward(self, input_step, last_hidden, encoder_outputs):
        # input_step: one time step (one word) of input sequence batch; shape=(1, batch_size)
        # last_hidden: final hidden state of encoder GRU; shape=(n_layers x num_directions, batch_size, hidden_state)
        # encoder_outputs: encoder model's output; shape=(seq_len, batch, num_directions * hidden_size)
        # Note: we run this one step (batch of words) at a time
        
        # Get embedding of current input word
        # nn.Embedding as a lookup table where the key is the word index and the value is the corresponding word vector
        embedded = self.embedding(input_step)
        embedded = self.embedding_dropout(embedded)
        # Forward through unidirectional GRU
        rnn_output, hidden = self.gru(embedded, last_hidden)
        # rnn_output of shape = (1, batch, num_directions * hidden_size)
        # hidden of shape = (num_layers * num_directions, batch, hidden_size)
        # hidden is the hidden state of the current time step of GRU
        
        # Calculate attention weights from the current GRU output
        attn_weights = self.attn(rnn_output, encoder_outputs)
        # Multiply attention weights to encoder outputs to get new weighted sum context vector
        # (batch_size, 1m max_lengths) bmm(batch multiplication) with (batch_size, max_length, hidden) = (batch_size, 1, hidden)
        context = attn_weights.bmm(encoder_outputs.transpose(0, 1))
        # Concatenate weighted vectors and GRU output
        rnn_output = rnn_output.squeeze(0)
        context = context.squeeze(1)
        concat_input = torch.cat((rnn_output, context), 1) # 1 is the dimencion of concatenation
        # concat_input: (batch_size, hidden_size * 2)
        concat_output = torch.tanh(self.concat(concat_input))
        # Predict next word using Luong eq. 6
        output = self.out(concat_output)
        output = F.softmax(output, dim=1)
        # return output and final hidden state
        return output, hidden
        # output: sofmax normalized tensor giving probabilies of each word being the correct next word in decoded sequencw
        # shape: (batch_size, voc.num_words)
        # hidden: final hidden state of GRU; shape=(n_layers x num_directions, batch_size, hidden_size)
        

# We're done with building the architecture, Let's start Training Code

### Loss Function

Since we are dealing with batches of padded sequences, we cannot simply consider all elements of the tensor when calculating loss. We define maskNLLLoss to calculate our loss based on our decoder’s output tensor, the target tensor, and a binary mask tensor describing the padding of the target tensor. This loss function calculates the average negative log likelihood of the elements that correspond to a 1 in the mask tensor.

In [None]:
# calculate loss only for non 0 elements
# mask is what was returned above ▲
# we calc loss btw decoder output and target

# NLLL: Negative Log Likelihood Loss
# def maskNLLLoss(decoder_out, target, mask):
#     nTotal = mask.sum()# gives how many non zerro elements we have, that we consider
#     target = target.view(-1,1)
#     # decoder_out shape: (batch_size, vocab_size), target_size = (batch_size, 1)
#     gathered_tensor = torch.gather(decoder_out, 1, target)
#     # calc the NLLL
#     crossEntropy = -torch.log(gathered_tensor)
#     # Select the non-zer0 elements
#     loss = crossEntropy.masked_select(mask) # loss is only for non-zere
#     # calc teh mean of loss
#     loss = loss.mean()
# #     loss = loss.to(device) for cuda
#     return loss, nTotal.item()

def maskNLLLoss(inp, target, mask):
    nTotal = mask.sum()
    crossEntropy = -torch.log(torch.gather(inp, 1, target.view(-1, 1)).squeeze(1))
    loss = crossEntropy.masked_select(mask).mean()
    loss = loss.to(device)
    return loss, nTotal.item()


#### Teacher forcing:

Adv is that if wrong word is generated, it's not fed into the nn

!['title'](img/teacherForcing.png)

In [None]:
def train(input_variable, lengths, target_variable, mask, max_target_len, encoder, decoder, embedding,
          encoder_optimizer, decoder_optimizer, batch_size, clip, max_length=MAX_LENGTH):

    # Zero gradients
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

#     # Set device options
#     input_variable = input_variable.to(device)
#     lengths = lengths.to(device)
#     target_variable = target_variable.to(device)
#     mask = mask.to(device)

    # Initialize variables
    loss = 0
    print_losses = []
    n_totals = 0

    # Forward pass through encoder
    encoder_outputs, encoder_hidden = encoder(input_variable, lengths)

    # Create initial decoder input (start with SOS tokens for each sentence)
    decoder_input = torch.LongTensor([[SOS_token for _ in range(batch_size)]])
#     decoder_input = decoder_input.to(device)

    # Set initial decoder hidden state to the encoder's final hidden state
    decoder_hidden = encoder_hidden[:decoder.n_layers]

    # Determine if we are using teacher forcing this iteration
    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    # Forward batch of sequences through decoder one time step at a time
    if use_teacher_forcing:
        for t in range(max_target_len):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden, encoder_outputs
            )
            # Teacher forcing: next input is current target
            decoder_input = target_variable[t].view(1, -1)
            # Calculate and accumulate loss
            mask_loss, nTotal = maskNLLLoss(decoder_output, target_variable[t], mask[t])
            loss += mask_loss
            print_losses.append(mask_loss.item() * nTotal)
            n_totals += nTotal
    else:
        for t in range(max_target_len):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden, encoder_outputs
            )
            # No teacher forcing: next input is decoder's own current output
            _, topi = decoder_output.topk(1)
            decoder_input = torch.LongTensor([[topi[i][0] for i in range(batch_size)]])
#             decoder_input = decoder_input.to(device)
            # Calculate and accumulate loss
            mask_loss, nTotal = maskNLLLoss(decoder_output, target_variable[t], mask[t])
            loss += mask_loss
            print_losses.append(mask_loss.item() * nTotal)
            n_totals += nTotal

    # Perform backpropatation
    loss.backward()

    # Clip gradients: gradients are modified in place, solve exploding gradient problem
    # parameters are weights
    _ = torch.nn.utils.clip_grad_norm_(encoder.parameters(), clip)
    _ = torch.nn.utils.clip_grad_norm_(decoder.parameters(), clip)

    # Adjust model weights
    encoder_optimizer.step()
    decoder_optimizer.step()

    return sum(print_losses) / n_totals

In [None]:
def trainIters(model_name, voc, pairs, encoder, decoder, encoder_optimizer, decoder_optimizer, embedding, encoder_n_layers, decoder_n_layers, save_dir, n_iteration, batch_size, print_every, save_every, clip, corpus_name, loadFilename):

    # Load batches for each iteration
    training_batches = [batch2TrainData(voc, [random.choice(pairs) for _ in range(batch_size)])
                      for _ in range(n_iteration)]

    # Initializations
    print('Initializing ...')
    start_iteration = 1
    print_loss = 0
    if loadFilename:
        start_iteration = checkpoint['iteration'] + 1

    # Training loop
    print("Training...")
    for iteration in range(start_iteration, n_iteration + 1):
        training_batch = training_batches[iteration - 1]
        # Extract fields from batch
        input_variable, lengths, target_variable, mask, max_target_len = training_batch

        # Run a training iteration with batch
        loss = train(input_variable, lengths, target_variable, mask, max_target_len, encoder,
                     decoder, embedding, encoder_optimizer, decoder_optimizer, batch_size, clip)
        print_loss += loss

        # Print progress
        if iteration % print_every == 0:
            print_loss_avg = print_loss / print_every
            print("Iteration: {}; Percent complete: {:.1f}%; Average loss: {:.4f}".format(iteration, iteration / n_iteration * 100, print_loss_avg))
            print_loss = 0

        # Save checkpoint
        if (iteration % save_every == 0):
            directory = os.path.join(save_dir, model_name, corpus_name, '{}-{}_{}'.format(encoder_n_layers, decoder_n_layers, hidden_size))
            if not os.path.exists(directory):
                os.makedirs(directory)
            torch.save({
                'iteration': iteration,
                'en': encoder.state_dict(),
                'de': decoder.state_dict(),
                'en_opt': encoder_optimizer.state_dict(),
                'de_opt': decoder_optimizer.state_dict(),
                'loss': loss,
                'voc_dict': voc.__dict__,
                'embedding': embedding.state_dict()
            }, os.path.join(directory, '{}_{}.tar'.format(iteration, 'checkpoint')))

Greedy decoding is used when we are not using teacher forcing

In [None]:
class GreedySearchDecoder(nn.Module):
    def __init__(self, encoder, decoder):
        super(GreedySearchDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, input_seq, input_length, max_length):
        # Forward input through encoder model
        encoder_outputs, encoder_hidden = self.encoder(input_seq, input_length)
        # Prepare encoder's final hidden layer to be first hidden input to the decoder
        decoder_hidden = encoder_hidden[:decoder.n_layers]
        # Initialize decoder input with SOS_token
        decoder_input = torch.ones(1, 1, device=device, dtype=torch.long) * SOS_token
        # Initialize tensors to append decoded words to
        all_tokens = torch.zeros([0], device=device, dtype=torch.long)
        all_scores = torch.zeros([0], device=device)
        # Iteratively decode one word token at a time
        for _ in range(max_length):
            # Forward pass through decoder
            decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden, encoder_outputs)
            # Obtain most likely word token and its softmax score
            decoder_scores, decoder_input = torch.max(decoder_output, dim=1)
            # Record token and score
            all_tokens = torch.cat((all_tokens, decoder_input), dim=0)
            all_scores = torch.cat((all_scores, decoder_scores), dim=0)
            # Prepare current token to be next decoder input (add a dimension)
            decoder_input = torch.unsqueeze(decoder_input, 0)
        # Return collections of word tokens and scores
        return all_tokens, all_scores

In [None]:
def evaluate(encoder, decoder, searcher, voc, sentence, max_length=MAX_LENGTH):
    ### Format input sentence as a batch
    # words -> indexes
    indexes_batch = [indexesFromSentence(voc, sentence)]
    # Create lengths tensor
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
    # Transpose dimensions of batch to match models' expectations
    input_batch = torch.LongTensor(indexes_batch).transpose(0, 1)
    # Use appropriate device
#     input_batch = input_batch.to(device)
#     lengths = lengths.to(device)
    # Decode sentence with searcher
    tokens, scores = searcher(input_batch, lengths, max_length)
    # indexes -> words
    decoded_words = [voc.index2word[token.item()] for token in tokens]
    return decoded_words


def evaluateInput(encoder, decoder, searcher, voc):
    input_sentence = ''
    while(1):
        try:
            # Get input sentence
            input_sentence = input('> ')
            # Check if it is quit case
            if input_sentence == 'q' or input_sentence == 'quit': break
            # Normalize sentence
            input_sentence = normalizeString(input_sentence)
            # Evaluate sentence
            output_words = evaluate(encoder, decoder, searcher, voc, input_sentence)
            # Format and print response sentence
            output_words[:] = [x for x in output_words if not (x == 'EOS' or x == 'PAD')]
            print('Bot:', ' '.join(output_words))

        except KeyError:
            print("Error: Encountered unknown word.")

## Run Our Model

In [None]:
# Configure models
model_name = 'cb_model'
# attn_model = 'dot'
attn_model = 'general'
# attn_model = 'concat'
hidden_size = 500
encoder_n_layers = 2
decoder_n_layers = 2
dropout = 0.1
batch_size = 64

# Set checkpoint to load from; set to None if starting from scratch
loadFilename = None
checkpoint_iter = 4000
loadFilename = os.path.join("data" ,"save", model_name, "cornell movie-dialogs corpus",
                           '{}-{}_{}'.format(encoder_n_layers, decoder_n_layers, hidden_size), "4000_checkpoint.tar")
#                            '{}_checkpoint.tar'.format(checkpoint_iter))
# Load model if a loadFilename is provided
if loadFilename:
    # If loading on same machine the model was trained on
    checkpoint = torch.load(loadFilename)
    # If loading a model trained on GPU to CPU
    #checkpoint = torch.load(loadFilename, map_location=torch.device('cpu'))
    encoder_sd = checkpoint['en']
    decoder_sd = checkpoint['de']
    encoder_optimizer_sd = checkpoint['en_opt']
    decoder_optimizer_sd = checkpoint['de_opt']
    embedding_sd = checkpoint['embedding']
    voc.__dict__ = checkpoint['voc_dict']


print('Building encoder and decoder ...')
# Initialize word embeddings
embedding = nn.Embedding(voc.num_words, hidden_size)
if loadFilename:
    embedding.load_state_dict(embedding_sd)
# Initialize encoder & decoder models
encoder = EncoderRNN(hidden_size, embedding, encoder_n_layers, dropout)
decoder = LuongAttnDecoderRNN(attn_model, embedding, hidden_size, voc.num_words, decoder_n_layers, dropout)
if loadFilename:
    encoder.load_state_dict(encoder_sd)
    decoder.load_state_dict(decoder_sd)
# # Use appropriate device
encoder = encoder.to(device)
decoder = decoder.to(device)
print('Models built and ready to go!')

Building encoder and decoder ...
Models built and ready to go!


In [None]:
# Configure training/optimization
clip = 50.0
teacher_forcing_ratio = 1.0
learning_rate = 0.0001
decoder_learning_ratio = 5.0
n_iteration = 4000
print_every = 1
save_every = 500

# Ensure dropout layers are in train mode
encoder.train()
decoder.train()

# Initialize optimizers
print('Building optimizers ...')
encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate * decoder_learning_ratio)
if loadFilename:
    encoder_optimizer.load_state_dict(encoder_optimizer_sd)
    decoder_optimizer.load_state_dict(decoder_optimizer_sd)

# Run training iterations
print("Starting Training!")
start = time.time()
trainIters(model_name, voc, pairs, encoder, decoder, encoder_optimizer, decoder_optimizer,
           embedding, encoder_n_layers, decoder_n_layers, save_dir, n_iteration, batch_size,
           print_every, save_every, clip, corpus_name, loadFilename)
end = time.time()
print("-------------TIME TAKEN----------------")
print(end - start)


Building optimizers ...
Starting Training!
Initializing ...
Training...
-------------TIME TAKEN----------------
4.194089651107788


In [None]:
# Run the following to chat

In [None]:
# Set dropout layers to eval mode
encoder.eval()
decoder.eval()

# Initialize search module
searcher = GreedySearchDecoder(encoder, decoder)

evaluateInput(encoder, decoder, searcher, voc)

> hello
Bot: hello . . . . .
> whats your name?
Error: Encountered unknown word.
> what's your name?
Bot: edward . . . . .
> So boone
Bot: i m not gonna get some things
> where are you
Bot: i m here . . . .
> you are surely funny
Bot: i am . . . .
> thanks alot
Error: Encountered unknown word.
> thanks
Bot: i ll see you again . . .
> bye bye
Bot: bye . . . . .
> make sure to have your car locked
Bot: i m not going to be your wife .
> i love you
Bot: i love you too . . .
