## Building a Chatbot : PyTorch

In [1]:
import torch
import torch.nn as nn  #neural networks
from torch import optim  #optimizer
import torch.nn.functional as F  #relu, softmax, etc.
import csv
import random
import re
import os
import unicodedata
import codecs
import itertools

In [2]:
# True if GPU
CUDA = torch.cuda.is_available()
CUDA

True

In [3]:
# transfer data in GPU, or CPU if not GPU
device = torch.device("cuda" if CUDA else "cpu")
device

device(type='cuda')

### Part 1: Data Preprocessing

In [4]:
lines_filepath = os.path.join("cornell movie-dialogs corpus","movie_lines.txt")
conv_filepath = os.path.join("cornell movie-dialogs corpus","movie_conversations.txt")

In [5]:
# visualize some lines
with open(lines_filepath, "r") as file:
    lines = file.readlines()
for line in lines[:8]:
    print(line.strip())
    
# fields: lineID, charID, movieID, charname, line

L1045 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ They do not!
L1044 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ They do to!
L985 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ I hope so.
L984 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ She okay?
L925 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Let's go.
L924 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ Wow
L872 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Okay -- you're gonna need to learn how to lie.
L871 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ No


In [6]:
# split each line into dictionary of fields
line_fields = ["lineID", "charID", "movieID", "charname", "line"]
lines = {}
with open(lines_filepath, "r", encoding="iso-8859-1") as f:  
    """
    UTF-8 is a multibyte encoding that can represent any Unicode character. 
    ISO 8859-1 is a single-byte encoding that can represent the first 256 Unicode characters. 
    Both encode ASCII exactly the same way.    
    """
    for line in f:
        values = line.split(" +++$+++ ")
        # extract fields
        lineObj = {}
        for i,field in enumerate(line_fields):
            lineObj[field] = values[i]
        lines[lineObj["lineID"]] = lineObj  #each line corresponding to lineID

In [7]:
lines

{'L1045': {'lineID': 'L1045',
  'charID': 'u0',
  'movieID': 'm0',
  'charname': 'BIANCA',
  'line': 'They do not!\n'},
 'L1044': {'lineID': 'L1044',
  'charID': 'u2',
  'movieID': 'm0',
  'charname': 'CAMERON',
  'line': 'They do to!\n'},
 'L985': {'lineID': 'L985',
  'charID': 'u0',
  'movieID': 'm0',
  'charname': 'BIANCA',
  'line': 'I hope so.\n'},
 'L984': {'lineID': 'L984',
  'charID': 'u2',
  'movieID': 'm0',
  'charname': 'CAMERON',
  'line': 'She okay?\n'},
 'L925': {'lineID': 'L925',
  'charID': 'u0',
  'movieID': 'm0',
  'charname': 'BIANCA',
  'line': "Let's go.\n"},
 'L924': {'lineID': 'L924',
  'charID': 'u2',
  'movieID': 'm0',
  'charname': 'CAMERON',
  'line': 'Wow\n'},
 'L872': {'lineID': 'L872',
  'charID': 'u0',
  'movieID': 'm0',
  'charname': 'BIANCA',
  'line': "Okay -- you're gonna need to learn how to lie.\n"},
 'L871': {'lineID': 'L871',
  'charID': 'u2',
  'movieID': 'm0',
  'charname': 'CAMERON',
  'line': 'No\n'},
 'L870': {'lineID': 'L870',
  'charID': 'u

### Part 2: Processing the Dataset

In [8]:
# group fields from 'loadlines' into conversation based on movie_conversations.txt
conv_fields = ["char1ID", "char1ID", "movieID", "utteranceIDs"]
conversations = []
with open(conv_filepath, "r", encoding="iso-8859-1") as f:  
    for line in f:
        values = line.split(" +++$+++ ")
        # extract fields
        convObj = {}
        for i,field in enumerate(conv_fields):
            convObj[field] = values[i]
        # convert string results from split to list
        lineIds = eval(convObj["utteranceIDs"])
        # reassemble lines
        convObj["lines"] = []
        for lineId in lineIds:
            convObj["lines"].append(lines[lineId])            
        conversations.append(convObj)

In [9]:
conversations[0]

{'char1ID': 'u2',
 'movieID': 'm0',
 'utteranceIDs': "['L194', 'L195', 'L196', 'L197']\n",
 'lines': [{'lineID': 'L194',
   'charID': 'u0',
   'movieID': 'm0',
   'charname': 'BIANCA',
   'line': 'Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.\n'},
  {'lineID': 'L195',
   'charID': 'u2',
   'movieID': 'm0',
   'charname': 'CAMERON',
   'line': "Well, I thought we'd start with pronunciation, if that's okay with you.\n"},
  {'lineID': 'L196',
   'charID': 'u0',
   'movieID': 'm0',
   'charname': 'BIANCA',
   'line': 'Not the hacking and gagging and spitting part.  Please.\n'},
  {'lineID': 'L197',
   'charID': 'u2',
   'movieID': 'm0',
   'charname': 'CAMERON',
   'line': "Okay... then how 'bout we try out some French cuisine.  Saturday?  Night?\n"}]}

### Part 3: Processing the Dataset

In [10]:
# extract pairs of sentences from conversations
qa_pairs = []
for conversation in conversations:
    # iterate over all lines of conversations
    for i in range(len(conversation["lines"]) - 1): 
        inputLine = conversation["lines"][i]["line"].strip()
        targetLine = conversation["lines"][i+1]["line"].strip()
        # filter wrong samples (if one of lists is empty)
        if inputLine and targetLine:
            qa_pairs.append([inputLine, targetLine])

In [11]:
qa_pairs

[['Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.',
  "Well, I thought we'd start with pronunciation, if that's okay with you."],
 ["Well, I thought we'd start with pronunciation, if that's okay with you.",
  'Not the hacking and gagging and spitting part.  Please.'],
 ['Not the hacking and gagging and spitting part.  Please.',
  "Okay... then how 'bout we try out some French cuisine.  Saturday?  Night?"],
 ["You're asking me out.  That's so cute. What's your name again?",
  'Forget it.'],
 ["No, no, it's my fault -- we didn't have a proper introduction ---",
  'Cameron.'],
 ['Cameron.',
  "The thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does."],
 ["The thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does.",
  'Seems like she could get a date easy enough...'],
 [

In [12]:
len(qa_pairs)

221282

### Part 4: Processing the Dataset

In [13]:
# define path to new file
datafile = os.path.join("cornell movie-dialogs corpus", "formatted_movie_lines.txt")
delimiter = "\t" 
# unescape the delimiter
delimiter = str(codecs.decode(delimiter, "unicode_escape"))

In [14]:
# write new csv file
print("\nWriting newly formatted file...")
with open(datafile, "w", encoding="utf-8") as outputfile:
    writer = csv.writer(outputfile, delimiter=delimiter) 
    # seprarated qa pair by tab
    for pair in qa_pairs:
        writer.writerow(pair)
        # separated conversations by new line 
print("Done writing to file")


Writing newly formatted file...
Done writing to file


In [15]:
# visualize some lines
datafile = os.path.join("cornell movie-dialogs corpus", "formatted_movie_lines.txt")
with open(datafile, "rb") as file:
    lines = file.readlines()
for line in lines[:8]:
    print(line)

b"Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.\tWell, I thought we'd start with pronunciation, if that's okay with you.\r\r\n"
b"Well, I thought we'd start with pronunciation, if that's okay with you.\tNot the hacking and gagging and spitting part.  Please.\r\r\n"
b"Not the hacking and gagging and spitting part.  Please.\tOkay... then how 'bout we try out some French cuisine.  Saturday?  Night?\r\r\n"
b"You're asking me out.  That's so cute. What's your name again?\tForget it.\r\r\n"
b"No, no, it's my fault -- we didn't have a proper introduction ---\tCameron.\r\r\n"
b"Cameron.\tThe thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does.\r\r\n"
b"The thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does.\tSeems like she could get a date easy enough...\r\r\n"
b'Why?\tU

### Processing the Words

In [33]:
PAD_token = 0  #used for padding short sentences 
SOS_token = 1  #start-of-sentence token
EOS_token = 2  #end-of-sentence token

class Vocabulary:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_token:"PAD", SOS_token:"SOS", EOS_token:"EOS"}
        self.num_words = 3  #count SOS, EOS, PAD
    
    def addSentence(self, sentence):
        for word in sentence.split(" "):
            self.addWord(word)
    
    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.num_words
            self.word2count[word] = 1
            self.index2word[self.num_words] = word
            self.num_words += 1
        else:
            self.word2count[word] += 1
            
    # remove words below a certain count threshold
    def trim(self, min_count):
        keep_words = []
        for k,v in self.word2count.items():
            if v >= min_count:
                keep_words.append(k)
                
        print("keep_words {} / {} = {:.4f}".format(len(keep_words), 
                                                   len(self.word2index),
                                                   len(keep_words) / 
                                                  len(self.word2index)))
        # reinitialize dictionaries
        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_token:"PAD", SOS_token:"SOS", EOS_token:"EOS"}
        self.num_words = 3  #count SOS, EOS, PAD
        
        for word in keep_words:
            self.addWord(word)

In [17]:
# turn a unicode string to plain ASCII
def unicodeToAscii(s):
    return ''.join(c for c in unicodedata.normalize("NFD", s) if unicodedata.category(c) != "Mn")

In [18]:
# test the function
unicodeToAscii("Montréal,Françoise....")

'Montreal,Francoise....'

In [19]:
# lowercase, trim white spaces, lines... etc, and remove non-letter characters
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    # replace any .!? by a whitespace + the character -> '!' = ' !'. \1 means the first bracketed group -> [,!?].
    # r is to not consider \1 as a character (r to escape a backslash). + means one or more
    s = re.sub(r"([.!?])", r" \1", s)
    # remove any character that is not a sequence of lower or upper case letters
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    # remove a sequence of whitespace characters
    s = re.sub(r"\s+", r" ", s).strip()
    return s

In [20]:
normalizeString("aa123aa!s's  dd?")

'aa aa !s s dd ?'

### Processing the Text - Part 2

In [35]:
datafile = os.path.join("cornell movie-dialogs corpus", "formatted_movie_lines.txt")
# read the file ad split into lines
print("Reading and processing file... Please wait")
lines = open(datafile, encoding="utf-8").read().strip().split("\n")
# split every line into pairs and normalize 
pairs = [[normalizeString(s) for s in pair.split("\t")] for pair in lines]
print("Done Reading!")
voc = Vocabulary("cornell movie-dialogs corpus")

Reading and processing file... Please wait
Done Reading!


In [23]:
lines[0].split("\t")

['Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.',
 "Well, I thought we'd start with pronunciation, if that's okay with you."]

In [24]:
pairs

[['can we make this quick ? roxanne korrine and andrew barrett are having an incredibly horrendous public break up on the quad . again .',
  'well i thought we d start with pronunciation if that s okay with you .'],
 [''],
 ['well i thought we d start with pronunciation if that s okay with you .',
  'not the hacking and gagging and spitting part . please .'],
 [''],
 ['not the hacking and gagging and spitting part . please .',
  'okay . . . then how bout we try out some french cuisine . saturday ? night ?'],
 [''],
 ['you re asking me out . that s so cute . what s your name again ?',
  'forget it .'],
 [''],
 ['no no it s my fault we didn t have a proper introduction', 'cameron .'],
 [''],
 ['cameron .',
  'the thing is cameron i m at the mercy of a particularly hideous breed of loser . my sister . i can t date until she does .'],
 [''],
 ['the thing is cameron i m at the mercy of a particularly hideous breed of loser . my sister . i can t date until she does .',
  'seems like she coul

In [25]:
len(pairs)

442563

### Filtering the Text

In [26]:
# return true if both sentences in a pair 'p' are under the MAX_LENGTH threshold
MAX_LENGTH = 10  #maximum sentence length (max words)
def filterPair(p):
    # input sequences need to preserve the last word for EOS token
    return len(p[0].split()) < MAX_LENGTH and len(p[1].split()) < MAX_LENGTH

In [27]:
# filter pair using filterPair(pair)
def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

In [28]:
pairs = [pair for pair in pairs if len(pair)>1]
print("There are {} pairs/conversations in the dataset".format(len(pairs)))
pairs = filterPairs(pairs)
print("After filtering, there are {} pairs/conversations".format(len(pairs)))

There are 221282 pairs/conversations in the dataset
After filtering, there are 64271 pairs/conversations


### Getting Rid of Rare Words

In [41]:
#loop through each pair of and add the question and reply sentence to the vocabulary
for pair in pairs:
    voc.addSentence(pair[0])
    voc.addSentence(pair[1])
print("Counted words:", voc.num_words)
for pair in pairs[:10]:
    print(pair)

Counted words: 42989
['can we make this quick ? roxanne korrine and andrew barrett are having an incredibly horrendous public break up on the quad . again .', 'well i thought we d start with pronunciation if that s okay with you .']
['']
['well i thought we d start with pronunciation if that s okay with you .', 'not the hacking and gagging and spitting part . please .']
['']
['not the hacking and gagging and spitting part . please .', 'okay . . . then how bout we try out some french cuisine . saturday ? night ?']
['']
['you re asking me out . that s so cute . what s your name again ?', 'forget it .']
['']
['no no it s my fault we didn t have a proper introduction', 'cameron .']
['']


In [None]:
MIN_COUNT = 3  #minimum word count threshold for trimming

def trimRareWords(voc, pairs, MIN_COUNT):
    # trim words used under the MIN_COUNT from the voc
#     voc.trim(MIN_COUNT)
    # filter out pairs with trimmed words
    keep_pairs = []
    for pair in pairs:
        input_sentence = pair[0]
        output_sentence = pair[1]
        keep_input = True
        keep_output = True
        # check input sentence
        for word in input_sentence.split(" "):
            if word not in voc.word2index:
                keep_input = False
                break
        # check output sentence
        for word in output_sentence.split(" "):
            if word not in voc.word2index:
                keep_output = False
                break
        
        # only keep pairs that do not contain trimmed word(s) in their input or output sentence
        if keep_input and keep_output:
            keep_pairs.append(pair)
            
    print("Trimmed from {} pairs to {}, {:.4f} of total".format(len(pairs), len(keep_pairs), len(keep_pairs) / len(pairs)))
    return keep_pairs

# trim voc and pairs
pairs = trimRareWords(voc, pairs, MIN_COUNT)

### Preparing the Data - Part 1

In [47]:
def indexesFromSentence(voc, sentence):
    return [voc.word2index[word] for word in sentence.split(" ")] + [EOS_token]

In [48]:
# test the function
indexesFromSentence(voc, pairs[1][0])

[39, 2]

In [51]:
# define some samples for testing
inp = []
out = []
i = 0
for pair in pairs[:10]:
    inp.append(pair[0])
    out.append(pair[1])
print(inp)
print(len(inp))
indexes = [indexesFromSentence(voc, sentence) for sentence in inp]
indexes

['can we make this quick ? roxanne korrine and andrew barrett are having an incredibly horrendous public break up on the quad . again .', '', 'well i thought we d start with pronunciation if that s okay with you .', '', 'not the hacking and gagging and spitting part . please .', '', 'you re asking me out . that s so cute . what s your name again ?', '', 'no no it s my fault we didn t have a proper introduction', '']
10


[[3,
  4,
  5,
  6,
  7,
  8,
  9,
  10,
  11,
  12,
  13,
  14,
  15,
  16,
  17,
  18,
  19,
  20,
  21,
  22,
  23,
  24,
  25,
  26,
  25,
  2],
 [39, 2],
 [27, 28, 29, 4, 30, 31, 32, 33, 34, 35, 36, 37, 32, 38, 25, 2],
 [39, 2],
 [40, 23, 41, 11, 42, 11, 43, 44, 25, 45, 25, 2],
 [39, 2],
 [38, 46, 47, 48, 49, 25, 35, 36, 50, 51, 25, 52, 36, 53, 54, 26, 8, 2],
 [39, 2],
 [55, 55, 56, 36, 57, 58, 4, 59, 60, 61, 62, 63, 64, 2],
 [39, 2]]

### Understanding the Zip Function

In [52]:
a = ["A","B","C"]
b = [1,2,3]
list(zip(a,b))

[('A', 1), ('B', 2), ('C', 3)]

In [53]:
a = ["A","B","C","D","E"]
b = [1,2,3]
list(itertools.zip_longest(a,b))

[('A', 1), ('B', 2), ('C', 3), ('D', None), ('E', None)]

### Preparing the Data - Part 2

In [55]:
a = [[3, 4, 5, 2],
 [39, 35, 2],
 [27, 28, 29, 4, 30, 31, 35, 36, 37, 32, 38, 25, 2],
 [40, 23, 41, 11, 42, 11, 43, 44, 25, 45, 25, 2],
 [38, 46, 47, 48, 49, 21, 25, 52, 36, 53, 54, 26, 8, 2],
 [55, 55, 56, 36, 57, 58, 4, 59, 60, 61, 62, 63, 64, 2],
 [39, 2]]
list(itertools.zip_longest(*a, fillvalue="x"))

[(3, 39, 27, 40, 38, 55, 39),
 (4, 35, 28, 23, 46, 55, 2),
 (5, 2, 29, 41, 47, 56, 'x'),
 (2, 'x', 4, 11, 48, 36, 'x'),
 ('x', 'x', 30, 42, 49, 57, 'x'),
 ('x', 'x', 31, 11, 21, 58, 'x'),
 ('x', 'x', 35, 43, 25, 4, 'x'),
 ('x', 'x', 36, 44, 52, 59, 'x'),
 ('x', 'x', 37, 25, 36, 60, 'x'),
 ('x', 'x', 32, 45, 53, 61, 'x'),
 ('x', 'x', 38, 25, 54, 62, 'x'),
 ('x', 'x', 25, 2, 26, 63, 'x'),
 ('x', 'x', 2, 'x', 8, 64, 'x'),
 ('x', 'x', 'x', 'x', 2, 2, 'x')]

In [57]:
def zeroPadding(l, fillvalue=0):
    return list(itertools.zip_longest(*l, fillvalue=fillvalue))

In [59]:
leng = [len(ind) for ind in indexes]
max(leng)

26

In [60]:
leng

[26, 2, 16, 2, 12, 2, 18, 2, 14, 2]

In [65]:
# test the function
test_result = zeroPadding(indexes)
print(len(test_result))  #max length is now the number of rows or the batch size
test_result

26


[(3, 39, 27, 39, 40, 39, 38, 39, 55, 39),
 (4, 2, 28, 2, 23, 2, 46, 2, 55, 2),
 (5, 0, 29, 0, 41, 0, 47, 0, 56, 0),
 (6, 0, 4, 0, 11, 0, 48, 0, 36, 0),
 (7, 0, 30, 0, 42, 0, 49, 0, 57, 0),
 (8, 0, 31, 0, 11, 0, 25, 0, 58, 0),
 (9, 0, 32, 0, 43, 0, 35, 0, 4, 0),
 (10, 0, 33, 0, 44, 0, 36, 0, 59, 0),
 (11, 0, 34, 0, 25, 0, 50, 0, 60, 0),
 (12, 0, 35, 0, 45, 0, 51, 0, 61, 0),
 (13, 0, 36, 0, 25, 0, 25, 0, 62, 0),
 (14, 0, 37, 0, 2, 0, 52, 0, 63, 0),
 (15, 0, 32, 0, 0, 0, 36, 0, 64, 0),
 (16, 0, 38, 0, 0, 0, 53, 0, 2, 0),
 (17, 0, 25, 0, 0, 0, 54, 0, 0, 0),
 (18, 0, 2, 0, 0, 0, 26, 0, 0, 0),
 (19, 0, 0, 0, 0, 0, 8, 0, 0, 0),
 (20, 0, 0, 0, 0, 0, 2, 0, 0, 0),
 (21, 0, 0, 0, 0, 0, 0, 0, 0, 0),
 (22, 0, 0, 0, 0, 0, 0, 0, 0, 0),
 (23, 0, 0, 0, 0, 0, 0, 0, 0, 0),
 (24, 0, 0, 0, 0, 0, 0, 0, 0, 0),
 (25, 0, 0, 0, 0, 0, 0, 0, 0, 0),
 (26, 0, 0, 0, 0, 0, 0, 0, 0, 0),
 (25, 0, 0, 0, 0, 0, 0, 0, 0, 0),
 (2, 0, 0, 0, 0, 0, 0, 0, 0, 0)]

### Preparing the Data - Part 3

In [66]:
def binaryMatrix(l, value=0):
    m = []
    for i,seq in enumerate(l):
        m.append([])
        for token in seq:
            if token == PAD_token:
                m[i].append(0)
            else:
                m[i].append(1)
    return m

In [67]:
binary_result = binaryMatrix(test_result)
binary_result

[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 [1, 0, 1, 0, 1, 0, 1, 0, 1, 0],
 [1, 0, 1, 0, 1, 0, 1, 0, 1, 0],
 [1, 0, 1, 0, 1, 0, 1, 0, 1, 0],
 [1, 0, 1, 0, 1, 0, 1, 0, 1, 0],
 [1, 0, 1, 0, 1, 0, 1, 0, 1, 0],
 [1, 0, 1, 0, 1, 0, 1, 0, 1, 0],
 [1, 0, 1, 0, 1, 0, 1, 0, 1, 0],
 [1, 0, 1, 0, 1, 0, 1, 0, 1, 0],
 [1, 0, 1, 0, 1, 0, 1, 0, 1, 0],
 [1, 0, 1, 0, 1, 0, 1, 0, 1, 0],
 [1, 0, 1, 0, 0, 0, 1, 0, 1, 0],
 [1, 0, 1, 0, 0, 0, 1, 0, 1, 0],
 [1, 0, 1, 0, 0, 0, 1, 0, 0, 0],
 [1, 0, 1, 0, 0, 0, 1, 0, 0, 0],
 [1, 0, 0, 0, 0, 0, 1, 0, 0, 0],
 [1, 0, 0, 0, 0, 0, 1, 0, 0, 0],
 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0]]

In [69]:
# return padded input sequence tensor and as well as a tensor of lengths for each of the sequances in the batch
def inputVar(l, voc):
    indexes_batch = [indexesFromSentence(voc, sentence) for sentence in l]
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
    padList = zeroPadding(indexes_batch)
    padVar = torch.LongTensor(padList)
    return padVar, lengths

In [70]:
# return padded target sequence tensor, padding mask, and max target length
def outputVar(l, voc):
    indexes_batch = [indexesFromSentence(voc, sentence) for sentence in l]
    max_target_len = max([len(indexes) for indexes in indexes_batch])
    padList = zeroPadding(indexes_batch)
    mask = binaryMatrix(padList)
    mask = torch.ByteTensor(mask)
    padVar = torch.LongTensor(padList)
    return padVar, mask, max_target_len

### Preparing the Data - Part 4

In [74]:
# return all items for a given batch of pairs
def batch2TrainData(voc, pair_batch):
    # sort the questions in descending length
    pair_batch.sort(key=lambda x: len(x[0].split(" ")), reverse=True)
    input_batch, output_batch = [], []
    for pair in pair_batch:
        input_batch.append(pair[0])
        output_batch.append(pair[1])
    inp, lengths = inputVar(input_batch, voc)
    # assert len(inp) == lengths[0]
    output, mask, max_target_len = outputVar(output_batch, voc)
    return inp, lengths, output, mask, max_target_len

In [75]:
# example for validation
small_batch_size = 5
batches = batch2TrainData(voc, [random.choice(pairs) for _ in range(small_batch_size)])
input_variable, lengths, target_variable, mask, max_target_len = batches

print("input_variable:\n", input_variable)
print("lengths:", lengths)
print("target_variable:\n", target_variable)
print("mask:\n", mask)
print("max_target_len:", max_target_len)

input_variable:
 tensor([[34128,    39,    39,    39,    39],
        [   25,     2,     2,     2,     2],
        [   25,     0,     0,     0,     0],
        [   25,     0,     0,     0,     0],
        [    2,     0,     0,     0,     0]])
lengths: tensor([5, 2, 2, 2, 2])
target_variable:
 tensor([[34128,    39,    39,    39,    39],
        [   25,     2,     2,     2,     2],
        [   25,     0,     0,     0,     0],
        [   25,     0,     0,     0,     0],
        [    2,     0,     0,     0,     0]])
mask:
 tensor([[1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1],
        [1, 0, 0, 0, 0],
        [1, 0, 0, 0, 0],
        [1, 0, 0, 0, 0]], dtype=torch.uint8)
max_target_len: 5
