In [7]:
import re
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset
# Metrics
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
# Text box
import ipywidgets as widgets
from IPython.display import display

# Loading Data

In [8]:
movie_lines_path = 'movie_lines.txt'
movie_conversations_path = 'movie_conversations.txt'

In [9]:
with open(movie_lines_path, encoding='iso-8859-1', errors='ignore') as my_file:
    all_lines = {}
    for line in my_file:
        split = line.split(' +++$+++ ')
        linemp = {}
        fields = ["lineID", "characterID", "movieID", "character", "text"]
        count = 0
        for field in (fields):
                linemp[field] = split[count]
                count +=1
        all_lines[linemp['lineID']] = linemp        
        


In [10]:
with open(movie_conversations_path, encoding='iso-8859-1', errors='ignore') as my_file:
    conv = []
    for line in my_file:
        split = line.split(' +++$+++ ')
        obj = {}
        fields = ["character1ID", "character2ID", "movieID", "utteranceIDs"]
        count = 0 
        for field in fields:
            obj[field] = split[count]
            count +=1
        ID = re.compile('L[0-9]+').findall(obj['utteranceIDs'])
        lines = []
        
        for id_ in ID:
            lines.append(all_lines[id_])
        obj['line'] = lines
        conv.append(obj)

In [11]:
all_lines["L985"]

{'lineID': 'L985',
 'characterID': 'u0',
 'movieID': 'm0',
 'character': 'BIANCA',
 'text': 'I hope so.\n'}

In [12]:
conv[10]

{'character1ID': 'u0',
 'character2ID': 'u2',
 'movieID': 'm0',
 'utteranceIDs': "['L367', 'L368']\n",
 'line': [{'lineID': 'L367',
   'characterID': 'u2',
   'movieID': 'm0',
   'character': 'CAMERON',
   'text': 'How do you get your hair to look like that?\n'},
  {'lineID': 'L368',
   'characterID': 'u0',
   'movieID': 'm0',
   'character': 'BIANCA',
   'text': "Eber's Deep Conditioner every two days. And I never, ever use a blowdryer without the diffuser attachment.\n"}]}

# Matching Data

In [31]:
pairs = []
for convrtsation in conv:
        for i in range(len(convrtsation['line'])):
            try:
                question = convrtsation['line'][i]['text'].strip()
                answer = convrtsation['line'][i+1]['text'].strip()
            except:
                pass
            if(question and answer):
                pairs.append([question, answer])

In [32]:
len(pairs)

304309

In [33]:
for i in range (10):
    print(pairs[i])

['Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.', "Well, I thought we'd start with pronunciation, if that's okay with you."]
["Well, I thought we'd start with pronunciation, if that's okay with you.", 'Not the hacking and gagging and spitting part.  Please.']
['Not the hacking and gagging and spitting part.  Please.', "Okay... then how 'bout we try out some French cuisine.  Saturday?  Night?"]
["Okay... then how 'bout we try out some French cuisine.  Saturday?  Night?", "Okay... then how 'bout we try out some French cuisine.  Saturday?  Night?"]
["You're asking me out.  That's so cute. What's your name again?", 'Forget it.']
['Forget it.', 'Forget it.']
["No, no, it's my fault -- we didn't have a proper introduction ---", 'Cameron.']
['Cameron.', "The thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does."]
["The thing is, Cameron --

In [378]:
class Vocab:
    def __init__(self):
        self.enum = {"PAD_token" : 0, "SOS_token" : 1, "EOS_token":2, "UNK":3}
        self.count = {}
        self.index = {}
        self.wordcount = 4
        self.min_freq = 10
    def addSentence(self,sentence):
        for word in sentence.split(' '):
            if word not in self.enum:
                if(word in self.count.keys()):
                    self.count[word] += 1
                    if(self.count[word] >= self.min_freq):
                        self.enum[word] = self.wordcount
                        self.index[self.wordcount] = word
                        self.wordcount += 1
                else:
                    self.count[word] = 1
            else:
                #print("Word already Added")
                self.count[word] += 1
    def __len__(self):
        return self.wordcount
        
                
            
            
                
    ### This will be the class that handles the bag of words.
    

In [379]:
PAD_token = 0
SOS_token = 1
EOS_token = 2
UNK = 3

In [351]:
voc = Vocab()

In [352]:
import numpy as np

In [353]:
shape = np.array(pairs)
shape.shape

(304309, 2)

In [354]:
pairs[1][1]

'Not the hacking and gagging and spitting part.  Please.'

In [355]:
voc.count

{}

In [356]:
voc.enum

{'PAD_token': 0, 'SOS_token': 1, 'EOS_token': 2}

In [357]:
import nltk
from nltk.corpus import stopwords

In [358]:

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\beand\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [359]:
def clean_String(string):
    lower_string = string.lower()
    no_number_string = re.sub(r'\d+','',lower_string)
    no_punc_string = re.sub(r'[^\w\s]','', no_number_string) 
    no_wspace_string = no_punc_string.strip()
    
    words = no_wspace_string.split()
    filtered_words = [word for word in words if word not in stop_words]
    return ' '.join(filtered_words)

In [360]:
for i in pairs:
    for j in i:
        cleaned = clean_String(j)
        voc.addSentence(cleaned)

In [361]:
len(voc)

55643

# Make Batches

In [362]:
batch_size = 3

In [363]:
import random

In [371]:
voc.enum

{'PAD_token': 0,
 'SOS_token': 1,
 'EOS_token': 2,
 'well': 3,
 'thought': 4,
 'wed': 5,
 'start': 6,
 'pronunciation': 7,
 'thats': 8,
 'okay': 9,
 'hacking': 10,
 'gagging': 11,
 'spitting': 12,
 'part': 13,
 'please': 14,
 'bout': 15,
 'try': 16,
 'french': 17,
 'cuisine': 18,
 'saturday': 19,
 'night': 20,
 'forget': 21,
 'cameron': 22,
 'thing': 23,
 'im': 24,
 'mercy': 25,
 'particularly': 26,
 'hideous': 27,
 'breed': 28,
 'loser': 29,
 'sister': 30,
 'cant': 31,
 'date': 32,
 'seems': 33,
 'like': 34,
 'could': 35,
 'get': 36,
 'easy': 37,
 'enough': 38,
 'unsolved': 39,
 'mystery': 40,
 'used': 41,
 'really': 42,
 'popular': 43,
 'started': 44,
 'high': 45,
 'school': 46,
 'got': 47,
 'sick': 48,
 'something': 49,
 'shame': 50,
 'let': 51,
 'see': 52,
 'youre': 53,
 'right': 54,
 'ready': 55,
 'quiz': 56,
 'want': 57,
 'know': 58,
 'head': 59,
 'dont': 60,
 'say': 61,
 'though': 62,
 'useful': 63,
 'things': 64,
 'good': 65,
 'stores': 66,
 'much': 67,
 'champagne': 68,
 'cost

In [374]:
voc.enum["judgment"]

999

In [393]:
import itertools

In [424]:
def EnumerateSentance(voc , sentence):
    sentence = clean_String(sentence)
    words = sentence.split()
    output = []
    for word in words:
        try:
            output.append(voc.enum[word])
        except:
            output.append(UNK)
    output.append(EOS_token)
       
    return output

def Mask(string, value=PAD_token):
    mask = []
    for i, seq in enumerate(l):
        for token in seq:
            if token == PAD_token:
                mask[i].append(PAD_token)
            else:
                mask[i].append(1)
        m.append([])
    return m

def inputVar(l,voc):
    indexes_batch = [EnumerateSentance(voc , sentence) for sentence in l]
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
    padList = list(itertools.zip_longest(*indexes_batch , fillvalue = PAD_token))
    padVar = torch.LongTensor(padList)
    return padVar , lengths

def outputVar(l ,voc):
    indexes_batch = [EnumerateSentance(voc , sentence) for sentence in l]
    max_length = max([len(indexes) for indexes in indexes_batch])
    padList = list(itertools.zip_longest(*indexes_batch , fillvalue = PAD_token))
    mask = binaryMatrix(padList)
    mask = torch.BoolTensor(mask)
    padVar = torch.LongTensor(padList)
    return padVar , mask , max_length

def batch2TrainData(voc , pair_batch):
    input_batch , output_batch = [] , []
    for pair in pair_batch:
        input_batch.append(pair[0])
        output_batch.append(pair[1])
    inp , lengths = inputVar(input_batch , voc)
    output , mask , max_target_len = outputVar(output_batch , voc)
    return inp , lengths , output , mask , max_target_len


In [425]:
p = pairs[:batch_size]

In [426]:
batches = batch2TrainData(voc, p)

In [427]:
batches

(tensor([[  298,     3,    10],
         [ 2388,     4,    11],
         [    3,     5,    12],
         [    3,     6,    13],
         [17571,     7,    14],
         [13625,     8,     2],
         [  625,     9,     0],
         [42443,     2,     0],
         [  578,     0,     0],
         [  552,     0,     0],
         [21541,     0,     0],
         [    2,     0,     0]]),
 tensor([12,  8,  6]),
 tensor([[ 3, 10,  9],
         [ 4, 11, 15],
         [ 5, 12, 16],
         [ 6, 13, 17],
         [ 7, 14, 18],
         [ 8,  2, 19],
         [ 9,  0, 20],
         [ 2,  0,  2]]),
 tensor([[ True,  True,  True],
         [ True,  True,  True],
         [ True,  True,  True],
         [ True,  True,  True],
         [ True,  True,  True],
         [ True,  True,  True],
         [ True, False,  True],
         [ True, False,  True]]),
 8)