In [1]:
import re
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset
# Metrics
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
# Text box
import ipywidgets as widgets
from IPython.display import display

# Loading Data

In [2]:
movie_lines_path = 'movie_lines.txt'
movie_conversations_path = 'movie_conversations.txt'

In [3]:
with open(movie_lines_path, encoding='iso-8859-1', errors='ignore') as my_file:
    all_lines = {}
    for line in my_file:
        split = line.split(' +++$+++ ')
        linemp = {}
        fields = ["lineID", "characterID", "movieID", "character", "text"]
        count = 0
        for field in (fields):
                linemp[field] = split[count]
                count +=1
        all_lines[linemp['lineID']] = linemp        
        


In [4]:
with open(movie_conversations_path, encoding='iso-8859-1', errors='ignore') as my_file:
    conv = []
    for line in my_file:
        split = line.split(' +++$+++ ')
        obj = {}
        fields = ["character1ID", "character2ID", "movieID", "utteranceIDs"]
        count = 0 
        for field in fields:
            obj[field] = split[count]
            count +=1
        ID = re.compile('L[0-9]+').findall(obj['utteranceIDs'])
        lines = []
        
        for id_ in ID:
            lines.append(all_lines[id_])
        obj['line'] = lines
        conv.append(obj)

In [5]:
all_lines["L985"]

{'lineID': 'L985',
 'characterID': 'u0',
 'movieID': 'm0',
 'character': 'BIANCA',
 'text': 'I hope so.\n'}

In [6]:
conv[10]

{'character1ID': 'u0',
 'character2ID': 'u2',
 'movieID': 'm0',
 'utteranceIDs': "['L367', 'L368']\n",
 'line': [{'lineID': 'L367',
   'characterID': 'u2',
   'movieID': 'm0',
   'character': 'CAMERON',
   'text': 'How do you get your hair to look like that?\n'},
  {'lineID': 'L368',
   'characterID': 'u0',
   'movieID': 'm0',
   'character': 'BIANCA',
   'text': "Eber's Deep Conditioner every two days. And I never, ever use a blowdryer without the diffuser attachment.\n"}]}

# Matching Data

In [7]:
pairs = []
for convrtsation in conv:
        for i in range(len(convrtsation['line'])):
            try:
                question = convrtsation['line'][i]['text'].strip()
                answer = convrtsation['line'][i+1]['text'].strip()
            except:
                pass
            if(question and answer):
                pairs.append([question, answer])

In [8]:
len(pairs)

304309

In [9]:
for i in range (10):
    print(pairs[i])

['Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.', "Well, I thought we'd start with pronunciation, if that's okay with you."]
["Well, I thought we'd start with pronunciation, if that's okay with you.", 'Not the hacking and gagging and spitting part.  Please.']
['Not the hacking and gagging and spitting part.  Please.', "Okay... then how 'bout we try out some French cuisine.  Saturday?  Night?"]
["Okay... then how 'bout we try out some French cuisine.  Saturday?  Night?", "Okay... then how 'bout we try out some French cuisine.  Saturday?  Night?"]
["You're asking me out.  That's so cute. What's your name again?", 'Forget it.']
['Forget it.', 'Forget it.']
["No, no, it's my fault -- we didn't have a proper introduction ---", 'Cameron.']
['Cameron.', "The thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does."]
["The thing is, Cameron --

In [10]:
class Vocab:
    def __init__(self):
        self.enum = {"PAD_token" : 0, "SOS_token" : 1, "EOS_token":2, "UNK":3}
        self.count = {}
        self.index = {}
        self.wordcount = 4
        self.min_freq = 10
    def addSentence(self,sentence):
        for word in sentence.split(' '):
            if word not in self.enum:
                if(word in self.count.keys()):
                    self.count[word] += 1
                    if(self.count[word] >= self.min_freq):
                        self.enum[word] = self.wordcount
                        self.index[self.wordcount] = word
                        self.wordcount += 1
                else:
                    self.count[word] = 1
            else:
                #print("Word already Added")
                self.count[word] += 1
    def __len__(self):
        return self.wordcount    
                
    ### This will be the class that handles the bag of words.
    

In [11]:
PAD_token = 0
SOS_token = 1
EOS_token = 2
UNK = 3

In [12]:
voc = Vocab()

In [13]:
import numpy as np

In [14]:
shape = np.array(pairs)
shape.shape

(304309, 2)

In [15]:
pairs[1][1]

'Not the hacking and gagging and spitting part.  Please.'

In [16]:
voc.count

{}

In [17]:
voc.enum

{'PAD_token': 0, 'SOS_token': 1, 'EOS_token': 2, 'UNK': 3}

In [18]:
import nltk
from nltk.corpus import stopwords

In [19]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\beand\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [20]:
def clean_String(string):
    lower_string = string.lower()
    no_number_string = re.sub(r'\d+','',lower_string)
    no_punc_string = re.sub(r'[^\w\s]','', no_number_string) 
    no_wspace_string = no_punc_string.strip()
    
    words = no_wspace_string.split()
    filtered_words = [word for word in words if word not in stop_words]
    return ' '.join(filtered_words)

In [21]:
for i in pairs:
    for j in i:
        cleaned = clean_String(j)
        voc.addSentence(cleaned)

In [22]:
len(voc)

17148

# Make Batches

In [23]:
batch_size = 3

In [24]:
import random

In [25]:
voc.enum["judgment"]

5337

In [26]:
import itertools

In [27]:
def EnumerateSentance(voc , sentence):
    sentence = clean_String(sentence)
    words = sentence.split()
    output = []
    for word in words:
        try:
            output.append(voc.enum[word])
        except:
            output.append(UNK)
    output.append(EOS_token)
    return output

def out(string ,voc):
    batches = []
    for sen in string:
            batches.append(EnumerateSentance(voc,sen))
    max_l = 0
    for sen in batches:
        if(len(sen) > max_l):
            max_l = len(sen)
    padList = list(itertools.zip_longest(*batches , fillvalue = PAD_token))
    mask = Mask(padList)
    mask = torch.BoolTensor(mask)
    padVar = torch.LongTensor(padList)
    return padVar , mask , max_l


def inps(string,voc):
    indexes_batch = []
    for i in string:
        indexes_batch.append(EnumerateSentance(voc , i))
    length = []
    for i in indexes_batch:
        length.append(len(i))
    #print(length)
    padList = list(itertools.zip_longest(*indexes_batch , fillvalue = PAD_token))
    padVar = torch.LongTensor(padList)
    return padVar , length

def Mask(string):
    mask = []
    for i, seq in enumerate(string):
        k = []
        for token in seq:
            if token == PAD_token:
                k.append(PAD_token)
            else:
                k.append(1)
        mask.append(k)
    return mask


def batchmaker(voc , pair_batch):
    input_batch = [] 
    output_batch = []
    for pair in pair_batch:
        input_batch.append(pair[0])
        output_batch.append(pair[1])
    inputs , lens = inps(input_batch , voc)
    output , mask , max_l = out(output_batch , voc)
    return inputs , lens , output , mask , max_l


In [28]:
p = pairs[:batch_size]

In [29]:
batches = batchmaker(voc, p)

In [30]:
batches[0]

tensor([[   49,    48,  6417],
        [ 1956,    22, 14876],
        [    3,  1019,  9984],
        [    3,    78,   298],
        [ 6278,     3,   257],
        [ 4358,     5,     2],
        [ 1877,    47,     0],
        [    3,     2,     0],
        [ 1235,     0,     0],
        [  749,     0,     0],
        [    3,     0,     0],
        [    2,     0,     0]])

# Models

In [31]:
class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, bidirectional=True)

    def forward(self, input_seq, input_lengths, hidden=None):
        embedded = self.embedding(input_seq)
        packed = nn.utils.rnn.pack_padded_sequence(embedded, input_lengths.cpu(), enforce_sorted=False)
        outputs, hidden = self.gru(packed, hidden)
        outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs)
        outputs = outputs[:, :, :self.hidden_size] + outputs[:, :, self.hidden_size:]
        return outputs, hidden

In [32]:
class Attn(nn.Module):
    def __init__(self, method, hidden_size):
        super(Attn, self).__init__()
        self.method = method
        self.hidden_size = hidden_size
        
        if self.method == 'general':
            self.attn = nn.Linear(self.hidden_size, hidden_size)
            
    def general_score(self, hidden, encoder_output):
        energy = self.attn(encoder_output)
        return torch.sum(hidden * energy, dim=2)

    def forward(self, hidden, encoder_outputs):
        if self.method == 'general':
            attn_energies = self.general_score(hidden, encoder_outputs)
        
        attn_energies = attn_energies.t()
        return F.softmax(attn_energies, dim=1).unsqueeze(1)

In [36]:
class LuongAttnDecoderRNN(nn.Module):
    def __init__(self , attn_model, embedding,  hidden_size , output_size , n_layers = 1 , dropout = 0.1):
        super(LuongAttnDecoderRNN , self).__init__()
        self.attn_model = attn_model 
        self.hidden_size = hidden_size
        self.output_size =output_size
        self.n_layers = n_layers
        self.dropout = dropout
        self.embedding = embedding 
        self.embedding_dropout = nn.Dropout(dropout)
        self.gru = nn.GRU(hidden_size , hidden_size , n_layers , dropout = 0 , )
        self.concat = nn.Linear(hidden_size*2 , hidden_size)
        self.out = nn.Linear(hidden_size , output_size)
        self.attn = Attn(attn_model , hidden_size)
    def forward(self , input_step , last_hidden , encoder_outputs):
        embedded = self.embedding(input_step)
        embedded = self.embedding_dropout(embedded)
        rnn_output , hidden = self.gru(input_step.int() , last_hidden.int())
        attn_weights = self.attn(rnn_output , encoder_outputs)
        context = attn_weights.bmm(encoder_outputs.transpose(0,1)) #batch matrix-matrix product of matrices
        rnn_output = rnn_output.squeeze(0)
        context = context.squeeze(1)
        concat_input = torch.cat((rnn_output , context) , 1)
        concat_output = torch.tanh(self.concat(concat_input))
        output = self.out(concat_output)
        output = F.softmax(output , dim = 1)
        return output , hidden 




# Testing With previous Batch"

In [37]:
input_size = len(voc)
src_vocab_size = len(voc)
trg_vocab_size = len(voc)
hidden_size = 200 

encoder = Encoder(input_size, hidden_size)

test_input = batches[0]
test_input_lengths = torch.tensor([len(test_input)]) 

# Double checking sequence length:
assert test_input.shape[0] >= torch.max(test_input_lengths), "Declared sequence length exceeds actual length"

# Encoder
output, hidden = encoder(test_input, test_input_lengths)

tensor([[[ 0.4850, -0.4314,  0.1394,  ...,  0.3522,  0.2990, -0.5079]],

        [[ 0.5007, -0.2831, -0.6359,  ...,  0.3976,  0.7162, -0.7220]],

        [[ 0.5200, -0.0534, -0.9816,  ...,  0.0355,  0.3542, -0.9092]],

        ...,

        [[ 0.0643,  0.0987, -0.6060,  ..., -0.0646,  0.2679, -0.3439]],

        [[ 0.1153,  0.0197, -0.6474,  ...,  0.2446, -0.1727, -0.7925]],

        [[-0.2211,  0.0532,  0.0827,  ..., -0.0730, -0.1277, -0.1193]]],
       grad_fn=<AddBackward0>)