In [1]:
import re
import os
import sys
import math
import time
import torch
import random
import numpy as np
import unicodedata
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils.data.dataset import Dataset
from torch.utils.data.dataloader import DataLoader

In [2]:
# update_progress() : Displays or updates a console progress bar
## Accepts a float between 0 and 1. Any int will be converted to a float.
## A value under 0 represents a 'halt'.
## A value at 1 or bigger represents 100%
def update_progress(progress):
    barLength = 10 # Modify this to change the length of the progress bar
    status = ""
    if isinstance(progress, int):
        progress = float(progress)
    if not isinstance(progress, float):
        progress = 0
        status = "error: progress var must be float\r\n"
    if progress < 0:
        progress = 0
        status = "Halt...\r\n"
    if progress >= 1:
        progress = 1
        status = "Done...\r\n"
    block = int(round(barLength*progress))
    text = "\rPercent: [{:s}] {:.2f}% {:s}".format( ":"*block + "-"*(barLength-block), progress*100, status)
    sys.stdout.write(text)
    sys.stdout.flush()

In [3]:
# Transalation
class ChatbotTrainDataset(Dataset):
    class Voc(object):
        def __init__(self, name):
            self.name = name
            self.word2index = {}
            self.word2count = {}
            self.index2word = {0: "SOS", 1: "EOS", 2:"PAD"}
            self.n_words = 3  # Count SOS and EOS

        def add_sentence(self, sentence):
            for word in sentence.split(' '):
                self.add_word(word)

        def add_word(self, word):
            if word not in self.word2index:
                self.word2index[word] = self.n_words
                self.word2count[word] = 1
                self.index2word[self.n_words] = word
                self.n_words += 1
            else:
                self.word2count[word] += 1
                
    def __init__(self,lang1,lang2,MAX_LENGTH = 10,reverse=False):
        self.MAX_LENGTH = MAX_LENGTH
        self.SOS_token = 0
        self.EOS_token = 1
        self.PAD_token = 2
        self.input_voc,self.output_voc,self.pairs = self.prepare_data(lang1,lang2,reverse)
        input_data = list(map(lambda x:self.indexes_from_sentence(self.input_voc,x[0])+[self.EOS_token],self.pairs))
        output_data = list(map(lambda x:self.indexes_from_sentence(self.input_voc,x[0])+[self.EOS_token],self.pairs))
        self.input_lengths = np.array([len(seq) for seq in input_data])
        self.output_lengths = np.array([len(seq) for seq in output_data])
        self.input_data = self.zeroPadding(input_data,self.PAD_token)
        self.output_data = self.zeroPadding(output_data,self.PAD_token)       
        
    def filter_pair(self, p):
        return len(p[0].split(' ')) < self.MAX_LENGTH and \
               len(p[1].split(' ')) < self.MAX_LENGTH 

    def filter_pairs(self, pairs):
        return [pair for pair in pairs if self.filter_pair(pair)]

    @staticmethod
    def unicode_to_ascii(s):
        return ''.join(
            c for c in unicodedata.normalize('NFD', s)
            if unicodedata.category(c) != 'Mn'
        )

    def normalize_string(self, s):
        s = self.unicode_to_ascii(s.strip()).lower().strip()        
        s = re.sub("([.!?])", " \1", s)
        s = re.sub("[^a-zA-Z.!?s]+", " ", s)
        return s.strip()

    def read_lang(self, lang1, lang2, reverse=False):
        print("Reading lines...")
        # combine every two lines into pairs and normalize
        with open('../data/interim/%s-%s.txt' % (lang1, lang2), encoding='utf-8') as f:
            content = f.readlines()
        lines = [x.strip() for x in content]
        pairs = [[self.normalize_string(s) for s in line.split('\t')] for line in lines]
        if reverse:
            pairs = [list(reversed(p)) for p in pairs]
            input_voc = self.Voc(lang2)
            output_voc = self.Voc(lang1)
        else:
            input_voc = self.Voc(lang1)
            output_voc = self.Voc(lang2)
        return input_voc, output_voc, pairs

    def indexes_from_sentence(self, voc, sentence):
        return [voc.word2index[word] for word in sentence.split(' ')]
    
    # batch_first: true -> false, i.e. shape: seq_len * batch
    def zeroPadding(self,data, fillvalue):
        pad = len(max(data, key=len))
        return np.array([i + [fillvalue]*(pad-len(i)) for i in data])

    def prepare_data(self, lang1, lang2, reverse=False):
        input_voc, output_voc, pairs = self.read_lang(lang1, lang2, reverse)
        print("Read %s sentence pairs" % len(pairs))
        pairs = self.filter_pairs(pairs)
        print("Trimmed to %s sentence pairs" % len(pairs))
        print("Counting words...")
        for pair in pairs:
            input_voc.add_sentence(pair[0])
            output_voc.add_sentence(pair[1])
        print("Counted words:")
        print(input_voc.name, input_voc.n_words)
        print(output_voc.name, output_voc.n_words)
        return input_voc,output_voc,pairs
        
    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        return self.input_data[idx],self.input_lengths[idx],self.output_data[idx],self.output_lengths[idx]

In [3]:
# Conversation
import pandas as pd
class ChatbotTrainDataset(Dataset):
    class Voc(object):
        def __init__(self, name):
            self.name = name
            self.word2index = {}
            self.word2count = {}
            self.index2word = {0: "SOS", 1: "EOS", 2:"PAD"}
            self.n_words = 3  # Count SOS and EOS

        def add_sentence(self, sentence):
            for word in sentence.split(' '):
                self.add_word(word)

        def add_word(self, word):
            if word not in self.word2index:
                self.word2index[word] = self.n_words
                self.word2count[word] = 1
                self.index2word[self.n_words] = word
                self.n_words += 1
            else:
                self.word2count[word] += 1
                
    def __init__(self,lang1,lang2,MAX_LENGTH = 100,reverse=False):
        self.MAX_LENGTH = MAX_LENGTH
        self.SOS_token = 0
        self.EOS_token = 1
        self.PAD_token = 2
        self.input_voc,self.output_voc,self.pairs = self.prepare_data(lang1,lang2,reverse)
        input_data = list(map(lambda x:self.indexes_from_sentence(self.input_voc,x[0])+[self.EOS_token],self.pairs))
        output_data = list(map(lambda x:self.indexes_from_sentence(self.input_voc,x[0])+[self.EOS_token],self.pairs))
        self.input_lengths = np.array([len(seq) for seq in input_data])
        self.output_lengths = np.array([len(seq) for seq in output_data])
        self.input_data = self.zeroPadding(input_data,self.PAD_token)
        self.output_data = self.zeroPadding(output_data,self.PAD_token)       
        
    def filter_pair(self, p):
        return len(p[0].split(' ')) < self.MAX_LENGTH and \
               len(p[1].split(' ')) < self.MAX_LENGTH 

    def filter_pairs(self, pairs):
        return [pair for pair in pairs if self.filter_pair(pair)]

    @staticmethod
    def unicode_to_ascii(s):
        return ''.join(
            c for c in unicodedata.normalize('NFD', s)
            if unicodedata.category(c) != 'Mn'
        )

    def normalize_string(self, s):
        s = self.unicode_to_ascii(s.strip()).lower().strip()        
        s = re.sub(r"\n", "",  s)
        s = re.sub(r"[-()]", "", s)
        s = re.sub(r"\.", " .", s)
        s = re.sub(r"\!", " !", s)
        s = re.sub(r"\?", " ?", s)
        s = re.sub(r"\,", " ,", s)
        s = re.sub(r"i'm", "i am", s)
        s = re.sub(r"he's", "he is", s)
        s = re.sub(r"she's", "she is", s)
        s = re.sub(r"it's", "it is", s)
        s = re.sub(r"that's", "that is", s)
        s = re.sub(r"what's", "that is", s)
        s = re.sub(r"\'ll", " will", s)
        s = re.sub(r"\'re", " are", s)
        s = re.sub(r"won't", "will not", s)
        s = re.sub(r"can't", "cannot", s)
        s = re.sub(r"n't", " not", s)
        s = re.sub(r"n'", "ng", s)
        s = re.sub(r"ohh", "oh", s)
        s = re.sub(r"ohhh", "oh", s)
        s = re.sub(r"ohhhh", "oh", s)
        s = re.sub(r"ohhhhh", "oh", s)
        s = re.sub(r"ohhhhhh", "oh", s)
        s = re.sub(r"ahh", "ah", s)
        return s.strip()

    def read_lang(self, lang1, lang2, reverse=False):
        print("Reading lines...")
        # combine every two lines into pairs and normalize
        lines = pd.read_csv("../data/processed/All-seasons.csv").Line
        lines = iter(lines)
        pairs = [[self.normalize_string(line),self.normalize_string(next(lines))] for line in lines]
        if reverse:
            pairs = [list(reversed(p)) for p in pairs]
            input_voc = self.Voc(lang2)
            output_voc = self.Voc(lang1)
        else:
            input_voc = self.Voc(lang1)
            output_voc = self.Voc(lang2)
        return input_voc, output_voc, pairs

    def indexes_from_sentence(self, voc, sentence):
        return [voc.word2index[word] for word in sentence.split(' ')]
    
    # batch_first: true -> false, i.e. shape: seq_len * batch
    def zeroPadding(self,data, fillvalue):
        pad = len(max(data, key=len))
        return np.array([i + [fillvalue]*(pad-len(i)) for i in data])

    def prepare_data(self, lang1, lang2, reverse=False):
        input_voc, output_voc, pairs = self.read_lang(lang1, lang2, reverse)
        print("Read %s sentence pairs" % len(pairs))
        pairs = self.filter_pairs(pairs)
        print("Trimmed to %s sentence pairs" % len(pairs))
        print("Counting words...")
        for pair in pairs:
            input_voc.add_sentence(pair[0])
            output_voc.add_sentence(pair[1])
        print("Counted words:")
        print(input_voc.name, input_voc.n_words)
        print(output_voc.name, output_voc.n_words)
        return input_voc,output_voc,pairs
        
    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        return self.input_data[idx],self.input_lengths[idx],self.output_data[idx],self.output_lengths[idx]

In [4]:
train_dataset = ChatbotTrainDataset('eng', 'fra')

Reading lines...
Read 35448 sentence pairs
Trimmed to 35052 sentence pairs
Counting words...
Counted words:
eng 21290
fra 21153


In [5]:
train_dataloader = DataLoader(train_dataset,batch_size=50,shuffle=True)

In [8]:
len(iter(train_dataloader))

702

In [10]:
list(DataLoader(None,batch_size=100))

TypeError: object of type 'NoneType' has no len()

In [7]:
class DynamicEncoderRNN(nn.Module):
    def __init__(self, input_size, embed_size,hidden_size, n_layers=1):
        super(DynamicEncoderRNN, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, embed_size)
        self.gru = nn.GRU(embed_size, hidden_size,n_layers,bidirectional=True)

    def forward(self,input_seqs, input_lens, hidden=None):
        batch_size = input_seqs.size(0)
        input_lens,sort_idx = input_lens.sort(dim=0, descending=True)
        input_seqs = input_seqs[sort_idx]
        input_seqs = input_seqs.transpose(0, 1)# [T,B,E]        
        embedded = self.embedding(input_seqs)        
        packed = torch.nn.utils.rnn.pack_padded_sequence(embedded, input_lens)
        outputs, hidden = self.gru(packed,hidden)
        outputs, _ = torch.nn.utils.rnn.pad_packed_sequence(outputs)
        outputs = outputs[:, :, :self.hidden_size] + outputs[:, :,self.hidden_size:]
        _,unsort_idx= sort_idx.sort(dim=0)
        outputs = outputs.transpose(0, 1)[unsort_idx].transpose(0, 1).contiguous()
        hidden = hidden.transpose(0, 1)[unsort_idx].transpose(0, 1).contiguous()
        return outputs, hidden

    def init_hidden(self,batch_size):        
        result = Variable(torch.zeros(2,batch_size, self.hidden_size))
        return result

In [8]:
class Attn(nn.Module):
    def __init__(self, method, hidden_size):
        super(Attn, self).__init__()
        self.method = method
        self.hidden_size = hidden_size
        self.attn = nn.Linear(self.hidden_size * 2, hidden_size)
        self.v = nn.Parameter(torch.rand(hidden_size))
        stdv = 1. / math.sqrt(self.v.size(0))
        self.v.data.normal_(mean=0, std=stdv)

    def forward(self, hidden, encoder_outputs):
        '''
        :param hidden: 
            previous hidden state of the decoder, in shape (layers*directions,B,H)
        :param encoder_outputs:
            encoder outputs from Encoder, in shape (T,B,H)
        :return
            attention energies in shape (B,T)
        '''
        max_len = encoder_outputs.size(0)
        this_batch_size = encoder_outputs.size(1)
        H = hidden.repeat(max_len,1,1).transpose(0,1)
        encoder_outputs = encoder_outputs.transpose(0,1) # [B*T*H]
        attn_energies = self.score(H,encoder_outputs) # compute attention score
        return F.softmax(attn_energies).unsqueeze(1) # normalize with softmax

    def score(self, hidden, encoder_outputs):
        energy = F.tanh(self.attn(torch.cat([hidden, encoder_outputs], 2))) # [B*T*2H]->[B*T*H]
        energy = energy.transpose(2,1) # [B*H*T]
        v = self.v.repeat(encoder_outputs.data.shape[0],1).unsqueeze(1) #[B*1*H]
        energy = torch.bmm(v,energy) # [B*1*T]
        return energy.squeeze(1) #[B*T]

In [9]:
class BahdanauAttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, embed_size, output_size, n_layers=1, dropout_p=0.1):
        super(BahdanauAttnDecoderRNN, self).__init__()
        # Define parameters
        self.hidden_size = hidden_size
        self.embed_size = embed_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout_p = dropout_p
        # Define layers
        self.embedding = nn.Embedding(output_size, embed_size)
        self.dropout = nn.Dropout(dropout_p)
        self.attn = Attn('concat', hidden_size)
        self.gru = nn.GRU(hidden_size + embed_size, hidden_size, n_layers, dropout=dropout_p,bidirectional=True)
        self.attn_combine = nn.Linear(hidden_size + embed_size, hidden_size)
        self.out = nn.Linear(2*hidden_size, output_size)

    def forward(self, word_input, last_hidden, encoder_outputs):
        '''
        :param word_input:
            word input for current time step, in shape (B)
        :param last_hidden:
            last hidden stat of the decoder, in shape (layers*direction*B*H)
        :param encoder_outputs:
            encoder outputs in shape (T*B*H)
        :return
            decoder output
        Note: we run this one step at a time i.e. you should use a outer loop 
            to process the whole sequence
        Tip(update):
        EncoderRNN may be bidirectional or have multiple layers, so the shape of hidden states can be 
        different from that of DecoderRNN
        You may have to manually guarantee that they have the same dimension outside this function,
        e.g, select the encoder hidden state of the foward/backward pass.
        '''
        # Get the embedding of the current input word (last output word)
        word_embedded = self.embedding(word_input).view(1, word_input.size(0), -1) # (1,B,V)
        word_embedded = self.dropout(word_embedded)
        # Calculate attention weights and apply to encoder outputs
        attn_weights = self.attn(last_hidden[-1], encoder_outputs)
        context = attn_weights.bmm(encoder_outputs.transpose(0, 1))  # (B,1,V)
        context = context.transpose(0, 1)  # (1,B,V)
        # Combine embedded input word and attended context, run through RNN
        rnn_input = torch.cat((word_embedded, context), 2)
#         rnn_input = self.attn_combine(rnn_input) # use it in case your size of rnn_input is different
        output, hidden = self.gru(rnn_input, last_hidden)
        output = output.squeeze(0)  # (1,B,V)->(B,V)
        # context = context.squeeze(0)
        # update: "context" input before final layer can be problematic.
        # output = F.log_softmax(self.out(torch.cat((output, context), 1)))
        output = F.log_softmax(self.out(output))
        # Return final output, hidden state
        return output, hidden

In [22]:
embed_size = 64
hidden_size = 64
learning_rate = 0.01
encoder = DynamicEncoderRNN(train_dataset.input_voc.n_words,embed_size,hidden_size)
decoder = BahdanauAttnDecoderRNN(hidden_size,embed_size,train_dataset.output_voc.n_words)
encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
encoder_optimizer_exp_lr_scheduler = torch.optim.lr_scheduler.StepLR(encoder_optimizer, step_size=5, gamma=0.1)
decoder_optimizer_exp_lr_scheduler = torch.optim.lr_scheduler.StepLR(decoder_optimizer, step_size=5, gamma=0.1)
criterion = nn.CrossEntropyLoss()
if torch.cuda.is_available():
    encoder = encoder.cuda()
    decoder = decoder.cuda()
    criterion = criterion.cuda()

In [251]:
encoder = torch.load('../models/encoder.pt')
decoder = torch.load('../models/decoder.pt')

In [217]:
def evaluate():
    total_loss = torch.zeros(1)
    total_accuracy = torch.zeros(1)
    encoder.eval()
    decoder.eval()
    for batch_idx,(input_seqs, input_lens,output_seqs, output_lens) in enumerate(train_dataloader):
        input_seqs = input_seqs.long()
        output_seqs = output_seqs.long()
        encoder_hidden = encoder.init_hidden(len(input_lens))
        word_input = torch.Tensor([0]*len(output_lens)).long()
        output_seqs = output_seqs.transpose(0,1) # B,T => T,B
        
        if torch.cuda.is_available():
            input_seqs = input_seqs.cuda()
            input_lens = input_lens.cuda()
            output_seqs = output_seqs.cuda()
            output_lens = output_lens.cuda()
            encoder_hidden = encoder_hidden.cuda()
            word_input = word_input.cuda()
            total_loss = total_loss.cuda()
            total_accuracy = total_accuracy.cuda()
        encoder_outputs,last_hidden = encoder(input_seqs,input_lens,encoder_hidden)
        for seq in output_seqs:
            word_output,last_hidden = decoder(word_input,last_hidden,encoder_outputs)
            word_input = seq
            topv, topi = word_output.topk(1)
            total_loss += criterion(word_output,word_input).item()
            total_accuracy += topi.eq(word_input.view_as(topi)).sum().item()
        max_length,batch_size = output_seqs.size()
        total_loss /= max_length*batch_size
        total_accuracy /= max_length*batch_size
#     # print statistics
    print("-"*50,"Test Accuracy:",round(total_accuracy.item()*100,4),'Test Loss:',round(total_loss.item(),4))

In [11]:
evaluate()



-------------------------------------------------- Test Accuracy: 0.0 Test Loss: 0.0254


In [23]:
def train():
    completed = 0
    total_loss = torch.zeros(1)
    total_accuracy = torch.zeros(1)
    encoder.train()
    decoder.train()
    encoder_optimizer_exp_lr_scheduler.step()
    decoder_optimizer_exp_lr_scheduler.step()
    for batch_idx,(input_seqs, input_lens,output_seqs, output_lens) in enumerate(train_dataloader):
        
        input_seqs = input_seqs.long()
        output_seqs = output_seqs.long()
        encoder_hidden = encoder.init_hidden(len(input_lens))
        word_input = torch.Tensor([0]*len(output_lens)).long()
        output_seqs = output_seqs.transpose(0,1) # B,T => T,B
        batch_loss = torch.zeros(1)
        batch_accuracy = torch.zeros(1)
        
        if torch.cuda.is_available():
            input_seqs = input_seqs.cuda()
            input_lens = input_lens.cuda()
            output_seqs = output_seqs.cuda()
            output_lens = output_lens.cuda()
            encoder_hidden = encoder_hidden.cuda()
            word_input = word_input.cuda()
            batch_loss = batch_loss.cuda()
            total_loss = total_loss.cuda()
            batch_accuracy = batch_accuracy.cuda()
            total_accuracy = total_accuracy.cuda()
            
        encoder_outputs,last_hidden = encoder(input_seqs,input_lens,encoder_hidden)
        for seq in output_seqs:
            word_output,last_hidden = decoder(word_input,last_hidden,encoder_outputs)
            word_input = seq
            topv, topi = word_output.topk(1)
            batch_loss += criterion(word_output,word_input)
            batch_accuracy += topi.eq(word_input.view_as(topi)).sum().item()
            
        # backward + optimize
        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()
        batch_loss.backward()
        encoder_optimizer.step()
        decoder_optimizer.step()
        
        max_length,batch_size = output_seqs.size()
        batch_loss /= max_length*batch_size
        batch_accuracy /= max_length*batch_size
        total_loss += batch_loss     
        total_accuracy += batch_accuracy
        
        # print statistics
        completed += batch_size
        update_progress(completed/len(train_dataset))
#         print("Batch Accuracy:",round(batch_accuracy.item()*100,4),'Batch Loss:',round(batch_loss.item(),4))
    print("-"*50,"Train Accuracy:",round(total_accuracy.item()*100/(batch_idx+1),4),'Train Loss:',round(total_loss.item()/(batch_idx+1),4))

In [24]:
for _ in range(10):train()



Percent: [::::::::::] 100.00% Done...
-------------------------------------------------- Train Accuracy: 89.8521 Train Loss: 0.0145
Percent: [::::::::::] 100.00% Done...
-------------------------------------------------- Train Accuracy: 92.7176 Train Loss: 0.0096
Percent: [::::::::::] 100.00% Done...
-------------------------------------------------- Train Accuracy: 93.716 Train Loss: 0.0068
Percent: [::::::::::] 100.00% Done...
-------------------------------------------------- Train Accuracy: 94.6324 Train Loss: 0.0054
Percent: [::::::::::] 100.00% Done...
-------------------------------------------------- Train Accuracy: 94.8875 Train Loss: 0.0052
Percent: [::::::::::] 100.00% Done...
-------------------------------------------------- Train Accuracy: 96.8899 Train Loss: 0.003
Percent: [::::::::::] 100.00% Done...
-------------------------------------------------- Train Accuracy: 97.8106 Train Loss: 0.002
Percent: [::::::::::] 100.00% Done...
-----------------------------------------

In [23]:
torch.save(encoder,'../models/encoder.pt')
torch.save(decoder,'../models/decoder.pt')

  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


In [14]:
def predict_from_loader():
    test_pred=torch.LongTensor()
    encoder.eval()
    decoder.eval()
    for batch_idx,(input_seqs, input_lens,output_seqs, output_lens) in enumerate(train_dataloader):
        input_seqs = input_seqs.long()
        encoder_hidden = encoder.init_hidden(len(input_lens))
        word_input = torch.Tensor([0]*len(input_lens)).long()
        batch_pred = torch.LongTensor()
        if torch.cuda.is_available():
            input_seqs = input_seqs.cuda()
            input_lens = input_lens.cuda()
            encoder_hidden = encoder_hidden.cuda()
            word_input = word_input.cuda()
            batch_pred = batch_pred.cuda()
            test_pred = test_pred.cuda()
        encoder_outputs,last_hidden = encoder(input_seqs,input_lens,encoder_hidden)
        for _ in range(train_dataset.MAX_LENGTH):
            word_output,last_hidden = decoder(word_input,last_hidden,encoder_outputs)
            topv, topi = word_output.topk(1)
            word_input = topi.squeeze(1)
            batch_pred = torch.cat((batch_pred,topi),dim=1)
#             if (word_input == 1).sum() == input_seqs.size(0):
#                 break
        test_pred = torch.cat((test_pred,batch_pred),dim=0)
    return test_pred

In [28]:
def predict_from_data(input_seq,input_len):
    input_seq,input_len = input_seq.view(1,-1),input_len.view(1)
    pred = torch.LongTensor()
    encoder.eval()
    decoder.eval()
    input_seq = input_seq.long()
    encoder_hidden = encoder.init_hidden(len(input_len))
    word_input = torch.Tensor([0]*len(input_len)).long()
    if torch.cuda.is_available():
        input_seq = input_seq.cuda()
        input_len = input_len.cuda()
        encoder_hidden = encoder_hidden.cuda()
        word_input = word_input.cuda()
        pred = pred.cuda()
    encoder_outputs,last_hidden = encoder(input_seq,input_len,encoder_hidden)
    for _ in range(train_dataset.MAX_LENGTH):
        word_output,last_hidden = decoder(word_input,last_hidden,encoder_outputs)
        topv, topi = word_output.topk(1)
        word_input = topi.squeeze(1)
        pred = torch.cat((pred,topi),dim=1)
    return pred

In [26]:
def predict_from_sentence(line):
    line = train_dataset.normalize_string(line)
    input_seq = train_dataset.indexes_from_sentence(train_dataset.input_voc,line)+[train_dataset.EOS_token]
    input_len = torch.Tensor([len(input_seq)]).int()
    input_seq = input_seq + [train_dataset.PAD_token]*(train_dataset.MAX_LENGTH-len(input_seq))
    input_seq = torch.Tensor(input_seq).long()
    pred = predict_from_data(input_seq,input_len)
    input_seq = input_seq.squeeze(0)
    pred = pred.squeeze(0)
    # Print Input
    decoded_words = []
    for index in input_seq:
        decoded_words.append(train_dataset.input_voc.index2word[index.item()])
        if index.item() == train_dataset.EOS_token:
            break
    print("Input:"," ".join(decoded_words))
    # Print Predict
    decoded_words = []
    for index in pred:
        decoded_words.append(train_dataset.output_voc.index2word[index.item()])
        if index.item() == train_dataset.EOS_token:
            break
    print("Predict:"," ".join(decoded_words))

In [25]:
def evaluate_randomly(index):
    input_seq,input_len,output_seq,_ = list(train_dataloader)[index//train_dataloader.batch_size]
    idx = index%train_dataloader.batch_size
    input_seq,input_len,output_seq = input_seq[idx].view(1,-1),input_len[idx].view(1),output_seq[idx]
    predict_from_data(input_seq,input_len)
    pred = torch.LongTensor()
    encoder.eval()
    decoder.eval()
    input_seq = input_seq.long()
    encoder_hidden = encoder.init_hidden(len(input_len))
    word_input = torch.Tensor([0]*len(input_len)).long()
    if torch.cuda.is_available():
        input_seq = input_seq.cuda()
        input_len = input_len.cuda()
        encoder_hidden = encoder_hidden.cuda()
        word_input = word_input.cuda()
        pred = pred.cuda()
    encoder_outputs,last_hidden = encoder(input_seq,input_len,encoder_hidden)
    for _ in range(train_dataset.MAX_LENGTH):
        word_output,last_hidden = decoder(word_input,last_hidden,encoder_outputs)
        topv, topi = word_output.topk(1)
        word_input = topi.squeeze(1)
        pred = torch.cat((pred,topi),dim=1)
        
    input_seq = input_seq.squeeze(0)
    pred = pred.squeeze(0)
    # Print Input
    decoded_words = []
    for index in input_seq:
        decoded_words.append(train_dataset.input_voc.index2word[index.item()])
        if index.item() == train_dataset.EOS_token:
            break
    print("Input:"," ".join(decoded_words))
    # Print Target
    decoded_words = []
    for index in output_seq:
        decoded_words.append(train_dataset.output_voc.index2word[index.item()])
        if index.item() == train_dataset.EOS_token:
            break
    print("Target:"," ".join(decoded_words))
    # Print Predict
    decoded_words = []
    for index in pred:
        decoded_words.append(train_dataset.output_voc.index2word[index.item()])
        if index.item() == train_dataset.EOS_token:
            break
    print("Predict:"," ".join(decoded_words))

In [143]:
prediction = predict()



In [29]:
evaluate_randomly(1100)



Input: well no , ah i just thought it was one of your other personalities trying to give me a EOS
Target: mean guys ? town will just then friends so jesus  wants na sight" broflovskis draw alain welcome meaning EOS
Predict: mean guys ? town will just then friends so jesus jesus  wants na sight" broflovskis draw alain welcome meaning EOS


In [37]:
predict_from_sentence("why")



Input: why EOS
Predict: son EOS


## Corpus fromatted dataset

In [145]:
corpus_name = "cornell movie-dialogs corpus"
corpus = os.path.join("../data/interim", corpus_name)

def printLines(file, n=10):
    with open(file, 'rb') as datafile:
        lines = datafile.readlines()
    for line in lines[:n]:
        print(line)

printLines(os.path.join(corpus, "movie_lines.txt"))

b'L1045 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ They do not!\n'
b'L1044 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ They do to!\n'
b'L985 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ I hope so.\n'
b'L984 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ She okay?\n'
b"L925 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Let's go.\n"
b'L924 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ Wow\n'
b"L872 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Okay -- you're gonna need to learn how to lie.\n"
b'L871 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ No\n'
b'L870 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ I\'m kidding.  You know how sometimes you just become this "persona"?  And you don\'t know how to quit?\n'
b'L869 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Like my fear of wearing pastels?\n'


In [146]:
# Splits each line of the file into a dictionary of fields
def loadLines(fileName, fields):
    lines = {}
    with open(fileName, 'r', encoding='iso-8859-1') as f:
        for line in f:
            values = line.split(" +++$+++ ")
            # Extract fields
            lineObj = {}
            for i, field in enumerate(fields):
                lineObj[field] = values[i]
            lines[lineObj['lineID']] = lineObj
    return lines


# Groups fields of lines from `loadLines` into conversations based on *movie_conversations.txt*
def loadConversations(fileName, lines, fields):
    conversations = []
    with open(fileName, 'r', encoding='iso-8859-1') as f:
        for line in f:
            values = line.split(" +++$+++ ")
            # Extract fields
            convObj = {}
            for i, field in enumerate(fields):
                convObj[field] = values[i]
            # Convert string to list (convObj["utteranceIDs"] == "['L598485', 'L598486', ...]")
            lineIds = eval(convObj["utteranceIDs"])
            # Reassemble lines
            convObj["lines"] = []
            for lineId in lineIds:
                convObj["lines"].append(lines[lineId])
            conversations.append(convObj)
    return conversations


# Extracts pairs of sentences from conversations
def extractSentencePairs(conversations):
    qa_pairs = []
    for conversation in conversations:
        # Iterate over all the lines of the conversation
        for i in range(len(conversation["lines"]) - 1):  # We ignore the last line (no answer for it)
            inputLine = conversation["lines"][i]["text"].strip()
            targetLine = conversation["lines"][i+1]["text"].strip()
            # Filter wrong samples (if one of the lists is empty)
            if inputLine and targetLine:
                qa_pairs.append([inputLine, targetLine])
    return qa_pairs

In [149]:
import codecs
import csv
datafile = os.path.join(corpus, "formatted_movie_lines.txt")

delimiter = '\t'
# Unescape the delimiter
delimiter = str(codecs.decode(delimiter, "unicode_escape"))

# Initialize lines dict, conversations list, and field ids
lines = {}
conversations = []
MOVIE_LINES_FIELDS = ["lineID", "characterID", "movieID", "character", "text"]
MOVIE_CONVERSATIONS_FIELDS = ["character1ID", "character2ID", "movieID", "utteranceIDs"]

# Load lines and process conversations
print("\nProcessing corpus...")
lines = loadLines(os.path.join(corpus, "movie_lines.txt"), MOVIE_LINES_FIELDS)
print("\nLoading conversations...")
conversations = loadConversations(os.path.join(corpus, "movie_conversations.txt"),
                                  lines, MOVIE_CONVERSATIONS_FIELDS)

# Write new csv file
print("\nWriting newly formatted file...")
with open(datafile, 'w', encoding='utf-8') as outputfile:
    writer = csv.writer(outputfile, delimiter=delimiter)
    for pair in extractSentencePairs(conversations):
        writer.writerow(pair)

# Print a sample of lines
print("\nSample lines from file:")
printLines(datafile)


Processing corpus...

Loading conversations...

Writing newly formatted file...

Sample lines from file:
b"Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.\tWell, I thought we'd start with pronunciation, if that's okay with you.\r\r\n"
b"Well, I thought we'd start with pronunciation, if that's okay with you.\tNot the hacking and gagging and spitting part.  Please.\r\r\n"
b"Not the hacking and gagging and spitting part.  Please.\tOkay... then how 'bout we try out some French cuisine.  Saturday?  Night?\r\r\n"
b"You're asking me out.  That's so cute. What's your name again?\tForget it.\r\r\n"
b"No, no, it's my fault -- we didn't have a proper introduction ---\tCameron.\r\r\n"
b"Cameron.\tThe thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does.\r\r\n"
b"The thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser. 

In [168]:
# Default word tokens
PAD_token = 0  # Used for padding short sentences
SOS_token = 1  # Start-of-sentence token
EOS_token = 2  # End-of-sentence token

class Voc:
    def __init__(self, name):
        self.name = name
        self.trimmed = False
        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
        self.num_words = 3  # Count SOS, EOS, PAD

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.num_words
            self.word2count[word] = 1
            self.index2word[self.num_words] = word
            self.num_words += 1
        else:
            self.word2count[word] += 1

    # Remove words below a certain count threshold
    def trim(self, min_count):
        if self.trimmed:
            return
        self.trimmed = True

        keep_words = []

        for k, v in self.word2count.items():
            if v >= min_count:
                keep_words.append(k)

        print('keep_words {} / {} = {:.4f}'.format(
            len(keep_words), len(self.word2index), len(keep_words) / len(self.word2index)
        ))

        # Reinitialize dictionaries
        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
        self.num_words = 3 # Count default tokens

        for word in keep_words:
            self.addWord(word)

In [209]:
MAX_LENGTH = 10  # Maximum sentence length to consider

# Turn a Unicode string to plain ASCII, thanks to
# http://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    s = re.sub(r"\s+", r" ", s).strip()
    return s

# Read query/response pairs and return a voc object
def readVocs(datafile, corpus_name):
    print("Reading lines...")
    # Read the file and split into lines
    lines = open(datafile, encoding='utf-8').\
        read().strip().split('\n')
    # Split every line into pairs and normalize
    pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]
    print(pairs[:50])
    voc = Voc(corpus_name)
    return voc, pairs

# Returns True iff both sentences in a pair 'p' are under the MAX_LENGTH threshold
def filterPair(p):
    # Input sequences need to preserve the last word for EOS token
    try:
        return len(p[0].split(' ')) < MAX_LENGTH and len(p[1].split(' ')) < MAX_LENGTH
    except:
        return False

# Filter pairs using filterPair condition
def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

# Using the functions defined above, return a populated voc object and pairs list
def loadPrepareData(corpus, corpus_name, datafile, save_dir):
    print("Start preparing training data ...")
    voc, pairs = readVocs(datafile, corpus_name)
    print("Read {!s} sentence pairs".format(len(pairs)))
    pairs = filterPairs(pairs)
    print("Trimmed to {!s} sentence pairs".format(len(pairs)))
    print("Counting words...")
    for pair in pairs:
        voc.addSentence(pair[0])
        voc.addSentence(pair[1])
    print("Counted words:", voc.num_words)
    return voc, pairs


# Load/Assemble voc and pairs
save_dir = os.path.join("data", "save")
voc, pairs = loadPrepareData(corpus, corpus_name, datafile, save_dir)
# Print some pairs to validate
print("\npairs:")
for pair in pairs[:10]:
    print(pair)

Start preparing training data ...
Reading lines...
[['can we make this quick ? roxanne korrine and andrew barrett are having an incredibly horrendous public break up on the quad . again .', 'well i thought we d start with pronunciation if that s okay with you .'], [''], ['well i thought we d start with pronunciation if that s okay with you .', 'not the hacking and gagging and spitting part . please .'], [''], ['not the hacking and gagging and spitting part . please .', 'okay . . . then how bout we try out some french cuisine . saturday ? night ?'], [''], ['you re asking me out . that s so cute . what s your name again ?', 'forget it .'], [''], ['no no it s my fault we didn t have a proper introduction', 'cameron .'], [''], ['cameron .', 'the thing is cameron i m at the mercy of a particularly hideous breed of loser . my sister . i can t date until she does .'], [''], ['the thing is cameron i m at the mercy of a particularly hideous breed of loser . my sister . i can t date until she do