# Requirements

You will need [PyTorch](http://pytorch.org/) to build and train the models, and [matplotlib](https://matplotlib.org/) for plotting training and visualizing attention outputs later.

In [1]:
from __future__ import unicode_literals, print_function, division
from collections import Counter
from io import open
import unicodedata
import string
import re
import random
import time

import torch
import torch.nn as nn
from torch.autograd import Variable
from torch import optim
import torch.nn.functional as F
import numpy as np
import torch.optim.lr_scheduler as lr
from random import shuffle
import math

USE_CUDA = torch.cuda.is_available()

In [2]:
#with open("/Users/carolineroper/Documents/School/Natural Language Processing/Char-NMT/data/train.txt", "r", encoding="utf8") as f:
with open("/Users/bilals01/Documents/NLP/project/Char-NMT/data/train.txt", "r", encoding="utf8") as f:
    
    DE_seq = []
    EN_seq = []
    
    for i, line in enumerate(f):
        
        line = line.split('<JOIN>')
        DE_seq.append(line[0])
        EN_seq.append(line[1])

### Indexing words

We'll need a unique index per word to use as the inputs and targets of the networks later. To keep track of all this we will use a helper class called `Lang` which has word &rarr; index (`word2index`) and index &rarr; word (`index2word`) dictionaries, as well as a count of each word `word2count` to use to later replace rare words.

### Reading and decoding files

The files are all in Unicode, to simplify we will turn Unicode characters to ASCII, make everything lowercase, and trim most punctuation.

In [3]:
# Turn a Unicode string to plain ASCII, thanks to
# http://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters
def normalizeString(s):
    s = s.replace(" <EOS>", "")
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s) #separates punctuation from the word
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s) #strips anything that isn't a character of punctuation
    return s

In [4]:
def readLangs(seq1, seq2):
    print("Reading lines...")
    
    seq1 = [normalizeString(s) for s in seq1]
    seq2 = [normalizeString(s) for s in seq2]
    
    pairs = list(map(list, zip(seq1, seq2)))

    return pairs #, vocab1, vocab2

In [5]:
pairs = readLangs(DE_seq, EN_seq)

Reading lines...


### Filtering sentences

Since there are a *lot* of example sentences and we want to train something quickly, we'll trim the data set to only relatively short and simple sentences. Here the maximum length is 10 words (that includes punctuation) and we're filtering to sentences that translate to the form "I am" or "He is" etc. (accounting for apostrophes being removed).

In [6]:
#will ultimately remove this and set MAX_LENGTH to either 50 like the paper or the longest sentence in corpus
MAX_LENGTH = 10

eng_prefixes = (
    "i am ", "i m ",
    "he is", "he s ",
    "she is", "she s",
    "you are", "you re ",
    "we are", "we re ",
    "they are", "they re "
)


def filterPair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and \
        len(p[1].split(' ')) < MAX_LENGTH  \
        and p[1].startswith(eng_prefixes)


def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

pairs = filterPairs(pairs)

In [7]:
def find_vocabulary(seq, vocab_size):
    counter = Counter()
    for sentence in seq:
        counter.update(sentence.split())
    vocabulary = [count[0] for count in counter.most_common(vocab_size)]
    return vocabulary



def find_all_vocabulary(seq):
    counter = Counter()
    for sentence in seq:
        counter.update(sentence.split())
    vocabulary = [count[0] for count in counter.most_common(100000000000000)]
    return vocabulary

In [8]:
vocab_size = 10000
input_vocab = find_vocabulary([pair[0] for pair in pairs], vocab_size)
output_vocab = find_vocabulary([pair[1] for pair in pairs], vocab_size)

input_a_w = find_all_vocabulary([pair[0] for pair in pairs])
output_a_w = find_all_vocabulary([pair[1] for pair in pairs])

In [9]:
UNK_token = 0
SOS_token = 1
EOS_token = 2
PAD_token = 3

class Lang:
    def __init__(self, name, allwords, vocab):
        self.name = name
        self.vocab = vocab
        self.allwords = allwords

        self.word2index = {}
        self.word2count = {}
        self.index2word_vocab = dict(zip(range(4,len(vocab)+4), vocab))
        self.index2word_allwords = dict(zip(range(4,len(allwords)+4), allwords))

        self.index2word_vocab[UNK_token] = "<UNK>"
        self.index2word_vocab[SOS_token] = "<SOS>"
        self.index2word_vocab[EOS_token] = "<EOS>"
        self.index2word_vocab[PAD_token] = "<PAD>"
        
        
        self.word2index_vocab = {v: k for k, v in self.index2word_vocab.items()} 
        self.word2index_allwords = {v: k for k, v in self.index2word_allwords.items()} 

        self.n_words = len(self.index2word_vocab)
        self.n_words_all = len(self.index2word_allwords)

In [10]:
input_lang = Lang('ger',input_a_w, input_vocab)
output_lang = Lang('eng', output_a_w,output_vocab)

The full process for preparing the data is:

* Read text file and split into lines, split lines into pairs
* Normalize text, filter by length and content
* Make word lists from sentences in pairs

In [34]:
def indexesFromSentence(lang, sentence, seq_len):
    
    type(sentence)
    
    #returns 0 if not found in word2index
    indexes0 = [Counter(lang.word2index_vocab)[word] for word in sentence.split(' ')]
    indexes1 = [Counter(lang.word2index_allwords)[word] for word in sentence.split(' ')]

    ind1=[]#regular indices
    ind2=[]#indices of the exotic words

    for ind in range(0, len(indexes0)): 
        ind1.append(indexes0[ind])
        if (ind1[ind]==0):
            ind2.append(indexes1[ind])
        else: 
            ind2.append(-1) #fill the non exotic with -1

        
    #adds EOS token at EOS
    ind1.extend([EOS_token])
    ind2.extend([EOS_token]) #do we need that, not sure if we care if the char matrix has EOS and SOS

    #trims to the seq len
    ind1 = ind1[0:(min(seq_len, len(ind1)))]
    ind2 = ind2[0:(min(seq_len, len(ind2)))] 


    #pads if needed
    ind1.extend([3] * (seq_len - len(ind1)))
    ind2.extend([3] * (seq_len - len(ind2)))

    return (ind1, ind2)

def variableFromSentence(lang, sentence, seq_len):
    (ind1, ind2) = indexesFromSentence(lang, sentence, seq_len)
    result1 = Variable(torch.LongTensor(ind1).view(-1, 1))
    result2 = Variable(torch.LongTensor(ind2).view(-1, 1))

    if USE_CUDA:
        return (result1.cuda(), result2.cuda())
    else:
        return (result1, result2)

def variablesFromPair(pair, seq_len):
    (input_variable1, input_variable2)  = variableFromSentence(input_lang, pair[0], seq_len)
    (target_variable1, target_variable2) = variableFromSentence(output_lang, pair[1], seq_len)
    return (input_variable1, target_variable1, input_variable2, target_variable2 )

In [35]:
import string
char_vocab = list(string.printable)

UNK_char = 0
SOS_char = 1
EOS_char = 2
PAD_char = 3

char_dict=dict(zip(range(4,len(char_vocab)+4), char_vocab)) 
char2index_counter={v: k for k, v in char_dict.items()}

def indexesFromWord(word,seq_len):
    indexes = [Counter(char2index_counter)[chars] for chars in list(word)]
    indexes.extend([EOS_char])
    indexes = indexes[0:(min(seq_len, len(indexes)))]
    indexes.extend([3] * (seq_len - len(indexes)))
    return indexes

#print(indexesFromWord("dghfgf",10))

In [36]:
batch_pairs = pairs[0:10]
batch_input = [pair[0] for pair in pairs]
batch_target = [pair[1] for pair in pairs]
batch_vars = [variablesFromPair(pair, seq_len=8) for pair in batch_pairs]
    

batch_vars

[(Variable containing:
     77
      6
      5
    179
     62
     12
    561
      4
  [torch.LongTensor of size 8x1], Variable containing:
      6
     10
    109
     21
    146
    532
     33
      4
  [torch.LongTensor of size 8x1], Variable containing:
     -1
     -1
     -1
     -1
     -1
     -1
     -1
     -1
  [torch.LongTensor of size 8x1], Variable containing:
     -1
     -1
     -1
     -1
     -1
     -1
     -1
     -1
  [torch.LongTensor of size 8x1]), (Variable containing:
     16
      6
    562
      4
      2
      3
      3
      3
  [torch.LongTensor of size 8x1], Variable containing:
      6
      5
    533
    534
      4
      2
      3
      3
  [torch.LongTensor of size 8x1], Variable containing:
     -1
     -1
     -1
     -1
      2
      3
      3
      3
  [torch.LongTensor of size 8x1], Variable containing:
     -1
     -1
     -1
     -1
     -1
      2
      3
      3
  [torch.LongTensor of size 8x1]), (Variable containing:
      7
      9
    1

# Building the models

## The Encoder

<img src="images/encoder-network.png" style="float: right" />

The encoder of a seq2seq network is a RNN that outputs some value for every word from the input sentence. For every input word the encoder outputs a vector and a hidden state, and uses the hidden state for the next input word.

In [None]:
class CharacterEncoderRNN(nn.Module):
    #different batch size?
    #self.batch_size
    #different self.input_size?
    
    #takes a word or batch of words
    #loops through each of the characters or batch of characters at one index position
    #f

In [15]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, n_layers, batch_size, uniform_init=False, dropout_p=0.2):
        super(EncoderRNN, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        self.dropout_p = dropout_p
        self.embedding = nn.Embedding(input_size, hidden_size) #the input size is the number of words
        self.batch_size = batch_size
        self.lstm = nn.LSTM(hidden_size, hidden_size, dropout = dropout_p)
        self.dropout = nn.Dropout(self.dropout_p)
        
        #used in paper but isn't working well on the subsample of data
        if uniform_init == True:
            self.init_weights()
        
    def forward(self, x, hidden):
        
        seq_len = len(x)
        
        embedded = self.embedding(x).view(seq_len, self.batch_size, -1) #this will be 1x128x500
        output = self.dropout(embedded)
        for i in range(self.n_layers):
            output, hidden = self.lstm(output, hidden)
        return output, hidden
    
    #also return indexes_where_unknown
    
    def init_weights(self):
        initrange = 0.1
        self.lstm.weight_ih_l0.data.uniform_(-initrange, initrange)
        self.lstm.weight_hh_l0.data.uniform_(-initrange, initrange)
        self.lstm.bias_ih_l0.data.uniform_(-initrange, initrange)
        self.lstm.bias_hh_l0.data.uniform_(-initrange, initrange)
        
    def init_hidden(self):
        #the paper says to use "uniform initialization of parameters in [−0.1,0.1]: does that include these?"
        h0 = Variable(torch.zeros(1, 1, self.hidden_size)) #I think maybe one of the 1's should be the batch_size?
        c0 = Variable(torch.zeros(1, 1, self.hidden_size))
        if USE_CUDA:
            return h0.cuda(), c0.cuda()
        else:
            return h0, c0

### Luong Attention

In [132]:
class Attn(nn.Module):
    def __init__(self, hidden_size, batch_size, method = 'general'):
        super(Attn, self).__init__()
        
        self.method = method
        self.hidden_size = hidden_size
        self.batch_size = batch_size
        
        if self.method == 'general':
            self.attn = nn.Linear(self.hidden_size, hidden_size)

        elif self.method == 'concat':
            self.attn = nn.Linear(self.hidden_size * 2, hidden_size)
            self.other = nn.Parameter(torch.FloatTensor(1, hidden_size))

    def forward(self, hidden, encoder_outputs):
        seq_len = len(encoder_outputs)

        attn_energies = Variable(torch.zeros(self.batch_size, seq_len)) # B x 1 x S
        if USE_CUDA: attn_energies = attn_energies.cuda()

        # Calculate energies for each encoder output
        for i in range(seq_len):
            attn_energies[:,i] = self.score(hidden, encoder_outputs[i])

        # Normalize energies to weights in range 0 to 1, resize to 1 x 1 x seq_len
        # I confirmed that this computes row-wise, which is what we need
        after_softmax = F.softmax(attn_energies)
        return after_softmax.unsqueeze(0)
    
    def score(self, hidden, encoder_output):
        
        if self.method == 'dot':
            energy = torch.diag(torch.matmul(hidden, torch.transpose(encoder_output,0,1)))
            #print ("energy", energy)
            return energy
        
        elif self.method == 'general':
            energy = self.attn(encoder_output)
            energy = torch.diag(torch.matmul(hidden, torch.transpose(energy,0,1)))
            return energy

Now we can build a decoder that plugs this Attn module in after the RNN to calculate attention weights, and apply those weights to the encoder outputs to get a context vector.

In [133]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, attn_model, hidden_size, output_size, n_layers, batch_size, uniform_init = False, dropout_p=0.2):
        super(AttnDecoderRNN, self).__init__()
        
        # Keep parameters for reference
        self.attn = attn_model
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.batch_size = batch_size
        
        # Define layers
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.n_layers = n_layers
        self.lstm = nn.LSTM(hidden_size*2, hidden_size, dropout = dropout_p)
        self.out = nn.Linear(hidden_size*2, output_size)
        
        if uniform_init == True:
            self.init_weights()
    
    def forward(self, word_input, last_context, last_hidden, encoder_outputs):
        # Note: we run this one step at a time
        
        # Get the embedding of the current input word (last output word)
        
        hidden = last_hidden
        word_embedded = self.embedding(word_input).view(1, self.batch_size, -1) # S=1 x B x N
        output = word_embedded

        #Combine embedded input word and last context, run through LSTM
        
        for i in range(self.n_layers):
            output = torch.cat((output, last_context.unsqueeze(0)), 2)
            output, hidden = self.lstm(output, hidden)

        # Calculate attention from current RNN state and all encoder outputs; apply to encoder outputs
        attn_weights = self.attn(word_embedded.squeeze(0), encoder_outputs)
        context = torch.bmm(attn_weights.transpose(0, 1), encoder_outputs.transpose(0, 1)) # B x 1 x N
        
        # Final output layer (next word prediction) using the hidden state and context vector
        output = output.squeeze(0) # S=1 x B x N -> B x N
        context = context.squeeze(1) # B x S=1 x N -> B x N
        output = F.log_softmax(self.out(torch.cat((output, context), 1)))
        
        #here is where we'd want to do any type of character-level thing
        #ignore the "separate path" for now
        #
        
        
        # Return final output, hidden state, and attention weights (for visualization)
        return output, context, hidden, attn_weights
    
    def init_weights(self):
        initrange = 0.1
        self.lstm.weight_ih_l0.data.uniform_(-initrange, initrange)
        self.lstm.weight_hh_l0.data.uniform_(-initrange, initrange)
        self.lstm.bias_ih_l0.data.uniform_(-initrange, initrange)
        self.lstm.bias_hh_l0.data.uniform_(-initrange, initrange)

In [None]:
class AttnDecoderCharLevelRNN(nn.Module):
    def __init__(self, attn_model, hidden_size, output_size, n_layers, batch_size, uniform_init = False, dropout_p=0.2):
        super(AttnDecoderRNN, self).__init__()
        
        # Keep parameters for reference
        self.attn = attn_model
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.batch_size = batch_size
        
        # Define layers
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.n_layers = n_layers
        self.lstm = nn.LSTM(hidden_size*2, hidden_size, dropout = dropout_p)
        self.out = nn.Linear(hidden_size*2, output_size)
        
        if uniform_init == True:
            self.init_weights()
    
    def forward(self, word_input, last_context, last_hidden, encoder_outputs):
        # Note: we run this one step at a time
        
        # Get the embedding of the current input word (last output word)
        
        hidden = last_hidden
        word_embedded = self.embedding(word_input).view(1, self.batch_size, -1) # S=1 x B x N
        output = word_embedded

        #Combine embedded input word and last context, run through LSTM
        
        for i in range(self.n_layers):
            output = torch.cat((output, last_context.unsqueeze(0)), 2)
            output, hidden = self.lstm(output, hidden)

        # Calculate attention from current RNN state and all encoder outputs; apply to encoder outputs
        attn_weights = self.attn(word_embedded.squeeze(0), encoder_outputs)
        context = torch.bmm(attn_weights.transpose(0, 1), encoder_outputs.transpose(0, 1)) # B x 1 x N
        
        # Final output layer (next word prediction) using the hidden state and context vector
        output = output.squeeze(0) # S=1 x B x N -> B x N
        context = context.squeeze(1) # B x S=1 x N -> B x N
        output = F.log_softmax(self.out(torch.cat((output, context), 1)))
        
        # Return final output, hidden state, and attention weights (for visualization)
        return output, context, hidden, attn_weights
    
    def init_weights(self):
        initrange = 0.1
        self.lstm.weight_ih_l0.data.uniform_(-initrange, initrange)
        self.lstm.weight_hh_l0.data.uniform_(-initrange, initrange)
        self.lstm.bias_ih_l0.data.uniform_(-initrange, initrange)
        self.lstm.bias_hh_l0.data.uniform_(-initrange, initrange)

# Training

## Defining a training iteration

To train we first run the input sentence through the encoder word by word, and keep track of every output and the latest hidden state. Next the decoder is given the last hidden state of the decoder as its first hidden state, and the `<SOS>` token as its first input. From there we iterate to predict a next token from the decoder.

In [3]:
hidden_size = 500 #will be 1024 in final version
BATCH_SIZE = 64 #will be 128 in final version
N_LAYERS = 4
SEQ_LENGTH = MAX_LENGTH
learning_rate = 1 #will be 1 in final version, descends by scheduler

attn_model = Attn(hidden_size, BATCH_SIZE)
# Initialize models
encoder = EncoderRNN(input_lang.n_words, hidden_size, \
                     n_layers=N_LAYERS, batch_size = BATCH_SIZE)
decoder = AttnDecoderRNN(attn_model, hidden_size, output_lang.n_words,\
                         n_layers = N_LAYERS, batch_size=BATCH_SIZE) 

# Move models to GPU
if USE_CUDA:
    encoder.cuda()
    decoder.cuda()

# Initialize optimizers and criterion
# commenting out for now because the paper's parameters didn't work well when I tried them)

#encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate) #should be optim.SGD according to the paper
#decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)

#scheduler1 = lr.LambdaLR(encoder_optimizer, lr_lambda=custom_schedule)
#scheduler2 = lr.LambdaLR(decoder_optimizer, lr_lambda=custom_schedule)

criterion = nn.NLLLoss()

###these parameters are nothing like the paper but seemed to work best, at least with less data
learning_rate = 0.001
encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)

NameError: name 'MAX_LENGTH' is not defined

In [2]:
# Zero gradients of both optimizers
encoder_optimizer.zero_grad()
decoder_optimizer.zero_grad()
loss = 0 # Added onto for each word

# Get size of input and target sentences
input_length = input_variable.size()[0]
target_length = target_variable.size()[0]

# Run words through encoder
encoder_hidden = encoder.init_hidden()

########## Need the final "encoder outputs" to incorporate both the wl & cl encodings?
encoder_outputs, encoder_hidden = encoder(input_variable, encoder_hidden)

# Prepare input and output variables
# One SOS token for each sentence, so length is batch size
decoder_input = Variable(torch.LongTensor([[SOS_token]*batch_size]))
decoder_context = Variable(torch.zeros(batch_size, decoder.hidden_size))
decoder_hidden = encoder_hidden # Use last hidden state from encoder to start decoder
if USE_CUDA:
    decoder_input = decoder_input.cuda()
    decoder_context = decoder_context.cuda()

for di in range(target_length):
    decoder_output, decoder_context, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_context, decoder_hidden, encoder_outputs)

    loss += criterion(decoder_output, target_variable[di])

    # Get most likely word index (highest value) from output
    topv, topi = decoder_output.data.topk(1)

    ni = topi.squeeze(1)

    decoder_input = Variable(ni) # Chosen word is next input
    if USE_CUDA: decoder_input = decoder_input.cuda()
        
    #Find the entries where ni = 0 - these are the <unks>
    #When it produces an <>

# Backpropagation
loss.backward()
torch.nn.utils.clip_grad_norm(encoder.parameters(), clip)
torch.nn.utils.clip_grad_norm(decoder.parameters(), clip)
encoder_optimizer.step()
decoder_optimizer.step()

return loss.data[0] / target_length

NameError: name 'encoder_optimizer' is not defined

In [None]:
#maybe the data loader needs a "sentence index" that is fixed (not dependent on batch)
#so you receive a bach and a vector of indeces that corresponds to the index in the pair itself

In [103]:
clip = 5.0

def train(input_variable, target_variable, encoder, decoder, encoder_optimizer, decoder_optimizer,\
          criterion, batch_size, max_length):

    # Zero gradients of both optimizers
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    loss = 0 # Added onto for each word

    # Get size of input and target sentences
    input_length = input_variable.size()[0]
    target_length = target_variable.size()[0]

    # Run words through encoder
    encoder_hidden = encoder.init_hidden()
    
    for ei in range(input_length):
        #input a column of word indexes, some high (exotic), some low (non-exotic)
        #input_var_for_word_level = input with high indexes replaced with 0's
        #run the input_var_for_word_level through the encoder
        encoder_output, encoder_hidden, indexes_where_unknown = encoder(
            input_variable[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0][0] #will probably need to change these subscripts
        #input_var_for_char_level = rare words represented at the character level
        #index_to_reunite with_word_level
        #create char-level matrix
        #use some time of special token as a placeholder to represent where the
        #input word is within the vocabulary
        char_encoder_output, char_encoder_hidden = character_encoder_goes_here
        
        
        #replace the word-level with char-level where needed
        encoder_outputs[ei][indexes_where_unknown] = char_encoder_output
    
    # Prepare input and output variables
    # One SOS token for each sentence, so length is batch size
    decoder_input = Variable(torch.LongTensor([[SOS_token]*batch_size]))
    decoder_context = Variable(torch.zeros(batch_size, decoder.hidden_size))
    decoder_hidden = encoder_hidden # Use last hidden state from encoder to start decoder
    if USE_CUDA:
        decoder_input = decoder_input.cuda()
        decoder_context = decoder_context.cuda()
        
    for di in range(target_length):
        #possible I'll need to move this inside the AttnDecoderRNN so that
        #the torch autogradient thing works
        decoder_output, decoder_context, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_context, decoder_hidden, encoder_outputs)
        
        loss += criterion(decoder_output, target_variable[di])
        
        # Get most likely word index (highest value) from output
        topv, topi = decoder_output.data.topk(1)
        
        ni = topi.squeeze(1)
        
        

        decoder_input = Variable(ni) # Chosen word is next input
        if USE_CUDA: decoder_input = decoder_input.cuda()

        #once the decoder output is produced - can see if there are <UNKs> = 0
        #wherever there's a 0, seek to replace the 0 with a string of characters similar to the 
        #target[di]
        #so find the indexes where the decoded outputs = 0
        #grab the attributes that the decoder needs in order to function at those indexes
        #(might not actually need all of these depending on the architecture of the decoder):
            #to avoid dynamic batch size, put placeholder special character where
            #word is in vocabulary
            #decoder_output[indexes resulted in 0], 
            #decoder_context[indexes resulted in 0],
            #decoder_hidden[indexes resulted in 0], 
            #decoder_attention[indexes resulted in 0]
        #pass to the character-level decoder, find the most likely character at each position in the sequence
        #replace the decoder output at those indexes with the character-level sequence
            #evaluate the character-level loss between target_variable[di] (represented via character indexes) &
            #the output of the decoder
            #I think it'll have its own cl_loss.backward() to perfect it
        #the actual loss is then the weighed average of these two things - the reason the losses are connected
        #is because the word-level piece is also giving output to the character level piece
        #so you should think of the character-level loss as a component of the word-level loss
         
    # Backpropagation
    loss.backward()
    torch.nn.utils.clip_grad_norm(encoder.parameters(), clip)
    torch.nn.utils.clip_grad_norm(decoder.parameters(), clip)
    encoder_optimizer.step()
    decoder_optimizer.step()
    
    return loss.data[0] / target_length

Finally helper functions to print time elapsed and estimated time remaining, given the current time and progress.

## Running training

With everything in place we can actually initialize a network and start training.

To start, we initialize models, optimizers, and a loss function (criterion).

In [104]:
def custom_schedule(epoch):
    if (epoch < 4):
        return (1)
    elif epoch == 4:
        return (.5)
    elif epoch >= 4:
        return (.5**(1+(epoch - 3)))

In [310]:
hidden_size = 500 #will be 1024 in final version
BATCH_SIZE = 64 #will be 128 in final version
N_LAYERS = 4
SEQ_LENGTH = MAX_LENGTH
learning_rate = 1 #will be 1 in final version, descends by scheduler

attn_model = Attn(hidden_size, BATCH_SIZE)
# Initialize models
encoder = EncoderRNN(input_lang.n_words, hidden_size, \
                     n_layers=N_LAYERS, batch_size = BATCH_SIZE)
decoder = AttnDecoderRNN(attn_model, hidden_size, output_lang.n_words,\
                         n_layers = N_LAYERS, batch_size=BATCH_SIZE) 

# Move models to GPU
if USE_CUDA:
    encoder.cuda()
    decoder.cuda()

# Initialize optimizers and criterion
# commenting out for now because the paper's parameters didn't work well when I tried them)

#encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate) #should be optim.SGD according to the paper
#decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)

#scheduler1 = lr.LambdaLR(encoder_optimizer, lr_lambda=custom_schedule)
#scheduler2 = lr.LambdaLR(decoder_optimizer, lr_lambda=custom_schedule)

criterion = nn.NLLLoss()

###these parameters are nothing like the paper but seemed to work best, at least with less data
learning_rate = 0.001
encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)


Then set up variables for plotting and tracking progress:

In [311]:
def as_minutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def time_since(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (as_minutes(s), as_minutes(rs))

In [1]:
from torch.utils.data import Dataset

class TranslationDataset(Dataset):
    """
    Class that represents a train/validation/test dataset that's readable for PyTorch
    Note that this class inherits torch.utils.data.Dataset
    """
    
    def __init__(self, data_list, seq_length):
        """
        @param data_list: list of IMDBDatum
        """
        self.data_list = data_list
        self.seq_length = seq_length
        
    def __len__(self):
        return len(self.data_list)
        
    def __getitem__(self, key):
        """
        Triggered when you call dataset[i]
        """
        return indexesFromSentence(input_lang, self.data_list[key][0], seq_len = self.seq_length), \
    indexesFromSentence(output_lang, self.data_list[key][1], seq_len = self.seq_length)

In [316]:
dataset = TranslationDataset(pairs, SEQ_LENGTH)

train_loader = torch.utils.data.DataLoader(dataset=dataset, 
                                       batch_size=BATCH_SIZE,
                                       shuffle=True)

In [328]:
# Keep track of time elapsed and running averages
start = time.time()
plot_losses = []
print_loss_total = 0 # Reset every print_every
print_loss_num = 0 # Reset every plot_every

In [329]:
for epoch in range(1,n_epochs+1):
    for i, (input_tensor, target_tensor, char_level_input, char_level_target(?)) in enumerate(train_loader):
        if input_tensor.size()[0]==BATCH_SIZE:
            #this is needed because the loader puts the remainder datum in a too-small batch
            input_tensor = input_tensor.squeeze(1).transpose(0,1)
            target_tensor = target_tensor.squeeze(1).transpose(0,1)
            input_variable = Variable(input_tensor)
            target_variable = Variable(target_tensor)
            if USE_CUDA:
                input_variable = input_variable.cuda()
                target_variable = target_variable.cuda()

            loss = train(input_variable, target_variable, encoder, decoder, encoder_optimizer, decoder_optimizer,\
                         criterion, BATCH_SIZE, SEQ_LENGTH)

            # Keep track of loss
            print_loss_total += loss
            print_loss_num += 1
        else:
            pass

    print_loss_avg = print_loss_total / print_loss_num
    print_loss_total = 0
    print_loss_num = 0
    print (time_since(start, epoch/n_epochs), print_loss_avg)       

1m 7s (- 1m 7s) 2.8404785633087157
2m 14s (- 0m 0s) 1.358345284461975


# Evaluating the network

Evaluation is mostly the same as training, but there are no targets. Instead we always feed the decoder's predictions back to itself. Every time it predicts a word, we add it to the output string. If it predicts the EOS token we stop there. We also store the decoder's attention outputs for each step to display later.

We can evaluate random sentences from the training set and print out the input, target, and output to make some subjective quality judgements:

In [127]:
def evaluate(pairs, seq_length = SEQ_LENGTH, batch_size = BATCH_SIZE, max_length=MAX_LENGTH):
    
    # get a random batch
    shuffle(pairs)
    batch_pairs = pairs[0:batch_size]
    batch_input = [pair[0] for pair in pairs]
    batch_target = [pair[1] for pair in pairs]
    batch_vars = [variablesFromPair(pair, seq_len=SEQ_LENGTH) for pair in batch_pairs]
    
    # stack that batch
    inputs = [variable[0].data for variable in batch_vars]
    input_variable = Variable(torch.stack(inputs, 1).squeeze(), volatile=False)
    targets = [variable[1].data for variable in batch_vars]
    target_variable = Variable(torch.stack(targets, 1).squeeze(), volatile=False)
    
    # Get size of input and target sentences
    input_length = input_variable.size()[0]
    target_length = target_variable.size()[0]

    # Run words through encoder
    encoder_hidden = encoder.init_hidden()
    encoder_outputs, encoder_hidden = encoder(input_variable, encoder_hidden)
    
    # Prepare input and output variables
    # One SOS token for each stence, so length is batch size
    decoder_input = Variable(torch.LongTensor([[SOS_token]*batch_size]))
    decoder_context = Variable(torch.zeros(batch_size, decoder.hidden_size))
    decoder_hidden = encoder_hidden
    if USE_CUDA:
        decoder_input = decoder_input.cuda()
        decoder_context = decoder_context.cuda()
    
    decoded_words = []
    decoder_attentions = torch.zeros(max_length, max_length)
    
    # Run through decoder
    
    for di in range(max_length):
        decoder_output, decoder_context, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_context, decoder_hidden, encoder_outputs)

        topv, topi = decoder_output.data.topk(1)
        ni = topi.squeeze(1)
        
        batch_decoded_words = []
        for word_index in ni.numpy():
            batch_decoded_words.append(output_lang.index2word[word_index])
        
        decoded_words.append(batch_decoded_words)
            
        decoder_input = Variable(ni) # Chosen word is next input
        if USE_CUDA: decoder_input = decoder_input.cuda()
            
    decoded_words_matrix = np.array(decoded_words).reshape((SEQ_LENGTH, BATCH_SIZE))
        
    decoded_sentences = []

    for column in range(decoded_words_matrix.shape[1]):
        decoded_sentence = ' '.join(list(decoded_words_matrix[:, column]))
        decoded_sentences.append(decoded_sentence)
    
    return batch_input, batch_target, decoded_sentences, decoder_attentions[:di+1, :len(encoder_outputs)]

In [128]:
example_input, example_target, decoded_sentences, decoder_attentions = evaluate(pairs)

In [129]:
for i in range(20):
    print ("input:", example_input[i])
    print ("target:", example_target[i])
    print ("translation:", decoded_sentences[i])
    print ("...")

input: sie sind durch und durch naturlich .
target: they re completely natural .
machine translation: they re very imaginative . <EOS> <PAD> <PAD> <PAD> <PAD>
...
input: es tut mir so leid . nein nein .
target: i m so sorry . no no .
machine translation: i m so sorry to say no . <EOS> <PAD>
...
input: sie werden von den anderen menschen beobachtet .
target: they are observed by the other people 
machine translation: they are observed buried tuxedos other people . <EOS> <PAD>
...
input: wir sind noch nicht fertig mit dem video .
target: we re not finished with the video yet .
machine translation: we re not finished with with video yet . <EOS>
...
input: im einschatzen von wahrscheinlichkeiten sind wir nicht gut .
target: we are not good at reasoning with uncertainty .
machine translation: we are seeing a at reasoning with uncertainty . <EOS>
...
input: sie stellen tatsachlich urin her .
target: they are actually making urine .
machine translation: they are doing making urine . <EOS> <PA