In [1]:
import numpy as np
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch import optim
import torch.nn.functional as F

use_cuda = torch.cuda.is_available()

In [2]:
SOS_token = 1
EOS_token = 2

# 2000 word vocabulary
# 50 dimensional embeddings

In [176]:
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import numpy as np

def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

# Load dataset

In [3]:
def load_dataset(language):
    with open('data/train.{}.txt'.format(language), 'r') as data:
        sentences = [sentence.rstrip('\n').split(' ') for sentence in data]
    with open('data/vocab.{}.txt'.format(language), 'r') as vocab:
        vocab = {word.rstrip('\n'): index for index, word in enumerate(vocab)}
    return sentences, vocab

en_sentences, en_vocab = load_dataset('en')
vi_sentences, vi_vocab = load_dataset('vi')

dataset = list(zip(en_sentences, vi_sentences))

In [215]:
def load_test_sets(in_language, out_language):
    with open('data/test.{}.txt'.format(in_language), 'r') as data:
        in_sentences = [sentence.rstrip('\n').split(' ') for sentence in data]
    with open('data/test.{}.txt'.format(out_language), 'r') as data:
        out_sentences = [sentence.rstrip('\n').split(' ') for sentence in data]
    return in_sentences, out_sentences

en_test_sentences, vi_test_sentences = load_test_sets('en', 'vi')

# Process dataset

In [6]:
def word_from_dict(word, lan_dict):
    if word in lan_dict:
        return lan_dict[word]
    else:
        return lan_dict['<unk>']

def process_sentences(sentences, vocab, translate_to=False):
    X = list()
    Xoh = list()
    for index, sentence in enumerate(sentences):
        if translate_to:
            index_sentence = [SOS_token] + [word_from_dict(word, vocab) for word in sentence] + [EOS_token]
        else:
            index_sentence = [word_from_dict(word, vocab) for word in sentence] + [EOS_token]
        a = np.array(index_sentence)
        b = np.zeros((a.size, len(vocab)))
        b[np.arange(a.size),a] = 1
        X.append(index_sentence)
        Xoh.append(b)
    X = np.array([np.array(Xi) for Xi in X])
    Xoh = np.array([Xohi for Xohi in Xoh])
    return X, Xoh

In [7]:
X, Xoh = process_sentences(en_sentences, en_vocab)

In [8]:
Y, Yoh = process_sentences(vi_sentences, vi_vocab, translate_to=True)

In [216]:
X_test, Xoh_test = process_sentences(en_test_sentences, en_vocab)

In [217]:
Y_test, Yoh_test = process_sentences(vi_test_sentences, vi_vocab, translate_to=True)

# Build model

In [196]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, bidirectional=True, batch_first=True)

    def forward(self, input, hidden):
        embedded = self.embedding(input)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        result = Variable(torch.zeros(2, 1, self.hidden_size))
        if use_cuda:
            return result.cuda()
        else:
            return result

In [197]:
class Attention(nn.Module):
    def __init__(self, hidden_size):
        super(Attention, self).__init__()
        self.hidden_size = hidden_size
        self.attn = nn.Linear(self.hidden_size * 3, 1)

    def forward(self, decoder_hidden, encoder_output):
        decoder_hidden_expanded = decoder_hidden.expand(1, encoder_output.size()[1], decoder_hidden.size()[2])
        input_vector = torch.cat((decoder_hidden_expanded, encoder_output), 2)
        output = torch.matmul(input_vector, self.attn.weight.t())
        attn_weights = F.softmax(output, dim=1)
        permuted_encoder_output = encoder_output.permute(0, 2, 1)
        input_context = torch.bmm(permuted_encoder_output, attn_weights).view(1,1,-1)
        return input_context

In [198]:
class DecoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, self.hidden_size)
        self.gru = nn.GRU(self.hidden_size * 3, self.hidden_size, batch_first=True)
        self.out = nn.Linear(self.hidden_size, input_size)

    def forward(self, input_context, hidden, word):
        embedded = self.embedding(word)
        output = torch.cat((embedded, input_context), 2)
        output, hidden = self.gru(output, hidden)
        output = self.out(output)
        output = F.log_softmax(output, dim=2)
        return output, hidden

In [210]:
# Define global params for training
LEARNING_RATE = 0.01
EPOCHS = 30

In [211]:
def train_step(in_sentence, out_sentence, model_sections, optimizing_params):
    # pass input sentence through encoder
    encoder_hidden = encoder.initHidden()
    input = Variable(torch.LongTensor(in_sentence).view(1,-1))
    input = input.cuda() if use_cuda else input
    encoder_output, encoder_hidden = model_sections['encoder'](input, encoder_hidden)
    
    # initialize decoder hidden layer with final encoder hidden layer
    decoder_hidden = encoder_hidden[0].clone().view(1,1,-1)
    
    # initialize loss to 0
    loss = 0 
    
    # pass encoder output through attention + decoder
    for word_index in out_sentence:
        decoder_input = Variable(torch.LongTensor([[int(word_index)]]))
        decoder_input = decoder_input.cuda() if use_cuda else decoder_input
        input_context = model_sections['attention'](decoder_hidden, encoder_output) 
        decoder_output, decoder_hidden = model_sections['decoder'](input_context, decoder_hidden, decoder_input)
        expected = Variable(torch.LongTensor([int(word_index)]))
        expected = expected.cuda() if use_cuda else expected
        loss += optimizing_params['loss_function'](decoder_output.view(1,-1), expected)
    loss.backward()
    optimizing_params['enc_optimizer'].step()
    optimizing_params['att_optimizer'].step()
    optimizing_params['dec_optimizer'].step()
    
    return loss.data[0] / len(out_sentence)

In [212]:
def train(in_sentences, out_sentences, model_sections, optimizing_params):
    losses = list()
    current_loss = 0
    for epoch_num in range(EPOCHS):
        for in_sentence, out_sentence in zip(in_sentences[:30], out_sentences[:30]):
            current_loss += train_step(in_sentence, out_sentence, model_sections, optimizing_params)
        print('Epoch: {} | Loss: {}'.format(epoch_num + 1, current_loss))
        losses.append(current_loss)
        current_loss = 0
    return losses

In [213]:
# Instantiate model and optimizer
encoder = EncoderRNN(len(en_vocab), 100)
attention = Attention(100)
decoder = DecoderRNN(len(vi_vocab), 100)
criterion = nn.NLLLoss()
encoder_optimizer = optim.SGD(encoder.parameters(), lr=LEARNING_RATE)
attention_optimizer = optim.SGD(attention.parameters(), lr=LEARNING_RATE)
decoder_optimizer = optim.SGD(decoder.parameters(), lr=LEARNING_RATE)

# save model and optimizer params to dict
model_sections = {'encoder': encoder, 
                  'attention': attention, 
                  'decoder': decoder,
                 }

optimizing_params = {'loss_function': criterion, 
                     'enc_optimizer': encoder_optimizer,
                     'att_optimizer': attention_optimizer,
                     'dec_optimizer': decoder_optimizer,
                    }

In [214]:
losses = train(X, Y, model_sections, optimizing_params)

Epoch: 1 | Loss: 192.2237575328946
Epoch: 2 | Loss: 109.65815759343785
Epoch: 3 | Loss: 255.93483719010203
Epoch: 4 | Loss: 573.7265294935715
Epoch: 5 | Loss: 1023.0489503364831
Epoch: 6 | Loss: 1159.9805471117816
Epoch: 7 | Loss: 1144.9182745916385
Epoch: 8 | Loss: 1045.813236705913
Epoch: 9 | Loss: 817.5437194021795
Epoch: 10 | Loss: 638.8699334556007
Epoch: 11 | Loss: 594.9702977528832
Epoch: 12 | Loss: 471.60360606809814
Epoch: 13 | Loss: 328.63357963127095
Epoch: 14 | Loss: 257.50031677433867
Epoch: 15 | Loss: 152.35547300270804
Epoch: 16 | Loss: 87.00339600361606
Epoch: 17 | Loss: 95.36094167671493
Epoch: 18 | Loss: 57.62134524842298
Epoch: 19 | Loss: 37.042083944381375
Epoch: 20 | Loss: 31.702114102346304
Epoch: 21 | Loss: 72.26683934254542
Epoch: 22 | Loss: 47.800909760497476
Epoch: 23 | Loss: 40.842412355926086
Epoch: 24 | Loss: 186.15513121095313
Epoch: 25 | Loss: 174.78181063586388
Epoch: 26 | Loss: 126.12793209924305
Epoch: 27 | Loss: 87.46511186845575
Epoch: 28 | Loss: 94.

In [204]:
showPlot(losses)