In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

from torchtext.datasets import TranslationDataset, Multi30k
from torchtext.data import Field, BucketIterator

import spacy
import numpy as np

import random
import math
import time

In [2]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## Preparing Data

We will be translating from French to English. This task is an example of sequence to sequence (seq2seq). Seq2seq can be more challenging than classification, since the output is of variable length (and typically different from the length of the input.


In [4]:
spacy_de = spacy.load('de')
spacy_en = spacy.load('en')

def tokenize_de(text):
    """
    Tokenizes German text from a string into a list of strings (tokens) and reverses it
    """
    return [tok.text for tok in spacy_de.tokenizer(text)][::-1]

def tokenize_en(text):
    """
    Tokenizes English text from a string into a list of strings (tokens)
    """
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [5]:
SRC = Field(tokenize = tokenize_de, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True,
            batch_first= True)

TRG = Field(tokenize = tokenize_en, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True,
            batch_first=True)

Next, we download and load the train, validation and test data. 

The dataset we'll be using is the [Multi30k dataset](https://github.com/multi30k/dataset). This is a dataset with ~30,000 parallel English, German and French sentences, each with ~12 words per sentence. 

`exts` specifies which languages to use as the source and target (source goes first) and `fields` specifies which field to use for the source and target.

In [6]:
train_data, valid_data, test_data = Multi30k.splits(exts = ('.de', '.en'), 
                                                    fields = (SRC, TRG))

print(f"Number of training examples: {len(train_data.examples)}")
print(f"Number of validation examples: {len(valid_data.examples)}")
print(f"Number of testing examples: {len(test_data.examples)}")

Number of training examples: 29000
Number of validation examples: 1014
Number of testing examples: 1000


In [7]:
SRC.build_vocab(train_data, min_freq = 2)
TRG.build_vocab(train_data, min_freq = 2)
print(f"Unique tokens in source (de) vocabulary: {len(SRC.vocab)}")
print(f"Unique tokens in target (en) vocabulary: {len(TRG.vocab)}")

Unique tokens in source (de) vocabulary: 7855
Unique tokens in target (en) vocabulary: 5893


In [8]:
TRG.vocab.itos[0],TRG.vocab.itos[1],TRG.vocab.itos[2]

('<unk>', '<pad>', '<sos>')

In [9]:
BATCH_SIZE = 64

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE, 
    device = device)

In [10]:
# for i, batch in enumerate(train_iterator):
#     src = batch.src
#     trg = batch.trg

batch = next(iter(train_iterator))
print(batch.src)
print(batch.trg)
print("batch.src.size: ",batch.src.size())
print("batch.trg.size: ",batch.trg.size())

tensor([[   2,    4, 2550,  ...,    1,    1,    1],
        [   2,    4,  123,  ...,    1,    1,    1],
        [   2,    4,    0,  ...,    1,    1,    1],
        ...,
        [   2,    4,    0,  ...,    1,    1,    1],
        [   2,    4,  338,  ...,    1,    1,    1],
        [   2,    4,  713,  ...,    1,    1,    1]], device='cuda:0')
tensor([[  2,   9, 136,  ...,   1,   1,   1],
        [  2,   4,  26,  ...,   1,   1,   1],
        [  2,   4,  53,  ...,   1,   1,   1],
        ...,
        [  2,   4,  70,  ...,   1,   1,   1],
        [  2,  19, 119,  ...,   1,   1,   1],
        [  2,   4,  64,  ...,   1,   1,   1]], device='cuda:0')
batch.src.size:  torch.Size([64, 25])
batch.trg.size:  torch.Size([64, 29])


## Our Model


### Encoders & Decoders
The model in itself consists in an encoder and a decoder

![Seq2seq model](assets/seq2seq.png)
**Note: We are going from German to French**
<center><i>Diagram from Smerity's <a href="https://smerity.com/articles/2016/google_nmt_arch.html">Peeking into the neural network architecture used for Google's Neural Machine Translation</a></i></center>

The encoder is a recurrent neural net and we feed it our input sentence, producing an output (that we discard for now) and a hidden state.  A **hidden state** is the activations that come out of an RNN.

That hidden state is then given to the decoder (an other RNN) which uses it in conjunction with the outputs it predicts to get produce the translation. We loop until the decoder produces a padding token (or at 30 iterations to make sure it's not an infinite loop at the beginning of training). 

In [161]:
class Seq2SeqRNN(nn.Module):
    def __init__(self, src_vocab_size, target_vocab_size, 
                    hidden_size, output_seq_len, 
                    num_layers=2, bos_idx=0, pad_idx=1):
        super().__init__()
        
        emb_enc = nn.Embedding(src_vocab_size,300)
        emb_dec = nn.Embedding(target_vocab_size,300)
        
        self.num_layers,self.hidden_size,self.output_seq_len = num_layers,hidden_size,output_seq_len
        self.bos_idx,self.pad_idx = bos_idx,pad_idx #0,1
        self.em_sz_enc = emb_enc.embedding_dim #300
        self.em_sz_dec = emb_dec.embedding_dim #300
        self.voc_sz_dec = emb_dec.num_embeddings #5893 Vocabulary Size for decoder
                 
        #Encoder
        self.emb_enc = emb_enc
        self.emb_enc_drop = nn.Dropout(0.15)
        self.gru_enc = nn.GRU(input_size=self.em_sz_enc, hidden_size=hidden_size, num_layers=num_layers,
                              dropout=0.25, batch_first=True)
        self.out_enc = nn.Linear(hidden_size, self.em_sz_dec, bias=False) #256, 300
       
        #Decoder
        self.emb_dec = emb_dec
        self.gru_dec = nn.GRU(input_size=self.em_sz_dec, hidden_size=self.em_sz_dec, num_layers=num_layers,
                              dropout=0.1, batch_first=True)
        self.out_drop = nn.Dropout(0.35)
        self.out_dec = nn.Linear(self.em_sz_dec, self.voc_sz_dec) #300, 8144
        
        self.out_dec.weight.data = self.emb_dec.weight.data

    def encoder(self, bs, inp): #bs:64 , inp.size: 64,x
        # Dimension of embedding is 300
        # h = (num_layers (2)*num_directions (1), batch  (64), hidden_size (256))
        h = self.initHidden(bs) # h.size: 2,64,256
        emb = self.emb_enc(inp) #emb.size: 64,x,300
        emb = self.emb_enc_drop(emb) #emb.size: 64,x,300
        _, h = self.gru_enc(emb, h) #h.size: 2,64,256
        
        # h(2,64,245) *  out_enc(256,300) = 2,64,300
        h = self.out_enc(h) #h.size: 2,64,300
        return h
    
    def decoder(self, dec_inp, h): #dec_inp.size: [64], h.size: 2,64,300 
        emb = self.emb_dec(dec_inp).unsqueeze(1) #emb.size: 64, 1, 300 
        outp, h = self.gru_dec(emb, h) #outp.size: [64, 1, 300], h.size: 2, 64, 300
        outp = self.out_dec(self.out_drop(outp[:,0])) #outp.size: 64, 5893
        return h, outp
        
    def forward(self, inp):
        bs, sl = inp.size()
        #batch_size(bs) = 64
        #seq_length(sl) = x variable
        
        h = self.encoder(bs, inp) #batch_size: 64, inp: x variable
        #h.size: [2, 64, 300]
        dec_inp = inp.new_zeros(bs).long() + self.bos_idx #[64] zeros
        res = []
        for i in range(self.output_seq_len): #self.output_seq_len: 30
            h, outp = self.decoder(dec_inp, h) #dec_inp:64, #h = [batch_size: 64, inp: x variable]
            # outp.size: [batch_size:64, emb_decoder:5893]
            ## h.size: (num_layers (2)*num_directions (1), batch  (64), hidden_size (300))
            #tensor.max(input, dim=0, keepdim=False, out=None):Returns the maximum value of all elements in the input tensor.
            dec_inp = outp.max(1)[1]
            res.append(outp)
            '''
            The all() function returns True if all items in an iterable(list, tuple, dictionary, etc.) are true, otherwise 
            it returns False.If the iterable object is empty, the all() function also returns True.
            '''
            if (dec_inp==self.pad_idx).all(): break #break if you detect pad_idx
        return torch.stack(res, dim=1)
    
    def initHidden(self, bs): return one_param(self).new_zeros(self.num_layers, bs, self.hidden_size)

In [162]:
def one_param(m: nn.Module): 
    "Return the first parameter of `m`."
    return next(m.parameters())

In [163]:
model = Seq2SeqRNN(len(SRC.vocab), len(TRG.vocab), 256, 30)

In [164]:
model = model.cuda()

In [165]:
a = model(batch.src)

In [119]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 7,881,889 trainable parameters


In [23]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)
        
model.apply(init_weights)

Seq2SeqRNN(
  (emb_enc): Embedding(7855, 300)
  (emb_enc_drop): Dropout(p=0.15, inplace=False)
  (gru_enc): GRU(300, 256, num_layers=2, batch_first=True, dropout=0.25)
  (out_enc): Linear(in_features=256, out_features=300, bias=False)
  (emb_dec): Embedding(5893, 300)
  (gru_dec): GRU(300, 300, num_layers=2, batch_first=True, dropout=0.1)
  (out_drop): Dropout(p=0.35, inplace=False)
  (out_dec): Linear(in_features=300, out_features=5893, bias=True)
)

In [24]:
optimizer = optim.Adam(model.parameters(),lr=0.001)
TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]

criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)

In [25]:
def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        
        src = batch.src
        trg = batch.trg


        optimizer.zero_grad()
        
        output = model(src)
        
        pad_idx=1
        targ_len = trg.size()[1]

        out_len = output.size()[1]
        
        '''
        Our Taget and output has to be the same in sequence length. We have hard-coded input to be sequence length of 30.
        If our target length is more than our output (targ_len>out_len): We need to add paading into our output length
        If our target length is less than our output (out_len>targ_len): We need to add padding into our target length. 
        
        '''
        if targ_len>out_len:
            output  = nn.functional.pad(output,  (0,0,0,targ_len-out_len,0,0), value=pad_idx)
        if out_len>targ_len:
            trg = nn.functional.pad(trg, (0,out_len-targ_len,0,0), value=pad_idx)

        output_dim = output.shape[-1]
        
        output = output[:,1:,:]
        output = output.contiguous().view(-1, output_dim)
        
        trg = trg[:,1:]
        trg = trg.contiguous().view(-1)


        loss = criterion(output, trg)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        

        
    return epoch_loss / len(iterator)

def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            src = batch.src
            trg = batch.trg

            output = model(src)

            pad_idx=1
            targ_len = trg.size()[1]

            out_len = output.size()[1]

            if targ_len>out_len:
                output  = nn.functional.pad(output,  (0,0,0,targ_len-out_len,0,0), value=pad_idx)
            if out_len>targ_len:
                trg = nn.functional.pad(trg, (0,out_len-targ_len,0,0), value=pad_idx)

            output_dim = output.shape[-1]
            
            output = output[:,1:,:].contiguous().view(-1, output_dim)
            trg = trg[:,1:].contiguous().view(-1)

            loss = criterion(output, trg)
            
            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [26]:
N_EPOCHS = 10
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut1-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

Epoch: 01 | Time: 0m 20s
	Train Loss: 5.090 | Train PPL: 162.456
	 Val. Loss: 4.562 |  Val. PPL:  95.778
Epoch: 02 | Time: 0m 20s
	Train Loss: 4.491 | Train PPL:  89.172
	 Val. Loss: 4.070 |  Val. PPL:  58.533
Epoch: 03 | Time: 0m 20s
	Train Loss: 4.134 | Train PPL:  62.450
	 Val. Loss: 3.774 |  Val. PPL:  43.557
Epoch: 04 | Time: 0m 20s
	Train Loss: 3.884 | Train PPL:  48.597
	 Val. Loss: 3.628 |  Val. PPL:  37.635
Epoch: 05 | Time: 0m 20s
	Train Loss: 3.698 | Train PPL:  40.382
	 Val. Loss: 3.507 |  Val. PPL:  33.339
Epoch: 06 | Time: 0m 20s
	Train Loss: 3.541 | Train PPL:  34.486
	 Val. Loss: 3.415 |  Val. PPL:  30.416
Epoch: 07 | Time: 0m 20s
	Train Loss: 3.404 | Train PPL:  30.085
	 Val. Loss: 3.348 |  Val. PPL:  28.452
Epoch: 08 | Time: 0m 21s
	Train Loss: 3.281 | Train PPL:  26.614
	 Val. Loss: 3.308 |  Val. PPL:  27.321
Epoch: 09 | Time: 0m 20s
	Train Loss: 3.169 | Train PPL:  23.795
	 Val. Loss: 3.272 |  Val. PPL:  26.358
Epoch: 10 | Time: 0m 20s
	Train Loss: 3.073 | Train PPL

In [None]:
model.load_state_dict(torch.load('tut1-model.pt'))

test_loss = evaluate(model, test_iterator, criterion)

print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')