https://github.com/keon/seq2seq/
https://pytorch.org/docs/stable/nn.html#torch.nn.Embedding
https://pytorch.org/docs/stable/nn.html#torch.nn.GRU

In [1]:
import torch
from torchtext.datasets import TranslationDataset, Multi30k
from torchtext.data import Field, BucketIterator
import spacy

In [2]:
Multi30k.download('.data')

'.data/multi30k/'

In [3]:
spacy_de = spacy.load('de')
spacy_en = spacy.load('en')

In [4]:
def tokenize_de(text):
    """
    Tokenizes German text from a string into a list of strings and reverses it
    """
    return [tok.text for tok in spacy_de.tokenizer(text)][::-1]

def tokenize_en(text):
    """
    Tokenizes English text from a string into a list of strings
    """
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [5]:
SRC = Field(tokenize=tokenize_de, init_token='<sos>', eos_token='<eos>')
TRG = Field(tokenize=tokenize_en, init_token='<sos>', eos_token='<eos>')

In [6]:
train, valid, test = TranslationDataset.splits(      
  path = '.data/multi30k',  
  exts = ['.de', '.en'],   
  fields = [('src', SRC), ('trg', TRG)],
  train = 'train', 
  validation = 'val', 
  test = 'test2016')

In [7]:
SRC.build_vocab(train.src, min_freq=2)
TRG.build_vocab(train.trg, min_freq=2)

In [8]:
BATCH_SIZE = 128

In [9]:
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train, valid, test), batch_size=BATCH_SIZE, repeat=False)

- https://discuss.pytorch.org/t/how-can-i-know-which-part-of-h-n-of-bidirectional-rnn-is-for-backward-process/3883/4
- https://towardsdatascience.com/understanding-bidirectional-rnn-in-pytorch-5bd25a5dd66
- https://discuss.pytorch.org/t/get-forward-and-backward-output-seperately-from-bidirectional-rnn/2523

In [10]:
import torch.nn as nn

class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, dropout, bidirectional):
        super().__init__()
        
        self.input_dim = input_dim
        self.emb_dim = emb_dim
        self.hid_dim = hid_dim
        self.dropout = dropout
        
        self.embedding = nn.Embedding(input_dim, emb_dim)
        
        self.rnn = nn.GRU(emb_dim, hid_dim, bidirectional=bidirectional)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src, hidden=None):
        
        #src = [sent len, batch size]
        
        embedded = self.dropout(self.embedding(src))
        
        #embedded = [sent len, batch size, emb dim]
        
        outputs, hidden = self.rnn(embedded, hidden)
                
        #outputs = [sent len, batch size, hid dim * num directions]
        #hidden = [n layers * num directions, batch size, hid dim]
        
        #outputs are stacked [forward_1, backward_1, forward_2, backward_2, ...]
        #outputs are always from the last layer
        
        #outputs[-1,  :, :self.hid_dim] is the last of the forwards RNN
        #outputs[ 0,  :, self.hid_dim:] is the last of the backwards RNN
        #hidden [-2, :, : ] is the last of the forwards RNN 
        #hidden [-1, :, : ] is the last of the backwards RNN
        
        #therefore:
        assert (torch.cat((outputs[-1,:,:self.hid_dim], outputs[0,:,self.hid_dim:]),dim=1) == torch.cat((hidden[-2,:,:], hidden[-1,:,:]),dim=1)).all()
        
        #we sum, but can take the mean or pass through a linear layer
        outputs = outputs[:, :, :self.hid_dim] + outputs[:, :, self.hid_dim:]
        hidden = hidden[-2,:,:] + hidden[-1,:,:]
        
        #outputs = [sent len, batch size, hid dim]
        #hidden = [batch size, hid dim]
        
        return outputs, hidden

In [11]:
import torch.nn.functional as F

class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, dropout):
        super().__init__()

        self.emb_dim = emb_dim
        self.hid_dim = hid_dim
        self.output_dim = output_dim
        self.dropout = dropout
        
        self.embedding = nn.Embedding(output_dim, emb_dim)
        
        self.attn = nn.Linear(hid_dim * 2, hid_dim)
        self.v = nn.Parameter(torch.rand(hid_dim))
        
        self.rnn = nn.GRU(hid_dim + emb_dim, hid_dim)
        
        self.out = nn.Linear(hid_dim * 2, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def attention(self, hidden, encoder_outputs):
        
        #hidden = [batch size, hid dim]
        #encoder_outputs = [src sent len, batch size, hid dim]
        
        batch_size = encoder_outputs.shape[1]
        src_len = encoder_outputs.shape[0]
        
        hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        
        #hidden = [batch size, src sent len, hid dim]
        #encoder_outputs = [batch size, src sent len, hid dim]
        
        energy = self.attn(torch.cat((hidden, encoder_outputs),dim=2))
        
        #energy = [batch size, src sent len, hid dim]
        
        energy = energy.permute(0, 2, 1)
        
        #energy = [batch size, hid dim, src sent len]
        
        #v = [hid dim]
        
        v = self.v.repeat(batch_size, 1).unsqueeze(1)
        
        #v = [batch size, 1, hid_dim]
                
        energy = torch.bmm(v, energy).squeeze(1)
        
        #energy = [batch size, src len]
        
        return F.softmax(energy, dim=1).unsqueeze(1)
        
    def forward(self, input, hidden, encoder_outputs):
             
        #input = [bsz]
        #hidden = [batch size, hid dim]
        #encoder_outputs = [src sent len, batch size, hid dim]
        
        input = input.unsqueeze(0)
        
        #input = [1, bsz]
        #hidden = [batch size, hid dim]
        
        embedded = self.dropout(self.embedding(input))
        
        #embedded = [1, bsz, emb dim]
        
        a = self.attention(hidden, encoder_outputs)
        
        #a = [bsz, 1, src len]
        
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        
        #encoder_outputs = [bsz, src sent len, hid dim]
        
        context = torch.bmm(a, encoder_outputs)
        
        #context = [bsz, 1, hid dim]
        
        context = context.permute(1, 0, 2)
        
        #context = [1, bsz, hid dim]
        
        rnn_input = torch.cat((embedded, context), dim=2)
        
        #rnn_input = [1, bsz, hid dim + emb dim]
              
        output, hidden = self.rnn(rnn_input, hidden.unsqueeze(0))
        
        #o = [1, bsz, hid dim]
        #h = [n layers, bsz, hid dim]
        
        output = output.squeeze(0)
        context = context.squeeze(0)
        
        output = self.out(torch.cat((output, context), dim=1))
        
        #output = [bsz, output dim]
        
        return output, hidden.squeeze(0)

In [12]:
OUTPUT_DIM = len(TRG.vocab)
INPUT_DIM = len(SRC.vocab)
EMB_DIM = 256
HID_DIM = 512
DROPOUT = 0.5
BIDIRECTIONAL = True

enc = Encoder(INPUT_DIM, EMB_DIM, HID_DIM, DROPOUT, BIDIRECTIONAL)
dec = Decoder(INPUT_DIM, EMB_DIM, HID_DIM, DROPOUT)

In [13]:
import random

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        
        #src = [sent len, batch size]
        #trg = [sent len, batch size]
        #teacher_forcing_ratio is probability to use teacher forcing
        #e.g. if teacher_forcing_ratio is 0.75 we use teacher forcing 75% of the time
        
        batch_size = src.shape[1]
        max_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        
        #tensor to store decoder outputs
        outputs = torch.zeros(max_len, batch_size, trg_vocab_size).to(self.device)
        
        #encoder_outputs is all hidden states of the input sequence, back and forwards summed together
        #hidden is the final hidden state of the input sequence, back and forwards summed together
        encoder_outputs, hidden = self.encoder(src)
                
        #first input to the decoder is the <sos> tokens
        output = trg[0,:]
        
        for t in range(1, max_len):
            output, hidden = self.decoder(output, hidden, encoder_outputs)
            outputs[t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.max(1)[1]
            output = (trg[t] if teacher_force else top1)

        return outputs

In [14]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = Seq2Seq(enc, dec, device).to(device)

In [15]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())

In [16]:
pad_idx = TRG.vocab.stoi['<pad>']

criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

In [17]:
def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        
        src = batch.src
        trg = batch.trg
        
        optimizer.zero_grad()
        
        output = model(src, trg)
        
        loss = criterion(output[1:].view(-1, output.shape[2]), trg[1:].view(-1))
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [18]:
def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            src = batch.src
            trg = batch.trg

            output = model(src, trg, 0) #turn off teacher forcing

            loss = criterion(output[1:].view(-1, output.shape[2]), trg[1:].view(-1))

            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [19]:
import math
import os
N_EPOCHS = 25
CLIP = 10

best_valid_loss = float('inf')

if not os.path.isdir('.save'):
    os.makedirs('.save')

for epoch in range(N_EPOCHS):
    
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), '.save/tut3_model.pt')
    
    print(f'| Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f} | Val. Loss: {valid_loss:.3f} | Val. PPL: {math.exp(valid_loss):7.3f} |')

  return Variable(arr, volatile=not train)


| Epoch: 01 | Train Loss: 4.588 | Train PPL:  98.301 | Val. Loss: 3.706 | Val. PPL:  40.685 |
| Epoch: 02 | Train Loss: 3.593 | Train PPL:  36.336 | Val. Loss: 3.226 | Val. PPL:  25.182 |
| Epoch: 03 | Train Loss: 3.151 | Train PPL:  23.353 | Val. Loss: 2.960 | Val. PPL:  19.304 |
| Epoch: 04 | Train Loss: 2.858 | Train PPL:  17.421 | Val. Loss: 2.858 | Val. PPL:  17.433 |
| Epoch: 05 | Train Loss: 2.653 | Train PPL:  14.201 | Val. Loss: 2.768 | Val. PPL:  15.923 |
| Epoch: 06 | Train Loss: 2.456 | Train PPL:  11.657 | Val. Loss: 2.805 | Val. PPL:  16.528 |
| Epoch: 07 | Train Loss: 2.323 | Train PPL:  10.208 | Val. Loss: 2.713 | Val. PPL:  15.068 |
| Epoch: 08 | Train Loss: 2.191 | Train PPL:   8.944 | Val. Loss: 2.897 | Val. PPL:  18.113 |
| Epoch: 09 | Train Loss: 2.082 | Train PPL:   8.019 | Val. Loss: 2.698 | Val. PPL:  14.852 |
| Epoch: 10 | Train Loss: 2.005 | Train PPL:   7.424 | Val. Loss: 2.811 | Val. PPL:  16.619 |
| Epoch: 11 | Train Loss: 1.921 | Train PPL:   6.826 | Val. 

In [20]:
model.load_state_dict(torch.load('.save/tut3_model.pt'))

test_loss = evaluate(model, test_iterator, criterion)

print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f}')

  return Variable(arr, volatile=not train)


| Test Loss: 2.628 | Test PPL:  13.841
