https://github.com/keon/seq2seq
https://pytorch.org/docs/stable/nn.html#torch.nn.Embedding
https://pytorch.org/docs/stable/nn.html#torch.nn.LSTM

In [1]:
import torch
from torchtext.datasets import TranslationDataset, Multi30k
from torchtext.data import Field, BucketIterator
import spacy

In [2]:
Multi30k.download('.data')

'.data/multi30k/'

In [3]:
spacy_de = spacy.load('de')
spacy_en = spacy.load('en')

In [4]:
def tokenize_de(text):
    """
    Tokenizes German text from a string into a list of strings and reverses it
    """
    return [tok.text for tok in spacy_de.tokenizer(text)][::-1]

def tokenize_en(text):
    """
    Tokenizes English text from a string into a list of strings
    """
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [5]:
SRC = Field(tokenize=tokenize_de, init_token='<sos>', eos_token='<eos>')
TRG = Field(tokenize=tokenize_en, init_token='<sos>', eos_token='<eos>')

In [6]:
train, valid, test = TranslationDataset.splits(      
  path = '.data/multi30k',  
  exts = ['.de', '.en'],   
  fields = [('src', SRC), ('trg', TRG)],
  train = 'train', 
  validation = 'val', 
  test = 'test2016')

In [7]:
SRC.build_vocab(train.src, min_freq=2)
TRG.build_vocab(train.trg, min_freq=2)

In [8]:
BATCH_SIZE = 128

In [9]:
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train, valid, test), batch_size=BATCH_SIZE, repeat=False)

- https://discuss.pytorch.org/t/how-can-i-know-which-part-of-h-n-of-bidirectional-rnn-is-for-backward-process/3883/4
- https://towardsdatascience.com/understanding-bidirectional-rnn-in-pytorch-5bd25a5dd66
- https://discuss.pytorch.org/t/get-forward-and-backward-output-seperately-from-bidirectional-rnn/2523

In [10]:
import torch.nn as nn

class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        
        self.input_dim = input_dim
        self.emb_dim = emb_dim
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        self.dropout = dropout
        
        self.embedding = nn.Embedding(input_dim, emb_dim)
        
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src):
        
        #src = [sent len, batch size]
        
        embedded = self.dropout(self.embedding(src))
        
        #embedded = [sent len, batch size, emb dim]
        
        outputs, (hidden, cell) = self.rnn(embedded)
        
        #outputs = [sent len, batch size, hid dim * n directions]
        #hidden = [n layers, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]
        
        #outputs are always from the last layer
        
        return hidden, cell

In [11]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()

        self.emb_dim = emb_dim
        self.hid_dim = hid_dim
        self.output_dim = output_dim
        self.n_layers = n_layers
        self.dropout = dropout
        
        self.embedding = nn.Embedding(output_dim, emb_dim)
        
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout)
        
        self.out = nn.Linear(hid_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, hidden, cell):
        
        #input = [bsz]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]
        
        #n directions will both always be 1, therefore:
        #hidden = [n layers, batch size, hid dim]
        #context = [n layers, batch size, hid dim]
        
        input = input.unsqueeze(0)
        
        #input = [1, bsz]
        
        embedded = self.dropout(self.embedding(input))
        
        #embedded = [1, bsz, emb dim]
                
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        
        #outputs = [sent len, batch size, hid dim * n directions]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]
        
        #sent len and n directions will always be 1, therefore:
        #outputs = [1, batch size, hid dim]
        #hidden = [n layers, batch size, hid dim]
        #cell = [n layers, batch size, hid dim]
        
        output = output.squeeze(0)
        
        return self.out(output), hidden, cell

In [12]:
OUTPUT_DIM = len(TRG.vocab)
INPUT_DIM = len(SRC.vocab)
EMB_DIM = 256
HID_DIM = 512
N_LAYERS = 2
DROPOUT = 0.5

enc = Encoder(INPUT_DIM, EMB_DIM, HID_DIM, N_LAYERS, DROPOUT)
dec = Decoder(INPUT_DIM, EMB_DIM, HID_DIM, N_LAYERS, DROPOUT)

In [13]:
import random

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        
        #src = [sent len, batch size]
        #trg = [sent len, batch size]
        #teacher_forcing_ratio is probability to use teacher forcing
        #e.g. if teacher_forcing_ratio is 0.75 we use teacher forcing 75% of the time
        
        batch_size = src.shape[1]
        max_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        
        #tensor to store decoder outputs
        outputs = torch.zeros(max_len, batch_size, trg_vocab_size).to(self.device)
        
        #last hidden state of the encoder is used as the initial hidden state of the decoder
        hidden, cell = self.encoder(src)
        
        #first input to the decoder is the <sos> tokens
        output = trg[0,:]
        
        for t in range(1, max_len):
            output, hidden, cell = self.decoder(output, hidden, cell)
            outputs[t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.max(1)[1]
            output = (trg[t] if teacher_force else top1)

        return outputs

In [14]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = Seq2Seq(enc, dec, device).to(device)

In [15]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())

In [16]:
pad_idx = TRG.vocab.stoi['<pad>']

criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

In [17]:
def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        
        src = batch.src
        trg = batch.trg
        
        optimizer.zero_grad()
        
        output = model(src, trg)
        
        loss = criterion(output[1:].view(-1, output.shape[2]), trg[1:].view(-1))
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [18]:
def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            src = batch.src
            trg = batch.trg

            output = model(src, trg)

            loss = criterion(output[1:].view(-1, output.shape[2]), trg[1:].view(-1))

            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [20]:
import math
import os
N_EPOCHS = 25
CLIP = 10

best_valid_loss = float('inf')

if not os.path.isdir('.save'):
    os.makedirs('.save')

for epoch in range(N_EPOCHS):
    
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), '.save/tut1_model.pt')
    
    print(f'| Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f} | Val. Loss: {valid_loss:.3f} | Val. PPL: {math.exp(valid_loss):7.3f} |')

  return Variable(arr, volatile=not train)


| Epoch: 01 | Train Loss: 4.550 | Train PPL:  94.617 | Val. Loss: 4.176 | Val. PPL:  65.102 |
| Epoch: 02 | Train Loss: 4.245 | Train PPL:  69.752 | Val. Loss: 4.029 | Val. PPL:  56.180 |
| Epoch: 03 | Train Loss: 4.047 | Train PPL:  57.211 | Val. Loss: 3.726 | Val. PPL:  41.504 |
| Epoch: 04 | Train Loss: 3.862 | Train PPL:  47.565 | Val. Loss: 3.597 | Val. PPL:  36.480 |
| Epoch: 05 | Train Loss: 3.711 | Train PPL:  40.908 | Val. Loss: 3.524 | Val. PPL:  33.913 |
| Epoch: 06 | Train Loss: 3.594 | Train PPL:  36.362 | Val. Loss: 3.356 | Val. PPL:  28.672 |
| Epoch: 07 | Train Loss: 3.491 | Train PPL:  32.807 | Val. Loss: 3.284 | Val. PPL:  26.682 |
| Epoch: 08 | Train Loss: 3.384 | Train PPL:  29.493 | Val. Loss: 3.166 | Val. PPL:  23.714 |
| Epoch: 09 | Train Loss: 3.301 | Train PPL:  27.128 | Val. Loss: 3.327 | Val. PPL:  27.866 |
| Epoch: 10 | Train Loss: 3.219 | Train PPL:  24.994 | Val. Loss: 3.117 | Val. PPL:  22.572 |
| Epoch: 11 | Train Loss: 3.112 | Train PPL:  22.471 | Val. 

In [22]:
model.load_state_dict(torch.load('.save/tut1_model.pt'))

test_loss = evaluate(model, test_iterator, criterion)

print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f}')

  return Variable(arr, volatile=not train)


| Test Loss: 2.792 | Test PPL:  16.314
