https://github.com/keon/seq2seq/blob/master/train.py
https://pytorch.org/docs/stable/nn.html#torch.nn.GRU
- https://discuss.pytorch.org/t/how-can-i-know-which-part-of-h-n-of-bidirectional-rnn-is-for-backward-process/3883/4
- https://towardsdatascience.com/understanding-bidirectional-rnn-in-pytorch-5bd25a5dd66
- https://discuss.pytorch.org/t/get-forward-and-backward-output-seperately-from-bidirectional-rnn/2523

In [1]:
import torch
from torchtext.datasets import TranslationDataset, Multi30k
from torchtext.data import Field, BucketIterator
import spacy

In [2]:
Multi30k.download('.data')

'.data/multi30k/'

In [3]:
spacy_de = spacy.load('de')
spacy_en = spacy.load('en')

In [4]:
def tokenize_de(text):
    return [tok.text for tok in spacy_de.tokenizer(text)]

def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [5]:
SRC = Field(tokenize=tokenize_de, init_token='<sos>', eos_token='<eos>')
TRG = Field(tokenize=tokenize_en, init_token='<sos>', eos_token='<eos>')

In [6]:
train, valid, test = TranslationDataset.splits(      
  path = '.data/multi30k',  
  exts = ['.de', '.en'],   
  fields = [('src', SRC), ('trg', TRG)],
  train = 'train', 
  validation = 'val', 
  test = 'test2016')

In [7]:
SRC.build_vocab(train.src)
TRG.build_vocab(train.trg)

In [8]:
BATCH_SIZE = 32

In [9]:
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train, valid, test), batch_size=BATCH_SIZE, repeat=False)

- https://discuss.pytorch.org/t/how-can-i-know-which-part-of-h-n-of-bidirectional-rnn-is-for-backward-process/3883/4
- https://towardsdatascience.com/understanding-bidirectional-rnn-in-pytorch-5bd25a5dd66
- https://discuss.pytorch.org/t/get-forward-and-backward-output-seperately-from-bidirectional-rnn/2523

In [10]:
import torch.nn as nn

class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, bidirectional, dropout):
        super().__init__()
        
        self.input_dim = input_dim
        self.emb_dim = emb_dim
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        self.bidirectional = bidirectional
        self.dropout = dropout
        
        self.embedding = nn.Embedding(input_dim, emb_dim)
        
        self.rnn = nn.GRU(emb_dim, hid_dim, n_layers, bidirectional=bidirectional, dropout=dropout)

        self.out = nn.Linear(2 * hid_dim, hid_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src, hidden=None):
        
        #src = [sent len, batch size]
        
        embedded = self.dropout(self.embedding(src))
        
        #embedded = [sent len, batch size, emb dim]
        
        outputs, hidden = self.rnn(embedded, hidden)
        
        #outputs = [sent len, batch size, hid dim * num directions]
        #hidden = [n layers * num directions, batch size, hid dim]
        
        #outputs are stacked [forward_1, backward_1, forward_2, backward_2, ...]
        #outputs are always from the last layer
        
        #outputs[-1,  :, :self.hid_dim] is the last of the forwards RNN
        #outputs[ 0,  :, self.hid_dim:] is the last of the backwards RNN
        #hidden [-2, :, : ] is the last of the forwards RNN 
        #hidden [-1, :, : ] is the last of the backwards RNN
        
        #therefore:
        assert (torch.cat((outputs[-1,:,:self.hid_dim], outputs[0,:,self.hid_dim:]),dim=1) == torch.cat((hidden[-2,:,:], hidden[-1,:,:]),dim=1)).all()
                
        return self.out(torch.cat((hidden[-2,:,:], hidden[-1,:,:]),dim=1))

In [11]:
INPUT_DIM = len(SRC.vocab)
EMB_DIM = 256
HID_DIM = 512
ENC_N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5

enc = Encoder(INPUT_DIM, EMB_DIM, HID_DIM, ENC_N_LAYERS, BIDIRECTIONAL, DROPOUT)

In [12]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()

        self.emb_dim = emb_dim
        self.hid_dim = hid_dim
        self.output_dim = output_dim
        self.n_layers = n_layers
        self.dropout = dropout
        
        self.embedding = nn.Embedding(output_dim, emb_dim)
        
        self.rnn = nn.GRU(emb_dim, hid_dim, n_layers, dropout=dropout)
        
        self.out = nn.Linear(hid_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, hidden):
        
        #input = [bsz]
        #hidden = [n layers, batch size, hid dim]
        
        input = input.unsqueeze(0)
        
        #input = [1, bsz]
        
        embedded = self.dropout(self.embedding(input))
        
        #embedded = [1, bsz, emb dim]
                
        output, hidden = self.rnn(embedded, hidden)
        
        #outputs = [sent len, batch size, hid dim * num directions]
        #hidden = [n layers * num directions, batch size, hid dim]
        
        #sent len and num directions will always be 1, therefore:
        #outputs = [1, batch size, hid dim]
        #hidden = [n layers, batch size, hid dim]
        
        output = output.squeeze(0)
        
        return self.out(output), hidden

In [13]:
OUTPUT_DIM = len(TRG.vocab)
EMB_DIM = 256
HID_DIM = 512
DEC_N_LAYERS = 2
DROPOUT = 0.5

dec = Decoder(INPUT_DIM, EMB_DIM, HID_DIM, DEC_N_LAYERS, DROPOUT)

In [14]:
import random

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        
        #src = [sent len, batch size]
        #trg = [sent len, batch size]
        #teacher_forcing_ratio is probability to use teacher forcing
        #e.g. if teacher_forcing_ratio is 0.75 we use teacher forcing 75% of the time
        
        batch_size = src.shape[1]
        max_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        
        #tensor to store decoder outputs
        outputs = torch.zeros(max_len, batch_size, trg_vocab_size).to(self.device)
        
        #last hidden state of the encoder is used as the initial hidden state of the decoder
        #need to unsqueeze to add a 'n layers' dimension
        hidden = self.encoder(src).unsqueeze(0).repeat(self.decoder.n_layers, 1, 1)
        
        #first input to the decoder is the <sos> tokens
        output = trg[0,:]
        
        for t in range(1, max_len):
            output, hidden = self.decoder(output, hidden)
            outputs[t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.max(1)[1]
            output = (trg[t] if teacher_force else top1)

        return outputs

In [15]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = Seq2Seq(enc, dec, device).to(device)

In [16]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index=TRG.vocab.stoi['<pad>'])

In [17]:
def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        
        src = batch.src
        trg = batch.trg
        
        optimizer.zero_grad()
        
        output = model(src, trg)
        
        loss = criterion(output[1:].view(-1, output.shape[2]), trg[1:].view(-1))
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [18]:
def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            src = batch.src
            trg = batch.trg

            output = model(src, trg)

            loss = criterion(output[1:].view(-1, output.shape[2]), trg[1:].view(-1))

            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [19]:
import math
N_EPOCHS = 25
CLIP = 10

for epoch in range(N_EPOCHS):
    
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)
    
    print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Train PPL: {math.exp(train_loss):6.3f}, Val. Loss: {valid_loss:.3f}, Val. PPL: {math.exp(valid_loss):6.3f}')

  return Variable(arr, volatile=not train)


Epoch: 01, Train Loss: 4.405, Train PPL: 81.861, Val. Loss: 3.741, Val. PPL: 42.151
Epoch: 02, Train Loss: 3.657, Train PPL: 38.741, Val. Loss: 3.457, Val. PPL: 31.710
Epoch: 03, Train Loss: 3.342, Train PPL: 28.289, Val. Loss: 3.395, Val. PPL: 29.827
Epoch: 04, Train Loss: 3.137, Train PPL: 23.027, Val. Loss: 3.333, Val. PPL: 28.030
Epoch: 05, Train Loss: 3.003, Train PPL: 20.154, Val. Loss: 3.465, Val. PPL: 31.984
Epoch: 06, Train Loss: 2.891, Train PPL: 18.018, Val. Loss: 3.443, Val. PPL: 31.284
Epoch: 07, Train Loss: 2.817, Train PPL: 16.723, Val. Loss: 3.409, Val. PPL: 30.236
Epoch: 08, Train Loss: 2.740, Train PPL: 15.490, Val. Loss: 3.483, Val. PPL: 32.572
Epoch: 09, Train Loss: 2.706, Train PPL: 14.970, Val. Loss: 3.488, Val. PPL: 32.711
Epoch: 10, Train Loss: 2.664, Train PPL: 14.350, Val. Loss: 3.497, Val. PPL: 33.003
Epoch: 11, Train Loss: 2.635, Train PPL: 13.940, Val. Loss: 3.573, Val. PPL: 35.629
Epoch: 12, Train Loss: 2.603, Train PPL: 13.504, Val. Loss: 3.640, Val. PPL:

KeyboardInterrupt: 