## Simple Encoder Decoder Architecture for MT

In [None]:
# bleu score needs
!pip install torchtext==0.6.0

# spacy language model loads
!python -m spacy download en
!python -m spacy download de

Collecting torchtext==0.6.0
[?25l  Downloading https://files.pythonhosted.org/packages/f2/17/e7c588245aece7aa93f360894179374830daf60d7ed0bbb59332de3b3b61/torchtext-0.6.0-py3-none-any.whl (64kB)
[K     |█████                           | 10kB 20.0MB/s eta 0:00:01[K     |██████████▏                     | 20kB 23.9MB/s eta 0:00:01[K     |███████████████▎                | 30kB 15.4MB/s eta 0:00:01[K     |████████████████████▍           | 40kB 11.0MB/s eta 0:00:01[K     |█████████████████████████▌      | 51kB 8.1MB/s eta 0:00:01[K     |██████████████████████████████▋ | 61kB 9.4MB/s eta 0:00:01[K     |████████████████████████████████| 71kB 6.0MB/s 
Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/e5/2d/6d4ca4bef9a67070fa1cac508606328329152b1df10bdf31fb6e4e727894/sentencepiece-0.1.94-cp36-cp36m-manylinux2014_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 8.2MB/s 
Installing collected packages: sentencepiece, torchtext
  Fo

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator
import torchtext

import numpy as np
import spacy
import random
import math
import time

In [None]:
# For deterministic results set seed

SEED = 555

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

**Preprocessing the data**:
Tokenization, adding start/end tokens and lower case, data split. Creation of iterators

In [None]:
# Load Spacy language models for tokenizing

spacy_de = spacy.load('de')
spacy_en = spacy.load('en')

In [None]:
def tokenize_de(text):
    # Tokenizes German text from a string into a list of strings (tokens) and reverses it
    # As source seq is fed in reverse order in basic enc_dec
    return [tok.text for tok in spacy_de.tokenizer(text)][::-1]

def tokenize_en(text):
    # Tokenizes English text from a string into a list of strings (tokens)
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [None]:
# Field() of TorchText: Does tokenization according to fx, appends start and end tokens and lowers() case

SRC = Field(tokenize = tokenize_de, init_token = '<sos>', eos_token = '<eos>', lower = True)
TRG = Field(tokenize = tokenize_en, init_token = '<sos>', eos_token = '<eos>', lower = True)

In [None]:
# Split data. Assigns source as German, target as English

train_data, valid_data, test_data = Multi30k.splits(exts = ('.de', '.en'), fields = (SRC, TRG))

downloading training.tar.gz


training.tar.gz: 100%|██████████| 1.21M/1.21M [00:01<00:00, 606kB/s]


downloading validation.tar.gz


validation.tar.gz: 100%|██████████| 46.3k/46.3k [00:00<00:00, 173kB/s]


downloading mmt_task1_test2016.tar.gz


mmt_task1_test2016.tar.gz: 100%|██████████| 66.2k/66.2k [00:00<00:00, 165kB/s]


In [None]:
print(f"Number of training examples: {len(train_data.examples)}")
print(f"Number of validation examples: {len(valid_data.examples)}")
print(f"Number of testing examples: {len(test_data.examples)}")

Number of training examples: 29000
Number of validation examples: 1014
Number of testing examples: 1000


In [None]:
# Build lang vocabulary. Discard words which occur less than x(min_freq) times

SRC.build_vocab(train_data, min_freq = 2)
TRG.build_vocab(train_data, min_freq = 2)

In [None]:
# Use Cuda if available

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [None]:
BATCH_SIZE = 128

# Create iterators to get a batch of seq ip/op. Iterator automatically handles padding sequences to same length

train_iterator, valid_iterator, test_iterator = BucketIterator.splits((train_data, valid_data, test_data), batch_size = BATCH_SIZE, device = device)

**s2s Model**

Encoder:
Takes the ip seq, applies an embedding and then passes it on to a RNN (LSTM).

In [None]:
# Encoder

class Encoder(nn.Module):
  def __init__(self, input_dim, hid_dim, embed_dim, n_layers, dropout):
    super().__init__()
    self.embed = nn.Embedding(input_dim, embed_dim)
    self.rnn = nn.LSTM(embed_dim, hid_dim, num_layers = 2, dropout = dropout)
    self.dropout = nn.Dropout(dropout)
    
  def forward(self, src):
    embedded = self.dropout(self.embed(src)) # [seq_len, bs, embed_dim]
    outputs, (hidden, cell) = self.rnn(embedded)
    return hidden, cell


In [None]:
# Decoder

class Decoder(nn.Module):
  def __init__(self, output_dim, hid_dim, embed_dim, n_layers, dropout):
    super().__init__()
    self.op_dim = output_dim
    self.embed = nn.Embedding(output_dim, embed_dim)
    self.rnn = nn.LSTM(embed_dim, hid_dim, num_layers = 2, dropout = dropout)
    self.dropout = nn.Dropout(dropout)
    self.fc = nn.Linear(hid_dim, output_dim)
    self.verb = True
  
  def forward(self, tgt, fin_hs, fin_cs):
    if self.verb:
      print(f'Target shape before unsqueeze: {tgt.shape}\n')
    
    tgt = tgt.unsqueeze(0) # [1, bs]
    embedded = self.dropout(self.embed(tgt)) # [1, bs, embed_dim]
    outputs, (hidden, cell) = self.rnn(embedded, (fin_hs, fin_cs))
    
    # outputs: [seq_len, bs, n_dir*hid_dim] -> [1, bs, hid_dim]
    # hidden, cell: [n_layers*seq_len, bs, hid_dim] -> [2, bs, hid_dim]

    if self.verb:
      print(f'Target shape: {tgt.shape}\nEmbed shape: {embedded.shape}\nEnc op shape: {outputs.shape}\n')
    self.verb = False

    pred = self.fc(outputs.squeeze(0)) # [bs, output_dim]
    return pred, hidden, cell

Target shape before unsqueeze: torch.Size([128])

Target shape: torch.Size([1, 128])
Embed shape: torch.Size([1, 128, 512])
Enc op shape: torch.Size([1, 128, 256])

In [None]:
# S2S

class seq2seq(nn.Module):
  def __init__(self, enc, dec, dev):
    super().__init__()

    self.enc = enc
    self.dec = dec
    self.dev = dev
    # assumption: hidden dim of enc and dec is same! No. of layers in rnn of enc and dec is same. Else think of how to sned the fin_hs and fin_cs from enc to dec
  
  def forward(self, src, tgt, teacher_force_ratio=0.6):

    # Init the op tensor for storing predicted op
    tgt_len = tgt.shape[0]
    bs = tgt.shape[1]
    output_dim = self.dec.op_dim
    pred_seq = torch.zeros(tgt_len, bs, output_dim).to(self.dev)

    enc_hs, enc_cs = self.enc(src)

    # Decoder ip
    ip = tgt[0]
    prev_hs, prev_cs = enc_hs, enc_cs

    for x in range(1, tgt_len):
      pred, hidden, cell = self.dec(ip, prev_hs, prev_cs)
      prev_hs = hidden
      prev_cs = cell
      pred_seq[x] = pred

      teacher_force = random.random() < teacher_force_ratio

      # get the highest predicted token from our predictions
      top1 = pred.argmax(1) 
            
      # if teacher forcing, use actual next token as next input if not, use predicted token
      ip = tgt[x] if teacher_force else top1
    
    return pred_seq

**Training**

In [None]:
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)

model = seq2seq(enc, dec, device).to(device)

In [None]:
# Init weights: initialize all weights from a uniform distribution between -0.08 and +0.08

def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)
        
model.apply(init_weights)

seq2seq(
  (enc): Encoder(
    (embed): Embedding(7855, 512)
    (rnn): LSTM(512, 256, num_layers=2, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (dec): Decoder(
    (embed): Embedding(5893, 512)
    (rnn): LSTM(512, 256, num_layers=2, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
    (fc): Linear(in_features=256, out_features=5893, bias=True)
  )
)

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 11,183,109 trainable parameters


In [None]:
# Set optimizer and loss fx
# ignore the loss whenever the target token is a padding token.

optimizer = optim.Adam(model.parameters())
TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]
criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)

In [None]:
# Begin train

def train(model, iterator, optimizer, criterion, clip):
  # sets mode to train
  model.train()

  epoch_loss = 0

  for i, batch in enumerate(iterator):
    # pull the src, tgt
    src = batch.src
    tgt = batch.trg

    # zero the grad calculated from last batch
    optimizer.zero_grad()

    # send to model
    op = model(src, tgt)

    output_dim = op.shape[-1]
    op = op[1:].view(-1, output_dim)
    tgt = tgt[1:].view(-1)

    loss = criterion(op, tgt)
    loss.backward()

    torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

    optimizer.step()
    epoch_loss += loss.item()

  return epoch_loss / len(iterator)

In [None]:
# Begin eval

def evaluate(model, iterator, criterion):
  model.eval()
  epoch_loss = 0

  # No grad calculation in eval
  with torch.no_grad():
    for i, batch in enumerate(iterator):
      src = batch.src
      tgt = batch.trg

      # call for forward() with teacher_force = 0
      op = model(src, tgt, 0)

      output_dim = op.shape[-1]
      op = op[1:].view(-1, output_dim)
      tgt = tgt[1:].view(-1)

      loss = criterion(op, tgt)
      epoch_loss += loss.item()

    return epoch_loss / len(iterator)

In [None]:
# record times

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
# Begin training actually

N_EPOCHS = 10
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
  start_time = time.time()

  train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
  valid_loss = evaluate(model, valid_iterator, criterion)

  end_time = time.time()

  epoch_mins, epoch_secs = epoch_time(start_time, end_time)

  if valid_loss < best_valid_loss:
    best_valid_loss = valid_loss
    torch.save(model.state_dict(), 'enc_dec-model.pt')

  print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
  print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
  print(f'\tValid Loss: {valid_loss:.3f} | Valid PPL: {math.exp(valid_loss):7.3f}')

Target shape before unsqueeze: torch.Size([128])

Target shape: torch.Size([1, 128])
Embed shape: torch.Size([1, 128, 512])
Enc op shape: torch.Size([1, 128, 256])

Epoch: 01 | Time: 0m 29s
	Train Loss: 5.169 | Train PPL: 175.676
	Valid Loss: 4.883 | Valid PPL: 132.030
Epoch: 02 | Time: 0m 29s
	Train Loss: 4.496 | Train PPL:  89.681
	Valid Loss: 4.846 | Valid PPL: 127.187
Epoch: 03 | Time: 0m 29s
	Train Loss: 4.218 | Train PPL:  67.906
	Valid Loss: 4.799 | Valid PPL: 121.443
Epoch: 04 | Time: 0m 30s
	Train Loss: 4.035 | Train PPL:  56.538
	Valid Loss: 4.737 | Valid PPL: 114.064
Epoch: 05 | Time: 0m 30s
	Train Loss: 3.917 | Train PPL:  50.244
	Valid Loss: 4.581 | Valid PPL:  97.565
Epoch: 06 | Time: 0m 29s
	Train Loss: 3.791 | Train PPL:  44.307
	Valid Loss: 4.722 | Valid PPL: 112.419
Epoch: 07 | Time: 0m 29s
	Train Loss: 3.656 | Train PPL:  38.694
	Valid Loss: 4.482 | Valid PPL:  88.403
Epoch: 08 | Time: 0m 29s
	Train Loss: 3.555 | Train PPL:  34.979
	Valid Loss: 4.376 | Valid PPL:  79

**Test & Eval**

In [None]:
# When testing, directly load trained model and run
# Load trained model, test set

trained_model = 'enc_dec-model.pt'

model.load_state_dict(torch.load(trained_model))
test_loss = evaluate(model, test_iterator, criterion)
print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')

| Test Loss: 4.216 | Test PPL:  67.787 |


In [None]:
def translate(sentence, src_field, tgt_field, model, device, max_len = 50):
  # eval mode
  model.eval()

  # tokenize src if it's a string
  if isinstance(sentence, str):
    nlp = spacy.load('de')
    tokens = [token.text.lower() for token in nlp(sentence)]
  else:
    tokens = [token.lower() for token in sentence]

  # Add <sos>, <eos>
  tokens = [src_field.init_token] + tokens + [src_field.eos_token]
  # Numericalize seq
  src_indexes = [src_field.vocab.stoi[token] for token in tokens]
  # make tensor of src seq
  src_tensor = torch.LongTensor(src_indexes).unsqueeze(1).to(device)

  with torch.no_grad():
    hs, cs = model.enc(src_tensor)
  tgt_indexes = [tgt_field.vocab.stoi[tgt_field.init_token]]
  for i in range(max_len):
    # each time set the last token of tgt_tensor as input to decoder. Here no teacher_force
    # 1st time its <sos>, then prev predicted tok by decoder
    tgt_tensor = torch.LongTensor([tgt_indexes[-1]]).to(device)

    with torch.no_grad():
      output, hs, cs = model.dec(tgt_tensor, hs, cs)
    pred_token = output.argmax(1).item()
    tgt_indexes.append(pred_token)

    if pred_token == tgt_field.vocab.stoi[tgt_field.eos_token]:
      break
    
  # Get back the predicted tgt tokens
  tgt_tokens = [tgt_field.vocab.itos[i] for i in tgt_indexes]
  # cut off <sos>
  return tgt_tokens[1:]

In [None]:
example_idx = 25

src = vars(train_data.examples[example_idx])['src']
tgt = vars(train_data.examples[example_idx])['trg']

print(f'src = {src}')
print(f'actual translation = {tgt}')
translation = translate(src, SRC, TRG, model, device)
print(f'predicted translation = {translation}')

src = ['.', 'straßenszene', 'einer', 'gemälde', 'ein', 'betrachtet', 'und', 'gehweg', 'belebten', 'einem', 'auf', 'steht', 'mantel', 'blauen', 'einem', 'in', 'person', 'eine']
actual translation = ['a', 'person', 'dressed', 'in', 'a', 'blue', 'coat', 'is', 'standing', 'in', 'on', 'a', 'busy', 'sidewalk', ',', 'studying', 'painting', 'of', 'a', 'street', 'scene', '.']
predicted translation = ['a', 'person', 'in', 'a', 'blue', 'shirt', 'is', 'sitting', 'on', 'a', 'sidewalk', 'with', 'a', 'a', 'a', 'a', 'a', '.', '<eos>']


In [None]:
example_idx = 10

src = vars(train_data.examples[example_idx])['src']
tgt = vars(train_data.examples[example_idx])['trg']

print(f'src = {src}')
print(f'actual translation = {tgt}')
translation = translate(src, SRC, TRG, model, device)
print(f'predicted translation = {translation}')

src = ['.', 'springen', 'nacheinander', 'die', ',', 'mädchen', 'fünf', 'mit', 'ballettklasse', 'eine']
actual translation = ['a', 'ballet', 'class', 'of', 'five', 'girls', 'jumping', 'in', 'sequence', '.']
predicted translation = ['a', 'group', 'of', 'children', 'in', 'in', 'a', '.', '.', '<eos>']


In [None]:
example_idx = 29

src = vars(train_data.examples[example_idx])['src']
tgt = vars(train_data.examples[example_idx])['trg']

print(f'src = {src}')
print(f'actual translation = {tgt}')
translation = translate(src, SRC, TRG, model, device)
print(f'predicted translation = {translation}')

src = ['.', 'lächelt', 'und', 'an', 'etwas', 'blickt', 'jacke', 'schwarz-gelben', 'einer', 'in', 'mann', 'junger', 'ein']
actual translation = ['a', 'young', 'man', 'in', 'a', 'black', 'and', 'yellow', 'jacket', 'is', 'gazing', 'at', 'something', 'and', 'smiling', '.']
predicted translation = ['a', 'young', 'man', 'in', 'a', 'black', 'shirt', 'and', 'holding', 'a', '<unk>', 'of', 'her', '.', '.', '<eos>']


**From some examples, seems like the model is able to transalte the first few words of the sentence, but later parts are completely missed**

In [None]:
from torchtext.data.metrics import bleu_score

def calculate_bleu(data, src_field, trg_field, model, device, max_len = 50):
  trgs = []
  pred_trgs = []

  for datum in data:
    src = vars(datum)['src']
    trg = vars(datum)['trg']
    pred_trg = translate(src, src_field, trg_field, model, device, max_len)
    
    # cut off <eos>
    pred_trg = pred_trg[:-1]
    
    pred_trgs.append(pred_trg)
    trgs.append([trg])
      
  return bleu_score(pred_trgs, trgs)

In [None]:
bleu_score = calculate_bleu(test_data, SRC, TRG, model, device)

print(f'BLEU score w simple enc_dec = {bleu_score*100:.2f}')

BLEU score w simple enc_dec = 8.52
