## NMT with attention (jointly learning to align & translate)

In [77]:
# bleu score needs
!pip install torchtext==0.6.0

# spacy language model loads
!python -m spacy download en
!python -m spacy download de

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.6/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('de_core_news_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/de_core_news_sm -->
/usr/local/lib/python3.6/dist-packages/spacy/data/de
You can now load the model via spacy.load('de')


In [78]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator
import torchtext

import numpy as np
import spacy
import random
import math
import time

In [79]:
# For deterministic results set seed

SEED = 555

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [80]:
# Load Spacy language models for tokenizing

spacy_de = spacy.load('de')
spacy_en = spacy.load('en')

In [81]:
def tokenize_de(text):
    # Tokenizes German text from a string into a list of strings (tokens) and reverses it
    # As source seq is fed in reverse order in basic enc_dec
    return [tok.text for tok in spacy_de.tokenizer(text)][::-1]

def tokenize_en(text):
    # Tokenizes English text from a string into a list of strings (tokens)
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [82]:
# Field() of TorchText: Does tokenization according to fx, appends start and end tokens and lowers() case.
# include length will give a tuple for batch.src: (batch of numericalized source sentence as a tensor, non-padded lengths of each source sentence within the batch)

SRC = Field(tokenize = tokenize_de, init_token = '<sos>', eos_token = '<eos>', lower = True, include_lengths = True)
TRG = Field(tokenize = tokenize_en, init_token = '<sos>', eos_token = '<eos>', lower = True)

In [83]:
# Split data. Assigns source as German, target as English

train_data, valid_data, test_data = Multi30k.splits(exts = ('.de', '.en'), fields = (SRC, TRG))

In [84]:
print(f"Number of training examples: {len(train_data.examples)}")
print(f"Number of validation examples: {len(valid_data.examples)}")
print(f"Number of testing examples: {len(test_data.examples)}")

Number of training examples: 29000
Number of validation examples: 1014
Number of testing examples: 1000


In [85]:
# Build lang vocabulary. Discard words which occur less than x(min_freq) times

SRC.build_vocab(train_data, min_freq = 2)
TRG.build_vocab(train_data, min_freq = 2)

In [86]:
# Use Cuda if available

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [87]:
BATCH_SIZE = 128

# Create iterators to get a batch of seq ip/op. Iterator automatically handles padding sequences to same length

train_iterator, valid_iterator, test_iterator = BucketIterator.splits((train_data, valid_data, test_data), batch_size = BATCH_SIZE, sort_within_batch = True, sort_key = lambda x : len(x.src), device = device)

In [88]:
class Encoder(nn.Module):
  def __init__(self, input_dim, embed_dim, enc_hid_dim, dec_hid_dim, dropout):
    super().__init__()
    self.embed = nn.Embedding(input_dim, embed_dim)
    self.rnn = nn.GRU(embed_dim, enc_hid_dim, bidirectional = True, dropout = dropout)
    self.fc = nn.Linear(2*enc_hid_dim, dec_hid_dim)
    self.dropout = nn.Dropout(dropout)
    self.verbose = True
  
  def forward(self, src, src_len):
    if self.verbose:
      print(f'Src shape: {src.shape}\n')
    # src: [src_len, bs]
    # src_len: [bs]
    embedded = self.dropout(self.embed(src))
    packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, src_len.cpu())

    # hs is from the final non-padded element in the sequence not packed. op is packed.
    packed_output, hs = self.rnn(packed_embedded)
    if self.verbose:
      print(f'Enc hs shape: {hs.shape}\n')
    # op: [seq_len, bs, n_dir*enc_hid_dim]
    # hs: [n_lay*n_dir, bs, enc_hid_dim]

    outputs, _ = nn.utils.rnn.pad_packed_sequence(packed_output)
    # all hidden states obtained when the input is a pad token are all zeros

    if self.verbose:
      print(f'Unpacked Enc Op shape: {outputs.shape}\n')

    hidden = torch.tanh(self.fc(torch.cat((hs[-2,:,:], hs[-1,:,:]), dim = 1)))
    self.verbose = False
    return outputs, hidden

In [89]:
# mask to not pay attention to hs formed when ip is pad token

class Attention(nn.Module):
  def __init__(self, enc_hid_dim, dec_hid_dim):
    super().__init__()
    self.attn = nn.Linear((enc_hid_dim * 2) + dec_hid_dim, dec_hid_dim)
    self.v = nn.Linear(dec_hid_dim, 1, bias = False)

  def forward(self, hidden, encoder_outputs, mask):
    batch_size = encoder_outputs.shape[1]
    src_len = encoder_outputs.shape[0]
    
    #repeat decoder hidden state src_len times
    hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)
    encoder_outputs = encoder_outputs.permute(1, 0, 2)
    
    #hidden = [batch size, src len, dec hid dim]
    #encoder_outputs = [batch size, src len, enc hid dim * 2]
    
    energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim = 2))) 
    #energy = [batch size, src len, dec hid dim]

    attention = self.v(energy).squeeze(2)
    #attention = [batch size, src len]
    
    attention = attention.masked_fill(mask == 0, -1e10)
    return F.softmax(attention, dim = 1)

In [90]:
class Decoder(nn.Module):
  def __init__(self, output_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout, attention):
    super().__init__()
    self.output_dim = output_dim
    self.attention = attention
    self.embedding = nn.Embedding(output_dim, emb_dim)
    self.rnn = nn.GRU((enc_hid_dim * 2) + emb_dim, dec_hid_dim)
    self.fc_out = nn.Linear((enc_hid_dim * 2) + dec_hid_dim + emb_dim, output_dim)
    self.dropout = nn.Dropout(dropout)
    
  def forward(self, input, hidden, encoder_outputs, mask):  
      input = input.unsqueeze(0)
      #input = [1, batch size]
      
      embedded = self.dropout(self.embedding(input))
      #embedded = [1, batch size, emb dim]
      
      a = self.attention(hidden, encoder_outputs, mask)
      #a = [batch size, src len]
      
      a = a.unsqueeze(1)
      #a = [batch size, 1, src len]
      
      encoder_outputs = encoder_outputs.permute(1, 0, 2)
      #encoder_outputs = [batch size, src len, enc hid dim * 2]
      
      weighted = torch.bmm(a, encoder_outputs)
      #weighted = [batch size, 1, enc hid dim * 2]
      
      weighted = weighted.permute(1, 0, 2)
      #weighted = [1, batch size, enc hid dim * 2]
      
      rnn_input = torch.cat((embedded, weighted), dim = 2)
      #rnn_input = [1, batch size, (enc hid dim * 2) + emb dim]
          
      output, hidden = self.rnn(rnn_input, hidden.unsqueeze(0))
      
      embedded = embedded.squeeze(0)
      output = output.squeeze(0)
      weighted = weighted.squeeze(0)
      
      prediction = self.fc_out(torch.cat((output, weighted, embedded), dim = 1))
      
      #prediction = [batch size, output dim]
      return prediction, hidden.squeeze(0), a.squeeze(1)

In [91]:
class Seq2Seq(nn.Module):
  def __init__(self, encoder, decoder, src_pad_idx, device):
    super().__init__()
    self.encoder = encoder
    self.decoder = decoder
    self.src_pad_idx = src_pad_idx
    self.device = device
      
  def create_mask(self, src):
    mask = (src != self.src_pad_idx).permute(1, 0)
    return mask
      
  def forward(self, src, src_len, trg, teacher_forcing_ratio = 0.5):                
    batch_size = src.shape[1]
    trg_len = trg.shape[0]
    trg_vocab_size = self.decoder.output_dim
    
    # tensor to store decoder outputs
    outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)

    encoder_outputs, hidden = self.encoder(src, src_len)
            
    #first input to the decoder is the <sos> tokens
    input = trg[0,:]
    
    mask = self.create_mask(src)
    for t in range(1, trg_len):
      output, hidden, _ = self.decoder(input, hidden, encoder_outputs, mask)
      outputs[t] = output
      
      teacher_force = random.random() < teacher_forcing_ratio
      top1 = output.argmax(1) 
      input = trg[t] if teacher_force else top1
        
    return outputs

In [92]:
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
ENC_HID_DIM = 512
DEC_HID_DIM = 512
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5
SRC_PAD_IDX = SRC.vocab.stoi[SRC.pad_token]

attn = Attention(ENC_HID_DIM, DEC_HID_DIM)
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DEC_DROPOUT, attn)

model = Seq2Seq(enc, dec, SRC_PAD_IDX, device).to(device)

  "num_layers={}".format(dropout, num_layers))


In [93]:
def init_weights(m):
  for name, param in m.named_parameters():
    if 'weight' in name:
      nn.init.normal_(param.data, mean=0, std=0.01)
    else:
      nn.init.constant_(param.data, 0)
            
model.apply(init_weights)
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 20,518,917 trainable parameters


In [94]:
optimizer = optim.Adam(model.parameters())
TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]
criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)

In [95]:
# Begin train

def train(model, iterator, optimizer, criterion, clip):
  # sets mode to train
  model.train()

  epoch_loss = 0

  for i, batch in enumerate(iterator):
    # pull the src, tgt
    src, src_len = batch.src
    tgt = batch.trg

    # zero the grad calculated from last batch
    optimizer.zero_grad()

    # send to model
    op = model(src, src_len, tgt)

    output_dim = op.shape[-1]
    op = op[1:].view(-1, output_dim)
    tgt = tgt[1:].view(-1)

    loss = criterion(op, tgt)
    loss.backward()

    torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

    optimizer.step()
    epoch_loss += loss.item()

  return epoch_loss / len(iterator)

In [96]:
def evaluate(model, iterator, criterion):
  model.eval()
  epoch_loss = 0
  
  with torch.no_grad():
    for i, batch in enumerate(iterator):
      src, src_len = batch.src
      trg = batch.trg

      output = model(src, src_len, trg, 0) 
      output_dim = output.shape[-1]
      
      output = output[1:].view(-1, output_dim)
      trg = trg[1:].view(-1)

      loss = criterion(output, trg)
      epoch_loss += loss.item()
      
  return epoch_loss / len(iterator)

In [97]:
# record times

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [98]:
# Begin training actually

N_EPOCHS = 10
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
  start_time = time.time()

  train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
  valid_loss = evaluate(model, valid_iterator, criterion)

  end_time = time.time()

  epoch_mins, epoch_secs = epoch_time(start_time, end_time)

  if valid_loss < best_valid_loss:
    best_valid_loss = valid_loss
    torch.save(model.state_dict(), 'enc_dec-model.pt')

  print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
  print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
  print(f'\tValid Loss: {valid_loss:.3f} | Valid PPL: {math.exp(valid_loss):7.3f}')

Src shape: torch.Size([21, 128])

Enc hs shape: torch.Size([2, 128, 512])

Unpacked Enc Op shape: torch.Size([21, 128, 1024])

Epoch: 01 | Time: 0m 44s
	Train Loss: 5.068 | Train PPL: 158.886
	Valid Loss: 4.706 | Valid PPL: 110.658
Epoch: 02 | Time: 0m 46s
	Train Loss: 3.983 | Train PPL:  53.653
	Valid Loss: 4.081 | Valid PPL:  59.191
Epoch: 03 | Time: 0m 47s
	Train Loss: 3.237 | Train PPL:  25.451
	Valid Loss: 3.588 | Valid PPL:  36.152
Epoch: 04 | Time: 0m 47s
	Train Loss: 2.752 | Train PPL:  15.675
	Valid Loss: 3.405 | Valid PPL:  30.117
Epoch: 05 | Time: 0m 47s
	Train Loss: 2.420 | Train PPL:  11.248
	Valid Loss: 3.269 | Valid PPL:  26.274
Epoch: 06 | Time: 0m 46s
	Train Loss: 2.135 | Train PPL:   8.460
	Valid Loss: 3.184 | Valid PPL:  24.141
Epoch: 07 | Time: 0m 46s
	Train Loss: 1.932 | Train PPL:   6.903
	Valid Loss: 3.206 | Valid PPL:  24.671
Epoch: 08 | Time: 0m 46s
	Train Loss: 1.733 | Train PPL:   5.659
	Valid Loss: 3.313 | Valid PPL:  27.471
Epoch: 09 | Time: 0m 46s
	Train L

In [101]:
# When testing, directly load trained model and run
# Load trained model, test set

trained_model = 'enc_dec-model.pt'

model.load_state_dict(torch.load(trained_model))
test_loss = evaluate(model, test_iterator, criterion)
print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')

| Test Loss: 3.206 | Test PPL:  24.684 |


In [110]:
def translate(sentence, src_field, tgt_field, model, device, max_len = 50):
  # eval mode
  model.eval()

  # tokenize src if it's a string
  if isinstance(sentence, str):
    nlp = spacy.load('de')
    tokens = [token.text.lower() for token in nlp(sentence)]
  else:
    tokens = [token.lower() for token in sentence]

  # Add <sos>, <eos>
  tokens = [src_field.init_token] + tokens + [src_field.eos_token]
  # Numericalize seq
  src_indexes = [src_field.vocab.stoi[token] for token in tokens]
  # make tensor of src seq
  src_tensor = torch.LongTensor(src_indexes).unsqueeze(1).to(device)
  src_len = torch.LongTensor([len(src_indexes)]).to(device)

  with torch.no_grad():
    op, hs = model.encoder(src_tensor, src_len)
  
  mask = model.create_mask(src_tensor)
  tgt_indexes = [tgt_field.vocab.stoi[tgt_field.init_token]]
  attentions = torch.zeros(max_len, 1, len(src_indexes)).to(device)

  for i in range(max_len):
    # each time set the last token of tgt_tensor as input to decoder. Here no teacher_force
    # 1st time its <sos>, then prev predicted tok by decoder
    tgt_tensor = torch.LongTensor([tgt_indexes[-1]]).to(device)

    with torch.no_grad():
      output, hs, attention = model.decoder(tgt_tensor, hs, op, mask)

    attentions[i] = attention
    pred_token = output.argmax(1).item()
    tgt_indexes.append(pred_token)

    if pred_token == tgt_field.vocab.stoi[tgt_field.eos_token]:
      break
    
  # Get back the predicted tgt tokens
  tgt_tokens = [tgt_field.vocab.itos[i] for i in tgt_indexes]
  # cut off <sos>
  return tgt_tokens[1:], attentions[:len(tgt_tokens)-1]

In [111]:
example_idx = 25

src = vars(train_data.examples[example_idx])['src']
tgt = vars(train_data.examples[example_idx])['trg']

print(f'src = {src}')
print(f'actual translation = {tgt}')
translation, _ = translate(src, SRC, TRG, model, device)
print(f'predicted translation = {translation}')

src = ['.', 'straßenszene', 'einer', 'gemälde', 'ein', 'betrachtet', 'und', 'gehweg', 'belebten', 'einem', 'auf', 'steht', 'mantel', 'blauen', 'einem', 'in', 'person', 'eine']
actual translation = ['a', 'person', 'dressed', 'in', 'a', 'blue', 'coat', 'is', 'standing', 'in', 'on', 'a', 'busy', 'sidewalk', ',', 'studying', 'painting', 'of', 'a', 'street', 'scene', '.']
predicted translation = ['a', 'person', 'in', 'a', 'blue', 'coat', 'is', 'standing', 'on', 'a', 'sidewalk', 'sidewalk', 'looking', 'at', 'a', 'painting', 'a', 'a', 'street', '.', '<eos>']


In [112]:
example_idx = 10

src = vars(train_data.examples[example_idx])['src']
tgt = vars(train_data.examples[example_idx])['trg']

print(f'src = {src}')
print(f'actual translation = {tgt}')
translation, _ = translate(src, SRC, TRG, model, device)
print(f'predicted translation = {translation}')

src = ['.', 'springen', 'nacheinander', 'die', ',', 'mädchen', 'fünf', 'mit', 'ballettklasse', 'eine']
actual translation = ['a', 'ballet', 'class', 'of', 'five', 'girls', 'jumping', 'in', 'sequence', '.']
predicted translation = ['a', 'class', 'of', 'five', 'five', 'girls', 'jump', 'in', 'sequence', '.', '<eos>']


In [113]:
example_idx = 29

src = vars(train_data.examples[example_idx])['src']
tgt = vars(train_data.examples[example_idx])['trg']

print(f'src = {src}')
print(f'actual translation = {tgt}')
translation, _ = translate(src, SRC, TRG, model, device)
print(f'predicted translation = {translation}')

src = ['.', 'lächelt', 'und', 'an', 'etwas', 'blickt', 'jacke', 'schwarz-gelben', 'einer', 'in', 'mann', 'junger', 'ein']
actual translation = ['a', 'young', 'man', 'in', 'a', 'black', 'and', 'yellow', 'jacket', 'is', 'gazing', 'at', 'something', 'and', 'smiling', '.']
predicted translation = ['a', 'young', 'man', 'in', 'a', 'black', 'and', 'white', 'jacket', 'looks', 'looking', 'at', 'something', 'and', 'smiles', '.', '<eos>']


In [114]:
from torchtext.data.metrics import bleu_score

def calculate_bleu(data, src_field, trg_field, model, device, max_len = 50):
  trgs = []
  pred_trgs = []

  for datum in data:
    src = vars(datum)['src']
    trg = vars(datum)['trg']
    pred_trg, _ = translate(src, src_field, trg_field, model, device, max_len)
    
    # cut off <eos>
    pred_trg = pred_trg[:-1]
    
    pred_trgs.append(pred_trg)
    trgs.append([trg])
      
  return bleu_score(pred_trgs, trgs)

In [115]:
bleu_score = calculate_bleu(test_data, SRC, TRG, model, device)

print(f'BLEU score w attention enc_dec = {bleu_score*100:.2f}')

BLEU score w attention enc_dec = 29.03
