##Neural Machine Translation by Jointly Learning to Align and Translate



In [1]:
%%bash
python -m spacy download en
python -m spacy download de

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.7/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.7/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')
Collecting de_core_news_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-2.2.5/de_core_news_sm-2.2.5.tar.gz (14.9MB)
Building wheels for collected packages: de-core-news-sm
  Building wheel for de-core-news-sm (setup.py): started
  Building wheel for de-core-news-sm (setup.py): finished with status 'done'
  Created wheel for de-core-news-sm: filename=de_core_news_sm-2.2.5-cp37-none-any.whl size=14907055 sha256=4fd0b571fc2539082ac3f4aaf71a38c19711b995e4452a99adc16f473ff3eda1
  Stored in directory: /tmp/pip-ephem-wheel-cache-lwms1wye/wheels/ba/3f/ed/d4aa8e45e7191b7f32db4bfad565e7da1edbf05c916ca7a1ca
Successfully built de-core-news-sm
Inst

In [2]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.datasets import Multi30k
from typing import Iterable, List
import torch

In [3]:
SRC_LANGUAGE = 'de'
TGT_LANGUAGE = 'en'

# Place-holders
token_transform = {}
vocab_transform = {}

In [4]:
# Create source and target language tokenizer. Make sure to install the dependencies.

token_transform[SRC_LANGUAGE] = get_tokenizer('spacy', language='de')
token_transform[TGT_LANGUAGE] = get_tokenizer('spacy', language='en')

In [5]:
token_transform[TGT_LANGUAGE]('This is my notebook')

['This', 'is', 'my', 'notebook']

In [6]:
# helper function to yield list of tokens
def yield_tokens(data_iter: Iterable, language: str) -> List[str]:
    language_index = {SRC_LANGUAGE: 0, TGT_LANGUAGE: 1}

    for data_sample in data_iter:
        yield token_transform[language](data_sample[language_index[language]])

In [7]:
#yield_tokens(train_iter, TGT_LANGUAGE)

In [8]:
# Define special symbols and indices
UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0, 1, 2, 3
# Make sure the tokens are in order of their indices to properly insert them in vocab
special_symbols = ['<unk>', '<pad>', '<bos>', '<eos>']

## Building SRC & TRG language Vocabulary

In [9]:
for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
  # Training data Iterator 
  train_iter = Multi30k(split='train', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
  # Create torchtext's Vocab object 
  vocab_transform[ln] = build_vocab_from_iterator(yield_tokens(train_iter, ln),
                                                    min_freq=1,
                                                    specials=special_symbols,
                                                    special_first=True)

training.tar.gz: 100%|██████████| 1.21M/1.21M [00:00<00:00, 1.63MB/s]


In [10]:
# Set UNK_IDX as the default index. This index is returned when the token is not found. 
# If not set, it throws RuntimeError when the queried token is not found in the Vocabulary. 
for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
  vocab_transform[ln].set_default_index(UNK_IDX)

In [11]:
vocab_transform[TGT_LANGUAGE](['this', 'is'])

[595, 11]

In [12]:
vocab_transform[SRC_LANGUAGE](['es', 'de'])

[302, 7281]

In [13]:
# Set UNK_IDX as the default index. This index is returned when the token is not found. 
# If not set, it throws RuntimeError when the queried token is not found in the Vocabulary. 
for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
  vocab_transform[ln].set_default_index(UNK_IDX)

## Can also Use
```
src_vocab_pipeline = lambda x: vocab_transform[SRC_LANGUAGE](x)
tgt_vocab_pipeline = lambda x: vocab_transform[TGT_LANGUAGE](x)
```

In [14]:
vocab_transform[TGT_LANGUAGE](['this', 'is', 'my', 'game'])

[595, 11, 2228, 138]

In [15]:
# function to add BOS/EOS and create tensor for input sequence indices
def tensor_transform(token_ids: List[int]):
    return torch.cat((torch.tensor([BOS_IDX]), 
                      torch.tensor(token_ids), 
                      torch.tensor([EOS_IDX])))

In [16]:
tensor_transform(vocab_transform[TGT_LANGUAGE](['Two', 'young', ',', 'White', 'males', 'are', 'outside', 'near', 'many', 'bushes', '.', '\n']))

tensor([   2,   20,   26,   16, 1170,  809,   18,   58,   85,  337, 1340,    6,
           5,    3])

## Test collate func

In [17]:
tensor_transform(vocab_transform[TGT_LANGUAGE](['Two', 'young', ',', 'White', 'males', 'are', 'outside', 'near', 'many', 'bushes', '.', '\n']))

tensor([   2,   20,   26,   16, 1170,  809,   18,   58,   85,  337, 1340,    6,
           5,    3])

In [18]:
from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    tgt_batch, src_batch = [], []
    for _src, _tgt in batch:
      _tgt = token_transform[TGT_LANGUAGE](_tgt)
      _src = token_transform[SRC_LANGUAGE](_src)

      _tgt_tok = vocab_transform[TGT_LANGUAGE](_tgt)
      _src_tok = vocab_transform[SRC_LANGUAGE](_src)

      _tgt_tok_app = tensor_transform(_tgt_tok)
      _src_tok_app = tensor_transform(_src_tok)

      tgt_batch.append(_tgt_tok_app)
      src_batch.append(_src_tok_app)

    src_batch = pad_sequence(src_batch, padding_value=PAD_IDX)
    tgt_batch = pad_sequence(tgt_batch, padding_value=PAD_IDX)

    return src_batch, tgt_batch

## Test the data loader

In [19]:
from torch.utils.data import DataLoader

BATCH_SIZE = 5

train_iter = Multi30k(split='train', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
train_dataloader = DataLoader(train_iter, batch_size=BATCH_SIZE, collate_fn=collate_fn)

for src, tgt in train_dataloader:
  print(src.shape)
  print(tgt.shape)
  break
  #tgt = tgt.to(device)

torch.Size([18, 5])
torch.Size([18, 5])


In [20]:
import random
from typing import Tuple

import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch import Tensor
import torch

In [21]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout):
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, emb_dim)
        
        self.rnn = nn.GRU(emb_dim, enc_hid_dim, bidirectional = True)
        
        self.fc = nn.Linear(enc_hid_dim * 2, dec_hid_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src):
        
        #src = [src len, batch size]
        
        embedded = self.dropout(self.embedding(src))
        
        #embedded = [src len, batch size, emb dim]
        
        outputs, hidden = self.rnn(embedded)
                
        #outputs = [src len, batch size, hid dim * num directions]
        #hidden = [n layers * num directions, batch size, hid dim]
        
        #hidden is stacked [forward_1, backward_1, forward_2, backward_2, ...]
        #outputs are always from the last layer
        
        #hidden [-2, :, : ] is the last of the forwards RNN 
        #hidden [-1, :, : ] is the last of the backwards RNN
        
        #initial decoder hidden is final hidden state of the forwards and backwards 
        #  encoder RNNs fed through a linear layer
        hidden = torch.tanh(self.fc(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)))
        
        #outputs = [src len, batch size, enc hid dim * 2]
        #hidden = [batch size, dec hid dim]
        
        return outputs, hidden

In [22]:
class Attention(nn.Module):
    def __init__(self, enc_hid_dim, dec_hid_dim):
        super().__init__()
        
        self.attn = nn.Linear((enc_hid_dim * 2) + dec_hid_dim, dec_hid_dim)
        self.v = nn.Linear(dec_hid_dim, 1, bias = False)
        
    def forward(self, hidden, encoder_outputs):
        
        #hidden = [batch size, dec hid dim]
        #encoder_outputs = [src len, batch size, enc hid dim * 2]
        
        batch_size = encoder_outputs.shape[1]
        src_len = encoder_outputs.shape[0]
        
        #repeat decoder hidden state src_len times
        hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)
        
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        
        #hidden = [batch size, src len, dec hid dim]
        #encoder_outputs = [batch size, src len, enc hid dim * 2]
        
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim = 2))) 
        
        #energy = [batch size, src len, dec hid dim]

        attention = self.v(energy).squeeze(2)
        
        #attention= [batch size, src len]
        
        return F.softmax(attention, dim=1)

In [23]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout, attention):
        super().__init__()

        self.output_dim = output_dim
        self.attention = attention
        
        self.embedding = nn.Embedding(output_dim, emb_dim)
        
        self.rnn = nn.GRU((enc_hid_dim * 2) + emb_dim, dec_hid_dim)
        
        self.fc_out = nn.Linear((enc_hid_dim * 2) + dec_hid_dim + emb_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, hidden, encoder_outputs):
             
        #input = [batch size]
        #hidden = [batch size, dec hid dim]
        #encoder_outputs = [src len, batch size, enc hid dim * 2]
        
        input = input.unsqueeze(0)
        
        #input = [1, batch size]
        
        embedded = self.dropout(self.embedding(input))
        
        #embedded = [1, batch size, emb dim]
        
        a = self.attention(hidden, encoder_outputs)
                
        #a = [batch size, src len]
        
        a = a.unsqueeze(1)
        
        #a = [batch size, 1, src len]
        
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        
        #encoder_outputs = [batch size, src len, enc hid dim * 2]
        
        weighted = torch.bmm(a, encoder_outputs)
        
        #weighted = [batch size, 1, enc hid dim * 2]
        
        weighted = weighted.permute(1, 0, 2)
        
        #weighted = [1, batch size, enc hid dim * 2]
        
        rnn_input = torch.cat((embedded, weighted), dim = 2)
        
        #rnn_input = [1, batch size, (enc hid dim * 2) + emb dim]
            
        output, hidden = self.rnn(rnn_input, hidden.unsqueeze(0))
        
        #output = [seq len, batch size, dec hid dim * n directions]
        #hidden = [n layers * n directions, batch size, dec hid dim]
        
        #seq len, n layers and n directions will always be 1 in this decoder, therefore:
        #output = [1, batch size, dec hid dim]
        #hidden = [1, batch size, dec hid dim]
        #this also means that output == hidden
        assert (output == hidden).all()
        
        embedded = embedded.squeeze(0)
        output = output.squeeze(0)
        weighted = weighted.squeeze(0)
        
        prediction = self.fc_out(torch.cat((output, weighted, embedded), dim = 1))
        
        #prediction = [batch size, output dim]
        
        return prediction, hidden.squeeze(0)

In [24]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
    def forward(self, src, trg, teacher_forcing_ratio = 0.5):
        
        #src = [src len, batch size]
        #trg = [trg len, batch size]
        #teacher_forcing_ratio is probability to use teacher forcing
        #e.g. if teacher_forcing_ratio is 0.75 we use teacher forcing 75% of the time
        
        batch_size = src.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        
        #tensor to store decoder outputs
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        
        #encoder_outputs is all hidden states of the input sequence, back and forwards
        #hidden is the final forward and backward hidden states, passed through a linear layer
        encoder_outputs, hidden = self.encoder(src)
                
        #first input to the decoder is the <sos> tokens
        input = trg[0,:]
        
        for t in range(1, trg_len):
            
            #insert input token embedding, previous hidden state and all encoder hidden states
            #receive output tensor (predictions) and new hidden state
            output, hidden = self.decoder(input, hidden, encoder_outputs)
            
            #place predictions in a tensor holding predictions for each token
            outputs[t] = output
            
            #decide if we are going to use teacher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio
            
            #get the highest predicted token from our predictions
            top1 = output.argmax(1) 
            
            #if teacher forcing, use actual next token as next input
            #if not, use predicted token
            input = trg[t] if teacher_force else top1

        return outputs

In [25]:
INPUT_DIM = len(vocab_transform[SRC_LANGUAGE])
OUTPUT_DIM = len(vocab_transform[TGT_LANGUAGE])
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
ENC_HID_DIM = 512
DEC_HID_DIM = 512
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

BATCH_SIZE = 128

attn = Attention(ENC_HID_DIM, DEC_HID_DIM)
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DEC_DROPOUT, attn)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = Seq2Seq(enc, dec, device).to(device)

In [26]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.normal_(param.data, mean=0, std=0.01)
        
model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(19206, 256)
    (rnn): GRU(256, 512, bidirectional=True)
    (fc): Linear(in_features=1024, out_features=512, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (attention): Attention(
      (attn): Linear(in_features=1536, out_features=512, bias=True)
      (v): Linear(in_features=512, out_features=1, bias=False)
    )
    (embedding): Embedding(10840, 256)
    (rnn): GRU(1280, 512)
    (fc_out): Linear(in_features=1792, out_features=10840, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [27]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 33,561,176 trainable parameters


In [28]:
optimizer = optim.Adam(model.parameters())

In [29]:
criterion = torch.nn.CrossEntropyLoss(ignore_index=PAD_IDX)

In [30]:
######################################################################
# Let's define training and evaluation loop that will be called for each 
# epoch.
#

from torch.utils.data import DataLoader

def train_epoch(model, optimizer):
    model.train()
    losses = 0
    train_iter = Multi30k(split='train', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
    train_dataloader = DataLoader(train_iter, batch_size=BATCH_SIZE, collate_fn=collate_fn)
    
    for src, tgt in train_dataloader:
        src = src.to(device)
        tgt = tgt.to(device)


        optimizer.zero_grad()
        output = model(src, tgt)
        output = output[1:].view(-1, output.shape[-1])
        tgt = tgt[1:].view(-1)
        loss = criterion(output, tgt)
        loss.backward()
        clip = 1
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

        optimizer.step()
        losses += loss.item()

    return losses / len(train_dataloader)


def evaluate(model):
    model.eval()
    losses = 0

    val_iter = Multi30k(split='valid', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
    val_dataloader = DataLoader(val_iter, batch_size=BATCH_SIZE, collate_fn=collate_fn)

    for src, tgt in val_dataloader:
        src = src.to(device)
        tgt = tgt.to(device)

        output = model(src, tgt)
        output = output[1:].view(-1, output.shape[-1])
        tgt = tgt[1:].view(-1)
        loss = criterion(output, tgt)
        losses += loss.item()

    return losses / len(val_dataloader)


In [31]:
from timeit import default_timer as timer
NUM_EPOCHS = 15

for epoch in range(1, NUM_EPOCHS+1):
    start_time = timer()
    train_loss = train_epoch(model, optimizer)
    end_time = timer()
    val_loss = evaluate(model)
    print((f"Epoch: {epoch}, Train loss: {train_loss:.3f}, Val loss: {val_loss:.3f}, "f"Epoch time = {(end_time - start_time):.3f}s"))


validation.tar.gz: 100%|██████████| 46.3k/46.3k [00:00<00:00, 277kB/s]


Epoch: 1, Train loss: 4.838, Val loss: 4.299, Epoch time = 118.008s
Epoch: 2, Train loss: 3.945, Val loss: 3.719, Epoch time = 120.287s
Epoch: 3, Train loss: 3.408, Val loss: 3.256, Epoch time = 120.521s
Epoch: 4, Train loss: 2.874, Val loss: 3.027, Epoch time = 121.015s
Epoch: 5, Train loss: 2.476, Val loss: 2.685, Epoch time = 120.845s
Epoch: 6, Train loss: 2.122, Val loss: 2.777, Epoch time = 120.821s
Epoch: 7, Train loss: 1.865, Val loss: 2.632, Epoch time = 120.937s
Epoch: 8, Train loss: 1.641, Val loss: 2.503, Epoch time = 120.667s
Epoch: 9, Train loss: 1.494, Val loss: 2.681, Epoch time = 120.807s
Epoch: 10, Train loss: 1.354, Val loss: 2.769, Epoch time = 120.731s
Epoch: 11, Train loss: 1.234, Val loss: 2.731, Epoch time = 120.625s
Epoch: 12, Train loss: 1.149, Val loss: 2.672, Epoch time = 120.617s
Epoch: 13, Train loss: 1.043, Val loss: 2.654, Epoch time = 120.792s
Epoch: 14, Train loss: 0.978, Val loss: 2.682, Epoch time = 120.764s
Epoch: 15, Train loss: 0.880, Val loss: 2.7

### Making Inference

In [32]:
# take a list and create a subset from until '<eos>'
def create_sentence(l):
  len_sentence = 0

  for i in range(len(l)):
    if l[i] == '<eos>':
      break

    len_sentence+=1

  sentence = ' '.join(l[:len_sentence])

  return sentence



In [33]:
model.eval()
losses = 0

val_iter = Multi30k(split='valid', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
val_dataloader = DataLoader(val_iter, batch_size=BATCH_SIZE, collate_fn=collate_fn)

for src, tgt in val_dataloader:
  src = src.to(device)
  tgt = tgt.to(device)

  output = model(src, tgt)
  output = output[1:]
  tgt = tgt[1:]

  v = output.argmax(2)

  for k in range(10):
    j = random.randint(0,BATCH_SIZE-1)

    result = vocab_transform[TGT_LANGUAGE].lookup_tokens(v[:,j].tolist())
    sourcetxt = vocab_transform[SRC_LANGUAGE].lookup_tokens(src[1:,j].tolist())
    acttgt = vocab_transform[TGT_LANGUAGE].lookup_tokens(tgt[:,j].tolist())

    print('Input Text: ', create_sentence(sourcetxt).rstrip('\n'))
    print('Predicted Text  : ', create_sentence(result).rstrip('\n'))
    print('Actual Output Text: ', create_sentence(acttgt).rstrip('\n'))
    print('*'*40)

  break

Input Text:  Eine Frau sitzt bei ihrem <unk> auf einem Freiluftmarkt . 
Predicted Text  :  A woman is at at her stall at an open air market . . 
Actual Output Text:  A woman is sitting by her dried flower display at an outside market . 
****************************************
Input Text:  Eine Frau in einem schwarzen Kleid schiebt einen Wagen mit <unk> einen gepflasterten Fußgängerweg entlang . 
Predicted Text  :  A woman in a black dress pushing a cart with a cart down it a path . 
Actual Output Text:  A woman in a black dress is pushing a cart with <unk> down a paved walkway . 
****************************************
Input Text:  Ein Traktor bewegt Erde für den Bau einer Stützmauer . 
Predicted Text  :  A tractor dirt dirt dirt from a a wall outside . 
 . 
Actual Output Text:  A tractor is moving dirt to help build up a retaining wall . 
****************************************
Input Text:  Eine Gruppe von Menschen sammelt sich um einen Mann in einem Anzug und einen kleinen Jungen 