## Encoder Decoder Architecture with Attention - German to English

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import os
import spacy
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
from torch.utils.data import DataLoader, Dataset
print(torch.__version__)
print(torch.cuda.is_available())
print('Using', torch.cuda.get_device_name()) if torch.cuda.is_available() else print('Using cpu')

2.1.0+cu118
True
Using Tesla T4


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
path = 'drive/MyDrive/Neural Machine Translation/Multi30K/'

def read_sentences_from_file(file_path):
    sentences = []
    with open(file_path, 'r') as file:
        for line in file:
            sentence = line.strip()
            if sentence:
                sentences.append(sentence)
    return sentences

df_train = pd.DataFrame({'SRC':read_sentences_from_file(path+'train.de'),
                   'TGT':read_sentences_from_file(path+'train.en')})

df_test = pd.DataFrame({'SRC':read_sentences_from_file(path+'test_2017_flickr.de'),
                   'TGT':read_sentences_from_file(path+'test_2017_flickr.en')})

df_val = pd.DataFrame({'SRC':read_sentences_from_file(path+'val.de'),
                   'TGT':read_sentences_from_file(path+'val.en')})

df_train.head()

Unnamed: 0,SRC,TGT
0,Zwei junge weiße Männer sind im Freien in der ...,"Two young, White males are outside near many b..."
1,Mehrere Männer mit Schutzhelmen bedienen ein A...,Several men in hard hats are operating a giant...
2,Ein kleines Mädchen klettert in ein Spielhaus ...,A little girl climbing into a wooden playhouse.
3,Ein Mann in einem blauen Hemd steht auf einer ...,A man in a blue shirt is standing on a ladder ...
4,Zwei Männer stehen am Herd und bereiten Essen zu.,Two men are at the stove preparing food.


In [4]:
class NMTDataset(Dataset):
    def __init__(self, df):
        self.src = list(df['SRC'])
        self.tgt = list(df['TGT'])
    def __len__(self):
        return len(self.src)
    def __getitem__(self, idx):
        idx = np.array(idx)
        src_text = np.array(self.src)[idx]
        tgt_text = np.array(self.tgt)[idx]
        return src_text, tgt_text

In [5]:
train_dataset = NMTDataset(df_train)
test_dataset = NMTDataset(df_test)
val_dataset = NMTDataset(df_val)
print(f'Train size: {len(train_dataset)} Test size: {len(test_dataset)} Val size: {len(val_dataset)}')

Train size: 29000 Test size: 1000 Val size: 1014


In [1]:
# !python -m spacy download en_core_web_sm
# !python -m spacy download de_core_news_sm
# # !python -m spacy download fr_core_news_sm

In [7]:
class Vocab():
    def __init__(self, dataset, lang='en', max_size=float('inf'), min_freq=2):
        self.dataset = dataset # expects a list
        self.max_size = max_size
        if lang=='en':
            self.spacy_lang = spacy.load('en_core_web_sm')
        elif lang=='de':
            self.spacy_lang = spacy.load('de_core_news_sm')
        elif lang=='fr':
            self.spacy_lang = spacy.load('fr_core_news_sm')
        else:
            raise Exception('Language not supported')
        self.min_freq = min_freq
        self.itos = {0:'<START>', 1:'<END>', 2:'<PAD>', 3:'<UNK>'}
        self.stoi = {v:k for k, v in self.itos.items()}

    def __len__(self):
        return len(self.itos)

    def tokenize(self, text):
        return [token.text.lower() for token in self.spacy_lang.tokenizer(str(text))]

    def numericalize(self, text):
        tokens = self.tokenize(text)
        idxs = []
        for token in tokens:
            if self.stoi.get(token):
                idxs.append(self.stoi[token])
            else:
                idxs.append(self.stoi['<UNK>'])
        return idxs

    def idx_to_token(self, numericalized):
        return [self.itos[num] for num in numericalized]

    def build_vocab(self):
        freqs = Counter()
        idx = len(self.itos)
        for i in range(len(self.dataset)):
            sentence = self.dataset[i]
            for w in self.tokenize(sentence):
                freqs[w] += 1
        for w, _ in freqs.most_common():
            if freqs[w] >= self.min_freq:
                self.itos[idx] = w
                self.stoi[w] = idx
                idx += 1

                if idx == self.max_size:
                    break

In [8]:
src_vocab = Vocab(train_dataset.src, lang='de', max_size=20000, min_freq=2)
tgt_vocab = Vocab(train_dataset.tgt, lang='en', max_size=20000, min_freq=2)

src_vocab.build_vocab()
tgt_vocab.build_vocab()

print('Source vocab size:', len(src_vocab))
print('Target vocab size:', len(tgt_vocab))

Source vocab size: 7853
Target vocab size: 5893


In [9]:
from torch.nn.utils.rnn import pad_sequence

def numericalize_and_pad_text(batch, padding_idx, src_vocab, tgt_vocab, batch_first=False, max_seq_len=512):
    batch_src, batch_tgt = [], []
    for tupl in batch:
        src_text, tgt_text = tupl
        numericalized_src = [src_vocab.stoi['<START>']] + src_vocab.numericalize(src_text)[:max_seq_len] \
                            + [src_vocab.stoi['<END>']] # truncate at max_seq_len
        numericalized_tgt = [tgt_vocab.stoi['<START>']] + tgt_vocab.numericalize(tgt_text)[:max_seq_len] + [tgt_vocab.stoi['<END>']]
        batch_src.append(torch.tensor(numericalized_src)) # pad_sequence expects a list of tensors
        batch_tgt.append(torch.tensor(numericalized_tgt))
    batch_src = pad_sequence(batch_src, batch_first=batch_first, padding_value=padding_idx)
    batch_tgt = pad_sequence(batch_tgt, batch_first=batch_first, padding_value=padding_idx)

    return batch_src.to(device), batch_tgt.to(device)

In [10]:
from functools import partial

batch_size = 128
max_seq_len = 512
collate_fn = partial(numericalize_and_pad_text,
                     src_vocab=src_vocab,
                     tgt_vocab=tgt_vocab,
                     padding_idx=tgt_vocab.stoi["<PAD>"],
                     max_seq_len = max_seq_len,
                     batch_first=False)

trainloader = DataLoader(dataset=train_dataset, shuffle=True,
                        batch_size=batch_size, collate_fn=collate_fn)
testloader = DataLoader(dataset=test_dataset, shuffle=False,
                        batch_size=batch_size, collate_fn=collate_fn)
valloader = DataLoader(dataset=val_dataset, shuffle=False,
                        batch_size=batch_size, collate_fn=collate_fn)

In [11]:
for i, batch in enumerate(trainloader):
    print(batch[0].shape, batch[1].shape)
    if i==4:
        break

torch.Size([33, 128]) torch.Size([31, 128])
torch.Size([26, 128]) torch.Size([28, 128])
torch.Size([26, 128]) torch.Size([29, 128])
torch.Size([33, 128]) torch.Size([32, 128])
torch.Size([27, 128]) torch.Size([25, 128])


In [93]:
class Encoder(nn.Module):
    def __init__(self, len_vocab_src, emb_dim, enc_hidden_dim, dec_hidden_dim):
        super(Encoder, self).__init__()

        self.len_vocab_src = len_vocab_src
        self.embeddings = nn.Embedding(num_embeddings=len_vocab_src, embedding_dim=emb_dim)
        self.gru_layers = nn.GRU(emb_dim, enc_hidden_dim, num_layers=1, batch_first=False, bidirectional=True) # can't apply dropout on last layer
        self.fc = nn.Linear(2*enc_hidden_dim, dec_hidden_dim)

    def forward(self, src):
        src_embedding = nn.Dropout(0.5)(self.embeddings(src)) # (seq_len, batch_size) ->  (seq_len, batch_size, emb_dim)

        # top_layer_hidden_states = (seq_len, batch_size, 2*enc_hidden_dim)
        top_layer_hidden_states, hT = self.gru_layers(src_embedding) # hT = (2*num_layers, batch_size, enc_hidden_dim) [no. of directions=2]
        hT_for, hT_back = hT[-2, :, :], hT[-1, :, :]
        hT = torch.tanh(self.fc(torch.cat((hT_for, hT_back), dim=1))) # hT = (batch size, dec_hidden_dim)
        return top_layer_hidden_states, hT

In [94]:
class Attention(nn.Module):
    def __init__(self, enc_hidden_dim, dec_hidden_dim):
        super(Attention, self).__init__()

        self.alignment = nn.Linear(2*enc_hidden_dim + dec_hidden_dim, dec_hidden_dim)
        self.score = nn.Linear(dec_hidden_dim, 1, bias=False)

    def forward(self, top_layer_hidden_states, hT):
        src_seq_len, batch_size, _ = top_layer_hidden_states.shape
        hT = hT.unsqueeze(1).repeat(1, src_seq_len, 1) # hT = (batch size, src_seq_len, dec_hidden_dim)
        top_layer_hidden_states = top_layer_hidden_states.permute(1, 0, 2) # top_layer_hidden_states = (batch_size, src_seq_len, 2*enc_hidden_dim)
        aligned = torch.tanh(self.alignment(torch.cat((hT, top_layer_hidden_states), dim = 2))) # aligned = (batch_size, src_seq_len, dec_hidden_dim)
        attention = self.score(aligned).squeeze(2)  # attention = (batch_size, src_seq_len)
        return F.softmax(attention, dim = 1)

In [106]:
class Decoder(nn.Module):
    def __init__(self, len_vocab_tgt, emb_dim, enc_hidden_dim, dec_hidden_dim, attention):
        super(Decoder, self).__init__()

        self.len_vocab_tgt = len_vocab_tgt
        self.attention = attention
        self.embeddings = nn.Embedding(num_embeddings=len_vocab_tgt, embedding_dim=emb_dim)
        self.gru_layers = nn.GRU(2*enc_hidden_dim + emb_dim, dec_hidden_dim, batch_first=False) # hidden_dim and num_layers should match that of encoder
        self.fc = nn.Linear(2*enc_hidden_dim + dec_hidden_dim + emb_dim, len_vocab_tgt)

    def forward(self, tgt, ht, top_layer_hidden_states):
        tgt = tgt.unsqueeze(0) # (seq_len=1, batch_size)
        tgt_embedding = nn.Dropout(0.5)(self.embeddings(tgt)) # (seq_len=1, batch_size) ->  (seq_len=1, batch_size, emb_dim)
        a =  self.attention(top_layer_hidden_states, ht) # a = (batch_size, src_seq_len)
        a = a.unsqueeze(1) # a = (batch_size, 1, src_seq_len)
        top_layer_hidden_states = top_layer_hidden_states.permute(1, 0, 2) # top_layer_hidden_states = (batch_size, src_seq_len, 2*enc_hidden_dim)
        weighted_hidden_states = torch.bmm(a, top_layer_hidden_states) # weighted_hidden_states = (batch_size, 1, 2*enc_hidden_dim)
        weighted_hidden_states = weighted_hidden_states.permute(1, 0, 2) # weighted_hidden_states = (1, batch_size, 2*enc_hidden_dim)
        inp = torch.cat((tgt_embedding, weighted_hidden_states), dim=2) # inp = (seq_len=1, batch_size, 2*enc_hidden_dim+emb_dim)
        # top_layer_hidden_states = (seq_len=1, batch_size, dec_hidden_dim*1) [no. of directions=1]
        top_layer_hidden_states, ht = self.gru_layers(inp, ht.unsqueeze(0)) # ht = (1*1, batch_size, dec_hidden_dim) [no. of directions=1, num_layers=1]
        ht = ht.squeeze(0) # ht = (batch_size, dec_hidden_dim)
        fc_input = torch.cat((top_layer_hidden_states.squeeze(0), weighted_hidden_states.squeeze(0), tgt_embedding.squeeze(0)), dim=1)
        out = self.fc(fc_input)
        return out, ht

In [113]:
class Model(nn.Module):
    def __init__(self, encoder, decoder):
        super(Model, self).__init__()

        self.encoder = encoder
        self.decoder = decoder

    def forward(self, src, tgt, tfratio):
        len_vocab_tgt = self.decoder.len_vocab_tgt
        tgt_seq_len, batch_size = tgt.shape # tgt = (tgt_seq_len, batch_size)
        decoder_outputs = torch.zeros(tgt_seq_len, batch_size, len_vocab_tgt).to(device)
        top_layer_hidden_states, ht = self.encoder(src)

        dec_input = tgt[0, :] # <START> token

        # Sequentially generating decoder output
        for i in range(1, tgt_seq_len):
            out, ht = self.decoder(dec_input, ht, top_layer_hidden_states) # out = top layer hidden states of decoder
            decoder_outputs[i] = out
            teacher_force = np.random.random() < tfratio # True if we do teacher forcing
            pred_token = out.argmax(dim=1)
            dec_input = tgt[i] if teacher_force else pred_token

        return decoder_outputs

In [114]:
encoder = Encoder(len_vocab_src=len(src_vocab), emb_dim=256, enc_hidden_dim=512, dec_hidden_dim=512)
attention = Attention(enc_hidden_dim=512, dec_hidden_dim=512)
decoder = Decoder(len_vocab_tgt=len(tgt_vocab), emb_dim=256, enc_hidden_dim=512, dec_hidden_dim=512, attention=attention)

model = Model(encoder, decoder).to(device)
optimizer = torch.optim.Adam(model.parameters())
loss_fn = nn.CrossEntropyLoss(ignore_index = tgt_vocab.stoi['<PAD>'])

In [115]:
def weight_initialization(model):
    for name, param in model.named_parameters():
        if 'weight' in name:
            nn.init.normal_(param.data, mean=0, std=0.01)
        else:
            nn.init.constant_(param.data, 0)

model.apply(weight_initialization)

Model(
  (encoder): Encoder(
    (embeddings): Embedding(7853, 256)
    (gru_layers): GRU(256, 512, bidirectional=True)
    (fc): Linear(in_features=1024, out_features=512, bias=True)
  )
  (decoder): Decoder(
    (attention): Attention(
      (alignment): Linear(in_features=1536, out_features=512, bias=True)
      (score): Linear(in_features=512, out_features=1, bias=False)
    )
    (embeddings): Embedding(5893, 256)
    (gru_layers): GRU(1280, 512)
    (fc): Linear(in_features=1792, out_features=5893, bias=True)
  )
)

In [116]:
print(f'Trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad)}')

Trainable parameters: 20518405


In [118]:
def train():
    losses = []
    model.train()

    for batch in trainloader:
        src, tgt = batch
        optimizer.zero_grad()
        dec_out = model(src, tgt, tfratio=0.5)
        dec_out = dec_out[1:].view(-1, model.decoder.len_vocab_tgt) # first index of decoder_outputs is just zeros/isn't being used
        tgt = tgt[1:].view(-1) # first index of tgt is just start token
        loss = loss_fn(dec_out, tgt)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1) # gradient clipping
        optimizer.step()
        losses.append(loss.item())

    return sum(losses) / len(trainloader)

In [119]:
def evaluate(loader):
    losses = []
    model.eval()

    with torch.no_grad():
        for batch in loader:
            src, tgt = batch
            dec_out = model(src, tgt, tfratio=0) # no teacher forcing in testing
            dec_out = dec_out[1:].view(-1, model.decoder.len_vocab_tgt) # first index of decoder_outputs is just zeros/isn't being used
            tgt = tgt[1:].view(-1) # first index of tgt is just start token
            loss = loss_fn(dec_out, tgt)
            losses.append(loss.item())

    return sum(losses) / len(loader)

In [120]:
import time

def time_epoch(start, end):
    diff = end - start
    mins = int(diff / 60)
    secs = int(diff - (mins * 60))
    return mins, secs

In [121]:
from IPython import display
import math
res = pd.DataFrame(columns=['Epoch', 'Train Loss', 'Train Perplexity', 'Val Loss', 'Val Perplexity', 'Time'])
display.display(res)

best_val_loss = float('inf')
for epoch in range(10):

    start = time.time()
    train_loss = train()
    val_loss = evaluate(valloader)
    end = time.time()
    mins, secs = time_epoch(start, end)

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), './drive/MyDrive/Neural Machine Translation/EncDecAttn_de2en.pth')

    display.clear_output(wait=True)
    res.loc[len(res)] = [epoch+1, f'{train_loss:.3f}', f'{math.exp(train_loss):.2f}', f'{val_loss:.3f}', f'{math.exp(val_loss):.2f}', f'{mins}min {secs}s']
    display.display(res)

Unnamed: 0,Epoch,Train Loss,Train Perplexity,Val Loss,Val Perplexity,Time
0,1,5.077,160.24,4.829,125.09,11min 7s
1,2,4.104,60.56,4.25,70.09,11min 2s
2,3,3.406,30.14,3.77,43.38,11min 52s
3,4,2.9,18.17,3.47,32.15,11min 58s
4,5,2.492,12.08,3.45,31.51,11min 26s
5,6,2.212,9.13,3.308,27.32,11min 29s
6,7,1.962,7.11,3.357,28.7,10min 58s
7,8,1.741,5.71,3.442,31.25,11min 12s


KeyboardInterrupt: ignored

In [None]:
model.load_state_dict(torch.load('./drive/MyDrive/Neural Machine Translation/EncDecAttn_de2en.pth'))

<All keys matched successfully>

In [184]:
import math
test_loss = evaluate(testloader)
print(f'Test Loss: {test_loss} | Test Perplexity: {math.exp(test_loss)}')

Test Loss: 3.847332000732422 | Test Perplexity: 46.86785287813471


In [187]:
from torchtext.data.metrics import bleu_score
def reverse_numericalize(tensor, vocab):
    lst = tensor.detach().tolist()
    lst =  [num for num in lst if num not in [0, 1, 2, 3]]
    return vocab.idx_to_token(lst)

def total_bleu_score(model, testloader, src_vocab, tgt_vocab):
    model.eval()
    all_references = []
    all_candidates = []

    with torch.no_grad():
        for batch in testloader:
            src, tgt = batch
            dec_out = model(src, tgt, tfratio=0)
            tgt_pred = dec_out.argmax(dim=2)

            tgt_sentences = [reverse_numericalize(tgt[:, i], tgt_vocab) for i in range(tgt.size(1))]
            # Reverse numericalize translated tensors
            translated_sentences = [reverse_numericalize(tgt_pred[:, i], tgt_vocab) for i in range(tgt_pred.size(1))]

            all_references.extend(tgt_sentences)
            all_candidates.extend(translated_sentences)

    score = bleu_score(all_candidates, all_references, max_n=1, weights=[1])
    return score

score = total_bleu_score(model, testloader, src_vocab, tgt_vocab)
print(f'Total BLEU Score: {score * 100:.2f}')

Total BLEU Score: 14.86
