## Encoder Decoder Architecture with Attention - English to German

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import os
import spacy
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
from torch.utils.data import DataLoader, Dataset
print(torch.__version__)
print(torch.cuda.is_available())
print('Using', torch.cuda.get_device_name())

2.0.1+cu118
True
Using Tesla T4


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
path = 'drive/MyDrive/Multi30K/'

def read_sentences_from_file(file_path):
    sentences = []
    with open(file_path, 'r') as file:
        for line in file:
            sentence = line.strip()
            if sentence:
                sentences.append(sentence)
    return sentences

df_train = pd.DataFrame({'SRC':read_sentences_from_file(path+'train.en'),
                   'TGT':read_sentences_from_file(path+'train.de')})

df_test = pd.DataFrame({'SRC':read_sentences_from_file(path+'test_2017_flickr.en'),
                   'TGT':read_sentences_from_file(path+'test_2017_flickr.de')})

df_val = pd.DataFrame({'SRC':read_sentences_from_file(path+'val.en'),
                   'TGT':read_sentences_from_file(path+'val.de')})

df_train.head()

Unnamed: 0,SRC,TGT
0,"Two young, White males are outside near many b...",Zwei junge weiße Männer sind im Freien in der ...
1,Several men in hard hats are operating a giant...,Mehrere Männer mit Schutzhelmen bedienen ein A...
2,A little girl climbing into a wooden playhouse.,Ein kleines Mädchen klettert in ein Spielhaus ...
3,A man in a blue shirt is standing on a ladder ...,Ein Mann in einem blauen Hemd steht auf einer ...
4,Two men are at the stove preparing food.,Zwei Männer stehen am Herd und bereiten Essen zu.


In [None]:
class NMTDataset(Dataset):
    def __init__(self, df):
        self.src = list(df['SRC'])
        self.tgt = list(df['TGT'])
    def __len__(self):
        return len(self.src)
    def __getitem__(self, idx):
        idx = np.array(idx)
        src_text = np.array(self.src)[idx]
        tgt_text = np.array(self.tgt)[idx]
        return src_text, tgt_text

In [None]:
train_dataset = NMTDataset(df_train)
test_dataset = NMTDataset(df_test)
val_dataset = NMTDataset(df_val)
print(f'Train size: {len(train_dataset)} Test size: {len(test_dataset)} Val size: {len(val_dataset)}')

Train size: 29000 Test size: 1000 Val size: 1014


In [1]:
# !python -m spacy download en_core_web_sm
# !python -m spacy download de_core_news_sm

In [None]:
class Vocab():
    def __init__(self, dataset, lang='en', max_size=float('inf'), min_freq=2):
        self.dataset = dataset # expects a list
        self.max_size = max_size
        if lang=='en':
            self.spacy_lang = spacy.load('en_core_web_sm')
        elif lang=='de':
            self.spacy_lang = spacy.load('de_core_news_sm')
        else:
            raise Exception('Language not supported')
        self.min_freq = min_freq
        self.itos = {0:'<START>', 1:'<END>', 2:'<PAD>', 3:'<UNK>'}
        self.stoi = {v:k for k, v in self.itos.items()}

    def __len__(self):
        return len(self.itos)

    def tokenize(self, text):
        return [token.text.lower() for token in self.spacy_lang.tokenizer(str(text))]

    def numericalize(self, text):
        tokens = self.tokenize(text)
        idxs = []
        for token in tokens:
            if self.stoi.get(token):
                idxs.append(self.stoi[token])
            else:
                idxs.append(self.stoi['<UNK>'])
        return idxs

    def idx_to_token(self, numericalized):
        return [self.itos[num] for num in numericalized]

    def build_vocab(self):
        freqs = Counter()
        idx = len(self.itos)
        for i in range(len(self.dataset)):
            sentence = self.dataset[i]
            for w in self.tokenize(sentence):
                freqs[w] += 1
        for w, _ in freqs.most_common():
            if freqs[w] >= self.min_freq:
                self.itos[idx] = w
                self.stoi[w] = idx
                idx += 1

                if idx == self.max_size:
                    break

In [None]:
src_vocab = Vocab(train_dataset.src, lang='en', max_size=20000, min_freq=2)
tgt_vocab = Vocab(train_dataset.tgt, lang='de', max_size=20000, min_freq=2)

src_vocab.build_vocab()
tgt_vocab.build_vocab()

print('Source vocab size:', len(src_vocab))
print('Target vocab size:', len(tgt_vocab))

Source vocab size: 5893
Target vocab size: 7853


In [None]:
from torch.nn.utils.rnn import pad_sequence

def numericalize_and_pad_text(batch, padding_idx, src_vocab, tgt_vocab, batch_first=False, max_seq_len=512):
    batch_src, batch_tgt = [], []
    for tupl in batch:
        src_text, tgt_text = tupl
        numericalized_src = [src_vocab.stoi['<START>']] + src_vocab.numericalize(src_text)[:max_seq_len][::-1] \
                            + [src_vocab.stoi['<END>']] # truncate at max_seq_len # reverse src
        numericalized_tgt = [tgt_vocab.stoi['<START>']] + tgt_vocab.numericalize(tgt_text)[:max_seq_len] + [tgt_vocab.stoi['<END>']]
        batch_src.append(torch.tensor(numericalized_src)) # pad_sequence expects a list of tensors
        batch_tgt.append(torch.tensor(numericalized_tgt))
    batch_src = pad_sequence(batch_src, batch_first=batch_first, padding_value=padding_idx)
    batch_tgt = pad_sequence(batch_tgt, batch_first=batch_first, padding_value=padding_idx)

    return batch_src.to(device), batch_tgt.to(device)

In [None]:
from functools import partial

batch_size = 128
max_seq_len = 64
collate_fn = partial(numericalize_and_pad_text,
                     src_vocab=src_vocab,
                     tgt_vocab=tgt_vocab,
                     padding_idx=tgt_vocab.stoi["<PAD>"],
                     max_seq_len = max_seq_len,
                     batch_first=False)

trainloader = DataLoader(dataset=train_dataset, shuffle=True,
                        batch_size=batch_size, collate_fn=collate_fn)
testloader = DataLoader(dataset=test_dataset, shuffle=False,
                        batch_size=batch_size, collate_fn=collate_fn)
valloader = DataLoader(dataset=val_dataset, shuffle=False,
                        batch_size=batch_size, collate_fn=collate_fn)

In [None]:
for i, batch in enumerate(trainloader):
    print(batch[0].shape, batch[1].shape)
    if i==4:
        break

torch.Size([31, 128]) torch.Size([30, 128])
torch.Size([29, 128]) torch.Size([32, 128])
torch.Size([32, 128]) torch.Size([27, 128])
torch.Size([30, 128]) torch.Size([25, 128])
torch.Size([28, 128]) torch.Size([33, 128])


In [None]:
class Encoder(nn.Module):
    def __init__(self, len_vocab_src, emb_dim, hidden_dim):
        super(Encoder, self).__init__()

        self.len_vocab_src = len_vocab_src
        self.embeddings = nn.Embedding(num_embeddings=len_vocab_src, embedding_dim=emb_dim)
        self.lstm_layers = nn.LSTM(emb_dim, hidden_dim, num_layers=2, batch_first=False, dropout=0.5) # can't apply dropout on last layer

    def forward(self, src):
        src_embedding = nn.Dropout(0.5)(self.embeddings(src)) # (seq_len, batch_size) ->  (seq_len, batch_size, emb_dim)
        top_layer_hidden_states, (hT, cT) = self.lstm_layers(src_embedding) # hT = (1*num_layers, batch_size, hidden_dim) [no. of directions=1]

        return hT, cT

In [None]:
class Decoder(nn.Module):
    def __init__(self, len_vocab_tgt, emb_dim, hidden_dim):
        super(Decoder, self).__init__()

        self.len_vocab_tgt = len_vocab_tgt
        self.embeddings = nn.Embedding(num_embeddings=len_vocab_tgt, embedding_dim=emb_dim)
        self.lstm_layers = nn.LSTM(emb_dim, hidden_dim, num_layers=2, batch_first=False, dropout=0.5) # hidden_dim and num_layers should match that of encoder
        self.fc = nn.Linear(hidden_dim, len_vocab_tgt)

    def forward(self, tgt, ht, ct):
        tgt = tgt.unsqueeze(0) # (seq_len=1, batch_size)
        tgt_embedding = nn.Dropout(0.5)(self.embeddings(tgt)) # (seq_len=1, batch_size) ->  (seq_len=1, batch_size, emb_dim)
        # top_layer_hidden_states = (seq_len=1, batch_size, hidden_dim*no. of directions)
        top_layer_hidden_states, (ht, ct) = self.lstm_layers(tgt_embedding, (ht, ct)) # ht = (1*num_layers, batch_size, hidden_dim) [no. of directions=1]
        out = self.fc(top_layer_hidden_states.squeeze(0))
        return out, ht, ct

In [None]:
class Model(nn.Module):
    def __init__(self, encoder, decoder):
        super(Model, self).__init__()

        self.encoder = encoder
        self.decoder = decoder

    def forward(self, src, tgt, tfratio):
        len_vocab_tgt = self.decoder.len_vocab_tgt
        tgt_seq_len, batch_size = tgt.shape # tgt = (tgt_seq_len, batch_size)
        decoder_outputs = torch.zeros(tgt_seq_len, batch_size, len_vocab_tgt).to(device)

        ht, ct = self.encoder(src)
        dec_input = tgt[0, :] # <START> token

        # Sequentially generating decoder output
        for i in range(1, tgt_seq_len):
            out, ht, ct = self.decoder(dec_input, ht, ct) # out = top layer hidden states of decoder
            decoder_outputs[i] = out
            teacher_force = np.random.random() < tfratio # True if we do teacher forcing
            pred_token = out.argmax(dim=1)
            dec_input = tgt[i] if teacher_force else pred_token

        return decoder_outputs

In [None]:
encoder = Encoder(len_vocab_src=len(src_vocab), emb_dim=256, hidden_dim=512)
decoder = Decoder(len_vocab_tgt=len(tgt_vocab), emb_dim=256, hidden_dim=512)

model = Model(encoder, decoder).to(device)
optimizer = torch.optim.Adam(model.parameters())
loss_fn = nn.CrossEntropyLoss(ignore_index = tgt_vocab.stoi['<PAD>'])

In [None]:
def weight_initialization(model):
    for name, param in model.named_parameters():
        nn.init.uniform_(param.data, low=-0.08, high=0.08)

model.apply(weight_initialization)


Model(
  (encoder): Encoder(
    (embeddings): Embedding(5893, 256)
    (lstm_layers): LSTM(256, 512, num_layers=2, dropout=0.5)
  )
  (decoder): Decoder(
    (embeddings): Embedding(7853, 256)
    (lstm_layers): LSTM(256, 512, num_layers=2, dropout=0.5)
    (fc): Linear(in_features=512, out_features=7853, bias=True)
  )
)

In [None]:
print(f'Trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad)}')

In [None]:
def train():
    losses = []
    model.train()

    for batch in trainloader:
        src, tgt = batch
        optimizer.zero_grad()
        dec_out = model(src, tgt, tfratio=0.5)
        dec_out = dec_out[1:].view(-1, model.decoder.len_vocab_tgt) # first index of decoder_outputs is just zeros/isn't being used
        tgt = tgt[1:].view(-1) # first index of tgt is just start token
        loss = loss_fn(dec_out, tgt)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1) # gradient clipping
        optimizer.step()
        losses.append(loss.item())

    return sum(losses) / len(trainloader)

In [None]:
def evaluate(loader):
    losses = []
    model.eval()

    with torch.no_grad():
        for batch in loader:
            src, tgt = batch
            dec_out = model(src, tgt, tfratio=0) # no teacher forcing in testing
            dec_out = dec_out[1:].view(-1, model.decoder.len_vocab_tgt) # first index of decoder_outputs is just zeros/isn't being used
            tgt = tgt[1:].view(-1) # first index of tgt is just start token
            loss = loss_fn(dec_out, tgt)
            losses.append(loss.item())

    return sum(losses) / len(loader)

In [None]:
import time

def time_epoch(start, end):
    diff = end - start
    mins = int(diff / 60)
    secs = int(diff - (mins * 60))
    return mins, secs

In [None]:
from IPython import display
import math
res = pd.DataFrame(columns=['Epoch', 'Train Loss', 'Train Perplexity', 'Val Loss', 'Val Perplexity', 'Time'])
display.display(res)

best_val_loss = float('inf')
for epoch in range(10):

    start = time.time()
    train_loss = train()
    val_loss = evaluate(valloader)
    end = time.time()
    mins, secs = time_epoch(start, end)

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), './drive/MyDrive/seq2seq_1_en2de.pth')

    display.clear_output(wait=True)
    res.loc[len(res)] = [epoch+1, f'{train_loss:.3f}', f'{math.exp(train_loss):.2f}', f'{val_loss:.3f}', f'{math.exp(val_loss):.2f}', f'{mins}min {secs}s']
    display.display(res)

Unnamed: 0,Epoch,Train Loss,Train Perplexity,Val Loss,Val Perplexity,Time
0,1,5.175,176.77,5.158,173.81,10min 28s
1,2,4.578,97.27,4.826,124.7,10min 22s
2,3,4.224,68.28,4.654,104.96,10min 18s
3,4,3.963,52.63,4.536,93.28,10min 10s
4,5,3.798,44.59,4.416,82.72,10min 12s
5,6,3.621,37.37,4.214,67.66,10min 13s
6,7,3.44,31.18,4.205,67.04,10min 7s
7,8,3.318,27.61,4.059,57.92,10min 8s
8,9,3.162,23.61,4.019,55.65,10min 5s
9,10,3.018,20.46,3.93,50.91,10min 6s


In [None]:
model.load_state_dict(torch.load('./drive/MyDrive/seq2seq_1_en2de.pth'))

<All keys matched successfully>

In [None]:
import math
test_loss = evaluate(testloader)
print(f'Test Loss: {test_loss} | Test Perplexity: {math.exp(test_loss)}')

Test Loss: 4.305474281311035 | Test Perplexity: 74.10435342944503
