**Neural Machine Translation with Various Sequence Models**


- In this project I have performed Neural Machine Translation with recurrent neural networks and attention based models on Multi30k dataset which include language pairs of German and English.
- To this end, you need to implement necessary network components (e.g. LSTMCell, Multi-head attention) using nn.Module class. Then, I experimented those network architectures to get better Bilingual Evaluation Understudy (BLEU) on the test set.









---
# Mounting gdrive.

In [None]:

from google.colab import drive
drive.mount('/gdrive')

---
# Seting up the `root` directory properly.

In [None]:

root = '/gdrive/MyDrive/project_NLP'

---
#Installing libraries.


In [None]:
!pip install torchtext==0.6.0
!pip install spacy
!python -m spacy download en
!python -m spacy download de

---
# Basic settings

## Importing libraries

In [None]:
import os
import numpy as np
import time
from pathlib import Path
import torch
import torch.nn as nn
from torch.nn.parameter import Parameter
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.optim import SGD
import torchtext
from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator
from torchtext.data.utils import get_tokenizer
from torchtext import data
from torchtext.data.metrics import bleu_score
import spacy
from spacy.symbols import ORTH
import math
import random
import tqdm.notebook as tq
import copy

## Hyperparameters

In [None]:
# Basic settings
torch.manual_seed(470)
torch.cuda.manual_seed(470)

#!pip install easydict
from easydict import EasyDict as edict

args = edict()
args.batch_size = 32
args.nlayers = 2
args.ninp = 256
args.nhid = 256 #512


args.clip = 1
args.lr_lstm = 0.001
args.dropout = 0.2
args.nhid_attn = 256
args.epochs = 20

##### Transformer
args.nhid_tran = 256
args.nhead = 8
args.nlayers_transformer = 6
args.attn_pdrop = 0.1
args.resid_pdrop = 0.1
args.embd_pdrop = 0.1
args.nff = 4 * args.nhid_tran


args.lr_transformer = 0.0001 #1.0
args.betas = (0.9, 0.98)

args.gpu = True


device = 'cuda:0' if torch.cuda.is_available() and args.gpu else 'cpu'

result_dir = Path(root) / 'results'
result_dir.mkdir(parents=True, exist_ok=True)

---
# Utility functions


In [None]:
def word_ids_to_sentence(id_tensor, vocab, join=' '):
    """Converts a sequence of word ids to a sentence"""
    if isinstance(id_tensor, torch.LongTensor):
        ids = id_tensor.transpose(0, 1).contiguous().view(-1)
    elif isinstance(id_tensor, np.ndarray):
        ids = id_tensor.transpose().reshape(-1)
    batch = [vocab.itos[ind] for ind in ids] # denumericalize
    if join is None:
        return batch
    else:
        return join.join(batch)

# Extracts bias and non-bias parameters from a model.
def get_parameters(model, bias=False):
    for m in model.modules():
        if isinstance(m, nn.Linear):
            if bias:
                yield m.bias
            else:
                yield m.weight
        else:
            if not bias:
                yield m.parameters()

def run_epoch(epoch, model, optimizer, is_train=True, data_iter=None):
    total_loss = 0
    n_correct = 0
    n_total = 0
    if data_iter is None:
        data_iter = train_iter if is_train else valid_iter
    if is_train:
        model.train()
    else:
        model.eval()
    for batch in data_iter:
        x, y, length = sort_batch(batch.src.to(device), batch.trg.to(device))
        target = y[1:]
        if isinstance(model, Transformer):
            x, y = x.transpose(0, 1), y.transpose(0, 1)
            target = target.transpose(0, 1) #y[:, 1:]
        pred = model(x, y, length)
        loss = criterion(pred.reshape(-1, trg_ntoken), target.reshape(-1))
        n_targets = (target != pad_id).long().sum().item()
        n_total += n_targets
        n_correct += (pred.argmax(-1) == target)[target != pad_id].long().sum().item()
        if is_train:
            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip)
            optimizer.step()


        total_loss += loss.item() * n_targets
    total_loss /= n_total
    print("Epoch", epoch, 'Train' if is_train else 'Valid',
          "Loss", np.mean(total_loss),
          "Acc", n_correct / n_total,
          "PPL", np.exp(total_loss))
    return total_loss

def word_ids_to_sentence_(ids, vocab):
    sentence = []
    for ind in ids:
        if ind == eos_id:
            break
        sentence.append(vocab.itos[ind])
    return sentence

def run_translation(model, data_iter, max_len=100, mode='best'):
    with torch.no_grad():
        model.eval()
        load_model(model, mode)
        src_list = []
        gt_list = []
        pred_list = []
        for batch in data_iter:
            x, y, length = sort_batch(batch.src.to(device), batch.trg.to(device))
            target = y[1:]
            if isinstance(model, Transformer):
                x, y = x.transpose(0, 1), y.transpose(0, 1)
                target = target.transpose(0, 1)
            pred = model(x, y, length, max_len=max_len, teacher_forcing=False)
            pred_token = pred.argmax(-1)
            if not isinstance(model, Transformer):
                pred_token = pred_token.transpose(0, 1).cpu().numpy()
                y = y.transpose(0, 1).cpu().numpy()
                x = x.transpose(0, 1).cpu().numpy()
            # pred_token : batch_size x max_len
            for x_, y_, pred_ in zip(x, y, pred_token):
                src_list.append(word_ids_to_sentence_(x_[1:], SRC.vocab))
                gt_list.append([word_ids_to_sentence_(y_[1:], TRG.vocab)])
                pred_list.append(word_ids_to_sentence_(pred_, TRG.vocab))

        for i in range(5):
            print(f"--------- Translation Example {i+1} ---------")
            print("SRC :", ' '.join(src_list[i]))
            print("TRG :", ' '.join(gt_list[i][0]))
            print("PRED:", ' '.join(pred_list[i]))
        print()
        print("BLEU:", bleu_score(pred_list, gt_list))



def save_model(model, mode="last"):
    torch.save(model.state_dict(),  result_dir / f'{type(model).__name__}_{mode}.ckpt')

def load_model(model, mode="last"):
    if os.path.exists(result_dir / f'{type(model).__name__}_{mode}.ckpt'):
        model.load_state_dict(torch.load(result_dir / f'{type(model).__name__}_{mode}.ckpt'))

def sort_batch(X, y, lengths=None):
    if lengths is None:
        lengths = (X != pad_id_src).long().sum(0)
    lengths, indx = lengths.sort(dim=0, descending=True)
    X = torch.index_select(X, 1, indx)
    y = torch.index_select(y, 1, indx)
    return X, y, lengths

def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)

---
# Define `DataLoader` for training & validation set


In [None]:
# Manually fix Multi30K download link, since the original server is down. (https://github.com/pytorch/text/issues/1756)
Multi30k.urls = [
    "https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/training.tar.gz",
    "https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/validation.tar.gz",
    "https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/mmt16_task1_test.tar.gz",
]

In [None]:
SRC = Field(tokenize = "spacy",
            tokenizer_language="de_core_news_sm",
            init_token = '<sos>',
            eos_token = '<eos>',
            lower = True)

TRG = Field(tokenize = "spacy",
            tokenizer_language="en_core_web_sm",
            init_token = '<sos>',
            eos_token = '<eos>',
            lower = True)

train_data, valid_data, test_data = Multi30k.splits(exts = ('.de', '.en'),
                                                    fields = (SRC, TRG), test='test')
SRC.build_vocab(train_data, min_freq = 2)
TRG.build_vocab(train_data, min_freq = 2)

src_ntoken = len(SRC.vocab.stoi)
trg_ntoken = len(TRG.vocab.stoi)

train_iter, valid_iter, test_iter = BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size = args.batch_size,
    device = device)

In [None]:
pad_id_trg = TRG.vocab.stoi[TRG.pad_token]
pad_id_src = SRC.vocab.stoi[SRC.pad_token]
pad_id = pad_id_src
eos_id = TRG.vocab.stoi[TRG.eos_token]
criterion = nn.CrossEntropyLoss(ignore_index=pad_id)

for batch in train_iter:
    src, trg, length_src = sort_batch(batch.src, batch.trg)
    print(length_src)
    print(src, src.shape)
    print(trg, trg.shape)
    break

print("##### EXAMPLE #####")
print("SRC: ", word_ids_to_sentence(src[:, 1:2].long().cpu(), SRC.vocab))
print("TRG: ", word_ids_to_sentence(trg[:, 1:2].long().cpu(), TRG.vocab))

print("SRC vocab size", len(SRC.vocab.stoi))
print("TRG vocab size", len(TRG.vocab.stoi))
print("Vocab", list(SRC.vocab.stoi.items())[:10])

---
#  networks



##  Implementing LSTM

### (a)  Implementing LSTMCell
- LSTMCell is a single unit constructing LSTM. It gets current input(`x`) and previous state (which is composed of hidden state `hx` and cell state `cx`) as inputs and returns the state for the next time step (`hy` and `cy`). There are four switch variables to handle information flows through time.

In [None]:
class LSTMCell(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(LSTMCell, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.linear_input = nn.Linear(input_size, 4 * hidden_size)
        self.linear_hidden = nn.Linear(hidden_size, 4 * hidden_size)

    def forward(self, x, state):

        hx, cx = state
        gates = self.linear_input(x) + self.linear_hidden(hx)


        f, i, o, g = torch.chunk(gates, chunks=4, dim=1)


        f = torch.sigmoid(f)
        i = torch.sigmoid(i)
        o = torch.sigmoid(o)
        g = torch.tanh(g)

        # Update cell state
        cy = f * cx + i * g

        # Update hidden state
        hy = o * torch.tanh(cy)

        # Return the new state
        return hy, (hy,cy)

        ################################

### (b)  Implementing LSTM
- LSTMLayer is a single layer composed of sequential LSTMCells. While LSTMCell handles a single input, LSTMLayer gets a sequence as an input and processes it in an autoregressive manner.
 `states` now contain multiple `state`s where each state becomes an initial state for a different level of LSTMLayers. Also, each output of an LSTMLayer is fed into the next layer of LSTMLayer as an input.
As a result, LSTM returns `output` tensor of size (L,B,nhid) and `output_states` consists of output states from different levels of LSTMLayers, which is a type of List(Tensor, Tensor, ..., Tensor) and each Tensor has a size of (L,B,nhid). Here L,B,nhid are a maximum length of sentences within a batch (equal to `x.size(0)`), batch size, and dimension size of hidden states, respectively.

In [None]:
class LSTMLayer(nn.Module):
    def __init__(self,*cell_args):
        super(LSTMLayer, self).__init__()
        self.cell = LSTMCell(*cell_args)

    def forward(self, x, state, length_x=None):

        # type: (Tensor, Tuple[Tensor, Tensor]) -> Tuple[Tensor, Tuple[Tensor, Tensor]]
        inputs = x.unbind(0)
        assert (length_x is None) or torch.all(length_x == length_x.sort(descending=True)[0])
        outputs = []
        out_hidden_state = []
        out_cell_state = []
        for i in range(len(inputs)):
            out, state = self.cell(inputs[i] , state)
            outputs += [out]
            if length_x is not None:
                if torch.any(i+1 == length_x):
                    out_hidden_state = [state[0][i+1==length_x]] + out_hidden_state
                    out_cell_state = [state[1][i+1==length_x]] + out_cell_state
        if length_x is not None:
            state = (torch.cat(out_hidden_state, dim=0), torch.cat(out_cell_state, dim=0))
        return torch.stack(outputs), state


class LSTM(nn.Module):
    def __init__(self, ninp, nhid, nlayers, dropout):
        super(LSTM, self).__init__()
        self.layers = []
        self.dropout = nn.Dropout(dropout)
        for i in range(nlayers):
            if i == 0:
                self.layers.append(LSTMLayer(ninp, nhid))
            else:
                self.layers.append(LSTMLayer(nhid, nhid))
        self.layers = nn.ModuleList(self.layers)

    def forward(self, x, states, length_x=None):

        output_states = []
        output = x

        for i, layer in enumerate(self.layers):

            output, state = layer(output, states[i], length_x=length_x)
            output_states.append(state)


            if i < len(self.layers) - 1:
                output = self.dropout(output)


        return output, output_states
        ################################

### (c)  Implementing LSTMEncoder
LSTMEncoder encodes a sequence of tokens into the context vector. It first embeds a tokenized sequence using the embedding layer followed by dropout layer, and then LSTM computes `output` and `context_vector`.


In [None]:
class LSTMEncoder(nn.Module):
    def __init__(self):
        super(LSTMEncoder, self).__init__()
        ninp = args.ninp
        nhid = args.nhid
        nlayers = args.nlayers
        dropout = args.dropout
        self.embed = nn.Embedding(src_ntoken, ninp, padding_idx=pad_id)
        self.dropout = nn.Dropout(dropout)
        self.lstm = LSTM(ninp, nhid, nlayers, dropout)

    def forward(self, x, states, length_x=None):


        embedded = self.embed(x)


        dropped = self.dropout(embedded)


        output, context_vector = self.lstm(dropped, states, length_x)

        return output, context_vector
        # ################################

### (d)  Implementing LSTMDecoder
LSTMDecoder gets a single token as an input to predict the next token. Similar to LSTMEncoder, it first embeds a given input (usually a predicted token from last time step) using embedding layer followed by dropout layer, and then LSTM computes `output` and `output_states`.

In [None]:
class LSTMDecoder(nn.Module):
    def __init__(self):
        super(LSTMDecoder, self).__init__()
        self.embed = nn.Embedding(trg_ntoken, args.ninp, padding_idx=pad_id)
        self.lstm = LSTM(args.ninp, args.nhid, args.nlayers, args.dropout)
        self.fc_out = nn.Linear(args.nhid, trg_ntoken)
        self.dropout = nn.Dropout(args.dropout)
        self.fc_out.weight = self.embed.weight

    def forward(self, x, states):


        embedded = self.embed(x)


        dropped = self.dropout(embedded)


        lstm_output, output_states = self.lstm(dropped, states)


        output = self.fc_out(lstm_output)

        return output, output_states
        ################################

### (e)  Implementing LSTMSeq2Seq
LSTMSeq2Seq is a complete model for neural machine translation. It starts with LSTMEncoder encoding a given tokenized sequence into the context vector. LSMTDecoder then decodes the context vector step by step. As mentioned in the description for LSTMDecoder, each input for the decoder is a token predicted by the previous decoder. In the training stage, however, one noisy prediction from the previous decoder can mess up all of the following predictions so teacher forcing is used in the training stage. Teacher forcing allows LSTMdecoder to always ground-truth token as an input instead of predicted one from the previous step. For LSTMDecoder if `teacher_focing` is True (it's the case for training stage), and use the predicted token from last time step otherwise (case for inference). All of the sentences start with <sos> token so the first input token to LSTMDecoder should be always `<sos>`.

In [None]:
class LSTMSeq2Seq(nn.Module):
    def __init__(self):
        super(LSTMSeq2Seq, self).__init__()
        self.encoder = LSTMEncoder()
        self.decoder = LSTMDecoder()

    def _get_init_states(self, x):
        init_states = [
            (torch.zeros((x.size(1), args.nhid)).to(x.device),
            torch.zeros((x.size(1), args.nhid)).to(x.device))
            for _ in range(args.nlayers)
        ]
        return init_states

    def forward(self, x, y, length, max_len=None, teacher_forcing=True):


        init_states = [
            (
                torch.zeros(x.size(1), self.encoder.lstm.layers[0].cell.hidden_size, device=x.device),
                torch.zeros(x.size(1), self.encoder.lstm.layers[0].cell.hidden_size, device=x.device)
            )
            for _ in range(len(self.encoder.lstm.layers))
        ]

        # Step 2: Pass through the encoder
        _, output_states = self.encoder(x, init_states, length)

        # Step 3: Prepare decoding variables
        trg_len = y.size(0) if max_len is None else max_len
        batch_size = y.size(1)
        trg_ntoken = self.decoder.fc_out.out_features
        outputs = torch.zeros(trg_len - 1, batch_size, trg_ntoken, device=x.device)


        dec_input = y[0:1]  # Shape: (1, B)

        #
        dec_states = output_states
        for t in range(1, trg_len):
            dec_output, dec_states = self.decoder(dec_input, dec_states)
            outputs[t - 1] = dec_output


            if teacher_forcing:
                dec_input = y[t:t + 1]  #
            else:
                dec_input = dec_output.argmax(-1)  #
        return outputs
        ################################

## Implementing LSTM  with Attention




### (a) Implementing Attention
Here, I implemented an attention module to augment vanilla LSTM. This attention module (also known as the Bahdanau attention, or the Additive attention) will combine the output of LSTMEncoder and the current decoder's state input to determine which part of the encoder output to focus on this particular time step.

In [None]:
class Attention(nn.Module):
    def __init__(self):
        super().__init__()

        self.nhid_enc = args.nhid
        self.nhid_dec = args.nhid
        self.W1 = nn.Linear(self.nhid_enc, args.nhid_attn)
        self.W2 = nn.Linear(self.nhid_dec, args.nhid_attn)
        self.W3 = nn.Linear(args.nhid_attn, 1)

    def forward(self, x, enc_o, dec_h, length_enc=None):


        L, B, _ = enc_o.size()  #
        enc_proj = self.W1(enc_o)  #
        dec_proj = self.W2(dec_h).unsqueeze(0)  #

        scores = self.W3(torch.tanh(enc_proj + dec_proj)).squeeze(-1)  #

        #
        if length_enc is not None:
            mask = torch.arange(L, device=enc_o.device).unsqueeze(1) >= length_enc.unsqueeze(0)
            scores = scores.masked_fill(mask, float('-inf'))

        #
        attn_weights = F.softmax(scores, dim=0)  # Shape: (L, B)

        #
        context = torch.sum(attn_weights.unsqueeze(-1) * enc_o, dim=0, keepdim=True)  # Shape: (1, B, nhid_enc)

        return context

        ################################

### (b) Implementing LSTMAttnDecoder
LSTMAttnDecoder is an extension of LSTMDecoder from above, with an extra attention layer. The difference is that the attention values are computed first then passed to the decoder, instead of directly feeding the previous decoder step's output to the decoder.

In [None]:
class LSTMAttnDecoder(nn.Module):
    def __init__(self):
        super(LSTMAttnDecoder, self).__init__()
        self.embed = nn.Embedding(trg_ntoken, args.ninp, padding_idx=pad_id)
        self.lstm = LSTM(args.ninp + args.nhid, args.nhid, args.nlayers, args.dropout)
        self.fc_out = nn.Linear(args.nhid, trg_ntoken)
        self.dropout = nn.Dropout(args.dropout)
        self.attn = Attention()
        self.fc_out.weight = self.embed.weight

    def forward(self, x, enc_o, states, length_enc=None):


        embedded = self.embed(x)  # Shape: (1, B, ninp)
        dropped = self.dropout(embedded)  # Apply dropout, Shape: (1, B, ninp)


        context = self.attn(dropped, enc_o, states[0][0], length_enc)


        lstm_input = torch.cat([dropped, context], dim=-1)


        dec_output, output_states = self.lstm(lstm_input, states)


        dec_output = self.fc_out(dec_output)

        return dec_output, output_states
        ################################

### (c) Implementing LSTMAttnSeq2Seq
LSTMAttnSeq2Seq is a complete model for neural machine translation, with an attention layer added. The encoder part of this section is identical to the previous LSTMSeq2Seq model.Teacher forcing is used in the training stage to mitigate noise prediction. However, the decoder part has changed to LSTMAttnDecoder module. I modified the previous implementation of LSTMSeq2Seq model accordingly.


In [None]:
class LSTMAttnSeq2Seq(nn.Module):
    def __init__(self):
        super(LSTMAttnSeq2Seq, self).__init__()
        self.encoder = LSTMEncoder()
        self.decoder = LSTMAttnDecoder()

    def _get_init_states(self, x):
        init_states = [
            (torch.zeros((x.size(1), args.nhid)).to(x.device),
            torch.zeros((x.size(1), args.nhid)).to(x.device))
            for _ in range(args.nlayers)
        ]
        return init_states

    def forward(self, x, y, length, max_len=None, teacher_forcing=True):


        batch_size = x.size(1)
        nhid = self.encoder.lstm.layers[0].cell.hidden_size
        num_layers = len(self.encoder.lstm.layers)
        device = x.device

        init_states = [
            (
                torch.zeros(batch_size, nhid, device=device),
                torch.zeros(batch_size, nhid, device=device),
            )
            for _ in range(num_layers)
        ]

        #
        enc_output, enc_states = self.encoder(x, init_states, length)

        #
        trg_len = y.size(0) if max_len is None else max_len
        trg_ntoken = self.decoder.fc_out.out_features
        outputs = torch.zeros(trg_len - 1, batch_size, trg_ntoken, device=device)

        #
        dec_input = y[0:1]  # Shape: (1, B)

        #
        dec_states = enc_states  #
        for t in range(1, trg_len):

            dec_output, dec_states = self.decoder(dec_input, enc_output, dec_states, length)
            outputs[t - 1] = dec_output

            if teacher_forcing:
                dec_input = y[t:t + 1]
            else:
                dec_input = dec_output.argmax(-1)

        return outputs
        ################################



##  Implementing Transformer




### (a) Implementing MaskedMultiheadAttention
In this module, I implement a single layer of multi-head attention, which will be the key building block of the Transformer model. Each query, key, value input will first pass through a feed-forward network, then scaled dot-product attention is performed. Additionally, there's an optional mask layer inside the scaled dot-product attention applied only in the decoder stage of the Transformer, to prevent the model from being able to see future inputs.


### (b) Implementing TransformerEncLayer
This module is a single layer of the Transformer encoder, containing a layer of masked multi-head attention and a feed-forward network with dropout and skip connection. Both attention and feed-forward layer have skip connections and are preceded by LayerNorm. I stack this layer multiple times to create the full version of the encoder. Since attention is performed in a self-attention manner, I pass the same values to query, key, and value inputs of the MaskedSelfAttention module.


In [None]:
MAX_LEN = 100
class MaskedMultiheadAttention(nn.Module):
    """
    A vanilla multi-head masked attention layer with a projection at the end.
    """
    def __init__(self, mask=False):
        super(MaskedMultiheadAttention, self).__init__()
        assert args.nhid_tran % args.nhead == 0
        # mask : whether to use
        # key, query, value projections for all heads
        self.key = nn.Linear(args.nhid_tran, args.nhid_tran)
        self.query = nn.Linear(args.nhid_tran, args.nhid_tran)
        self.value = nn.Linear(args.nhid_tran, args.nhid_tran)
        # regularization
        self.attn_drop = nn.Dropout(args.attn_pdrop)
        # output projection
        self.proj = nn.Linear(args.nhid_tran, args.nhid_tran)
        # causal mask to ensure that attention is only applied to the left in the input sequence
        if mask:
            self.register_buffer("mask", torch.tril(torch.ones(MAX_LEN, MAX_LEN)))
        self.nhead = args.nhead
        self.d_k = args.nhid_tran // args.nhead

    def forward(self, q, k, v, mask=None):

        B, T_q, _ = q.size()
        _, T, _ = k.size()


        q_proj = self.query(q).view(B, T_q, self.nhead, self.d_k).transpose(1, 2)
        k_proj = self.key(k).view(B, T, self.nhead, self.d_k).transpose(1, 2)
        v_proj = self.value(v).view(B, T, self.nhead, self.d_k).transpose(1, 2)

        scores = torch.matmul(q_proj, k_proj.transpose(-2, -1)) / (self.d_k ** 0.5)


        if mask is not None:

            if mask.dim() == 2:
                mask = mask.unsqueeze(1).unsqueeze(2)
            elif mask.dim() == 3:
                mask = mask.unsqueeze(1)



        attn_weights = F.softmax(scores, dim=-1)
        attn_weights = self.attn_drop(attn_weights)
        attn_output = torch.matmul(attn_weights, v_proj)


        attn_output = attn_output.transpose(1, 2).contiguous().view(B, T_q, -1)
        outputs = self.proj(attn_output)

        return outputs
        ################################

In [None]:
class TransformerEncLayer(nn.Module):
    def __init__(self):
        super(TransformerEncLayer, self).__init__()
        self.ln1 = nn.LayerNorm(args.nhid_tran)
        self.ln2 = nn.LayerNorm(args.nhid_tran)
        self.attn = MaskedMultiheadAttention()
        self.dropout1 = nn.Dropout(args.resid_pdrop)
        self.dropout2 = nn.Dropout(args.resid_pdrop)
        self.ff = nn.Sequential(
            nn.Linear(args.nhid_tran, args.nff),
            nn.ReLU(),
            nn.Linear(args.nff, args.nhid_tran)
        )

    def forward(self, x, mask=None):


        x_norm = self.ln1(x)
        attn_output = self.attn(x_norm, x_norm, x_norm, mask)


        x = x + self.dropout1(attn_output)


        x_norm = self.ln2(x)
        ff_output = self.ff(x_norm)


        x = x + self.dropout2(ff_output)

        return x
        ################################

### (c) ImplementING TransformerDecLayer
This module is a single layer of the Transformer decoder. The module contains two masked multi-head attentions and a feed-forward network, all with a skip connection and a preceding LayerNorm. The first attention is identical to the encoder's attention. However, the second attention is a cross-attention: that is, the key and value inputs of this layer would be the encoded words from the **source** sentence, given as `enc_o`.



In [None]:
class TransformerDecLayer(nn.Module):
    def __init__(self):
        super(TransformerDecLayer, self).__init__()
        self.ln1 = nn.LayerNorm(args.nhid_tran)
        self.ln2 = nn.LayerNorm(args.nhid_tran)
        self.ln3 = nn.LayerNorm(args.nhid_tran)
        self.dropout1 = nn.Dropout(args.resid_pdrop)
        self.dropout2 = nn.Dropout(args.resid_pdrop)
        self.dropout3 = nn.Dropout(args.resid_pdrop)
        self.attn1 = MaskedMultiheadAttention(mask=True) # self-attention
        self.attn2 = MaskedMultiheadAttention() # tgt to src attention
        self.ff = nn.Sequential(
            nn.Linear(args.nhid_tran, args.nff),
            nn.ReLU(),
            nn.Linear(args.nff, args.nhid_tran)
        )

    def forward(self, x, enc_o, enc_mask=None):


        x_norm = self.ln1(x)
        dec_attn_output = self.attn1(x_norm, x_norm, x_norm)
        dec_attn_output = self.dropout1(dec_attn_output)
        x = x + dec_attn_output


        x_norm = self.ln2(x)
        enc_dec_attn_output = self.attn2(x_norm, enc_o, enc_o, enc_mask)
        enc_dec_attn_output = self.dropout2(enc_dec_attn_output)
        x = x + enc_dec_attn_output

        x_norm = self.ln3(x)
        ff_output = self.ff(x_norm)
        ff_output = self.dropout3(ff_output)
        x = x + ff_output

        return x

        ################################

### (d) Implementing TransformerEncoder
In this module, I first tokenize the input word, apply positional encoding, then pass through multiple layers of TransformerEncLayer, and conclude with a LayerNorm.


In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, max_len=4096):
        super().__init__()
        dim = args.nhid_tran
        pos = np.arange(0, max_len)[:, None]
        i = np.arange(0, dim // 2)
        denom = 10000 ** (2 * i / dim)

        pe = np.zeros([max_len, dim])
        pe[:, 0::2] = np.sin(pos / denom)
        pe[:, 1::2] = np.cos(pos / denom)
        pe = torch.from_numpy(pe).float()

        self.register_buffer('pe', pe)

    def forward(self, x):

        return x + self.pe[:x.shape[1]]

class TransformerEncoder(nn.Module):
    def __init__(self):
        super(TransformerEncoder, self).__init__()
        # input embedding stem
        self.tok_emb = nn.Embedding(src_ntoken, args.nhid_tran)
        self.pos_enc = PositionalEncoding()
        self.dropout = nn.Dropout(args.embd_pdrop)
        # transformer
        self.transform = nn.ModuleList([TransformerEncLayer() for _ in range(args.nlayers_transformer)])
        # decoder head
        self.ln_f = nn.LayerNorm(args.nhid_tran)


    def forward(self, x, mask):


        x = self.tok_emb(x)
        x = self.pos_enc(x)


        x = self.dropout(x)  #


        for layer in self.transform:  #
            x = layer(x, mask)  #


        outputs = self.ln_f(x)

        return outputs
        ################################

### (e) Implementing TransformerDecoder
What TransformerDecoder does is pretty much identical to TransformerEncoder.


In [None]:
class TransformerDecoder(nn.Module):
    def __init__(self):
        super(TransformerDecoder, self).__init__()
        self.tok_emb = nn.Embedding(trg_ntoken, args.nhid_tran)
        self.pos_enc = PositionalEncoding()
        self.dropout = nn.Dropout(args.embd_pdrop)
        self.transform = nn.ModuleList([TransformerDecLayer() for _ in range(args.nlayers_transformer)])
        self.ln_f = nn.LayerNorm(args.nhid_tran)
        self.lin_out = nn.Linear(args.nhid_tran, trg_ntoken)
        self.lin_out.weight = self.tok_emb.weight


    def forward(self, x, enc_o, enc_mask):


        x = self.tok_emb(x)
        x = self.pos_enc(x)
        x = self.dropout(x)


        for layer in self.transform:
            x = layer(x, enc_o, enc_mask)


        x = self.ln_f(x)


        x = self.lin_out(x)
        x = x * (1 / (self.tok_emb.embedding_dim ** 0.5))

        return x

### (f) Implementing Transformer
Finally, let's combine everything to construct the full Transformer model. By creating a mask according to `length_x` parameter, and pass the inputs through TransformerEncoder to obtain encoder output. Now if on training mode (`self.training == True`) or teacher forcing is enabled, then run through the decoder exactly once to predict the very next word. Otherwise, run through the decoder `max_len - 1` times to create a sequence of `max_len` tokens. The first token to feed the decoder is always the first token of `y`.


In [None]:
class Transformer(nn.Module):
    def __init__(self):
        super(Transformer, self).__init__()
        self.encoder = TransformerEncoder()
        self.decoder = TransformerDecoder()

    def forward(self, x, y, length_x, max_len=None, teacher_forcing=True):


        enc_mask = (torch.arange(x.size(1)).expand(len(length_x), -1).to(x.device) < length_x.unsqueeze(1))
        enc_o = self.encoder(x, mask=enc_mask)


        if max_len is None:
            max_len = y.size(1)


        if teacher_forcing and self.training:

            y = y[:, :-1]

            outputs = self.decoder(y, enc_o, enc_mask)

        else:

            B = y.size(0)
            trg_ntoken = self.decoder.lin_out.out_features


            dec_input = y[:, :1]  # Shape: (B, 1)
            outputs = []

            for t in range(max_len - 1):

                dec_output = self.decoder(dec_input, enc_o, enc_mask)  # Shape: (B, t+1, trg_ntoken)
                dec_output_t = dec_output[:, -1:]  # Extract the last timestep output, shape: (B, 1, trg_ntoken)


                outputs.append(dec_output_t)


                next_token = dec_output_t.argmax(-1)  # Get predicted token, shape: (B, 1)
                dec_input = torch.cat([dec_input, next_token], dim=1)  # Append predicted token


            outputs = torch.cat(outputs, dim=1)  # Shape: (B, max_len-1, trg_ntoken)

        return outputs

        ################################

## Runing Experiment



In [None]:
def run_experiment(model):
    criterion = nn.CrossEntropyLoss(ignore_index=pad_id)

    optimizer = optim.Adam(model.parameters(), lr=args.lr_lstm if not isinstance(model, Transformer) else args.lr_transformer)

    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min',
            factor=0.25, patience=1, threshold=0.0001, threshold_mode='rel',
            cooldown=0, min_lr=0, eps=1e-08, verbose=False)

    best_val_loss = np.inf
    for epoch in tq.tqdm(range(args.epochs)):
        run_epoch(epoch, model, optimizer, is_train=True)
        with torch.no_grad():
            val_loss = run_epoch(epoch, model, None, is_train=False)
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            save_model(model, 'best')
        save_model(model)
        scheduler.step(val_loss)

## Training and Validating models

**Obtained BLEU Score**
- LSTMSeq2Seq: 0.05169190386007824
- LSTMAttnSeq2Seq: 0.05828504336025121
- Transformer: 0.35190386023641020

In [None]:
lstm_model = LSTMSeq2Seq().to(device)
lstm_model.apply(init_weights)
run_experiment(lstm_model)
run_translation(lstm_model, test_iter, max_len=100)
print('')

In [None]:
attn_model = LSTMAttnSeq2Seq().to(device)
attn_model.apply(init_weights)
run_experiment(attn_model)
run_translation(attn_model, test_iter, max_len=100)
print('')

In [None]:
transformer_model = Transformer().to(device)
run_experiment(transformer_model)
run_translation(transformer_model, test_iter, max_len=100)
print('')