# Session 10 - Transformers Review

- Train the same code, but on different data.
- If you have n-classes, your accuracy MUST be more than 4 * 100 / n.
- Submit the Github link, that includes your notebook with training logs, and proper readme file.

 



## INSTALLATIONS AND IMPORTS

In [None]:
%%bash 

python -m spacy download en_core_web_sm
python -m spacy download nl_core_news_sm
python -m spacy download en
python -m spacy download nl

In [None]:
!pip install torchtext

In [10]:
import torch
import torch.nn as nn
import torch.optim as optim

from torchtext.legacy.data import Field, TabularDataset, BucketIterator  
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
# from torchtext.datasets import IWSLT2017
from typing import Iterable, List

import spacy
import numpy as np

import random
import math
import time
import pandas as pd

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [None]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

## GET DATA

Download data from the following website:
https://www.manythings.org/anki/

Dataset Link:
https://www.manythings.org/anki/nld-eng.zip

- download data saved in drive folder
- unzip the data

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
import shutil
source='/content/drive/MyDrive/Courses/TSAI/END3.0/Session10/data/nld-eng.zip'
destination='/content/nld-eng.zip'
shutil.copyfile(source, destination)

'/content/nld-eng.zip'

In [7]:
!unzip '/content/nld-eng.zip'

Archive:  /content/nld-eng.zip
replace _about.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: _about.txt              
replace nld.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: A
  inflating: nld.txt                 


## USE PANDAS TO PARSE DATA FROM TXT FILE

In [8]:
df=pd.read_csv('/content/nld.txt',sep="\t",header=None)
df.columns=['TRG','SRC','hh']
df=df[['TRG','SRC']]
df.head()

Unnamed: 0,TRG,SRC
0,Go.,Lopen!
1,Go.,Vooruit.
2,Hi.,Hoi.
3,Hi.,Hé!
4,Hi.,Hai!


## SPLIT DATA and BUILD VOCAB FROM DATA

- save data to csv files
- initiate SRC AND TRG VOCAB FIELDS
- build SRC AND TRG VOCAB


In [12]:
# Creating a dataframe with 75%
# values of original dataframe
train_data = df.sample(frac = 0.75)
 
# Creating dataframe with
# rest of the 25% values
test_data = df.drop(train_data.index)

import os
if not os.path.exists("/content/data/"):
    os.makedirs("/content/data/")

train_data.to_csv("/content/data/single_train_nl_data.csv")
test_data.to_csv("/content/data/single_test_nl_data.csv")

spacy_nl = spacy.load('nl_core_news_sm')
spacy_en = spacy.load('en_core_web_sm')

def tokenize_nl(text):
    """
    Tokenizes German text from a string into a list of strings
    """
    return [tok.text for tok in spacy_fr.tokenizer(text)]

def tokenize_en(text):
    """
    Tokenizes English text from a string into a list of strings
    """
    return [tok.text for tok in spacy_en.tokenizer(text)]

SRC = Field(tokenize = tokenize_nl, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True, 
            batch_first = True)

TRG = Field(tokenize = tokenize_en, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True, 
            batch_first = True)

fields={'SRC':('SRC',SRC),'TRG':('TRG',TRG)}

train_data, test_data=TabularDataset.splits(
                                    path='data',
                                    train='single_train_nl_data.csv',#this is the traing file
                                    test='single_test_nl_data.csv',##this is the test file
                                    format='csv',
                                    fields=fields)## need to put zero if only one data is being returned https://github.com/pytorch/text/issues/474

SRC.build_vocab(train_data, min_freq = 1)
TRG.build_vocab(test_data, min_freq = 1)

print("SRC Vocab Len = ", len(SRC.vocab))
print("TRG Vocab Len = ", len(TRG.vocab))

train_iterator, test_iterator = BucketIterator.splits(
    (train_data, test_data), 
     batch_size = 64,sort=False,
     device = device)

for i,batch in enumerate(train_iterator):
    print(batch.SRC)
    print(batch.TRG)
    break

SRC Vocab Len =  11894
TRG Vocab Len =  5478
tensor([[   2,   13,   48,  ...,    1,    1,    1],
        [   2,   11, 2400,  ...,    1,    1,    1],
        [   2,    6,   26,  ...,    1,    1,    1],
        ...,
        [   2,  311,   36,  ...,    1,    1,    1],
        [   2,    5,   76,  ...,    1,    1,    1],
        [   2,   28,    7,  ...,    1,    1,    1]], device='cuda:0')
tensor([[   2,    5,  101,  ...,    1,    1,    1],
        [   2, 2332,   11,  ...,    1,    1,    1],
        [   2,    7,   20,  ...,    1,    1,    1],
        ...,
        [   2,  123,    7,  ...,    1,    1,    1],
        [   2,    5,   14,  ...,    1,    1,    1],
        [   2,   37,   11,  ...,    1,    1,    1]], device='cuda:0')


## ENCODER AND ENCODER LAYER CLASS

In [13]:
class Encoder(nn.Module):
    def __init__(self, 
                 input_dim, 
                 hid_dim, 
                 n_layers, 
                 n_heads, 
                 pf_dim,
                 dropout, 
                 device,
                 max_length = 100):
        super().__init__()

        self.device = device
        
        self.tok_embedding = nn.Embedding(input_dim, hid_dim)
        self.pos_embedding = nn.Embedding(max_length, hid_dim)
        
        self.layers = nn.ModuleList([EncoderLayer(hid_dim, 
                                                  n_heads, 
                                                  pf_dim,
                                                  dropout, 
                                                  device) 
                                     for _ in range(n_layers)])
        
        self.dropout = nn.Dropout(dropout)
        
        self.scale = torch.sqrt(torch.FloatTensor([hid_dim])).to(device)
        
    def forward(self, src, src_mask):
        
        #src = [batch size, src len]
        #src_mask = [batch size, 1, 1, src len]
        
        batch_size = src.shape[0]
        src_len = src.shape[1]
        
        pos = torch.arange(0, src_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)
        
        #pos = [batch size, src len]
        
        src = self.dropout((self.tok_embedding(src) * self.scale) + self.pos_embedding(pos))
        
        #src = [batch size, src len, hid dim]
        
        for layer in self.layers:
            src = layer(src, src_mask)
            
        #src = [batch size, src len, hid dim]
            
        return src

In [14]:
class EncoderLayer(nn.Module):
    def __init__(self, 
                 hid_dim, 
                 n_heads, 
                 pf_dim,  
                 dropout, 
                 device):
        super().__init__()
        
        self.self_attn_layer_norm = nn.LayerNorm(hid_dim)
        self.ff_layer_norm = nn.LayerNorm(hid_dim)
        self.self_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, device)
        self.positionwise_feedforward = PositionwiseFeedforwardLayer(hid_dim, 
                                                                     pf_dim, 
                                                                     dropout)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src, src_mask):
        
        #src = [batch size, src len, hid dim]
        #src_mask = [batch size, 1, 1, src len] 
                
        #self attention
        _src, _ = self.self_attention(src, src, src, src_mask)
        
        #dropout, residual connection and layer norm
        src = self.self_attn_layer_norm(src + self.dropout(_src))
        
        #src = [batch size, src len, hid dim]
        
        #positionwise feedforward
        _src = self.positionwise_feedforward(src)
        
        #dropout, residual and layer norm
        src = self.ff_layer_norm(src + self.dropout(_src))
        
        #src = [batch size, src len, hid dim]
        
        return src

## MULTI HEAD ATTENTION CLASS

In [15]:
class MultiHeadAttentionLayer(nn.Module):
    def __init__(self, hid_dim, n_heads, dropout, device):
        super().__init__()
        
        assert hid_dim % n_heads == 0
        
        self.hid_dim = hid_dim
        self.n_heads = n_heads
        self.head_dim = hid_dim // n_heads
        
        self.fc_q = nn.Linear(hid_dim, hid_dim)
        self.fc_k = nn.Linear(hid_dim, hid_dim)
        self.fc_v = nn.Linear(hid_dim, hid_dim)
        
        self.fc_o = nn.Linear(hid_dim, hid_dim)
        
        self.dropout = nn.Dropout(dropout)
        
        self.scale = torch.sqrt(torch.FloatTensor([self.head_dim])).to(device)
        
    def forward(self, query, key, value, mask = None):
        
        batch_size = query.shape[0]
        
        #query = [batch size, query len, hid dim]
        #key = [batch size, key len, hid dim]
        #value = [batch size, value len, hid dim]
                
        Q = self.fc_q(query)
        K = self.fc_k(key)
        V = self.fc_v(value)
        
        #Q = [batch size, query len, hid dim]
        #K = [batch size, key len, hid dim]
        #V = [batch size, value len, hid dim]
                
        Q = Q.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        K = K.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        V = V.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        
        #Q = [batch size, n heads, query len, head dim]
        #K = [batch size, n heads, key len, head dim]
        #V = [batch size, n heads, value len, head dim]
                
        energy = torch.matmul(Q, K.permute(0, 1, 3, 2)) / self.scale
        
        #energy = [batch size, n heads, query len, key len]
        
        if mask is not None:
            energy = energy.masked_fill(mask == 0, -1e10)
        
        attention = torch.softmax(energy, dim = -1)
                
        #attention = [batch size, n heads, query len, key len]
                
        x = torch.matmul(self.dropout(attention), V)
        
        #x = [batch size, n heads, query len, head dim]
        
        x = x.permute(0, 2, 1, 3).contiguous()
        
        #x = [batch size, query len, n heads, head dim]
        
        x = x.view(batch_size, -1, self.hid_dim)
        
        #x = [batch size, query len, hid dim]
        
        x = self.fc_o(x)
        
        #x = [batch size, query len, hid dim]
        
        return x, attention

In [16]:
class PositionwiseFeedforwardLayer(nn.Module):
    def __init__(self, hid_dim, pf_dim, dropout):
        super().__init__()
        
        self.fc_1 = nn.Linear(hid_dim, pf_dim)
        self.fc_2 = nn.Linear(pf_dim, hid_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        
        #x = [batch size, seq len, hid dim]
        
        x = self.dropout(torch.relu(self.fc_1(x)))
        
        #x = [batch size, seq len, pf dim]
        
        x = self.fc_2(x)
        
        #x = [batch size, seq len, hid dim]
        
        return x

## DECODE AND DECODER LAYER CLASS

In [17]:
class Decoder(nn.Module):
    def __init__(self, 
                 output_dim, 
                 hid_dim, 
                 n_layers, 
                 n_heads, 
                 pf_dim, 
                 dropout, 
                 device,
                 max_length = 100):
        super().__init__()
        
        self.device = device
        
        self.tok_embedding = nn.Embedding(output_dim, hid_dim)
        self.pos_embedding = nn.Embedding(max_length, hid_dim)
        
        self.layers = nn.ModuleList([DecoderLayer(hid_dim, 
                                                  n_heads, 
                                                  pf_dim, 
                                                  dropout, 
                                                  device)
                                     for _ in range(n_layers)])
        
        self.fc_out = nn.Linear(hid_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
        self.scale = torch.sqrt(torch.FloatTensor([hid_dim])).to(device)
        
    def forward(self, trg, enc_src, trg_mask, src_mask):
        
        #trg = [batch size, trg len]
        #enc_src = [batch size, src len, hid dim]
        #trg_mask = [batch size, 1, trg len, trg len]
        #src_mask = [batch size, 1, 1, src len]
                
        batch_size = trg.shape[0]
        trg_len = trg.shape[1]
        
        pos = torch.arange(0, trg_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)
                            
        #pos = [batch size, trg len]
            
        trg = self.dropout((self.tok_embedding(trg) * self.scale) + self.pos_embedding(pos))
                
        #trg = [batch size, trg len, hid dim]
        
        for layer in self.layers:
            trg, attention = layer(trg, enc_src, trg_mask, src_mask)
        
        #trg = [batch size, trg len, hid dim]
        #attention = [batch size, n heads, trg len, src len]
        
        output = self.fc_out(trg)
        
        #output = [batch size, trg len, output dim]
            
        return output, attention

In [18]:
class DecoderLayer(nn.Module):
    def __init__(self, 
                 hid_dim, 
                 n_heads, 
                 pf_dim, 
                 dropout, 
                 device):
        super().__init__()
        
        self.self_attn_layer_norm = nn.LayerNorm(hid_dim)
        self.enc_attn_layer_norm = nn.LayerNorm(hid_dim)
        self.ff_layer_norm = nn.LayerNorm(hid_dim)
        self.self_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, device)
        self.encoder_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, device)
        self.positionwise_feedforward = PositionwiseFeedforwardLayer(hid_dim, 
                                                                     pf_dim, 
                                                                     dropout)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, trg, enc_src, trg_mask, src_mask):
        
        #trg = [batch size, trg len, hid dim]
        #enc_src = [batch size, src len, hid dim]
        #trg_mask = [batch size, 1, trg len, trg len]
        #src_mask = [batch size, 1, 1, src len]
        
        #self attention
        _trg, _ = self.self_attention(trg, trg, trg, trg_mask)
        
        #dropout, residual connection and layer norm
        trg = self.self_attn_layer_norm(trg + self.dropout(_trg))
            
        #trg = [batch size, trg len, hid dim]
            
        #encoder attention
        _trg, attention = self.encoder_attention(trg, enc_src, enc_src, src_mask)
        
        #dropout, residual connection and layer norm
        trg = self.enc_attn_layer_norm(trg + self.dropout(_trg))
                    
        #trg = [batch size, trg len, hid dim]
        
        #positionwise feedforward
        _trg = self.positionwise_feedforward(trg)
        
        #dropout, residual and layer norm
        trg = self.ff_layer_norm(trg + self.dropout(_trg))
        
        #trg = [batch size, trg len, hid dim]
        #attention = [batch size, n heads, trg len, src len]
        
        return trg, attention

## Seq2Seq NETWORK CLASS

In [19]:
class Seq2Seq(nn.Module):
    def __init__(self, 
                 encoder, 
                 decoder, 
                 src_pad_idx, 
                 trg_pad_idx, 
                 device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.src_pad_idx = src_pad_idx
        self.trg_pad_idx = trg_pad_idx
        self.device = device
        
    def make_src_mask(self, src):
        
        #src = [batch size, src len]
        
        src_mask = (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)

        #src_mask = [batch size, 1, 1, src len]

        return src_mask
    
    def make_trg_mask(self, trg):
        
        #trg = [batch size, trg len]
        
        trg_pad_mask = (trg != self.trg_pad_idx).unsqueeze(1).unsqueeze(2)
        
        #trg_pad_mask = [batch size, 1, 1, trg len]
        
        trg_len = trg.shape[1]
        
        trg_sub_mask = torch.tril(torch.ones((trg_len, trg_len), device = self.device)).bool()
        
        #trg_sub_mask = [trg len, trg len]
            
        trg_mask = trg_pad_mask & trg_sub_mask
        
        #trg_mask = [batch size, 1, trg len, trg len]
        
        return trg_mask

    def forward(self, src, trg):
        
        #src = [batch size, src len]
        #trg = [batch size, trg len]
                
        src_mask = self.make_src_mask(src)
        trg_mask = self.make_trg_mask(trg)
        
        #src_mask = [batch size, 1, 1, src len]
        #trg_mask = [batch size, 1, trg len, trg len]
        
        enc_src = self.encoder(src, src_mask)
        
        #enc_src = [batch size, src len, hid dim]
                
        output, attention = self.decoder(trg, enc_src, trg_mask, src_mask)
        
        #output = [batch size, trg len, output dim]
        #attention = [batch size, n heads, trg len, src len]
        
        return output, attention

## INITIATE AND ASSIGN ENCODER AND DECODER

In [20]:
BATCH_SIZE = 64
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
HID_DIM = 256
ENC_LAYERS = 3
DEC_LAYERS = 3
ENC_HEADS = 8
DEC_HEADS = 8
ENC_PF_DIM = 512
DEC_PF_DIM = 512
ENC_DROPOUT = 0.1
DEC_DROPOUT = 0.1

enc = Encoder(INPUT_DIM, 
              HID_DIM, 
              ENC_LAYERS, 
              ENC_HEADS, 
              ENC_PF_DIM, 
              ENC_DROPOUT, 
              device)

dec = Decoder(OUTPUT_DIM, 
              HID_DIM, 
              DEC_LAYERS, 
              DEC_HEADS, 
              DEC_PF_DIM, 
              DEC_DROPOUT, 
              device)

In [21]:
print("INPUT_DIM = ", INPUT_DIM) # 13077
print("OUTPUT_DIM = ", OUTPUT_DIM) # 9353

INPUT_DIM =  11894
OUTPUT_DIM =  5478


In [22]:
SRC_PAD_IDX = SRC.vocab.stoi[SRC.pad_token]
TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]

model = Seq2Seq(enc, dec, SRC_PAD_IDX, TRG_PAD_IDX, device).to(device)

In [23]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 9,859,942 trainable parameters


In [24]:
def initialize_weights(m):
    if hasattr(m, 'weight') and m.weight.dim() > 1:
        nn.init.xavier_uniform_(m.weight.data)


model.apply(initialize_weights);

In [25]:
LEARNING_RATE = 0.0005

optimizer = torch.optim.Adam(model.parameters(), lr = LEARNING_RATE)

In [27]:
loss_fn = torch.nn.CrossEntropyLoss(ignore_index=TRG_PAD_IDX)

## TRAIN_EPOCH AND EVALUATE METHODS

In [28]:
def train_epoch(model, optimizer):
    model.train()
    losses = 0

    count=0
    for i, batch in enumerate(train_iterator):
        src = batch.SRC.to(device)
        tgt = batch.TRG.to(device)


        optimizer.zero_grad()
        output, _ = model(src, tgt[:,:-1]) #[:,:-1])

        output_dim = output.shape[-1]
            
        output = output.contiguous().view(-1, output_dim)
        tgt = tgt[:,1:].contiguous().view(-1)


        loss = loss_fn(output, tgt)
        loss.backward()
        clip = 1
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

        optimizer.step()
        losses += loss.item()
        count+=1
    return losses / count


def evaluate(model):
    model.eval()
    losses = 0
    count=0
    for i,batch in enumerate(test_iterator):
        src = batch.SRC.to(device)
        tgt = batch.TRG.to(device)[:,:-1]

        output, _ = model(src, tgt[:,:-1])
            
        #output = [batch size, trg len - 1, output dim]
        #trg = [batch size, trg len]
        
        output_dim = output.shape[-1]
        
        output = output.contiguous().view(-1, output_dim)
        tgt = tgt[:,1:].contiguous().view(-1)

        # output = output[1:].view(-1, output.shape[-1])
        # tgt = tgt[1:].reshape(-1)
        loss = loss_fn(output, tgt)
        losses += loss.item()
        count+=1

    return losses / count


## START TRAINING

In [29]:
from timeit import default_timer as timer
NUM_EPOCHS = 10
train_losses = []
val_losses = []

for epoch in range(1, NUM_EPOCHS+1):
    start_time = timer()
    train_loss = train_epoch(model, optimizer)
    end_time = timer()
    val_loss = evaluate(model)
    print((f"Epoch: {epoch}, Train loss: {train_loss:.3f}, Train PPL: {math.exp(train_loss):7.3f} | Val loss: {val_loss:.3f}, Val. PPL: {math.exp(val_loss):7.3f} | "f"Epoch time = {(end_time - start_time):.3f}s"))
    train_losses.append(train_loss)
    val_losses.append(val_loss)


Epoch: 1, Train loss: 3.141, Train PPL:  23.130 | Val loss: 2.146, Val. PPL:   8.551 | Epoch time = 24.834s
Epoch: 2, Train loss: 1.742, Train PPL:   5.707 | Val loss: 1.519, Val. PPL:   4.569 | Epoch time = 24.912s
Epoch: 3, Train loss: 1.218, Train PPL:   3.379 | Val loss: 1.307, Val. PPL:   3.696 | Epoch time = 24.655s
Epoch: 4, Train loss: 0.944, Train PPL:   2.570 | Val loss: 1.215, Val. PPL:   3.369 | Epoch time = 24.672s
Epoch: 5, Train loss: 0.778, Train PPL:   2.177 | Val loss: 1.178, Val. PPL:   3.248 | Epoch time = 24.738s
Epoch: 6, Train loss: 0.665, Train PPL:   1.944 | Val loss: 1.147, Val. PPL:   3.148 | Epoch time = 24.708s
Epoch: 7, Train loss: 0.580, Train PPL:   1.787 | Val loss: 1.161, Val. PPL:   3.192 | Epoch time = 24.536s
Epoch: 8, Train loss: 0.520, Train PPL:   1.682 | Val loss: 1.168, Val. PPL:   3.217 | Epoch time = 24.902s
Epoch: 9, Train loss: 0.468, Train PPL:   1.597 | Val loss: 1.195, Val. PPL:   3.303 | Epoch time = 24.688s
Epoch: 10, Train loss: 0.430

## START EVALUATION

In [31]:
test_loss = evaluate(model)
print((f"Test loss: {test_loss:.3f}, Test. PPL: {math.exp(test_loss):7.3f}"))

Test loss: 1.218, Test. PPL:   3.381


## DEFINE TRANSLATE SENTENCE METHOD

In [41]:
def translate_sentence(sentence, src_field, trg_field, model, device, max_len = 50):
    
    model.eval()

    if isinstance(sentence, str):
        nlp = spacy.load('nl_core_news_sm')
        tokens = [token.text.lower() for token in nlp(sentence)]
    else:
        tokens = [token.lower() for token in sentence]

    tokens = [src_field.init_token] + tokens + [src_field.eos_token]
        
    src_indexes = [src_field.vocab.stoi[token] for token in tokens]

    src_tensor = torch.LongTensor(src_indexes).unsqueeze(0).to(device)
    
    src_mask = model.make_src_mask(src_tensor)
    
    
    with torch.no_grad():
        enc_src = model.encoder(src_tensor, src_mask)

    trg_indexes = [trg_field.vocab.stoi[trg_field.init_token]]

    for i in range(max_len):

        trg_tensor = torch.LongTensor(trg_indexes).unsqueeze(0).to(device)

        trg_mask = model.make_trg_mask(trg_tensor)
        
        with torch.no_grad():
            output, attention = model.decoder(trg_tensor, enc_src, trg_mask, src_mask)
        
        pred_token = output.argmax(2)[:,-1].item()
        
        trg_indexes.append(pred_token)

        # if pred_token == trg_field[eos_token]:
        if pred_token == trg_field.vocab.stoi[trg_field.eos_token]:
            break
    
    # trg_tokens = [trg_field.vocab.get_itos()[i] for i in trg_indexes]
    trg_tokens = [trg_field.vocab.itos[i] for i in trg_indexes]
    
    return trg_tokens[1:], attention

## DEFINE DISPLAY ATTNETION METHOD

In [42]:
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

def display_attention(sentence, translation, attention, n_heads = 8, n_rows = 4, n_cols = 2):
    
    assert n_rows * n_cols == n_heads
    
    fig = plt.figure(figsize=(15,25))
    
    for i in range(n_heads):
        
        ax = fig.add_subplot(n_rows, n_cols, i+1)
        
        _attention = attention.squeeze(0)[i].cpu().detach().numpy()

        cax = ax.matshow(_attention, cmap='bone')

        ax.tick_params(labelsize=12)
        ax.set_xticklabels(['']+['<sos>']+[t.lower() for t in sentence]+['<eos>'], 
                           rotation=45)
        ax.set_yticklabels(['']+translation)

        ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
        ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

    plt.show()
    plt.close()

## START TRANSLATING SENTENCES

In [53]:
print("*"*40)
print("Dutch to English Translations:-")
print("*"*40)

for x,y in [(10,20), (900, 910), (2000, 2010)]:
  print("."*40)
  for example_idx in range(x,y):

      src = vars(test_data.examples[example_idx])['SRC']
      trg = vars(test_data.examples[example_idx])['TRG']


      print("Dutch Sentence: src = ",' '.join(src))
      print("Target English Sentence trg =" ,' '.join(trg))

      translation, attention = translate_sentence(src, SRC, TRG, model, device)
      print("predicted trg =" ,' '.join(translation))
      print("*"*40)

****************************************
Dutch to English Translations:-
****************************************
........................................
Dutch Sentence: src =  rustig , rustig !
Target English Sentence trg = relax .
predicted trg = relax . <eos>
****************************************
Dutch Sentence: src =  lachen .
Target English Sentence trg = smile .
predicted trg = laugh . <eos>
****************************************
Dutch Sentence: src =  bedankt !
Target English Sentence trg = cheers !
predicted trg = thanks ! <eos>
****************************************
Dutch Sentence: src =  ik heb hem !
Target English Sentence trg = got it !
predicted trg = i have him ! <eos>
****************************************
Dutch Sentence: src =  ik ben oké .
Target English Sentence trg = i 'm ok .
predicted trg = i 'm ok . <eos>
****************************************
Dutch Sentence: src =  dat kan niet !
Target English Sentence trg = no way !
predicted trg = that ca n't be ! 