In [None]:
import torch
import torch.nn as nn
import numpy as np
import math

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)
if (device == 'cuda'):
  print(f"Device name: {torch.cuda.get_device_name(device.index)}")
  print(f"Device memory: {torch.cuda.get_device_properties(device.index).total_memory / 1024 ** 3} GB")
device = torch.device(device)

Using device: cuda
Device name: NVIDIA GeForce RTX 4080
Device memory: 15.99169921875 GB


In [None]:
class InputEmbeddings(nn.Module):

    def __init__(self, d_model, vocab_size):
        super().__init__()
        self.d_model = d_model
        self.vocab_size = vocab_size
        self.embedding = nn.Embedding(vocab_size, d_model)

    def forward(self, x):
        # (batch, seq_len) --> (batch, seq_len, d_model)
        # Multiply by sqrt(d_model) to scale the embeddings according to the paper
        return self.embedding(x) * math.sqrt(self.d_model)


"""
    PE(pos,2i) = sin(pos/10000^(2i/d_model))
    PE(pos,2i+1) = cos(pos/10000^(2i+1/d_model))
"""
class PositionalEncoding(nn.Module):

  def __init__(self, d_model, seq_len, dropout):
    super().__init__()          #__init__() call to the parent class must be made before assignment on the child inherited from nn.Module Class
    self.d_model= d_model
    self.seq_len= seq_len
    self.dropout= nn.Dropout(dropout)

    # Create a matrix of shape (seq_len, d_model)
    pe = torch.zeros(seq_len, d_model)
    # Create a vector of shape (seq_len)
    position = torch.arange(0, seq_len, dtype=torch.float).unsqueeze(1) # (seq_len, 1)    # position
    # Create a vector of shape (d_model)
    div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))#10000^(-2i/d_model) == e^ ln(2i*ln(10000) / d_model)

    # Apply sine to even indices
    pe[:, 0::2] = torch.sin(position * div_term) # sin(position * (10000 ** (2i / d_model))
    # Apply cosine to odd indices
    pe[:, 1::2] = torch.cos(position * div_term) # cos(position * (10000 ** (2i / d_model))
    # Add a batch dimension to the positional encoding
    pe = pe.unsqueeze(0) # (1, seq_len, d_model)
    # Register the positional encoding as a buffer
    self.register_buffer('pe', pe)

  def forward(self, x):
    x = x + (self.pe[:, :x.shape[1], :]).requires_grad_(False) # dim of input word embedding(x) -> (batch, seq_len, d_model)
    return self.dropout(x)    # x -> (batch, seq_len, d_model) here seq_len is actual length of different sentences

class LayerNormalization(nn.Module):

    def __init__(self, features, eps=10**-6):
        super().__init__()
        self.eps = eps
        self.gamma = nn.Parameter(torch.ones(features)) # gamma is a learnable parameter
        self.beta = nn.Parameter(torch.zeros(features)) # beta is a learnable parameter

    def forward(self, x):
        # x: (batch, seq_len, hidden_size)
        # Keep the dimension for broadcasting
        mean = x.mean(dim = -1, keepdim = True) # (batch, seq_len, 1)
        # Keep the dimension for broadcasting
        std = x.std(dim = -1, keepdim = True) # (batch, seq_len, 1)
        y = (x - mean) / (std + self.eps)
        # eps is to prevent dividing by zero or when std is very small
        return self.gamma * y + self.beta

In [None]:
class MultiHeadAttentionBlock(nn.Module):

    def __init__(self, d_model, h, dropout):
        super().__init__()
        self.d_model = d_model # Embedding vector size
        self.h = h # Number of heads
        # Make sure d_model is divisible by h
        assert d_model % h == 0, "d_model is not divisible by h"

        self.d_k = d_model // h # Dimension of vector seen by each head
        self.w_q = nn.Linear(d_model, d_model, bias=False) # Wq
        self.w_k = nn.Linear(d_model, d_model, bias=False) # Wk
        self.w_v = nn.Linear(d_model, d_model, bias=False) # Wv
        self.w_o = nn.Linear(d_model, d_model, bias=False) # Wo
        self.dropout = nn.Dropout(dropout)

    @staticmethod
    def attention(query, key, value, mask, dropout: nn.Dropout):
        d_k = query.shape[-1]
        # Just apply the formula from the paper
        # (batch, h, seq_len, d_k) --> (batch, h, seq_len, seq_len)
        # batch, h ,seq_len, d_k @ batch, h, d_k, seq_len == batch, h, seq_len, seq_len

        attention_scores = (query @ key.transpose(-2, -1)) / math.sqrt(d_k)
        if mask is not None:
            # Write a very low value (indicating -inf) to the positions where mask == 0
            attention_scores.masked_fill_(mask == 0, -1e9) # PyTorch Function: Fills elements of attention_scores with value where mask==0
        attention_scores = attention_scores.softmax(dim=-1) # (batch, h, seq_len, seq_len) # Apply softmax,import torch has inbuilt softmax

        if dropout is not None:
            attention_scores = dropout(attention_scores)
        # (batch, h, seq_len, seq_len) --> (batch, h, seq_len, d_k)
        # return attention scores which can be used for visualization
        return (attention_scores @ value), attention_scores  # @ denotes matmul, attention_scores is returned here for visualization

    def forward(self, q, k, v, mask):
        query = self.w_q(q) # (batch, seq_len, d_model) --> (batch, seq_len, d_model)
        key = self.w_k(k) # (batch, seq_len, d_model) --> (batch, seq_len, d_model)
        value = self.w_v(v) # (batch, seq_len, d_model) --> (bat ch, seq_len, d_model)

        # (batch, seq_len, d_model) --> (batch, seq_len, h, d_k) --> (batch, h, seq_len, d_k)
        query = query.view(query.shape[0], query.shape[1], self.h, self.d_k).transpose(1, 2)
        key = key.view(key.shape[0], key.shape[1], self.h, self.d_k).transpose(1, 2)
        value = value.view(value.shape[0], value.shape[1], self.h, self.d_k).transpose(1, 2)

        # Calculate attention
        x, self.attention_scores = MultiHeadAttentionBlock.attention(query, key, value, mask, self.dropout)

        # Combine all the heads together
        # (batch, h, seq_len, d_k) --> (batch, seq_len, h, d_k) --> (batch, seq_len, d_model)
        x = x.transpose(1, 2).contiguous().view(x.shape[0], -1, self.h * self.d_k)
        # for transforming a tensor, it needs to be contiguous in memory

        # Multiply by Wo
        # (batch, seq_len, d_model) --> (batch, seq_len, d_model)
        return self.w_o(x)

In [None]:
class FeedForwardBlock(nn.Module):

    def __init__(self, d_model, d_ff, dropout):
        super().__init__()
        self.linear_1 = nn.Linear(d_model, d_ff) # w1 and b1, nn.linear(in_features, out_features)
        self.dropout  = nn.Dropout(dropout) # d_ff is 2048, acc to paper
        self.linear_2 = nn.Linear(d_ff, d_model) # w2 and b2, bias is true by default in nn.Linear
        self.gelu     = nn.GELU()

    def forward(self, x):
        # (batch, seq_len, d_model) --> (batch, seq_len, d_ff) --> (batch, seq_len, d_model)
        return self.linear_2(self.dropout(self.gelu(self.linear_1(x))))

## NOTE: Changed ReLU by GELU for activation function

class ResidualConnection(nn.Module):

        def __init__(self, features, dropout):
            super().__init__()
            self.dropout = nn.Dropout(dropout)
            self.norm = LayerNormalization(features)

        def forward(self, x, sublayer):
            return x + self.dropout(sublayer(self.norm(x)))

In [None]:
class EncoderBlock(nn.Module):

    def __init__(self, features, self_attention_block: MultiHeadAttentionBlock, feed_forward_block: FeedForwardBlock, dropout):
        super().__init__()
        self.self_attention_block = self_attention_block
        self.feed_forward_block = feed_forward_block
        self.residual_connections = nn.ModuleList([ResidualConnection(features, dropout) for _ in range(2)])

    def forward(self, x, src_mask):
        x = self.residual_connections[0](x, lambda x: self.self_attention_block(x, x, x, src_mask))
        x = self.residual_connections[1](x, self.feed_forward_block)
        return x

class Encoder(nn.Module):

    def __init__(self, features, layers: nn.ModuleList):
        super().__init__()
        self.layers = layers
        self.norm = LayerNormalization(features)

    def forward(self, x, mask):
        for layer in self.layers:
            x = layer(x, mask)
        return self.norm(x)

In [None]:
class DecoderBlock(nn.Module):

    def __init__(self, features, self_attention_block: MultiHeadAttentionBlock, cross_attention_block: MultiHeadAttentionBlock, feed_forward_block: FeedForwardBlock, dropout):
        super().__init__()
        self.self_attention_block = self_attention_block
        self.cross_attention_block = cross_attention_block
        self.feed_forward_block = feed_forward_block
        self.residual_connections = nn.ModuleList([ResidualConnection(features, dropout) for _ in range(3)])

    def forward(self, x, encoder_output, src_mask, tgt_mask): # src_mask is for encoder and tgt_mask is for decoder
        x = self.residual_connections[0](x, lambda x: self.self_attention_block(x, x, x, tgt_mask))
        x = self.residual_connections[1](x, lambda x: self.cross_attention_block(x, encoder_output, encoder_output, src_mask)) # why src_mask here?
        # key and value come from encoder and query from decoder
        x = self.residual_connections[2](x, self.feed_forward_block)
        return x

class Decoder(nn.Module):

    def __init__(self, features, layers):
        super().__init__()
        self.layers = layers
        self.norm = LayerNormalization(features)

    def forward(self, x, encoder_output, src_mask, tgt_mask):
        for layer in self.layers:
            x = layer(x, encoder_output, src_mask, tgt_mask) # forward method of DecoderBlock
        return self.norm(x)

In [None]:
class ProjectionLayer(nn.Module): # projecting the embedding into the vocabulary

    def __init__(self, d_model, vocab_size) -> None:
        super().__init__()
        self.proj = nn.Linear(d_model, vocab_size)

    def forward(self, x) -> None:
        # (batch, seq_len, d_model) --> (batch, seq_len, vocab_size)
        return self.proj(x)

class Transformer(nn.Module):

    def __init__(self, encoder, decoder, src_embed: InputEmbeddings, tgt_embed: InputEmbeddings, src_pos: PositionalEncoding, tgt_pos: PositionalEncoding, projection_layer):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_embed = src_embed
        self.tgt_embed = tgt_embed
        self.src_pos = src_pos
        self.tgt_pos = tgt_pos
        self.projection_layer = projection_layer #ProjectionLayer

    def encode(self, src, src_mask):
        # (batch, seq_len, d_model)
        src = self.src_embed(src)
        src = self.src_pos(src) # PositionalEncoding adds with word embeddings in the function itself
        return self.encoder(src, src_mask)

    def decode(self, encoder_output, src_mask, tgt, tgt_mask): # all the arguments are torch.tensor
        # (batch, seq_len, d_model)
        tgt = self.tgt_embed(tgt)
        tgt = self.tgt_pos(tgt)
        return self.decoder(tgt, encoder_output, src_mask, tgt_mask) # forward method of DecoderBlock

    def project(self, x):
        # (batch, seq_len, vocab_size)
        return self.projection_layer(x)

In [None]:
def build_transformer(src_vocab_size, tgt_vocab_size, src_seq_len, tgt_seq_len, d_model=512, N=6, h=8, dropout=0.1, d_ff=2048)->Transformer:
    # Create the embedding layers
    src_embed = InputEmbeddings(d_model, src_vocab_size)
    tgt_embed = InputEmbeddings(d_model, tgt_vocab_size)

    # Create the positional encoding layers
    src_pos = PositionalEncoding(d_model, src_seq_len, dropout)
    tgt_pos = PositionalEncoding(d_model, tgt_seq_len, dropout)
    # now, src_pos and tgt_pos are the resultant of word and positional embeddings

    # Create the encoder blocks
    encoder_blocks = []
    for _ in range(N):
        encoder_self_attention_block = MultiHeadAttentionBlock(d_model, h, dropout)
        feed_forward_block = FeedForwardBlock(d_model, d_ff, dropout)
        encoder_block = EncoderBlock(d_model, encoder_self_attention_block, feed_forward_block, dropout) #__init__
        encoder_blocks.append(encoder_block)

    # Create the decoder blocks
    decoder_blocks = []
    for _ in range(N):
        decoder_self_attention_block = MultiHeadAttentionBlock(d_model, h, dropout)
        decoder_cross_attention_block = MultiHeadAttentionBlock(d_model, h, dropout)
        feed_forward_block = FeedForwardBlock(d_model, d_ff, dropout)
        decoder_block= DecoderBlock(d_model, decoder_self_attention_block, decoder_cross_attention_block, feed_forward_block, dropout)#__init__
        decoder_blocks.append(decoder_block)

    # Create the encoder and decoder
    encoder = Encoder(d_model, nn.ModuleList(encoder_blocks)) #__init__
    decoder = Decoder(d_model, nn.ModuleList(decoder_blocks)) #__init__

    # Create the projection layer
    projection_layer = ProjectionLayer(d_model, tgt_vocab_size)

    # Create the transformer
    transformer = Transformer(encoder, decoder, src_embed, tgt_embed, src_pos, tgt_pos, projection_layer)

    # Initialize the parameters
    for p in transformer.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)

    return transformer

In [None]:
import re

# Importing contractions
with open("Downloads/Machine Translation/contractions.txt", "r") as inp_cont:
    contractions = inp_cont.read()

# print(contractions)
#re.sub(r"(.*{)|(}.*)", '', contractions_list)--> removes everything before the first { and after the last }
'''contractions = {
"ain't": "am not / are not / is not / has not / have not",
"aren't": "are not / am not"
}
'''

''' after processing -->
"ain't": "am not / are not / is not / has not / have not",
"aren't": "are not / am not"
'''
#re.sub(r"\s+", " "        --> replaces multiple whitespace characters with a single space.
'''
"ain't": "am not / are not / is not / has not / have not",
"aren't": "are not / am not"
'''
# .split(',') --> This splits the string at each comma, creating a list of strings
'''
[
    '"ain't": "am not / are not / is not / has not / have not"',
    '"aren't": "are not / am not"'
] '''

# Each string in this list is further processed with re.sub('["]', '', x) to remove double quotes " and
# then split by colons split(":") to create a list of key-value pairs.
'''
[
  ['ain't', ' am not / are not / is not / has not / have not']
  ['aren't', ' are not / am not']
]
'''
contractions_list = [re.sub('["]', '', x).split(":") for x in re.sub(r"\s+", " ", re.sub(r"(.*{)|(}.*)", '', contractions)).split(',')]

'''
contractions_list= [
  ['ain't', ' am not / are not / is not / has not / have not']
  ['aren't', ' are not / am not']
]
'''
# k.lower().strip() ensures that keys (contractions) are in lowercase and stripped of leading/trailing whitespace.
# for values(v), it removes everything after '/' in that and after processes it similarly lowercasing and removing whitespaces.
'''
contractions_dict= {
  "ain't": "am not",
  "aren't": "are not"
}
'''
contractions_dict = dict((k.lower().strip(), re.sub('/.*', '', v).lower().strip()) for k, v in contractions_list)

#print(contractions_list)
#print(contractions_dict)


def remove_sc(_line, lang="en"):

    if lang == "hi":
        _line = re.sub(r'[+\-*/#@%>=;~{}×–`’"()_]', "", _line)
        _line = re.sub(r"(?:(\[)|(\])|(‘‘)|(’’))", '', _line)
        # ](\[) OR [(\]) OR "(‘‘) OR "(’’)
    elif lang == "en":
        _line = re.sub(r'[+\-*/#@%>=;~{}×–`’"()_|:]', "", _line)
        _line = re.sub(r"(?:(\[)|(\])|(‘‘)|(’’))", '', _line)
    return _line

def clean_text(_text, lang="en"):

    if lang == "en":
        _text = remove_sc(_line=_text, lang=lang)
        _text= re.sub(r"\s'(\w)", r"'\1", _text) # removing whitespaces between the contractions
        for cn in contractions_dict:
            _text = re.sub(cn, contractions_dict[cn], _text)
    elif lang == "hi":
        _text = remove_sc(_line=_text, lang=lang)
    return _text

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset

class BilingualDataset(Dataset):

    def __init__(self, ds, tokenizer_src, tokenizer_tgt, src_lang, tgt_lang, seq_len):
        super().__init__()
        self.seq_len = seq_len

        self.ds = ds
        self.tokenizer_src = tokenizer_src
        self.tokenizer_tgt = tokenizer_tgt
        self.src_lang = src_lang
        self.tgt_lang = tgt_lang

        self.sos_token = torch.tensor([tokenizer_tgt.token_to_id("[SOS]")], dtype=torch.int64)
        self.eos_token = torch.tensor([tokenizer_tgt.token_to_id("[EOS]")], dtype=torch.int64)
        self.pad_token = torch.tensor([tokenizer_tgt.token_to_id("[PAD]")], dtype=torch.int64)

    def __len__(self):
        return len(self.ds)

    def __getitem__(self, idx):
        src_target_pair = self.ds[idx]
        src_text = src_target_pair['translation'][self.src_lang]
        tgt_text = src_target_pair['translation'][self.tgt_lang]

        # Transform the text into tokens
        enc_input_tokens = self.tokenizer_src.encode(clean_text(src_text, self.src_lang)).ids
        dec_input_tokens = self.tokenizer_tgt.encode(clean_text(tgt_text, self.tgt_lang)).ids

        # Add sos, eos and padding to each sentence
        enc_num_padding_tokens = self.seq_len - len(enc_input_tokens) - 2  # We will add <s> and </s>
        # We will only add <s>, and </s> only on the label
        dec_num_padding_tokens = self.seq_len - len(dec_input_tokens) - 1

        # Make sure the number of padding tokens is not negative. If it is, the sentence is too long
        if enc_num_padding_tokens < 0 or dec_num_padding_tokens < 0:
            raise ValueError("Sentence is too long")

        # Add <s> and </s> token
        encoder_input = torch.cat(
            [
                self.sos_token,
                torch.tensor(enc_input_tokens, dtype=torch.int64),
                self.eos_token,
                torch.tensor([self.pad_token] * enc_num_padding_tokens, dtype=torch.int64),
            ],
            dim=0,
        )

        # Add only <s> token
        decoder_input = torch.cat(
            [
                self.sos_token,
                torch.tensor(dec_input_tokens, dtype=torch.int64),
                torch.tensor([self.pad_token] * dec_num_padding_tokens, dtype=torch.int64),
            ],
            dim=0,
        )

        # Add only </s> token
        label = torch.cat(
            [
                torch.tensor(dec_input_tokens, dtype=torch.int64),
                self.eos_token,
                torch.tensor([self.pad_token] * dec_num_padding_tokens, dtype=torch.int64),
            ],
            dim=0,
        )

        # Double check the size of the tensors to make sure they are all seq_len long
        assert encoder_input.size(0) == self.seq_len
        assert decoder_input.size(0) == self.seq_len
        assert label.size(0) == self.seq_len

        return {
            "encoder_input": encoder_input,  # (seq_len)
            "decoder_input": decoder_input,  # (seq_len)
            "encoder_mask": (encoder_input != self.pad_token).unsqueeze(0).unsqueeze(0).int(), # (1, 1, seq_len)
            "decoder_mask": (decoder_input != self.pad_token).unsqueeze(0).int() & causal_mask(decoder_input.size(0)), # (1, seq_len) & (1, seq_len, seq_len),
            "label": label,  # (seq_len)
            "src_text": src_text,
            "tgt_text": tgt_text,
        }

def causal_mask(size):
    mask = torch.triu(torch.ones((1, size, size)), diagonal=1).type(torch.int)
    return mask == 0

In [None]:
from pathlib import Path

def get_config():
    return {
        "batch_size": 16,
        "num_epochs": 16,
        "lr": 1e-4,
        "seq_len": 310,
        "d_model": 512,
        "datasource": 'cfilt/iitb-english-hindi',
        "lang_src": "en",
        "lang_tgt": "hi",
        "model_folder": "weights",
        "model_basename": "tmodel_",
        "preload": "latest",
        "tokenizer_file": "tokenizer_{0}.json"
    }

def get_weights_file_path(config, epoch: str):
    model_folder = f"{config['model_folder']}"
    model_filename = f"{config['model_basename']}{epoch}.pt"
    return str(Path('Downloads/Machine Translation') / model_folder / model_filename)

# Find the latest weights file in the weights folder
def latest_weights_file_path(config):
    model_folder = f"{config['model_folder']}"
    model_filename = f"{config['model_basename']}*"# 'model_' --> 'model_*'
    # * is a wildcard character that matches any sequence of characters in a filename.
    weights_files = list(Path(f"Downloads/Machine Translation/{model_folder}").glob(model_filename)) #it will search for files with name 'model_'
    # print(weights_files)

    if len(weights_files) == 0:
        return None

    weights_files.sort()
    return str(weights_files[-1]) #return the path to the latest file, which will be the last one

In [None]:
pip install datasets tokenizers

Note: you may need to restart the kernel to use updated packages.


In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split

import warnings
from tqdm import tqdm
import os
from pathlib import Path

# HuggingFace datasets and tokenizers
from datasets import load_dataset
from tokenizers import Tokenizer, normalizers
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace


def greedy_decode(model, source, source_mask, tokenizer_src, tokenizer_tgt, max_len, device):
    sos_idx = tokenizer_tgt.token_to_id('[SOS]')
    eos_idx = tokenizer_tgt.token_to_id('[EOS]')

    # Precompute the encoder output and reuse it for every step
    encoder_output = model.encode(source, source_mask)
    # Initialize the decoder input with the sos token
    decoder_input = torch.empty(1, 1).fill_(sos_idx).type_as(source).to(device)
    while True:
        if decoder_input.size(1) == max_len:
            break

        # build mask for target
        decoder_mask = causal_mask(decoder_input.size(1)).type_as(source_mask).to(device)

        # calculate output
        out = model.decode(encoder_output, source_mask, decoder_input, decoder_mask)

        # get next token
        prob = model.project(out[:, -1])
        _, next_word = torch.max(prob, dim=1)
        decoder_input = torch.cat(
            [decoder_input, torch.empty(1, 1).type_as(source).fill_(next_word.item()).to(device)], dim=1
        )

        if next_word == eos_idx:
            break

    return decoder_input.squeeze(0)

def run_validation(model, validation_ds, tokenizer_src, tokenizer_tgt, max_len, device, print_msg, num_examples=3):
    model.eval() # evaluation mode
    count = 0

    source_texts = []
    expected = []
    predicted = []

    try:
        # get the console window width
        with os.popen('stty size', 'r') as console:
            _, console_width = console.read().split()
            console_width = int(console_width)
    except:
        # If we can't get the console width, use 80 as default
        console_width = 80

    with torch.no_grad():
        for batch in validation_ds:
            count += 1
            encoder_input = batch["encoder_input"].to(device) # (b, seq_len)
            encoder_mask =  batch["encoder_mask"].to(device) # (b, 1, 1, seq_len)

            # check that the batch size is 1
            assert encoder_input.size(
                0) == 1, "Batch size must be 1 for validation"

            model_out = greedy_decode(model, encoder_input, encoder_mask, tokenizer_src, tokenizer_tgt, max_len, device)

            source_text = batch["src_text"][0]
            target_text = batch["tgt_text"][0]
            model_out_text = tokenizer_tgt.decode(model_out.detach().cpu().numpy())

            source_texts.append(source_text)
            expected.append(target_text)
            predicted.append(model_out_text)

            # Print the source, target and model output
            print_msg('-'*console_width)
            print_msg(f"{f'SOURCE: ':>12}{source_text}")
            print_msg(f"{f'TARGET: ':>12}{target_text}")
            print_msg(f"{f'PREDICTED: ':>12}{model_out_text}")

            if count == num_examples:
                print_msg('-'*console_width)
                break

def get_all_sentences(ds, lang):
    for item in ds:
        pre_processed_text = clean_text(item['translation'][lang], lang)
        yield pre_processed_text

def get_or_build_tokenizer(config, ds, lang):
    directory_path = f'Downloads/Machine Translation/{config["tokenizer_file"].format(lang)}'
    tokenizer_path = Path(directory_path)

    if not Path.exists(tokenizer_path):
        # Most code taken from: https://huggingface.co/docs/tokenizers/quicktour
        tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))

        if lang == 'en':
          tokenizer.normalizer = normalizers.Sequence([normalizers.Lowercase(),
                                                        normalizers.NFD(),
                                                        normalizers.StripAccents()])

        tokenizer.pre_tokenizer = Whitespace()
        trainer = WordLevelTrainer(special_tokens=["[UNK]", "[PAD]", "[SOS]", "[EOS]"], min_frequency=2)
        tokenizer.train_from_iterator(get_all_sentences(ds, lang), trainer=trainer)
        tokenizer.save(str(tokenizer_path))
    else:
        tokenizer = Tokenizer.from_file(str(tokenizer_path))
    return tokenizer

def get_ds(config):
    # we divide the train split into train and val splits
    ds_raw = load_dataset(f"{config['datasource']}") # please check the split parameter here

    train_ds_raw = ds_raw['train'].select(range(80000)) # in this way, we can select the no. of examples as much as desired
    val_ds_raw   = ds_raw['validation']
    test_ds_raw  = ds_raw['test']

    # Build tokenizers
    tokenizer_src = get_or_build_tokenizer(config, train_ds_raw, config['lang_src'])
    tokenizer_tgt = get_or_build_tokenizer(config, train_ds_raw, config['lang_tgt'])


    '''
    # Keep 90% for training, 10% for validation
    train_ds_size = int(0.9 * len(ds_raw))
    val_ds_size = len(ds_raw) - train_ds_size
    train_ds_raw, val_ds_raw = random_split(ds_raw, [train_ds_size, val_ds_size])
    '''

    train_ds = BilingualDataset(train_ds_raw, tokenizer_src, tokenizer_tgt, config['lang_src'], config['lang_tgt'], config['seq_len'])
    val_ds = BilingualDataset(val_ds_raw, tokenizer_src, tokenizer_tgt, config['lang_src'], config['lang_tgt'], config['seq_len'])

    # Find the maximum length of each sentence in the source and target sentence
    max_len_src = 0
    max_len_tgt = 0

    for item in train_ds_raw:
        src_ids = tokenizer_src.encode(clean_text(item['translation'][config['lang_src']], config['lang_src'])).ids
        tgt_ids = tokenizer_tgt.encode(clean_text(item['translation'][config['lang_tgt']], config['lang_tgt'])).ids
        max_len_src = max(max_len_src, len(src_ids))
        max_len_tgt = max(max_len_tgt, len(tgt_ids))

    print(f'Max length of source sentence: {max_len_src}')
    print(f'Max length of target sentence: {max_len_tgt}')

    train_dataloader = DataLoader(train_ds, batch_size=config['batch_size'], shuffle=True)
    val_dataloader   = DataLoader(val_ds, batch_size=1, shuffle=True)

    return train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt

def get_model(config, vocab_src_len, vocab_tgt_len):
    model = build_transformer(vocab_src_len, vocab_tgt_len, config["seq_len"], config['seq_len'], d_model=config['d_model'])
    return model

if __name__ == '__main__':
    warnings.filterwarnings("ignore")
    config = get_config()

In [None]:
# lrate= d_model ^(-0.5) * min(step_num^(-0.5), step_num* warmup_step *(-1.5))
# This corresponds to increasing the learning rate linearly for the first warmup_steps training steps
# and decreasing it thereafter proportionally to the inverse square root of the step number. We used warmup_steps=3.

def rate(step, d_model, factor, warmup):
    """
    we have to default the step to 1 for LambdaLR function
    to avoid zero raising to negative power.
    """
    if step == 0:
        step = 1
    return factor * (
        d_model ** (-0.5) * min(step ** (-0.5), step * warmup ** (-1.5))
    )

In [None]:
def train_model(config):

    # Define the device
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print("Using device:", device)
    if (device == 'cuda'):
        print(f"Device name: {torch.cuda.get_device_name(device.index)}")
        print(f"Device memory: {torch.cuda.get_device_properties(device.index).total_memory / 1024 ** 3} GB")
    else:
        print("NOTE: If you have a GPU, consider using it for training.")
        print("      On a Windows machine with NVidia GPU, check this video: https://www.youtube.com/watch?v=GMSjDTU8Zlc")
        print("      On a Mac machine, run: pip3 install --pre torch torchvision torchaudio torchtext --index-url https://download.pytorch.org/whl/nightly/cpu")
    device = torch.device(device)

    # Make sure the weights folder exists
    weight_path = f"Downloads/Machine Translation/{config['model_folder']}"
    Path(weight_path).mkdir(parents=True, exist_ok=True)

    train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt = get_ds(config)
    model = get_model(config, tokenizer_src.get_vocab_size(), tokenizer_tgt.get_vocab_size()).to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=config['lr'], eps=1e-8)
    # scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.1, patience=5, verbose=True)

    # from torch.optim.lr_scheduler import LambdaLR
    # scheduler = LambdaLR(optimizer=optimizer, lr_lambda=lambda step: rate(step, *example))
    # but we won't be using the proposed scheduler in paper here because we're not training for very large no. of epochs

    # If the user specified a model to preload before training, load it
    initial_epoch = 0
    preload = config['preload']
    model_filename = latest_weights_file_path(config) if preload == 'latest' else get_weights_file_path(config, preload) if preload else None

    if model_filename:
        print(f'Preloading model {model_filename}')
        state = torch.load(model_filename)
        model.load_state_dict(state['model_state_dict'])
        initial_epoch = state['epoch'] + 1
        optimizer.load_state_dict(state['optimizer_state_dict'])
        #scheduler.load_state_dict(state['scheduler_state_dict'])
    else:
        print('No model to preload, starting from scratch')

    loss_fn = nn.CrossEntropyLoss(ignore_index=tokenizer_src.token_to_id('[PAD]'), label_smoothing=0.1).to(device) #used LabelSmoothing

    train_loss = []
    for epoch in range(initial_epoch, config['num_epochs']):
        losses = []
        torch.cuda.empty_cache()
        model.train() # training mode
        batch_iterator = tqdm(train_dataloader, desc=f"Processing Epoch {epoch:02d}")

        for batch in batch_iterator:

            encoder_input = batch['encoder_input'].to(device) # (b, seq_len)
            decoder_input = batch['decoder_input'].to(device) # (B, seq_len)
            encoder_mask  = batch['encoder_mask'].to(device) # (B, 1, 1, seq_len)
            decoder_mask  = batch['decoder_mask'].to(device) # (B, 1, seq_len, seq_len)

            # Run the tensors through the encoder, decoder and the projection layer
            encoder_output = model.encode(encoder_input, encoder_mask) # (B, seq_len, d_model)
            decoder_output = model.decode(encoder_output, encoder_mask, decoder_input, decoder_mask) # (B, seq_len, d_model)
            proj_output    = model.project(decoder_output) # (B, seq_len, tgt_vocab_size) #this is the o/p of the model

            # Compare the output with the label
            label = batch['label'].to(device) # (B, seq_len)

            # Compute the loss using a simple cross entropy
            # proj_o/p --> (B, seq_len, tgt_vocab_size) --> (B*seq_len, tgt_vocab_size)
            loss = loss_fn(proj_output.view(-1, tokenizer_tgt.get_vocab_size()), label.view(-1)) # computing the loss
            batch_iterator.set_postfix({"loss": f"{loss.item():6.3f}"})
            losses.append(loss.item())

            # Backpropagate the loss
            loss.backward()

            # Clipping to avoid exploding gradient issues, makes sure grads are
            # torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

            # Update the weights
            optimizer.step()
            optimizer.zero_grad(set_to_none=True)

        train_mean_loss = sum(losses) / len(losses)
        train_loss.append(train_mean_loss)
        #scheduler.step(train_mean_loss)
        print(f"Epoch #{epoch} Training Loss: {train_mean_loss}")

        # Run validation at the end of every epoch
        run_validation(model, val_dataloader, tokenizer_src, tokenizer_tgt, config['seq_len'], device, lambda msg: batch_iterator.write(msg))

        # Save the model at the end of every epoch
        model_filename = get_weights_file_path(config, f"{epoch:02d}")
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            #'scheduler_state_dict': scheduler.state_dict(),
        }, model_filename)


if __name__ == '__main__':
    warnings.filterwarnings("ignore")
    config = get_config()
    train_model(config)

Using device: cuda
Device name: NVIDIA GeForce RTX 4080
Device memory: 15.99169921875 GB
Max length of source sentence: 261
Max length of target sentence: 157
No model to preload, starting from scratch


Processing Epoch 00: 100%|█████████████████████████████████████████████| 5000/5000 [14:05<00:00,  5.91it/s, loss=3.088]


Epoch #0 Training Loss: 4.088024450564385
--------------------------------------------------------------------------------
    SOURCE: The decision regarding the rest of the candidates for the Assembly would only be made after Diwali.
    TARGET: बाकी सीटों पर फैसला दीपावली बाद जिले की बाकी विधानसभा सीटों पर प्रत्याशियों की घोषणा दीपावली के बाद होने की संभावना व्यक्त की जा रही है।
 PREDICTED: के द्वारा स्थिति में है के लिए के लिए है .
--------------------------------------------------------------------------------
    SOURCE: It is clear that the DTO office itself accepts that DLs are being delivered after one month, while the service act specifies that the deadline is seven days.
    TARGET: साफ है कि डीटीओ ऑफिस खुद मान रहा है कि डीएल एक महीने के बाद डिलीवर हो रहे हैं, जबकि राइट टू सर्विस एक्ट में इसकी डेडलाइन सात दिन तय की गई है।
 PREDICTED: आपका सेवा साइट में तक पहले से पहले पहले से पहले है . कि फिर से भी कोशिश करें
-------------------------------------------------------------------

Processing Epoch 01: 100%|█████████████████████████████████████████████| 5000/5000 [14:11<00:00,  5.87it/s, loss=1.550]


Epoch #1 Training Loss: 2.192030510568619
--------------------------------------------------------------------------------
    SOURCE: Harish left his tenth grade studies midway.
    TARGET: हरीश 10वीं की पढ़ाई बीच में छोड़ रखी है।
 PREDICTED: 
--------------------------------------------------------------------------------
    SOURCE: This change has been made after taking into consideration the international fluctuation of prices in both the precious metals.
    TARGET: दोनों कीमती धातुओं के वैश्रि्वक मूल्य में उतार-चढ़ाव के मद्देनजर इसमें बदलाव किया गया है।
 PREDICTED: यह परिवर्तन के बाद में दिखाने के लिए प्रयुक्त था .
--------------------------------------------------------------------------------
    SOURCE: On hearing the sound of a car colliding with a tree, the near-by residents came to the scene and immediately informed the police.
    TARGET: गाड़ी के पेड़ से टकराने की आवाज सुनकर आसपास के लोग मौके पर पहुंच गए तथा उन्होंने पुलिस को सूचित किया।
 PREDICTED: के पहले एक ध्वनि का a

Processing Epoch 02: 100%|█████████████████████████████████████████████| 5000/5000 [14:08<00:00,  5.90it/s, loss=1.442]


Epoch #2 Training Loss: 1.7273695035219192
--------------------------------------------------------------------------------
    SOURCE: By making these beautiful rangolis you can also make your house beautiful.
    TARGET: ये खूबसूरत रंगोलियां बनाकर आप भी अपने घर की खूबसूरती बढ़ा सकते हैं।
 PREDICTED: 
--------------------------------------------------------------------------------
    SOURCE: During this time they discussed the combined activities of the party supporters and the members.
    TARGET: उन्होंने इस दौरान पार्टी समर्थकों व सदस्यों से सांगठनिक गतिविधि पर चर्चा की।
 PREDICTED: इस समय के दौरान दिखाई देने के दौरान दिखाई देता है .
--------------------------------------------------------------------------------
    SOURCE: Only preaching or messages from Rahul-Sonia seem to work for Congress.
    TARGET: राहुल-सोनिया के संदेश या उपदेश ही कांग्रेस के काम आते हैं।
 PREDICTED: केवल संदेश के लिए में
--------------------------------------------------------------------------------


Processing Epoch 03: 100%|█████████████████████████████████████████████| 5000/5000 [13:52<00:00,  6.00it/s, loss=1.560]


Epoch #3 Training Loss: 1.5789985405921936
--------------------------------------------------------------------------------
    SOURCE: The speeding jeep collided with the back of truck.
    TARGET: तेज रफ्तार जीप ट्रक के पिछले हिस्से में जा घुसी।
 PREDICTED: के साथ की
--------------------------------------------------------------------------------
    SOURCE: At the same time he said, and shocked everyone by saying, that until he is called to the excavation site the gold will not be found.
    TARGET: वहीं पर कभी यह कहकर सबको चौंका दिया कि जब तक उन्हें खुदाई स्थल पर नहीं बुलाया जाता तब तक सोना नहीं निकलने वाला है।
 PREDICTED: एक समय में अब भी हो , , , जायेगा , वह एक असुरक्षित है , वह उपयोग में नहीं भेजा जायेगा , तो आपके पास नहीं है .
--------------------------------------------------------------------------------
    SOURCE: The shares of consumer goods, banking, metal, oil, natural gas and power has helped the market to pick up.
    TARGET: बाजार में इस तेजी की अगुवाई कंज्यूमर गुड्स,

Processing Epoch 04: 100%|█████████████████████████████████████████████| 5000/5000 [13:51<00:00,  6.02it/s, loss=1.417]


Epoch #4 Training Loss: 1.5054933074235917
--------------------------------------------------------------------------------
    SOURCE: Jagran correspondent, Agra: The Fatehabad road had a deadly day on Thursday when there were several fatal accidents.
    TARGET: जागरण संवाददाता, आगराः फतेहाबाद रोड पर गुरुवार रात हादसों की शक्ल में मौत मंडराती रही।
 PREDICTED: 
--------------------------------------------------------------------------------
    SOURCE: Utensil seller, Rajesh said that this market has the maximum number of utensil shops.
    TARGET: बर्तन विक्रेता राजेश का कहना था कि इस बजार में सबसे ज्यादा बर्तन की दुकाने हैं।
 PREDICTED: , इस संख्या को पहले जैसा किए जाने के लिए प्रयुक्त है
--------------------------------------------------------------------------------
    SOURCE: This Delegation also met with MPP Wick Dhilon earlier and soon they will also meet with other MPs and MPPs.
    TARGET: इससे पहले डेलिगेशन एमपीपी विक ढिल्लों को भी मिल चुका है और आने वाले दिनों में कई अन्य 

Processing Epoch 05: 100%|█████████████████████████████████████████████| 5000/5000 [13:51<00:00,  6.01it/s, loss=1.437]


Epoch #5 Training Loss: 1.46092484767437
--------------------------------------------------------------------------------
    SOURCE: He has been admitted to Sundernagar civil hospital.
    TARGET: चालक को नागरिक अस्पताल सुंदरनगर में भर्ती करवाया गया है।
 PREDICTED: का तरीका को में है .
--------------------------------------------------------------------------------
    SOURCE: DIET Principal Pradeep Sharma conducted a surprise inspection of schools in the Thanamandi zone to check the midday meals there.
    TARGET: डाइट प्रिंसिपल प्रदीप शर्मा ने थन्ना मंडी जोन के सरकारी स्कूलों का औचक दौरा कर स्कूलों में मिडडे मील की जांच की।
 PREDICTED: 
--------------------------------------------------------------------------------
    SOURCE: In one program, while praising the beauty of the Sultanpur D.M., he said "I am very fortunate that I have become the minister in charge of this zone for a second time.
    TARGET: एक कार्यक्रम में सुल्तानपुर की डीएम की खूबसूरती की तारीफ करते हुए उन्होंने कहा 

Processing Epoch 06: 100%|█████████████████████████████████████████████| 5000/5000 [13:51<00:00,  6.01it/s, loss=1.497]


Epoch #6 Training Loss: 1.4320751959562301
--------------------------------------------------------------------------------
    SOURCE: Former UP Minister, Rajaram Pandey, who was known for his controversial speeches, passed away late Thursday evening following a heart attack.
    TARGET: अपने विवादित बायनों के लिए चर्चित हुए यूपी के पूर्व मंत्री राजाराम पांडे का गुरुवार देर रात हार्ट अटैक पड़ने से निधन हो गया।
 PREDICTED: , , , , , , , की करने के लिए , ,
--------------------------------------------------------------------------------
    SOURCE: This change has been made after taking into consideration the international fluctuation of prices in both the precious metals.
    TARGET: दोनों कीमती धातुओं के वैश्रि्वक मूल्य में उतार-चढ़ाव के मद्देनजर इसमें बदलाव किया गया है।
 PREDICTED: यह परिवर्तन को पश्चात है
--------------------------------------------------------------------------------
    SOURCE: According to the details received, Komal's father had passed away a few years ago, her m

Processing Epoch 07: 100%|█████████████████████████████████████████████| 5000/5000 [13:51<00:00,  6.01it/s, loss=1.562]


Epoch #7 Training Loss: 1.410319785284996
--------------------------------------------------------------------------------
    SOURCE: The investigation was done before Diwali on the 10th of November at different times.
    TARGET: दिवाली के पहले 10 नवंबर को दो अलग-अलग समय पर की गई
 PREDICTED: मेलबाक्स में से शुरू हो गया था .
--------------------------------------------------------------------------------
    SOURCE: Naming is a cognitive exercise.
    TARGET: नामकरण ज्ञानबोधक होता है।
 PREDICTED: एक है .
--------------------------------------------------------------------------------
    SOURCE: Also, in the B.D.O Office, apart from two temporary employees, the B.D.O himself along with his employee were not present.
    TARGET: इसके साथ साथ बीडीओ कार्यालय में दो अस्थायी कर्मचारी को छोड़ कर बीडीओ सहित अन्य कर्मचारी कार्यालय में उपस्थित नहीं थे।
 PREDICTED: सेकेंड में , R संदेश प्राप्त किया जा रहा है , क्रैश से किसी दूसरे सिस्टम सर्वर से . नहीं है .
-------------------------------------

Processing Epoch 08: 100%|█████████████████████████████████████████████| 5000/5000 [13:52<00:00,  6.00it/s, loss=1.347]


Epoch #8 Training Loss: 1.3937690151453017
--------------------------------------------------------------------------------
    SOURCE: Because of this utensil sellers have already decorated their shops.
    TARGET: जिसके चलते बाजार में बर्तन विक्रेताओं की दुकानें सज चुकी हैं।
 PREDICTED: इस पहले से ही मौजूद है .
--------------------------------------------------------------------------------
    SOURCE: The NRI will have to give proof of ownership to the SDM and they will have to tell them their requirements.
    TARGET: एनआरआई को जगह का मालिक होने का प्रमाण एसडीएम को देना होगा और अपनी जरूरत बतानी होगी।
 PREDICTED: पर को के की की के की को
--------------------------------------------------------------------------------
    SOURCE: Parents were angry when the fees were increased once again.
    TARGET: इस बार भी फिर फीस बढ़ाने से अभिभावकों में रोष पाया गया।
 PREDICTED: समय समाप्ति को फिर से कोशिश करें . www . com
--------------------------------------------------------------------------

Processing Epoch 09: 100%|█████████████████████████████████████████████| 5000/5000 [13:55<00:00,  5.98it/s, loss=1.493]


Epoch #9 Training Loss: 1.3803318328142167
--------------------------------------------------------------------------------
    SOURCE: Councillor Anup Sav arrived.
    TARGET: पार्षद अनूप साव पहुंचे।
 PREDICTED: का .
--------------------------------------------------------------------------------
    SOURCE: The people of India have made the country world renowned with their active efforts.
    TARGET: भारत के लोगों ने ही अपने सचेत श्रम से राष्ट्र को विश्वप्रतिष्ठ किया है।
 PREDICTED: का दें के आपके पास कोई को निर्धारित करता है .
--------------------------------------------------------------------------------
    SOURCE: The politicisation of the schemes was done by Congress itself.
    TARGET: योजनाओं का राजनीतिकरण कांग्रेस ने ही किया।
 PREDICTED: छवि की स्लाइड शो की स्थिति के द्वारा लौटाया .
--------------------------------------------------------------------------------


Processing Epoch 10: 100%|█████████████████████████████████████████████| 5000/5000 [13:54<00:00,  5.99it/s, loss=1.369]


Epoch #10 Training Loss: 1.370163449382782
--------------------------------------------------------------------------------
    SOURCE: He said that people must buy Gold, Silver and other metal items on this day as their financial capacity allows.
    TARGET: उन्होंने कहा कि इस दिन आर्थिक क्षमता के मुताबिक सोना, चांदी व अन्य धातु अवश्य खरीदना चाहिए।
 PREDICTED: , पर और के को , के की तरह
--------------------------------------------------------------------------------
    SOURCE: The chief said that developmental work should be carried in keeping with transparency in the government work.
    TARGET: प्रमुख ने कहा कि सरकारी कार्य में पारदर्शिता रखकर विकास कार्य को आगे बढ़ाने की जरूरत है।
 PREDICTED: , कार्य बनाने के लिए चुनते हैं , जो कि एक अंश पर मुद्रित करें .
--------------------------------------------------------------------------------
    SOURCE: All those renewal renewed and valid driving licenses (DL) are ready where photos are donewere submitted by till up to the 30th of Septemb

Processing Epoch 11: 100%|█████████████████████████████████████████████| 5000/5000 [14:10<00:00,  5.88it/s, loss=1.355]


Epoch #11 Training Loss: 1.3608799506425857
--------------------------------------------------------------------------------
    SOURCE: According to the description, the details of the death of Komal only came to light when one of Komal's cousin went to the third floor of their house to do some cleaning.
    TARGET: विवरण के अनुसार कोमल द्वारा मौत को गले लगाने के घटनाक्रम का पता शुक्रवार देर दोपहर को उस समय लगा जब उसकी चचेरी बहन तीसरी मंजिल की छत पर बने कमरे में सफाई करने गई।
 PREDICTED: विवरण की समीक्षा , विवरण को से विवरण की से से का से अधिक से से , का , , , and है .
--------------------------------------------------------------------------------
    SOURCE: Government schemes are run from the accumulated exchequer, from the country's taxpayers.
    TARGET: सरकारी योजनाएं देश के करदाताओं से संचित राजकोष से चलती हैं।
 PREDICTED: से को , , ' s ' से ,
--------------------------------------------------------------------------------
    SOURCE: Chauki in-charge Rejister Pal Singh, who wa

Processing Epoch 12: 100%|█████████████████████████████████████████████| 5000/5000 [13:52<00:00,  6.00it/s, loss=1.334]


Epoch #12 Training Loss: 1.3549497336626053
--------------------------------------------------------------------------------
    SOURCE: If you are still not satisfied and you have reservations about these names, there are other options that are available in the market.
    TARGET: यदि फिर भी बात न बने तो आपको इनके नामों से परहेज है तो चलिए आप के लिए बाजार में और भी विकल्प मौजूद हैं।
 PREDICTED: यदि आप नई परियोजना नहीं हैं तो आप रोस्टर में एकिगा निर्धारित हो तो वे अन्य विकल्प , उदाहरण के लिए चुन रहे हैं .
--------------------------------------------------------------------------------
    SOURCE: The bigger lie is the more effective.
    TARGET: बड़ा झूठ ज्यादा प्रभावकारी होता है।
 PREDICTED: का अधिक है .
--------------------------------------------------------------------------------
    SOURCE: Urmila Malik, Geeta Gupta, Rajbir Singh and Amit Gupta were honoured as jury members.
    TARGET: उर्मिला मलिक, गीता गुप्ता, राजबीर सिंह व अमित गुप्ता को निर्णायक के रूप में सम्मानित किया गया।

Processing Epoch 13:  75%|█████████████████████████████████▊           | 3754/5000 [10:07<03:21,  6.18it/s, loss=1.323]


KeyboardInterrupt: 

In [None]:
def beam_search_decode(model, beam_size, source, source_mask, tokenizer_src, tokenizer_tgt, max_len, device):
    sos_idx = tokenizer_tgt.token_to_id('[SOS]')
    eos_idx = tokenizer_tgt.token_to_id('[EOS]')

    # Precompute the encoder output and reuse it for every step
    encoder_output = model.encode(source, source_mask)
    # Initialize the decoder input with the sos token
    decoder_initial_input = torch.empty(1, 1).fill_(sos_idx).type_as(source).to(device)

    # Create a candidate list
    candidates = [(decoder_initial_input, 1)]

    while True:

        # If a candidate has reached the maximum length, it means we have run the decoding for at least max_len iterations, so stop the search
        if any([cand.size(1) == max_len for cand, _ in candidates]):
            break

        # Create a new list of candidates
        new_candidates = []

        for candidate, score in candidates:

            # Do not expand candidates that have reached the eos token
            if candidate[0][-1].item() == eos_idx:
                continue

            # Build the candidate's mask
            candidate_mask = causal_mask(candidate.size(1)).type_as(source_mask).to(device)
            # calculate output
            out = model.decode(encoder_output, source_mask, candidate, candidate_mask)
            # get next token probabilities
            prob = model.project(out[:, -1])
            # get the top k candidates
            topk_prob, topk_idx = torch.topk(prob, beam_size, dim=1)
            for i in range(beam_size):
                # for each of the top k candidates, get the token and its probability
                token = topk_idx[0][i].unsqueeze(0).unsqueeze(0)
                token_prob = topk_prob[0][i].item()
                # create a new candidate by appending the token to the current candidate
                new_candidate = torch.cat([candidate, token], dim=1)
                # We sum the log probabilities because the probabilities are in log space
                new_candidates.append((new_candidate, score + token_prob))

        # Sort the new candidates by their score
        candidates = sorted(new_candidates, key=lambda x: x[1], reverse=True)
        # Keep only the top k candidates
        candidates = candidates[:beam_size]

        # If all the candidates have reached the eos token, stop
        if all([cand[0][-1].item() == eos_idx for cand, _ in candidates]):
            break

    # Return the best candidate
    return candidates[0][0].squeeze()

In [None]:
def greedy_decode(model, source, source_mask, tokenizer_src, tokenizer_tgt, max_len, device):
    sos_idx = tokenizer_tgt.token_to_id('[SOS]')
    eos_idx = tokenizer_tgt.token_to_id('[EOS]')

    # Precompute the encoder output and reuse it for every step
    encoder_output = model.encode(source, source_mask)
    # Initialize the decoder input with the sos token
    decoder_input = torch.empty(1, 1).fill_(sos_idx).type_as(source).to(device)
    while True:
        if decoder_input.size(1) == max_len:
            break

        # build mask for target
        decoder_mask = causal_mask(decoder_input.size(1)).type_as(source_mask).to(device)

        # calculate output
        out = model.decode(encoder_output, source_mask, decoder_input, decoder_mask)

        # get next token
        prob = model.project(out[:, -1])
        _, next_word = torch.max(prob, dim=1)
        decoder_input = torch.cat(
            [decoder_input, torch.empty(1, 1).type_as(source).fill_(next_word.item()).to(device)], dim=1
        )

        if next_word == eos_idx:
            break

    return decoder_input.squeeze(0)

In [None]:
def run_validation(model, validation_ds, tokenizer_src, tokenizer_tgt, max_len, device, print_msg, num_examples=3):
    model.eval()
    count = 0

    console_width = 80

    with torch.no_grad():
        for batch in validation_ds:
            count += 1
            encoder_input = batch["encoder_input"].to(device) # (b, seq_len)
            encoder_mask  = batch["encoder_mask"].to(device) # (b, 1, 1, seq_len)

            # check that the batch size is 1
            assert encoder_input.size(
                0) == 1, "Batch size must be 1 for validation"


            model_out_greedy = greedy_decode(model, encoder_input, encoder_mask, tokenizer_src, tokenizer_tgt, max_len, device)
            model_out_beam = beam_search_decode(model, 3, encoder_input, encoder_mask, tokenizer_src, tokenizer_tgt, max_len, device)

            source_text = batch["src_text"][0]
            target_text = batch["tgt_text"][0]
            model_out_text_beam = tokenizer_tgt.decode(model_out_beam.detach().cpu().numpy())
            model_out_text_greedy = tokenizer_tgt.decode(model_out_greedy.detach().cpu().numpy())

            # Print the source, target and model output
            print_msg('-'*console_width)
            print_msg(f"{f'SOURCE: ':>20}{source_text}")
            print_msg(f"{f'TARGET: ':>20}{target_text}")
            print_msg(f"{f'PREDICTED GREEDY: ':>20}{model_out_text_greedy}")
            print_msg(f"{f'PREDICTED BEAM: ':>20}{model_out_text_beam}")

            if count == num_examples:
                print_msg('-'*console_width)
                break

max_len = 20
run_validation(model, val_dataloader, tokenizer_src, tokenizer_tgt, max_len, device, print_msg=print, num_examples=3)

--------------------------------------------------------------------------------
            SOURCE: Other than the precious metals the price of brass scrap, which was until yesterday $3,933 per ton, has been reduced to $3,840 per ton.
            TARGET: कीमती धातुओं के अलावा पीतल स्क्रैप का शुल्क मूल्य घटाकर 3,840 डॉलर प्रति टन कर दिया गया जो कल तक 3,933 डॉलर प्रति टन था।
  PREDICTED GREEDY: 
    PREDICTED BEAM: 
--------------------------------------------------------------------------------
            SOURCE: This not only affects the children's education but also affects the parents' businesses.
            TARGET: इससे बच्चों की पढ़ाई खराब होने के साथ-साथ उनके कारोबार पर भी प्रतिकूल असर पड़ता है।
  PREDICTED GREEDY: यह टैग s एक से नहीं
    PREDICTED BEAM: यह टैग s एक को से नहीं
--------------------------------------------------------------------------------
            SOURCE: People around the house immediately reported the matter to the police.
            TARGET: तत्काल आसपास

In [None]:
print(tokenizer_src.get_vocab_size())
print(tokenizer_tgt.get_vocab_size())

3309
3309


In [None]:
config = get_config()

In [None]:
from tokenizers import Tokenizer, normalizers

src_directory_path = f"/content/drive/MyDrive/Project Work/Machine Translation/{config['tokenizer_file'].format(config['lang_src'])}"
tgt_directory_path = f"/content/drive/MyDrive/Project Work/Machine Translation/{config['tokenizer_file'].format(config['lang_tgt'])}"

tokenizer_src = Tokenizer.from_file(str(Path(src_directory_path)))
tokenizer_tgt = Tokenizer.from_file(str(Path(tgt_directory_path)))

In [None]:
print(tokenizer_src.token_to_id('[PAD]'))
print(tokenizer_tgt.token_to_id('[PAD]'))

1
1


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
config = get_config()
train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt = get_ds(config)


print(tokenizer_src.get_vocab_size())
print(tokenizer_src.get_vocab_size())

model = build_transformer(tokenizer_src.get_vocab_size(), tokenizer_tgt.get_vocab_size(), config["seq_len"], config['seq_len'], d_model=config['d_model']).to(device)

# Load the pretrained weights
model_filename = get_weights_file_path(config, "02")
checkpoint = torch.load(model_filename,  map_location=device)
model.load_state_dict(checkpoint['model_state_dict'])
epoch= checkpoint['epoch']

model.to(device).eval()
print(f'Successfully Loaded The Best Model, achieved on Epoch {epoch+1}')

Using device: cuda
Max length of source sentence: 261
Max length of target sentence: 157
4365
4365
Successfully Loaded The Best Model, achieved on Epoch 3


In [None]:
import sys
from tokenizers import Tokenizer

def translate(sentence: str):
    # Define the device, tokenizers, and model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("Using device:", device)
    config = get_config()
    src_directory_path = f"Downloads/Machine Translation/{config['tokenizer_file'].format(config['lang_src'])}"
    tgt_directory_path = f"Downloads/Machine Translation/{config['tokenizer_file'].format(config['lang_tgt'])}"

    tokenizer_src = Tokenizer.from_file(str(Path(src_directory_path)))
    tokenizer_tgt = Tokenizer.from_file(str(Path(tgt_directory_path)))
    model = build_transformer(tokenizer_src.get_vocab_size(), tokenizer_tgt.get_vocab_size(), config["seq_len"], config['seq_len'], d_model=config['d_model']).to(device)

    # Load the pretrained weights
    model_filename = latest_weights_file_path(config)
    checkpoint = torch.load(model_filename,  map_location=device)
    model.load_state_dict(checkpoint['model_state_dict'])
    epoch= checkpoint['epoch']
    model.to(device).eval()
    print(f'Successfully Loaded The Best Model, achieved on Epoch {epoch+1}')

    # if the sentence is a number use it as an index to the test set
    label = ""
    if type(sentence) == int or sentence.isdigit():
        id = int(sentence)
        ds = load_dataset(f"{config['datasource']}", f"{config['lang_src']}-{config['lang_tgt']}", split='all')
        ds = BilingualDataset(ds, tokenizer_src, tokenizer_tgt, config['lang_src'], config['lang_tgt'], config['seq_len'])
        sentence = ds[id]['src_text']
        label = ds[id]["tgt_text"]
    seq_len = config['seq_len']

    # translate the sentence
    model.eval()
    with torch.no_grad():
        # Precompute the encoder output and reuse it for every generation step
        source = tokenizer_src.encode(sentence)
        source = torch.cat([
            torch.tensor([tokenizer_src.token_to_id('[SOS]')], dtype=torch.int64),
            torch.tensor(source.ids, dtype=torch.int64),
            torch.tensor([tokenizer_src.token_to_id('[EOS]')], dtype=torch.int64),
            torch.tensor([tokenizer_src.token_to_id('[PAD]')] * (seq_len - len(source.ids) - 2), dtype=torch.int64)
        ], dim=0).to(device)
        source_mask = (source != tokenizer_src.token_to_id('[PAD]')).unsqueeze(0).unsqueeze(0).int().to(device)
        encoder_output = model.encode(source, source_mask)

        # Initialize the decoder input with the sos token
        decoder_input = torch.empty(1, 1).fill_(tokenizer_tgt.token_to_id('[SOS]')).type_as(source).to(device)

        # Print the source sentence and target start prompt
        if label != "": print(f"{f'ID: ':>12}{id}")
        print(f"{f'SOURCE: ':>12}{sentence}")
        if label != "": print(f"{f'TARGET: ':>12}{label}")
        print(f"{f'PREDICTED: ':>12}", end='')

        # Generate the translation word by word
        while decoder_input.size(1) < seq_len:
            # build mask for target and calculate output
            decoder_mask = torch.triu(torch.ones((1, decoder_input.size(1), decoder_input.size(1))), diagonal=1).type(torch.int).type_as(source_mask).to(device)
            out = model.decode(encoder_output, source_mask, decoder_input, decoder_mask)

            # project next token
            prob = model.project(out[:, -1])
            _, next_word = torch.max(prob, dim=1)
            decoder_input = torch.cat([decoder_input, torch.empty(1, 1).type_as(source).fill_(next_word.item()).to(device)], dim=1)

            # print the translated word
            print(f"{tokenizer_tgt.decode([next_word.item()])}", end=' ')

            # break if we predict the end of sentence token
            if next_word == tokenizer_tgt.token_to_id('[EOS]'):
                break

    # convert ids to tokens
    return tokenizer_tgt.decode(decoder_input[0].tolist())

#read sentence from argument
translate("how are you?")

Using device: cpu
Successfully Loaded The Best Model, achieved on Epoch 6
    SOURCE: i love you
 PREDICTED:                                                                                                                                                                                                                                                                

''

In [None]:
pip install sacrebleu

In [None]:
from pathlib import Path
from tokenizers import Tokenizer
from datasets import load_dataset, load_metric
import torch
import sys

def run_evaluation(test_ds, beam_size, tokenizer_src, tokenizer_tgt, max_len, device):
    # Define the tokenizers, and model
    print("Using device:", device)

    config = get_config()
    tokenizer_src = Tokenizer.from_file(str(Path(config['tokenizer_file'].format(config['lang_src']))))
    tokenizer_tgt = Tokenizer.from_file(str(Path(config['tokenizer_file'].format(config['lang_tgt']))))

    model = build_transformer(tokenizer_src.get_vocab_size(), tokenizer_tgt.get_vocab_size(),
                              config["seq_len"], config['seq_len'], d_model=config['d_model']).to(device)

    # Load the pretrained weights
    model_filename = latest_weights_file_path(config)
    state = torch.load(model_filename)
    model.load_state_dict(state['model_state_dict'])

    source_texts    = []
    predictions     = []
    target          = []

    model.eval() # evaluation mode
    with torch.no_grad():
      for batch in test_ds:
          encoder_input = batch["encoder_input"].to(device) # (b, seq_len)
          encoder_mask  = batch["encoder_mask"].to(device) # (b, 1, 1, seq_len)

          # check that the batch size is 1
          assert encoder_input.size(
              0) == 1, "Batch size must be 1 for evaluation"

          model_out_beam = beam_search_decode(model, beam_size, encoder_input, encoder_mask, tokenizer_src, tokenizer_tgt, max_len, device)

          source_text = batch["src_text"][0] #Get the first source text in the batch, it is the only one, as batch_size is 1
          target_text = batch["tgt_text"][0] #Get the first target text in the batch, it is the only one, as batch_size is 1
          model_out_text = tokenizer_tgt.decode(model_out_beam.detach().cpu().numpy())

          source_texts.append(source_text)
          target.append(target_text)
          predictions.append(model_out_text)

    target_list = [[ref] for ref in target]
    sacrebleu = load_dataset('sacrebleu')
    # SacreBLEU operates on raw text, not tokens
    # that's why we've not used BLEU Metric here
    score = sacrebleu.compute(predictions=predictions, references=target_list)

    return score

In [None]:
test_ds_raw     = ds_raw['test']
test_ds         = BilingualDataset(test_ds_raw, tokenizer_src, tokenizer_tgt, config['lang_src'], config['lang_tgt'], config['seq_len'])
test_dataloader = DataLoader(test_ds, batch_size=1, shuffle=False)
beam_size       = 3
max_len         = 20
device          = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# seq_len or max_len is the max. no of words in a sentence
# d_model is the dimension of vector embedding of each word

bleu_score = run_evaluation(test_ds, beam_size, tokenizer_src, tokenizer_tgt, max_len, device)

score      = round(bleu_score['score'],2)
print(f"BLEU Score on test_dataset: {score}")
print(bleu_score)

# Attention Visualization

In [None]:
import torch
import torch.nn as nn
import altair as alt  # visualization library for charts
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [None]:
# Define the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

In [None]:
config = get_config()
train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt = get_ds(config)
model = get_model(config, tokenizer_src.get_vocab_size(), tokenizer_tgt.get_vocab_size()).to(device)

# Load the pretrained weights
model_filename = get_weights_file_path(config, f"29")
state = torch.load(model_filename)
model.load_state_dict(state['model_state_dict'])

In [None]:
def load_next_batch():
    # Load a sample batch from the validation set
    batch = next(iter(val_dataloader))
    encoder_input = batch["encoder_input"].to(device)
    encoder_mask  = batch["encoder_mask"].to(device)
    decoder_input = batch["decoder_input"].to(device)
    decoder_mask  = batch["decoder_mask"].to(device)

    encoder_input_tokens = [tokenizer_src.id_to_token(idx) for idx in encoder_input[0].cpu().numpy()]
    decoder_input_tokens = [tokenizer_tgt.id_to_token(idx) for idx in decoder_input[0].cpu().numpy()]

    # check that the batch size is 1
    assert encoder_input.size(
        0) == 1, "Batch size must be 1 for validation"

    model_out = greedy_decode(
        model, encoder_input, encoder_mask, vocab_src, vocab_tgt, config['seq_len'], device)

    return batch, encoder_input_tokens, decoder_input_tokens

In [None]:
def mtx2df(m, max_row, max_col, row_tokens, col_tokens):
    return pd.DataFrame(
        [
            (
                r,
                c,
                float(m[r, c]),
                "%.3d %s" % (r, row_tokens[r] if len(row_tokens) > r else "<blank>"),
                "%.3d %s" % (c, col_tokens[c] if len(col_tokens) > c else "<blank>"),
            )
            for r in range(m.shape[0])
            for c in range(m.shape[1])
            if r < max_row and c < max_col
        ],
        columns=["row", "column", "value", "row_token", "col_token"],
    )

def get_attn_map(attn_type: str, layer: int, head: int):
    if attn_type == "encoder":
        attn = model.encoder.layers[layer].self_attention_block.attention_scores
    elif attn_type == "decoder":
        attn = model.decoder.layers[layer].self_attention_block.attention_scores
    elif attn_type == "encoder-decoder":
        attn = model.decoder.layers[layer].cross_attention_block.attention_scores
    return attn[0, head].data

def attn_map(attn_type, layer, head, row_tokens, col_tokens, max_sentence_len):
    df = mtx2df(
        get_attn_map(attn_type, layer, head),
        max_sentence_len,
        max_sentence_len,
        row_tokens,
        col_tokens,
    )
    return (
        alt.Chart(data=df)
        .mark_rect()
        .encode(
            x=alt.X("col_token", axis=alt.Axis(title="")),
            y=alt.Y("row_token", axis=alt.Axis(title="")),
            color="value",
            tooltip=["row", "column", "value", "row_token", "col_token"],
        )
        #.title(f"Layer {layer} Head {head}")
        .properties(height=400, width=400, title=f"Layer {layer} Head {head}")
        .interactive()
    )

def get_all_attention_maps(attn_type: str, layers: list[int], heads: list[int], row_tokens: list, col_tokens, max_sentence_len: int):
    charts = []
    for layer in layers:
        rowCharts = []
        for head in heads:
            rowCharts.append(attn_map(attn_type, layer, head, row_tokens, col_tokens, max_sentence_len))
        charts.append(alt.hconcat(*rowCharts))
    return alt.vconcat(*charts)

In [None]:
batch, encoder_input_tokens, decoder_input_tokens = load_next_batch()
print(f'Source: {batch["src_text"][0]}')
print(f'Target: {batch["tgt_text"][0]}')
sentence_len = encoder_input_tokens.index("[PAD]")

In [None]:
layers = [0, 1, 2]
heads = [0, 1, 2, 3, 4, 5, 6, 7]

# Encoder Self-Attention
get_all_attention_maps("encoder", layers, heads, encoder_input_tokens, encoder_input_tokens, min(20, sentence_len))

In [None]:
# Decoder Self-Attention
get_all_attention_maps("decoder", layers, heads, decoder_input_tokens, decoder_input_tokens, min(20, sentence_len))

In [None]:
# Encoder-Decoder Self-Attention
get_all_attention_maps("encoder-decoder", layers, heads, encoder_input_tokens, decoder_input_tokens, min(20, sentence_len)) # Cross-Attention Block

In [None]:
ds_raw = load_dataset('cfilt/iitb-english-hindi') # please check the split parameter here

train_ds_raw = ds_raw['train'].select(range(50000)) # in this way, we can select the no. of examples as much as desired
val_ds_raw   = ds_raw['validation']
test_ds_raw  = ds_raw['test']

max_len_src=0
max_len_tgt=0
max_len_src2=0
max_len_tgt2=0

for item in val_ds_raw:
        src_ids = tokenizer_src.encode(clean_text(item['translation'][config['lang_src']], config['lang_src'])).ids
        tgt_ids = tokenizer_tgt.encode(clean_text(item['translation'][config['lang_tgt']], config['lang_tgt'])).ids
        max_len_src = max(max_len_src, len(src_ids))
        max_len_tgt = max(max_len_tgt, len(tgt_ids))

print(f'Max length of source sentence: {max_len_src}')
print(f'Max length of target sentence: {max_len_tgt}')

for item in val_ds_raw:
        src_ids = tokenizer_src.encode(clean_text(item['translation'][config['lang_src']], config['lang_src'])).ids
        tgt_ids = tokenizer_tgt.encode(clean_text(item['translation'][config['lang_tgt']], config['lang_tgt'])).ids
        max_len_src = max(max_len_src2, len(src_ids))
        max_len_tgt = max(max_len_tgt2, len(tgt_ids))

print(f'Max length of source sentence: {max_len_src2}')
print(f'Max length of target sentence: {max_len_tgt2}')