<div>
<img src="https://production-media.paperswithcode.com/method_collections/trans.jpeg" width="300" height="500"/>
</div>

In [None]:
# !pip install torch transformers datasets torchmetrics

In [None]:
import torch
import torch.nn as nn
import math
from tqdm import tqdm

### 1-Token Embedding

In [None]:
class tokenEmbedding(nn.Module):
    def __init__(self,embDim:int,vocabSize:int):
        super().__init__()
        self.embDim = embDim
        self.vocabSize = vocabSize
        self.embedding = nn.Embedding(vocabSize,embDim)

    def forward(self,x):
        # (batch, seq_len) --> (batch, seq_len, embDim)
        return self.embedding(x) # can be multiplied by math.sqrt(self.embDim) according to the paper


In [None]:
embDim = 3
vocabSize = 5
seqLen = 6
batch_size = 2
temb = tokenEmbedding(embDim,vocabSize)
x = torch.randint(0,vocabSize,(batch_size,seqLen))
print(f'Input shape : {x.shape}')
print(f"Shape of token embdedding layer's output: {temb(x).shape}")
# print(f"Output of token embedding layer: {temb(x)}")

### 2-Position Encoding
✅ Summary

- Add fixed positional information to input embeddings.
- Use sine and cosine functions to encode positions.
- Appliy dropout to help prevent overfitting.
<img src="https://miro.medium.com/v2/resize:fit:4800/format:webp/1*pxb-wsvNNrvy5j7hLGPRUg.png" alt="Alt Text" width="500" height="200">

In [None]:
class positionEmbedding(nn.Module):
    def __init__(self,embDim:int,seqLen:int, dropout:float):
        super().__init__()
        self.embDim = embDim
        self.seqLen = seqLen
        self.dropout = nn.Dropout(dropout)
        positions = torch.zeros(seqLen,embDim)  # Initialize positions tensor with shape (seqLen, embDim)
        
        for pos in range(seqLen):
            for i in range(0,embDim):
                if i % 2 == 0:
                    positions[pos,i] = math.sin(pos / (10000 ** ((2 * i)/embDim)))
                else:
                    positions[pos,i] = math.cos(pos / (10000 ** ((2 * i)/embDim)))
        positions = positions.unsqueeze(0) # Adds a batch dimension: shape becomes (1, seqLen, embDim)
        self.register_buffer('positions',positions) #Registers positions as a buffer (non-trainable parameter) so it’s saved with the model but not updated during training.
    def forward(self,x):
        # (batch, seqLen, embDim) --> (batch, seqLen, embDim)        
        x = x + (self.positions[:, :x.shape[1], :]).requires_grad_(False) 
        return self.dropout(x)

In [None]:
embDim = 3
vocabSize = 5
seqLen = 6
batch_size = 2
temb = tokenEmbedding(embDim,vocabSize)
x = torch.randint(0,vocabSize,(batch_size,seqLen))
print(f'Input shape : {x.shape}')
print(f"Shape of token embdedding layer's output: {temb(x).shape}")
PE = positionEmbedding(embDim,seqLen=seqLen,dropout=0.1)
x = PE(temb(x))
print(f"Shape of token embdedding layer + positional embedding layers output: {x.shape}")


### 3-Layer Normalization
✅ Summary

- Normalizes across the features (embedding dimension) for each token independently.
- Helps stabilize training by reducing internal covariate shift.
- Is especially useful in transformers, where it's applied before or after attention and feedforward layers.

In [None]:
class layerNormalization(nn.Module):
    def __init__(self,embDim,epsilon=10**-10):
        super().__init__()
        self.alpha = nn.Parameter(torch.ones(embDim))
        self.beta = nn.Parameter(torch.ones(embDim))
        self.eps = epsilon
    def forward(self,x):
        # (batch, seqLen, embDim) --> (batch, seqLen, embDim)
        mean = x.mean(dim=-1,keepdim=True)
        std = x.std(dim=-1,keepdim=True)
        return self.alpha * (x-mean) /(std+self.eps) + self.beta


In [None]:
embDim = 3
vocabSize = 5
seqLen = 6
batch_size = 2
temb = tokenEmbedding(embDim,vocabSize)
x = torch.randint(0,vocabSize,(batch_size,seqLen))
print(f'Input shape : {x.shape}')
print(f"Shape of token embdedding layer's output: {temb(x).shape}")
PE = positionEmbedding(embDim,seqLen=seqLen,dropout=0.1)
x = PE(temb(x))
print(f"Shape of token embdedding layer + positional embedding layers output: {x.shape}")
LN = layerNormalization(embDim=3)
x = LN(x)
print(f"Shape of layer normalization output: {x.shape}")



### 4-Feed Forward Network

In [None]:
class feedForward(nn.Module):
    def __init__(self,embDim:int,layerSize:int,dropout:float):
        super().__init__()
        self.layer1 = nn.Linear(embDim,layerSize) # It is alos possible to add more intermediate layers.
        self.dropout = nn.Dropout(dropout)
        self.layer2 = nn.Linear(layerSize,embDim)
    def forward(self,x):
        #(batch,seqLen,embDim)--->(batch,seqLen,embDim)
        x = self.dropout(nn.functional.relu(self.layer1(x)))
        return self.layer2(x)


In [None]:
embDim = 3
vocabSize = 5
seqLen = 6
batch_size = 2
layerSize=100
temb = tokenEmbedding(embDim,vocabSize)
x = torch.randint(0,vocabSize,(batch_size,seqLen))
print(f'Input shape : {x.shape}')
print(f"Shape of token embdedding layer's output: {temb(x).shape}")
PE = positionEmbedding(embDim,seqLen=seqLen,dropout=0.1)
x = PE(temb(x))
print(f"Shape of token embdedding layer + positional embedding layers output: {x.shape}")
LN = layerNormalization(embDim=embDim)
x = LN(x)
print(f"Shape of layer normalization output: {x.shape}")
FF = feedForward(embDim=embDim,layerSize=layerSize,dropout=0.1)
x = FF(x)
print(f"Shape of feed forward layer output: {x.shape}")


### 5-Residual Connection
✅ Summary
- Normalizes the input x.
- Passes the normalized input through a downblock (e.g., a feedforward or attention block).
- Applies dropout to the output of the downblock.
- Adds the original input x back to the processed output (residual connection).

In [None]:
class residualconnection(nn.Module):
    def __init__(self,embdDim:int, dropout:float):
        super().__init__()
        self.dropout= nn.Dropout(dropout)
        self.norm = layerNormalization(embdDim)

    def forward(self,x,downblock):
        return x + self.dropout(downblock(self.norm(x)))


In [None]:
embDim = 3
vocabSize = 5
seqLen = 6
batch_size = 2
layerSize=100
temb = tokenEmbedding(embDim,vocabSize)
x = torch.randint(0,vocabSize,(batch_size,seqLen))
print(f'Input shape : {x.shape}')
print(f"Shape of token embdedding layer's output: {temb(x).shape}")
PE = positionEmbedding(embDim,seqLen=seqLen,dropout=0.1)
x = PE(temb(x))
print(f"Shape of token embdedding layer + positional embedding layers output: {x.shape}")
LN = layerNormalization(embDim=embDim)
x = LN(x)
print(f"Shape of layer normalization output: {x.shape}")
FF = feedForward(embDim=embDim,layerSize=layerSize,dropout=0.1)
x = FF(x)
print(f"Shape of feed forward layer output: {x.shape}")


RC = residualconnection(embdDim=embDim, dropout=0.1)
x = RC(x,FF)
print(f"Shape of residual connection output: {x.shape}")


### 6- Multi-Head Attention
✅ Summary:
- Supports multi-head attention for richer representations.
- Handles masking, useful for:
   - Causal attention (e.g., in autoregressive models).
   - Padding masks (e.g., in variable-length sequences).

In [None]:
class multiHeadAttention(nn.Module):
    def __init__(self, embDim, num_heads):
        super().__init__()
        assert embDim % num_heads == 0, "Embedding dimension must be divisible by number of heads"
        
        self.embDim = embDim
        self.num_heads = num_heads
        self.head_dim = embDim // num_heads

        self.q_linear = nn.Linear(embDim, embDim)
        self.k_linear = nn.Linear(embDim, embDim)
        self.v_linear = nn.Linear(embDim, embDim)
        self.out_linear = nn.Linear(embDim, embDim)

    def forward(self, key,query,value, mask=None):
        batch_size, seqLen, _ = key.size()

        # Linear projections
        Q = self.q_linear(query)
        K = self.k_linear(key)
        V = self.v_linear(value)
     
        # Reshape for multi-head: (batch, heads, seqLen, head_dim)
        Q = Q.view(batch_size, seqLen, self.num_heads, self.head_dim).transpose(1, 2)
        K = K.view(batch_size, seqLen, self.num_heads, self.head_dim).transpose(1, 2)
        V = V.view(batch_size, seqLen, self.num_heads, self.head_dim).transpose(1, 2)
        # Scaled dot-product attention
        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.head_dim)

        # Apply mask (if provided)
        if mask is not None:
            # mask shape: (batch_size, 1, 1, seqLen) or (batch_size, 1, seqLen, seqLen)
            scores = scores.masked_fill(mask == 0, float('-inf'))
            # print(scores)

        attn = torch.softmax(scores, dim=-1)
        context = torch.matmul(attn, V)

        # Concatenate heads
        context = context.transpose(1, 2).contiguous().view(batch_size, seqLen, self.embDim)

        # Final linear layer
        return self.out_linear(context)


In [None]:
vocabSize = 5
seqLen = 6
batch_size = 2
layerSize=100
num_heads = 3
embDim = 3
temb = tokenEmbedding(embDim,vocabSize)
x = torch.randint(0,vocabSize,(batch_size,seqLen))
print(f'Input shape : {x.shape}')
print(f"Shape of token embdedding layer's output: {temb(x).shape}")
PE = positionEmbedding(embDim,seqLen=seqLen,dropout=0.1)
x = PE(temb(x))
print(f"Shape of token embdedding layer + positional embedding layers output: {x.shape}")
LN = layerNormalization(embDim=embDim)
x = LN(x)
print(f"Shape of layer normalization output: {x.shape}")
FF = feedForward(embDim=embDim,layerSize=layerSize,dropout=0.1)
x = FF(x)
print(f"Shape of feed forward layer output: {x.shape}")


RC = residualconnection(embdDim=embDim, dropout=0.1)
x = RC(x,FF)
print(f"Shape of residual connection output: {x.shape}")


mask = torch.tril(torch.ones(seqLen, seqLen)).unsqueeze(0).unsqueeze(1)  # Causal Attention  # (1, 1, seqLen, seqLen) = (Batch_size, Num_heads, SeqLen, SeqLen)
# mask = torch.tensor([1,1,1,0,0,0]).unsqueeze(0).unsqueeze(1) # Padding Attention # (1, 1, seqLen, seqLen) = (Batch_size, Num_heads, SeqLen, SeqLen)

MHA = multiHeadAttention(embDim=embDim, num_heads=num_heads)
x = MHA(x,x,x, mask=mask)
print(x.shape)

### 7-Encoder
✅ Summary:
- encoderBlock = Multi-head attention + Feed-forward + Residual connections.
- encoder = Stack of encoderBlocks + Final normalization.
- Supports masking (e.g., for padding) via srcMask.
- Uses modular design: attention and feed-forward are passed as objects, allowing flexibility.

In [None]:
class encoderBlock(nn.Module):
    def __init__(self, embDim:int,MHA:multiHeadAttention,FF:feedForward,dropout:float):
        super().__init__()
        self.mha = MHA
        self.ff = FF
        self.rc1 = residualconnection(embDim,dropout)
        self.rc2 = residualconnection(embDim,dropout)

    def forward(self,x,srcMask): 
        # srcMask is the mask for the source sequence to mask the padding tokens preventing to be attended by the other tokens
        x = self.rc1(x, lambda x:self.mha(x,x,x,srcMask))
        x = self.rc2(x,self.ff)
        return x


class encoder(nn.Module):
    def __init__(self,embDim:int, layers:nn.ModuleList):
        super().__init__()
        self.layers  = layers
        self.norm = layerNormalization(embDim)
    def forward(self,x,mask):
        for layer in self.layers:
            x = layer(x,mask)
        return self.norm(x)



In [None]:
embDim = 9
vocabSize = 5
seqLen = 6
batch_size = 2
layerSize=100
num_heads = 3
temb = tokenEmbedding(embDim,vocabSize)
x = torch.randint(0,vocabSize,(batch_size,seqLen))
print(f'Input shape : {x.shape}')
print(f"Shape of token embdedding layer's output: {temb(x).shape}")
PE = positionEmbedding(embDim,seqLen=seqLen,dropout=0.1)
x = PE(temb(x))
print(f"Shape of token embdedding layer + positional embedding layers output: {x.shape}")
LN = layerNormalization(embDim=embDim)
x = LN(x)
print(f"Shape of layer normalization output: {x.shape}")
FF = feedForward(embDim=embDim,layerSize=layerSize,dropout=0.1)
x = FF(x)
print(f"Shape of feed forward layer output: {x.shape}")


RC = residualconnection(embdDim=embDim, dropout=0.1)
x = RC(x,FF)
print(f"Shape of residual connection output: {x.shape}")

mask = torch.tensor([1,1,1,0,0,0]).unsqueeze(0).unsqueeze(1)
MHA = multiHeadAttention(embDim=embDim, num_heads=num_heads)
encBlock = encoderBlock(embDim=embDim,MHA=MHA,FF=FF,dropout=0.1)
print(f"Shape of the output of one encoderBlock: {encBlock(x,mask).shape}")


EN = encoder(embDim=embDim, layers=nn.ModuleList([encBlock for _ in range(2)]))

x= EN(x,mask)
print(f"Shape of the output of the encoder: {x.shape}")


### 8-Decoder

In [None]:
class decoderBlock(nn.Module):
    def __init__(self, embDim:int,MHA:multiHeadAttention,crossMha:multiHeadAttention,FF:feedForward,dropout:float):
        super().__init__()
        self.mha = MHA
        self.c_mha = crossMha
        self.ff = FF
        self.rc = nn.ModuleList([residualconnection(embDim,dropout) for _ in range(3)])
    def forward(self,x, encoderOutput,sourceMask,targetMask):
        x = self.rc[0](x,lambda x:self.mha(x,x,x,targetMask))
        x = self.rc[1](x,lambda x:self.c_mha(x,encoderOutput,encoderOutput,sourceMask))
        x = self.rc[2](x,self.ff)
        return x
    
class decoder(nn.Module):
    def __init__(self, embDim:int, layers:nn.ModuleList):
        super().__init__()
        self.layers = layers
        self.norm = layerNormalization(embDim)
    def forward(self,x, encoderOutput,sourceMask,targetMask):
        for layer in self.layers:
            x = layer(x, encoderOutput,sourceMask,targetMask)
        return self.norm(x)

In [None]:
embDim = 9
vocabSize = 5
seqLen = 6
batch_size = 2
layerSize=100
num_heads = 3
temb = tokenEmbedding(embDim,vocabSize)
x = torch.randint(0,vocabSize,(batch_size,seqLen))
print(f'Input shape : {x.shape}')
print(f"Shape of token embdedding layer's output: {temb(x).shape}")
PE = positionEmbedding(embDim,seqLen=seqLen,dropout=0.1)
x = PE(temb(x))
print(f"Shape of token embdedding layer + positional embedding layers output: {x.shape}")
LN = layerNormalization(embDim=embDim)
x = LN(x)
print(f"Shape of layer normalization output: {x.shape}")
FF = feedForward(embDim=embDim,layerSize=layerSize,dropout=0.1)
x = FF(x)
print(f"Shape of feed forward layer output: {x.shape}")


RC = residualconnection(embdDim=embDim, dropout=0.1)
x = RC(x,FF)
print(f"Shape of residual connection output: {x.shape}")

mask = None
MHA = multiHeadAttention(embDim=embDim, num_heads=num_heads)
encBlock = encoderBlock(embDim=embDim,MHA=MHA,FF=FF,dropout=0.1)
print(f"Shape of the output of one encoderBlock: {encBlock(x,mask).shape}")


EN = encoder(embDim=embDim, layers=nn.ModuleList([encBlock for _ in range(2)]))
x= EN(x,mask)
print(f"Shape of the output of the encoder: {x.shape}")


deBlock = decoderBlock(embDim=embDim,MHA=MHA,crossMha=MHA,FF=FF,dropout=0.1)
y = x.clone()  # Copy the encoder output for the decoder
# x= deBlock(x,y,None,torch.tril(torch.ones(seqLen,seqLen)))  # Mask for decoder
# print(f"shape of the output of one decoderBlock: {x.shape}")


DE = decoder(embDim=embDim, layers=nn.ModuleList([deBlock for _ in range(2)]))
out= DE(x, y, torch.tensor([1,1,1,0,0,0]).unsqueeze(0).unsqueeze(1), torch.tril(torch.ones(seqLen, seqLen)))
print(f"Shape of the output of the decoder: {out.shape}")


### 9-Mapping

In [None]:
class mappingLayer(nn.Module):
    # (batch, seqLen, embDim) --> (batch, seqLen, vocab_size)
    def __init__(self,embDim:int,vocabSize:int):
        super().__init__()
        self.map = nn.Linear(embDim,vocabSize)
    def forward(self,x):
        return torch.log_softmax(self.map(x),dim=-1)

### 10-Build a Custom Transformer

In [None]:
class transformer(nn.Module):
    def __init__(self,enc:encoder,dec:decoder,sourceEmb:tokenEmbedding,
                 targetEmb:tokenEmbedding,sourcePos:positionEmbedding,targetPos:positionEmbedding,
                 mapLayer:mappingLayer):
        super().__init__()
        self.encoder = enc
        self.decoder = dec
        self.sourceEmb = sourceEmb
        self.targetEmb = targetEmb
        self.sourcePos = sourcePos
        self.targetPos = targetPos
        self.map = mapLayer

    def encode(self,source,sourceMask):
        source = self.sourceEmb(source)
        source = self.sourcePos(source)
        return self.encoder(source,sourceMask)

    def decode(self,encOut,sourceMask,target,targetMask):
        target = self.targetEmb(target)
        target = self.targetPos(target)
        return self.decoder(target,encOut,sourceMask,targetMask)

    def maper(self,x):
        return self.map(x)

In [None]:
def customTransformer(sourceVocabSize,targetVocabSize,sourceSeqLen,targetSeqLen,
                      embDim,n_EncBlocks,n_DecBlocks,n_heads,dropout,ffLayerSize):

    sourceEmbd = tokenEmbedding(embDim,sourceVocabSize)
    sourcePos = positionEmbedding(embDim,sourceSeqLen,dropout)

    targetEmbd = tokenEmbedding(embDim,targetVocabSize)
    targetPos = positionEmbedding(embDim,targetSeqLen,dropout)

    encoderBlocks = []
    decoderBlocks = []
    for _ in range(n_EncBlocks):
        encMha = multiHeadAttention(embDim,n_heads)
        ff = feedForward(embDim,ffLayerSize,dropout)
        encBlock = encoderBlock(embDim,encMha,ff,dropout)
        encoderBlocks.append(encBlock)

    for _ in range(n_DecBlocks):
        decMha = multiHeadAttention(embDim,n_heads)
        decCrossMha = multiHeadAttention(embDim,n_heads)
        ff = feedForward(embDim,ffLayerSize,dropout)
        decBlock = decoderBlock(embDim,decMha,decCrossMha,ff,dropout)
        decoderBlocks.append(decBlock)

    Encoder = encoder(embDim,nn.ModuleList(encoderBlocks))
    Decoder = decoder(embDim,nn.ModuleList(decoderBlocks))

    mapLayer = mappingLayer(embDim,targetVocabSize)

    model = transformer(Encoder,Decoder,sourceEmbd,targetEmbd,
                              sourcePos,targetPos,mapLayer)
    
    for param in model.parameters():
         if param.dim() > 1:
             nn.init.xavier_uniform_(param)
    return model


In [None]:
sourceVocabSize = 10
targetVocabSize = 10 
sourceSeqLen= 20
targetSeqLen= 20
seqLen = 20
embDim=50
n_EncBlocks=1 
n_DecBlocks=1 
n_heads=2
dropout=0.1
ffLayerSize = 15
lr_rate = 10**-6
n_Epochs = 1
batchSize = 1
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


### 11 - DataSet and Tokenizers Creation

In [None]:
from torch.utils.data import Dataset,DataLoader
import pandas as pd
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace
from datasets import load_dataset
import torchmetrics

#### 11.1 - Read and Create Dataset, Build Tokenizers

In [None]:

def causal_mask(size):
    mask = torch.tril(torch.ones(1, size, size).type(torch.int))   
    return mask

class translationDataSet(Dataset):
    def __init__(self,dataset,srcTokenizer,trgTokenizer,seqLen):
        super().__init__()
        self.seqLen = seqLen
        self.dataset = dataset
        self.srcTokenizer = srcTokenizer
        self.trgTokenizer = trgTokenizer
        self.srcLang = 'en'
        self.trgLang = 'pt'
        self.sos_token = torch.tensor([srcTokenizer.token_to_id("[SOS]")], dtype=torch.int64)
        self.eos_token = torch.tensor([srcTokenizer.token_to_id("[EOS]")], dtype=torch.int64)
        self.pad_token = torch.tensor([srcTokenizer.token_to_id("[PAD]")], dtype=torch.int64)

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self,idx):

        src_target_pair = self.dataset[idx]
        srcText = src_target_pair['translation'][self.srcLang]
        trgText = src_target_pair['translation'][self.trgLang]
        srcTokens= self.srcTokenizer.encode(srcText).ids
        trgTokens = self.trgTokenizer.encode(trgText).ids
        n_srcPadTokens = self.seqLen - len(srcTokens) - 2 # We will add [SOS] and [EOS]
        n_trgPadTokens = self.seqLen - len(trgTokens) - 1 # We will add [SOS]
        
        if n_srcPadTokens < 0: #truncating
            src = torch.cat([self.sos_token,torch.tensor(srcTokens[:self.seqLen-2], dtype=torch.int64),self.eos_token],dim=0)
        else:
            src = torch.cat([self.sos_token,torch.tensor(srcTokens, dtype=torch.int64),self.eos_token,
                        self.pad_token.repeat(n_srcPadTokens)],dim=0)
            
        if n_trgPadTokens < 0: #truncating
            trg = torch.cat([self.sos_token,torch.tensor(trgTokens[:self.seqLen-1], dtype=torch.int64)],dim=0)
            label = torch.cat([torch.tensor(trgTokens[:self.seqLen -1], dtype=torch.int64),self.eos_token],dim=0) 
        else:
            trg = torch.cat([self.sos_token,torch.tensor(trgTokens, dtype=torch.int64),
                        self.pad_token.repeat(n_trgPadTokens)],dim=0)
            
            label = torch.cat([torch.tensor(trgTokens, dtype=torch.int64),self.eos_token,
                          self.pad_token.repeat(n_trgPadTokens )],dim=0) 
#
        # Double check the size of the tensors to make sure they are all seqLen long (trgSeqlen and srcSeqLen can be different)
        assert src.size(0) == self.seqLen
        assert trg.size(0) == self.seqLen
        assert label.size(0) == self.seqLen
        
        return {"encInput":src,"decInput":trg,"label":label,'srcText':srcText,
                'trgText':trgText,'srcMask':(src != self.pad_token).unsqueeze(0).unsqueeze(0).int(), # (1, 1, seqLen)
                'trgMask':(trg != self.pad_token).unsqueeze(0).int() & causal_mask(trg.size(0))}




In [None]:
def get_all_sentences(dataset, lang):
    for item in dataset:
        yield item['translation'][lang]
        
def buildTokenizer(ds, lang):
    tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))
    tokenizer.pre_tokenizer = Whitespace()
    trainer = WordLevelTrainer(special_tokens=["[UNK]", "[PAD]", "[SOS]", "[EOS]"], min_frequency=2)
    tokenizer.train_from_iterator(get_all_sentences(ds, lang), trainer=trainer)
    return tokenizer

def createHfDataset(seqLen, batchSize=8):
    # Here we are using opus_books dataset for English to Speranto translation task
    ds = load_dataset("opus_books", "en-pt",split='train')
    srcTokenizer = buildTokenizer( ds, 'en')
    trgTokenizer =buildTokenizer( ds,'pt')
    ds = ds.train_test_split(test_size=0.1,shuffle=True, seed=42)
    train_ds = translationDataSet(ds['train'], srcTokenizer, trgTokenizer, seqLen)
    test_ds = translationDataSet(ds['test'], srcTokenizer, trgTokenizer, seqLen)
    train_dataloader = DataLoader(train_ds, batch_size=batchSize, shuffle=True)
    test_dataloader = DataLoader(test_ds, batch_size= 1, shuffle=False)
    return train_dataloader, test_dataloader, srcTokenizer, trgTokenizer

trDloader, tsDloader,srcTokenizer, trgTokenizer = createHfDataset(seqLen, batchSize=batchSize)


#### 11.2 - Datasets

In [None]:
ds = load_dataset("opus_books", "en-pt",split='train')
srcTokenizer = buildTokenizer( ds, 'en')
trgTokenizer =buildTokenizer( ds,'pt')
ds = ds.train_test_split(test_size=0.1,shuffle=True, seed=42)
train_ds = translationDataSet(ds['train'], srcTokenizer, trgTokenizer, seqLen)
test_ds = translationDataSet(ds['test'], srcTokenizer, trgTokenizer, seqLen)


### 13 Build a custom model and data loaders

In [None]:
def getModelParameters(model):
    total_params = sum(p.numel() for p in model.parameters())
    print(f"Number of parameters: {total_params}")


model = customTransformer(sourceVocabSize=srcTokenizer.get_vocab_size(),
                                  targetVocabSize=trgTokenizer.get_vocab_size(),
                                  sourceSeqLen=seqLen,targetSeqLen=seqLen,
                                  embDim=embDim,n_EncBlocks=n_EncBlocks,
                                  n_DecBlocks=n_DecBlocks,n_heads=n_heads,
                                  dropout=dropout,ffLayerSize=ffLayerSize).to(device)

optimizer = torch.optim.Adam(model.parameters(),lr=lr_rate)
criterion = nn.CrossEntropyLoss(ignore_index=srcTokenizer.token_to_id('[PAD]')).to(device)
getModelParameters(model)


### 14 - Training 

In [None]:
def train(model,n_Epochs,trDloader,trgTokenizer):
    for epoch in range(n_Epochs):
        torch.cuda.empty_cache()
        model.train()
        batchIterator = tqdm(trDloader, desc=f"Epoch number: {epoch+1}")
        for batch in batchIterator:
            encIn = batch['encInput'].to(device)
            decIn = batch['decInput'].to(device)
            encMask = batch['srcMask'].to(device)
            decMask = batch['trgMask'].to(device)
            labels = batch['label'].to(device)
            # print(f"Shape of encIn: {encIn.shape}, decIn: {decIn.shape}, encMask: {encMask.shape}, decMask: {decMask.shape}, labels: {labels.shape}")
            encOut = model.encode(encIn,encMask)
            decOut = model.decode(encOut,encMask,decIn,decMask)
            out = model.maper(decOut)
            loss = criterion(out.view(-1,trgTokenizer.get_vocab_size()),torch.flatten(labels))
            batchIterator.set_postfix({"Loss is": f"{loss.item()}"})
            loss.backward()
            optimizer.step()
            optimizer.zero_grad(set_to_none=True)
            
            
train(model,n_Epochs,trDloader,trgTokenizer=trgTokenizer)


### 15 - Evaluation


### 16 - Sentence translation