## Import Libraries


In [14]:
! pip install datasets



In [15]:
#important libraries from pytorch
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
from torch.utils.tensorboard import SummaryWriter

In [16]:
#huggingface Libraries
from datasets import load_dataset
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace

In [17]:
#pathlib
from pathlib import Path

In [18]:
#typing
from typing import Any

In [19]:
#library for progress barrs in loops
from tqdm import tqdm

In [20]:
#library for warnings
import warnings

In [21]:
import math

## Input Embeddings

In [22]:
# Create input embeddings
class InputEmbeddings(nn.Module):

    def __init__(self, d_model: int, vocab_size: int):
        super().__init__()
        self.d_model = d_model #dimension of the vectors(512)
        self.vocab_size = vocab_size #size of the vocabolary
        self.embedding = nn.Embedding(vocab_size, d_model) #embedding layer size

    def forward(self, x):
        return self.embedding(x) * math.sqrt(self.d_model) #Normalizing the variance of the embedding
    


In [23]:
#Creating the positional encoding
class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, seq_len: int, dropout: float) -> None:
        super().__init__()
        self.d_model = d_model #dimensionality of the model
        self.seq_len = seq_len #maximum sequence length
        self.dropout = nn.Dropout(dropout) #dropout layer to prevent overfitting

        #creating a positional encoding matrix of shape (seq_len, d_model) filled with zeros
        pe = torch.zeros(seq_len, d_model)

        #creating a tensor representing positions (0 to seq_len-1)
        position = torch.arange(0, seq_len, dtype=torch.float).unsqueeze(1) #transformer 'position' into a 2D tensor['seq_len, 1']

        #creating the division term for the positional encoding formula
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))

        #apply sine to even indices in pe
        pe[:,0::2] = torch.sine(position * div_term)
        #apply cosine to odd indices in pe
        pe[:, 1::2] = torch.cos(position * div_term)

        #adding an extra dimension at the beginning of pe matrix for batch handling
        pe = pe.unsqueeze(0)

        #registering 'pe' as buffer in a tensor not considered as a model parameters
        self.register_buffer('pe', pe)

    def forward(self, x):
        #adding positional encoding to the input tensor x
        x = x + (self.pe[:, :x.shape[1], :]).requires_grad_(False)
        return self.dropout(x) #dropout for regularization


## Layer Normalization

In [24]:
#creating layer normalization
class LayerNormalization(nn.Module):

    def __init__(self, eps: float = 10**6) -> None: # we define epsilon as 0.000001 to avoid division by zero
        super().__init__()
        self.eps = eps

        #we define alpha as trainable parameter and initialize it with ones
        self.alpha = nn.Parameter(torch.ones(1)) #one-dimensional tensor that will be used to scale the input data

        #we define bias as a trainable parameter and initialize it with ones
        self.bias = nn.Parameter(torch.zeros(1)) #one-dimensional tensor that will be added to the input data

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True) #computing the mean of the input data. keeping the number of dimensions unchanged
        std = x.std(dim=-1, keepdim=True) #computing the standard deviation of the input data. kepping the number of dimensions unchanged

        #returning the normalized input
        return self.alpha * (x - mean) / (std + self.eps) + self.bias

## Feed Forward Network


In [25]:
class FeedForwardBlock(nn.Module):

    def __init__(self, d_model: int, d_ff: int, dropout: float) -> None:
        super().__init__()
        self.linear_1 = nn.Linear(d_model, d_ff)
        self.dropout = nn.Dropout(dropout)
        self.linear_2 = nn.Linear(d_ff, d_model)

    def forward(self, x):
        return self.linear_2(self.dropout(torch.relu(self.linear_1(x))))

## Multihead Attention