In [1]:
!pip3 install -q torch==2.2.0 torchtext==0.17.0
!python -m spacy download de

[38;5;3m⚠ As of spaCy v3.0, shortcuts like 'de' are deprecated. Please use the
full pipeline package name 'de_core_news_sm' instead.[0m
Collecting de-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.7.0/de_core_news_sm-3.7.0-py3-none-any.whl (14.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.6/14.6 MB[0m [31m77.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('de_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [2]:
import torch
from torch import nn
from pathlib import Path
from tokenizers import Tokenizer
from torch.utils.data import Dataset, DataLoader

In [3]:

torch.__version__

'2.2.0+cu121'

In [4]:
device = "cuda" if torch.cuda.is_available() else "cpu"
# device = "cpu"
device

'cuda'

In [5]:
import torch

# Check if CUDA is available
cuda_available = torch.cuda.is_available()
print(f"CUDA available: {cuda_available}")

# If CUDA is available, print the GPU name and perform a test operation
if cuda_available:
    # Get the name of the GPU
    gpu_name = torch.cuda.get_device_name(0)
    print(f"GPU Name: {gpu_name}")

    # Create a tensor and move it to the GPU
    x = torch.tensor([1.0, 2.0, 3.0], device='cuda')
    print(f"Tensor on GPU: {x}")

    # Perform a simple operation
    y = x * 2
    print(f"Result of operation on GPU: {y}")
else:
    print("CUDA is not available. Please check your PyTorch installation and GPU drivers.")

CUDA available: True
GPU Name: Tesla T4
Tensor on GPU: tensor([1., 2., 3.], device='cuda:0')
Result of operation on GPU: tensor([2., 4., 6.], device='cuda:0')


In [6]:
#Data

In [7]:
import torchtext
import torch
from torchtext.data.utils import get_tokenizer
from collections import Counter
from torchtext.vocab import Vocab
from torchtext.utils import download_from_url, extract_archive
import io

url_base = 'https://raw.githubusercontent.com/multi30k/dataset/master/data/task1/raw/'
train_urls = ('train.de.gz', 'train.en.gz')
val_urls = ('val.de.gz', 'val.en.gz')
test_urls = ('test_2016_flickr.de.gz', 'test_2016_flickr.en.gz')

train_filepaths = [extract_archive(download_from_url(url_base + url))[0] for url in train_urls]
val_filepaths = [extract_archive(download_from_url(url_base + url))[0] for url in val_urls]
test_filepaths = [extract_archive(download_from_url(url_base + url))[0] for url in test_urls]

de_tokenizer = get_tokenizer('spacy', language='de')
en_tokenizer = get_tokenizer('spacy', language='en')

from torchtext.vocab import build_vocab_from_iterator

def build_vocab(filepath, tokenizer):
    counter = Counter()
    with io.open(filepath, encoding="utf8") as f:
        for string_ in f:
            counter.update(tokenizer(string_))
    # Ensure '<pad>' is at index 0 by placing it first in the specials list
    vocab = build_vocab_from_iterator(
        [counter.keys()],
        specials=['<pad>', '<unk>', '<bos>', '<eos>']  # '<pad>' comes first
    )
    vocab.set_default_index(vocab['<unk>'])
    return vocab


de_vocab = build_vocab(train_filepaths[0], de_tokenizer)
en_vocab = build_vocab(train_filepaths[1], en_tokenizer)

def data_process(filepaths):
  raw_de_iter = iter(io.open(filepaths[0], encoding="utf8"))
  raw_en_iter = iter(io.open(filepaths[1], encoding="utf8"))
  data = []
  for (raw_de, raw_en) in zip(raw_de_iter, raw_en_iter):
    de_tensor_ = torch.tensor([de_vocab[token] for token in de_tokenizer(raw_de)],
                            dtype=torch.long)
    en_tensor_ = torch.tensor([en_vocab[token] for token in en_tokenizer(raw_en)],
                            dtype=torch.long)
    data.append((de_tensor_, en_tensor_))
  return data

train_data = data_process(train_filepaths)
val_data = data_process(val_filepaths)
test_data = data_process(test_filepaths)



In [8]:
print(f"DE <pad> index: {de_vocab['<pad>']}")
print(f"EN <pad> index: {en_vocab['<pad>']}")


DE <pad> index: 0
EN <pad> index: 0


In [9]:
#Hyperparameters

block_size = 256
batch_size = 64
src_vocab_size = len(de_vocab)
tgt_vocab_size = len(en_vocab)
embeddings_dims = 384
attn_dropout = 0.1
no_of_heads = 6 #IMP needs to be thoroughly calculated
dropout = 0.1
epochs = 3
max_lr = 2e-4
no_of_decoder_layers = 6 #IMP needs to be thoroughly calculated
attn_dropout = 0.1
weight_decay_optim = 0.01

In [10]:
# from textwrap import indent
# # Custom collate function for padding
# from torch.nn.utils.rnn import pad_sequence

# def collate_fn(batch):
#     # Separate inputs and targets
#     inputs, targets = zip(*batch)

#     # Pad input sequences
#     inputs_padded = pad_sequence([torch.tensor(seq) for seq in inputs], batch_first=True, padding_value=0)

#     # Pad target sequences (if targets are sequences) or convert them to a tensor
#     targets_padded = pad_sequence([torch.tensor(seq) for seq in targets], batch_first=True, padding_value=0)

#     return inputs_padded, targets_padded

import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

BATCH_SIZE = batch_size
PAD_IDX = de_vocab['<pad>']
BOS_IDX = de_vocab['<bos>']
EOS_IDX = de_vocab['<eos>']

from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader

# Constants
MAX_LENGTH = block_size  # Desired sequence length for padding

def generate_batch(data_batch):
    de_batch, en_batch, src_padding_masks, tgt_padding_masks= [], [], [], []
    for (de_item, en_item) in data_batch:
        # Ensure <BOS> is at the beginning and <EOS> at the end
        de_item = torch.cat([torch.tensor([BOS_IDX]), de_item, torch.tensor([EOS_IDX])], dim=0)
        en_item = torch.cat([torch.tensor([BOS_IDX]), en_item, torch.tensor([EOS_IDX])], dim=0)

        # Manually pad sequences to the maximum length (256 in this case)
        de_item_padded = F.pad(de_item, (0, MAX_LENGTH - de_item.size(0)), value=PAD_IDX)
        en_item_padded = F.pad(en_item, (0, MAX_LENGTH - en_item.size(0)), value=PAD_IDX)

         # Generate padding masks (1 for non-padding, 0 for padding)
        src_mask = (de_item_padded != PAD_IDX).int()  # Source mask
        tgt_mask = (en_item_padded != PAD_IDX).int()  # Target mask

        # Append to the batch
        de_batch.append(de_item_padded)
        en_batch.append(en_item_padded)
        src_padding_masks.append(src_mask)
        tgt_padding_masks.append(tgt_mask)

    # Stack batches together
    de_batch = torch.stack(de_batch, dim=0)
    en_batch = torch.stack(en_batch, dim=0)
    src_padding_masks = torch.stack(src_padding_masks, dim=0)  # Shape: (batch_size, MAX_LENGTH)
    tgt_padding_masks = torch.stack(tgt_padding_masks, dim=0)  # Shape: (batch_size, MAX_LENGTH)

    return de_batch, en_batch, src_padding_masks, tgt_padding_masks

# DataLoader with the custom collate_fn
train_dataloader = DataLoader(train_data, batch_size=batch_size,
                               shuffle=True, collate_fn=generate_batch)
val_dataloader = DataLoader(val_data, batch_size=batch_size,
                             shuffle=False, collate_fn=generate_batch)
# test_iter = DataLoader(test_data, batch_size=batch_size,
#                        shuffle=True, collate_fn=generate_batch)




In [11]:


# #Dataloaders
# train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
# val_dataloader = DataLoader(val_data, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
# test_dataloader = DataLoader(test_data, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

In [12]:
# Get the vocabulary size for German and English
german_vocab_size = len(de_vocab)
english_vocab_size = len(en_vocab)

print(f"German Vocabulary Size: {german_vocab_size}")
print(f"English Vocabulary Size: {english_vocab_size}")

German Vocabulary Size: 19215
English Vocabulary Size: 10838


In [13]:
next(iter(train_dataloader))

(tensor([[    2,  2473, 15830,  ...,     0,     0,     0],
         [    2,  2481,  6546,  ...,     0,     0,     0],
         [    2,  2481,  4170,  ...,     0,     0,     0],
         ...,
         [    2,  2473, 13509,  ...,     0,     0,     0],
         [    2,  2473,  6546,  ...,     0,     0,     0],
         [    2,  2473,  6546,  ...,     0,     0,     0]]),
 tensor([[    2,   102, 10819,  ...,     0,     0,     0],
         [    2,   102,  6509,  ...,     0,     0,     0],
         [    2,   102,  5371,  ...,     0,     0,     0],
         ...,
         [    2,   102,  6365,  ...,     0,     0,     0],
         [    2,   102,  6509,  ...,     0,     0,     0],
         [    2,   102,  6509,  ...,     0,     0,     0]]),
 tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]], dtype=torch.int32),
 tensor([[1, 1,

In [14]:

# for x, y in train_dataloader:
#     print(x.shape)
#     print(y.shape)
#     break

In [15]:
# Text embeddings
class TgtTextEmbeddings(nn.Module):
    def __init__(
        self,
        vocab_size = tgt_vocab_size,
        embeddings_dims = embeddings_dims
    ):
        super().__init__()
        self.embeddings_table = nn.Embedding(num_embeddings = tgt_vocab_size, embedding_dim=embeddings_dims, device=device) #Just a look up table to convert the toekns_ids to some numbers
        # nn.init.normal_(self.embeddings_table.weight.data, mean=0, std=0.02)

    def forward(self, x):
        return self.embeddings_table(x)

In [16]:
# Text embeddings
class SrcTextEmbeddings(nn.Module):
    def __init__(
        self,
        vocab_size = src_vocab_size,
        embeddings_dims = embeddings_dims
    ):
        super().__init__()
        self.embeddings_table = nn.Embedding(num_embeddings = src_vocab_size, embedding_dim=embeddings_dims, device=device) #Just a look up table to convert the toekns_ids to some numbers
        # nn.init.normal_(self.embeddings_table.weight.data, mean=0, std=0.02)

    def forward(self, x):
        return self.embeddings_table(x)

In [17]:

#Layer Normalization

class LayerNormalization(nn.Module):
    def __init__(
        self,
        embeddings_dims = embeddings_dims
    ):
        super().__init__()
        self.norm = nn.LayerNorm(normalized_shape=embeddings_dims)
    def forward(self, x):

        return self.norm(x)

In [18]:

#FeedForward Neural Network

class MLPBlock(nn.Module):
    def __init__(
        self,
        dropout = dropout,
        embeddings_size = embeddings_dims,
        # inner_dimensional_states: int = 3072
    ):
        super().__init__()

        self.mlp = nn.Sequential(
            nn.Linear(device=device, in_features=embeddings_size, out_features= 4 * embeddings_dims),
            nn.GELU(),
            nn.Linear(device=device, in_features= 4 * embeddings_dims, out_features=embeddings_size),
            nn.Dropout(p = dropout)
        )

    def forward(self, x):
        # mlp_weights_init = self.mlp.apply(weights_init)
        return self.mlp(x)

In [19]:

class MaskedAttentionHead(nn.Module):
    def __init__(
        self,
        attn_dropout = attn_dropout,
        embeddings_dims = embeddings_dims,
        no_of_heads = no_of_heads,
    ):
        super().__init__()
        self.head_size = embeddings_dims // no_of_heads
        self.query = nn.Linear(in_features=embeddings_dims, out_features=self.head_size, device=device, bias=False)
        self.keys = nn.Linear(in_features=embeddings_dims, out_features=self.head_size,device=device, bias=False)
        self.values = nn.Linear(in_features=embeddings_dims, out_features=self.head_size, device=device,bias=False)
        self.dropout = nn.Dropout(p = attn_dropout)


    def forward(self, x):
        batch, block_size, embd_dims = x.shape
        k = self.keys(x)
        q = self.query(x)
        v = self.values(x)
        masked_table = torch.tril(torch.ones(block_size, block_size, device=device))
        weights = q @ torch.transpose(k, dim0=-2, dim1=-1) * (k.shape[-1] ** -0.5)
        masked_values = weights.masked_fill(masked_table[: block_size, : block_size] == 0, float('-inf'))
        weights_normalized = nn.functional.softmax(masked_values, dim=-1) #Normalize along the embeddings dimension for all the tokens
        weights_normalized = self.dropout(weights_normalized)
        out = weights_normalized @ v
        return out


In [20]:

class MaskedMHA(nn.Module):
    def __init__(
        self,
        attn_dropout = attn_dropout,
        embeddings_dims = embeddings_dims,
        no_of_heads = no_of_heads,
    ):
        super().__init__()
        self.heads = nn.ModuleList([MaskedAttentionHead(attn_dropout=attn_dropout, embeddings_dims=embeddings_dims, no_of_heads=no_of_heads) for _ in range(no_of_heads)])
        self.dropout = nn.Dropout(p = attn_dropout)
        self.linear = nn.Linear(in_features=embeddings_dims, out_features=embeddings_dims, device=device, bias=False) # 12 (no of heads) * (batch_size) 64 = 768 -> gives out the text embeddings

    def forward(self, x):
        concat = torch.cat([head(x) for head in self.heads], dim=-1)
        linear_layer = self.linear(concat)
        out = self.dropout(linear_layer)
        return out

In [21]:
#Single Attention Head

class CrossAttentionHead(nn.Module):
    def __init__(
        self,
        attn_dropout = attn_dropout,
        embeddings_dims = embeddings_dims,
        no_of_heads = no_of_heads,
    ):
        super().__init__()
        self.head_size = embeddings_dims // no_of_heads
        self.query = nn.Linear(in_features=embeddings_dims, out_features=self.head_size, device=device, bias=False)
        self.keys = nn.Linear(in_features=embeddings_dims, out_features=self.head_size,device=device, bias=False)
        self.values = nn.Linear(in_features=embeddings_dims, out_features=self.head_size, device=device,bias=False)
        self.dropout = nn.Dropout(p = attn_dropout)


    def forward(self, query, key, value, mask=None):
        # batch, block_size, embd_dims = x.shape

        # masked_table = torch.tril(torch.ones(block_size, block_size, device=device))
        weights = query @ torch.transpose(key, dim0=-2, dim1=-1) * (key.shape[-1] ** -0.5)
        if(mask != None):
            mask = mask.unsqueeze(1)
            masked_values = weights.masked_fill(mask == 0, float('-inf'))
            weights_normalized = nn.functional.softmax(masked_values, dim=-1) #Normalize along the embeddings dimension for all the tokens
            # weights_normalized = self.dropout(weights_normalized)
            out = weights_normalized @ value
            out = self.dropout(out)
            return out
        else:
            weights_normalized = nn.functional.softmax(weights, dim=-1) #Normalize along the embeddings dimension for all the tokens
            # weights_normalized = self.dropout(weights_normalized)
            out = weights_normalized @ value
            out = self.dropout(out)
            return out

In [22]:
#Single Attention Head

class FullAttentionHead(nn.Module):
    def __init__(
        self,
        attn_dropout = attn_dropout,
        embeddings_dims = embeddings_dims,
        no_of_heads = no_of_heads,
    ):
        super().__init__()
        self.head_size = embeddings_dims // no_of_heads
        self.query = nn.Linear(in_features=embeddings_dims, out_features=self.head_size, device=device, bias=False)
        self.keys = nn.Linear(in_features=embeddings_dims, out_features=self.head_size,device=device, bias=False)
        self.values = nn.Linear(in_features=embeddings_dims, out_features=self.head_size, device=device,bias=False)
        self.dropout = nn.Dropout(p = attn_dropout)


    def forward(self, x, mask=None):
        # batch, block_size, embd_dims = x.shape
        k = self.keys(x)
        q = self.query(x)
        v = self.values(x)
        # masked_table = torch.tril(torch.ones(block_size, block_size, device=device))
        weights = q @ torch.transpose(k, dim0=-2, dim1=-1) * (k.shape[-1] ** -0.5)
        if(mask != None):
            mask = mask.unsqueeze(1)
            masked_values = weights.masked_fill(mask == 0, float('-inf'))
            weights_normalized = nn.functional.softmax(masked_values, dim=-1) #Normalize along the embeddings dimension for all the tokens
            # weights_normalized = self.dropout(weights_normalized)
            out = weights_normalized @ v
            out = self.dropout(out)
            return out
        else:
            weights_normalized = nn.functional.softmax(weights, dim=-1) #Normalize along the embeddings dimension for all the tokens
            # weights_normalized = self.dropout(weights_normalized)
            out = weights_normalized @ v
            out = self.dropout(out)
            return out

In [23]:

class FullMHA(nn.Module):
    def __init__(
        self,
        attn_dropout = attn_dropout,
        embeddings_dims = embeddings_dims,
        no_of_heads = no_of_heads,
    ):
        super().__init__()
        self.heads = nn.ModuleList([FullAttentionHead(attn_dropout=attn_dropout, embeddings_dims=embeddings_dims, no_of_heads=no_of_heads) for _ in range(no_of_heads)])
        self.dropout = nn.Dropout(p = attn_dropout)
        self.linear = nn.Linear(in_features=embeddings_dims, out_features=embeddings_dims, device=device, bias=False) # 12 (no of heads) * (batch_size) 64 = 768 -> gives out the text embeddings

    def forward(self, x, mask=None):
        concat = torch.cat([head(x, mask) for head in self.heads], dim=-1)
        linear_layer = self.linear(concat)
        out = self.dropout(linear_layer)
        return out

In [24]:


class CrossMHA(nn.Module):
    def __init__(
        self,
        attn_dropout = attn_dropout,
        embeddings_dims = embeddings_dims,
        no_of_heads = no_of_heads,
    ):
        super().__init__()
        self.heads = nn.ModuleList([CrossAttentionHead(attn_dropout=attn_dropout, embeddings_dims=embeddings_dims, no_of_heads=no_of_heads) for _ in range(no_of_heads)])
        self.dropout = nn.Dropout(p = attn_dropout)
        self.linear = nn.Linear(in_features=no_of_decoder_layers * embeddings_dims, out_features=embeddings_dims, device=device, bias=False) # 12 (no of heads) * (batch_size) 64 = 768 -> gives out the text embeddings

    def forward(self, query, key, x, mask=None):
        concat = torch.cat([head(query, key, x,  mask) for head in self.heads], dim=-1)
        linear_layer = self.linear(concat)
        out = self.dropout(linear_layer)
        return out

In [1]:
# Decoder Block

class TransformerDecoderBlock(nn.Module):
    def __init__(
        self,
        attn_dropout = attn_dropout,
        embeddings_dims = embeddings_dims,
        no_of_heads = no_of_heads,
        dropout = dropout,
        # vocab_size = vocab_size
    ):
        super().__init__()

        self.cross = CrossMHA(attn_dropout=attn_dropout, embeddings_dims=embeddings_dims, no_of_heads=no_of_heads)
        self.masked = MaskedMHA(attn_dropout=attn_dropout, embeddings_dims=embeddings_dims, no_of_heads=no_of_heads)
        self.layer_norm1 = LayerNormalization(embeddings_dims)
        self.layer_norm2 = LayerNormalization(embeddings_dims)
        # self.layer_norm3 = LayerNormalization(embeddings_dims=embeddings_dims)
        self.layer_norm4 = LayerNormalization(embeddings_dims)
        self.mlp_block = MLPBlock(dropout=dropout, embeddings_size=embeddings_dims)

    def forward(self, key, value, x, mask=None):
        x = self.layer_norm1(x + self.masked(x)) #Very important step -> Layer Norm on input and then passes it to the subsequent blocks
        x = self.layer_norm2(x + self.cross(key, value, x, mask)) #Very important step
        # x = x + self.mha(self.layer_norm1(x))  #Very important step -> Layer Norm on input and then passes it to the subsequent blocks
        x = self.layer_norm4(x + self.mlp_block(x)) #Very important step

        return x

NameError: name 'nn' is not defined

In [26]:
# Decoder Block

class DecoderModel(nn.Module):
    def __init__(
        self,
        attn_dropout = attn_dropout,
        embeddings_dims = embeddings_dims,
        no_of_heads = no_of_heads,
        block_size = block_size,
        dropout = dropout,
        no_of_decoder_layers = no_of_decoder_layers,
        # vocab_size = vocab_size
    ):
        super().__init__()


        # self.positional_embeddings_tgt = nn.Parameter(torch.randn(1, block_size, embeddings_dims, device=device), requires_grad=True) #To give positional embeddings to each token of the input text, hence num_embeddings=block_size


        # torch.nn.init.normal_(self.positional_embeddings_tgt, mean=0.0, std=0.02)

        # self.text_embds = TextEmbeddings(vocab_size=vocab_size, embeddings_dims=embeddings_dims)


        self.tgt_text_embds = TgtTextEmbeddings(vocab_size=tgt_vocab_size, embeddings_dims=embeddings_dims)
        self.linear_layer = nn.Linear(in_features=embeddings_dims, out_features=tgt_vocab_size, device=device, bias=False) # Takes in logits of dimensions- embeds_dims and converts it into dimension of vocab_size (logits in range of vocab_size)
        # self.layer_norm = LayerNormalization(embeddings_dims=embeddings_dims)
        self.decoder_layers = nn.ModuleList([TransformerDecoderBlock(attn_dropout=attn_dropout, embeddings_dims=embeddings_dims, no_of_heads=no_of_heads, dropout=dropout) for _ in range(no_of_decoder_layers)])
        self.apply(self._init_weights)
        self.positional_embeddings_tgt = nn.Parameter(torch.randn(1, block_size, embeddings_dims, device=device), requires_grad=True) #To give positional embeddings to each token of the input text, hence num_embeddings=block_size
        torch.nn.init.normal_(self.positional_embeddings_tgt, mean=0.0, std=0.02)

        # out = self.decoder_layers(query, key, x)
        # Loop through each decoder layer
    def _init_weights(self, module):  #Weight Initialization
            if isinstance(module, nn.Linear):
                torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
                if module.bias is not None:
                    torch.nn.init.zeros_(module.bias)
            elif isinstance(module, nn.Embedding):
                torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, key, value, x, mask):
        x = self.tgt_text_embds(x)
        x = x + self.positional_embeddings_tgt
        for decoder_layer in self.decoder_layers:
            x = decoder_layer(key, value, x, mask)
        # x = self.layer_norm(x)

        return x

In [27]:

#Encoder

In [28]:



class TransformerEncoderBlock(nn.Module):
    def __init__(
        self,
        attn_dropout = attn_dropout,
        embeddings_dims = embeddings_dims,
        no_of_heads = no_of_heads,
        dropout = dropout,
        mask=None
    ):
        super().__init__()

        self.mha = FullMHA(attn_dropout=attn_dropout, embeddings_dims=embeddings_dims, no_of_heads=no_of_heads)
        self.layer_norm1 = LayerNormalization(embeddings_dims)
        self.layer_norm2 = LayerNormalization(embeddings_dims)
        self.mlp_block = MLPBlock(dropout=dropout, embeddings_size=embeddings_dims)

    def forward(self, x, mask=None):
        x = self.layer_norm1(x + self.mha(x, mask))
        x = self.layer_norm2(x + self.mlp_block(x))

        return x

In [29]:


class EncoderModel(nn.Module):
    def __init__(
        self,
        attn_dropout = attn_dropout,
        embeddings_dims = embeddings_dims,
        no_of_heads = no_of_heads,
        block_size = block_size,
        dropout = dropout,
        no_of_decoder_layers = no_of_decoder_layers,
        # vocab_size = vocab_size
    ):
        super().__init__()

        # self.positional_embeddings_src = nn.Parameter(torch.randn(1, block_size, embeddings_dims, device=device), requires_grad=True) #To give positional embeddings to each token of the input text, hence num_embeddings=block_size

        # torch.nn.init.normal_(self.positional_embeddings_src, mean=0.0, std=0.02)

        # self.text_embds = TextEmbeddings(vocab_size=vocab_size, embeddings_dims=embeddings_dims)

        self.positional_embeddings_src = nn.Parameter(torch.randn(1, block_size, embeddings_dims, device=device), requires_grad=True) #To give positional embeddings to each token of the input text, hence num_embeddings=block_size
        torch.nn.init.normal_(self.positional_embeddings_src, mean=0.0, std=0.02)

        self.src_text_embeds = SrcTextEmbeddings(vocab_size=src_vocab_size, embeddings_dims=embeddings_dims)

        self.encoder_layers = nn.ModuleList([TransformerEncoderBlock(attn_dropout=attn_dropout, embeddings_dims=embeddings_dims, no_of_heads=no_of_heads, dropout=dropout) for _ in range(no_of_decoder_layers)])
        self.apply(self._init_weights)

    def _init_weights(self, module):  #Weight Initialization
            if isinstance(module, nn.Linear):
                torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
                if module.bias is not None:
                    torch.nn.init.zeros_(module.bias)
            elif isinstance(module, nn.Embedding):
                torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, x, mask):

        # print(x.shape)
        x = self.src_text_embeds(x)
        # print(self.positional_embeddings_src.shape)
        # print(x.shape)
        x = x + self.positional_embeddings_src

        # print(x.shape)
        # Loop through each encoder layer
        for encoder_layer in self.encoder_layers:
            x = encoder_layer(x, mask)
        return x



In [30]:

class Transformer(nn.Module):
    def __init__(
        self,

    ):
        super().__init__()

        self.encoder = EncoderModel()
        self.decoder = DecoderModel()
        self.linear_layer = nn.Linear(in_features=embeddings_dims, out_features=tgt_vocab_size, device=device, bias=False) # Takes in logits of dimensions- embeds_dims and converts it into dimension of vocab_size (logits in range of vocab_size)


    def forward(self, src, tgt, src_mask, tgt_mask):
        x = self.encoder(src, src_mask)
        x = self.decoder(x, x, tgt, None)
        out = self.linear_layer(x)
        return out



In [31]:
#Instantiating the model
model = Transformer()
model = model.to(device)


In [32]:
train_data[2][1][1].shape

torch.Size([])

In [33]:

#Printing a summary of the architecture
!pip install torchinfo
from torchinfo import summary
# idx, targets = get_batch('test')
# idx = idx.to(device)

# Generate random sample data
src_data = torch.randint(1, src_vocab_size, (batch_size, block_size)).to(device)  # (batch_size, seq_length)
tgt_data = torch.randint(1, tgt_vocab_size, (batch_size, block_size)).to(device)  # (batch_size, seq_length)
src_mask = torch.randint(1, src_vocab_size, (batch_size, block_size)).to(device)  # 
tgt_mask = torch.randint(1, tgt_vocab_size, (batch_size, block_size)).to(device)  #
# print(src_data.shape)
summary(model=model,
        input_data=(src_data, tgt_data, src_mask, tgt_mask),
        col_names=["input_size", "output_size", "num_params", "trainable"],
        col_width=20,
        row_settings=["var_names"])



Layer (type (var_name))                                 Input Shape          Output Shape         Param #              Trainable
Transformer (Transformer)                               [64, 256]            [64, 256, 10838]     --                   True
├─EncoderModel (encoder)                                [64, 256]            [64, 256, 384]       98,304               True
│    └─SrcTextEmbeddings (src_text_embeds)              [64, 256]            [64, 256, 384]       --                   True
│    │    └─Embedding (embeddings_table)                [64, 256]            [64, 256, 384]       7,378,560            True
│    └─ModuleList (encoder_layers)                      --                   --                   --                   True
│    │    └─TransformerEncoderBlock (0)                 [64, 256, 384]       [64, 256, 384]       1,772,928            True
│    │    └─TransformerEncoderBlock (1)                 [64, 256, 384]       [64, 256, 384]       1,772,928            True
│  

In [34]:
# Optimizer setup and scheduler steup
out = {"Train": None, "val": None}
optimizer = torch.optim.AdamW(model.parameters(), lr=max_lr)
# optimizer = torch.optim.Adam(model.parameters(), lr=max_lr, weight_decay=weight_decay_optim)
# initial_iters = 2000
# total_steps = 10000
eval_iters = 5
loss_fn = nn.CrossEntropyLoss()
count = 0

@torch.inference_mode()
def estimate_loss():
    out = {}
    model.eval()
    losses = torch.zeros(eval_iters * len(val_dataloader))
    count = 0
    for k in range(eval_iters):
        for src_idx, tgt_idx, src_pad, tgt_pad in val_dataloader:
            # idx, targets = get_batch(split=split)
            src_idx, tgt_idx, src_pad, tgt_pad = src_idx.to(device), tgt_idx.to(device), src_pad.to(device), tgt_pad.to(device)
          
            logits = model(src_idx, tgt_idx, src_pad, tgt_pad)
            batch_size, block_size, embeddings_dims = logits.shape
            logits = logits.view(batch_size*block_size, embeddings_dims) # Total tokens(words) => batch_size * block_size
            targets = tgt_idx.view(batch_size * block_size)
            loss = nn.functional.cross_entropy(logits, targets)
            losses[count] = loss.item()
            count += 1

    out['val'] = losses.mean()
    model.train()
    return out

In [35]:
#Train the  model
from tqdm import tqdm
loss_ = torch.zeros(epochs * len(train_dataloader))
model.train()
count1 = 0
# batch_counter = 0
for epoch  in tqdm(range(epochs)):
    eval = True
    loss_ = torch.zeros(epochs * len(train_dataloader))
    count1 = 0
    for src_idx, tgt_idx, src_pad, tgt_pad in train_dataloader:

      # Evaluate and print loss every epoch for a total of 5 val epochs (kinda like cross-val)
      # is_eval_iter = (batch_counter % eval_iters == 0 and batch_counter > 0)
      # is_last_batch = (epoch == epochs - 1 and batch_counter == epochs * len(train_dataloader))

      # print(batch_counter)
      if eval and epoch != 0:
          # print(batch_counter)
          # print(is_eval_iter)
          # print(is_last_batch)
          eval = False
          losses = estimate_loss()
          print(f"epoch {epoch}: train loss {loss.item():.4f}, val loss {losses['val']:.4f}")

      else:
        src_idx, tgt_idx, src_pad, tgt_pad = src_idx.to(device), tgt_idx.to(device), src_pad.to(device), tgt_pad.to(device)
          
        # idx, targets = get_batch(split='train')
        logits = model(src_idx, tgt_idx, src_pad, tgt_pad)
        batch_size, block_size, embeddings_dims = logits.shape
        logits = logits.view(batch_size*block_size, embeddings_dims)
        targets = tgt_idx.view(batch_size * block_size)
        loss = nn.functional.cross_entropy(logits, targets)
        # loss_[count1] = loss.item()
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()
        # count1 += 1
        # batch_counter -= 1

 33%|███▎      | 1/3 [12:34<25:08, 754.42s/it]

epoch 1: train loss 0.0248, val loss 0.0315


 67%|██████▋   | 2/3 [25:51<12:59, 779.70s/it]

epoch 2: train loss 0.0147, val loss 0.0153


100%|██████████| 3/3 [39:10<00:00, 783.61s/it]
