In [125]:
import torch 
import math
from torch import nn,Tensor
from torch.nn import MultiheadAttention,LayerNorm,Embedding
import numpy as np



In [126]:
device=('cuda:0' if torch.cuda.is_available() else 'cpu' )

Building Vanilla Transformer Encoder from scracth 

Step 1: Build Input Embdedding and add Positional Encoding to it 

In [127]:
### Creating a random tensor 
batch_size=8
num_tokens=10

example_tensors=torch.rand(size=(batch_size,num_tokens)).to(dtype=torch.int64)


In [128]:
example_tensors.device

device(type='cpu')

In [129]:
### Creating Embedding Layer 
embed_dim=256
vocab_size=100
emb_layer=nn.Embedding(num_embeddings=vocab_size,embedding_dim=embed_dim)

##
embedded_tensor=emb_layer(example_tensors)

print(f'embedded_tensor.shape: {embedded_tensor.shape}')

embedded_tensor.shape: torch.Size([8, 10, 256])


In [130]:
# Define positional encoding function
def positional_encoding(seq_len, embed_dim):
    pos = torch.arange(seq_len, dtype=torch.float32).unsqueeze(1)
    div_term = torch.exp(torch.arange(0, embed_dim, 2, dtype=torch.float32) * -(torch.log(torch.tensor(10000.0)) / embed_dim))
    pe = torch.zeros(seq_len, embed_dim)
    pe[:, 0::2] = torch.sin(pos * div_term)
    pe[:, 1::2] = torch.cos(pos * div_term)
    pe = pe.unsqueeze(0)  # Add batch dimension
    return pe


In [131]:
pos_enc=positional_encoding(seq_len=num_tokens,embed_dim=embed_dim)

In [132]:
pos_enc.shape

torch.Size([1, 10, 256])

In [133]:
### Defining Embedding Class now 

class EmbeddingLayer(nn.Module):

    def __init__(self,vocab_size,embed_dim):
        super(EmbeddingLayer,self).__init__()
        self.vocab_size=vocab_size
        self.embed_dim=embed_dim
        self.emb_layer=nn.Embedding(num_embeddings=self.vocab_size,embedding_dim=self.embed_dim)

    def positional_encoding(self,seq_len, embed_dim):
        pos = torch.arange(seq_len, dtype=torch.float32).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embed_dim, 2, dtype=torch.float32) * -(torch.log(torch.tensor(10000.0)) / embed_dim))
        pe = torch.zeros(seq_len, embed_dim)
        pe[:, 0::2] = torch.sin(pos * div_term)
        pe[:, 1::2] = torch.cos(pos * div_term)
        pe = pe.unsqueeze(0)  # Add batch dimension
        return pe
    
    def forward(self,x):
        x=self.emb_layer(x)
        seq_len=x.shape[1]
        pos_enc=self.positional_encoding(seq_len=seq_len,embed_dim=self.embed_dim).to(x.device)
        return torch.add(x,pos_enc)


example

In [134]:
embedded_layer=EmbeddingLayer(vocab_size=vocab_size,embed_dim=embed_dim).to(device)(example_tensors.to(device))

In [135]:
embedded_layer.shape

torch.Size([8, 10, 256])

In [136]:
embedded_layer[:,0,:].shape

torch.Size([8, 256])

In [137]:
embedded_layer.shape

torch.Size([8, 10, 256])

### Now Defining Transformer Encoder; 

1. First MHA
2. Add and Norm Layer
3. feedforward layer



In [138]:
linear_layer=nn.Sequential(
    nn.Linear(in_features=embed_dim,out_features=embed_dim*4),
    nn.Dropout(0.3),
    nn.GELU(),
    nn.Linear(in_features=embed_dim*4,out_features=embed_dim)
    )


In [139]:
class FFL(nn.Module): ## feedforward Layer
    def __init__(self,embed_dim,dff,dropout):
        super(FFL,self).__init__()
        self.ffl_layer=nn.Sequential(
            nn.Linear(in_features=embed_dim,out_features=dff),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(in_features=dff,out_features=embed_dim)
        )

    def forward(self,x):
        return self.ffl_layer(x)

In [140]:
## FeedForward Layer
FFL(embed_dim=embed_dim,dff=embed_dim*4,dropout=0.3).to(device)(embedded_layer).shape

torch.Size([8, 10, 256])

In [104]:
### Now we will define base transformer Encoder (single layer); In NLP we usuallu apply layer normalization to emb_dimension only 
layer_norm=nn.LayerNorm(normalized_shape=embed_dim).to(device)


layer_norm(embedded_layer+embedded_layer)

tensor([[[-0.9979, -0.2928,  0.2799,  ..., -1.2851, -0.4490,  1.7955],
         [-0.2755, -0.7590,  0.8744,  ..., -1.3427, -0.4938,  1.7849],
         [-0.2078, -1.6316,  1.1672,  ..., -1.3345, -0.4877,  1.7848],
         ...,
         [-0.3582, -0.4737, -0.4577,  ..., -1.2477, -0.4053,  1.8535],
         [-0.0468, -1.2494,  0.0753,  ..., -1.2029, -0.3866,  1.8016],
         [-0.5243, -1.8473,  0.7113,  ..., -1.1474, -0.3587,  1.7551]],

        [[-0.9979, -0.2928,  0.2799,  ..., -1.2851, -0.4490,  1.7955],
         [-0.2755, -0.7590,  0.8744,  ..., -1.3427, -0.4938,  1.7849],
         [-0.2078, -1.6316,  1.1672,  ..., -1.3345, -0.4877,  1.7848],
         ...,
         [-0.3582, -0.4737, -0.4577,  ..., -1.2477, -0.4053,  1.8535],
         [-0.0468, -1.2494,  0.0753,  ..., -1.2029, -0.3866,  1.8016],
         [-0.5243, -1.8473,  0.7113,  ..., -1.1474, -0.3587,  1.7551]],

        [[-0.9979, -0.2928,  0.2799,  ..., -1.2851, -0.4490,  1.7955],
         [-0.2755, -0.7590,  0.8744,  ..., -1

In [141]:
class TransformerEncoder(nn.Module):

    def __init__(self,embed_dim,vocab_size,droput,num_heads):
        super(TransformerEncoder,self).__init__()
        ##initializing variables
        self.vocab_size=vocab_size
        self.embed_dim=embed_dim
        self.dff=self.embed_dim*4
        self.num_heads=num_heads
        self.dropout=droput
        
        ##nn.NN
        self.mha=nn.MultiheadAttention(embed_dim=self.embed_dim,num_heads=self.num_heads,dropout=self.dropout)
        self.layer_nomr0=nn.LayerNorm(normalized_shape=self.embed_dim)
        self.layer_nomr1=nn.LayerNorm(normalized_shape=self.embed_dim)
        self.layer_nomr2=nn.LayerNorm(normalized_shape=self.embed_dim)

        ##Class Objects
        
        self.feedforward_layer=FFL(embed_dim=self.embed_dim,dff=self.dff,dropout=self.dropout)

    
    def forward(self,x):

        #x is of shape batch_size,seq_len,embed_dim; output from Embedding Layer

        x=self.layer_nomr0(x)
        mha_output,_=self.mha(x,x,x)
        
        normlized_1=self.layer_nomr1(x+mha_output)
        ffd_output=self.feedforward_layer(normlized_1)
        normlized_2=self.layer_nomr2(normlized_1+ffd_output)

        return normlized_2
        


        



In [142]:
### checking TransformerEncoder
## defining Encoder Parameter using baseline from Attention is all we need Paper
num_heads=8
num_layers=6

encoder=TransformerEncoder(embed_dim=embed_dim,vocab_size=vocab_size,
                           num_heads=num_heads,droput=0.3).to(device)

print(f'encoder_output_shape: {encoder(embedded_layer).shape}')

encoder_output_shape: torch.Size([8, 10, 256])


In [143]:
encoder_layers=[TransformerEncoder(embed_dim=embed_dim,vocab_size=vocab_size,
                           num_heads=num_heads,droput=0.3)
                           for i in range(5)]

In [144]:
### defining Encoder with num_of_layers
class TransformerEncoderLayers(nn.Module):
    
    def __init__(self,num_layers,embed_dim,vocab_size,droput,num_heads):
        super(TransformerEncoderLayers,self).__init__()
        self.num_layers=num_layers
        self.embedding_layers=EmbeddingLayer(vocab_size=vocab_size,embed_dim=embed_dim)
        self.transformer_encoders=nn.ModuleList([TransformerEncoder(embed_dim,vocab_size,droput,num_heads)
                                  for i in range(self.num_layers)])
    def forward(self,x):

        x=self.embedding_layers(x)

        for i in range(self.num_layers):
            x=self.transformer_encoders[i](x)
        
        return x



In [145]:
example_tensors.device

device(type='cpu')

In [146]:
### checking TransformerEncoder
## defining Encoder Parameter using baseline from Attention is all we need Paper
num_heads=8
num_layers=6

encoder_layers=TransformerEncoderLayers(num_layers=num_layers,embed_dim=embed_dim,vocab_size=vocab_size,
                           num_heads=num_heads,droput=0.3)

print(f'encoder_output_shape: {encoder_layers(example_tensors).shape}')

encoder_output_shape: torch.Size([8, 10, 256])


In [147]:
rand_tensor=torch.LongTensor(size=(10,10))

In [148]:
example_tensors

tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [149]:
### when both input_tensor and encoder_layers are in cpu 

encoder_layers(example_tensors)

tensor([[[-1.0254,  1.2196, -0.9225,  ...,  1.4687,  0.4730, -0.2067],
         [-0.3676,  0.7876, -1.0055,  ...,  1.3824,  0.9694,  0.1047],
         [-0.6959,  0.2638, -0.5278,  ...,  1.3144,  1.5179,  0.2961],
         ...,
         [-0.1148,  0.0841, -0.5788,  ...,  1.3095,  0.7129,  0.6355],
         [ 0.3884,  0.0905, -0.5934,  ...,  1.6724,  0.4144,  0.5311],
         [-0.1556, -1.0732, -0.8616,  ...,  0.9064,  0.3509,  0.0690]],

        [[-0.5176,  0.6613, -1.1251,  ...,  1.3231,  0.9872,  0.1495],
         [-0.6503,  0.7359, -1.0623,  ...,  1.1830,  0.3046, -0.4694],
         [-0.4996,  0.4626, -0.6042,  ...,  1.8565,  1.1727,  0.6076],
         ...,
         [-0.4223, -0.0653, -0.8974,  ...,  1.9943,  0.3851,  0.4359],
         [-0.4256,  0.0300,  0.0233,  ...,  1.4695,  0.6478,  0.5803],
         [-0.3249, -0.0456, -0.6723,  ...,  1.5817,  0.4523,  0.0068]],

        [[-0.2590,  0.9130, -0.7669,  ...,  2.1335,  0.4574,  0.4472],
         [-0.3560,  0.0356, -0.4256,  ...,  1

In [150]:
##
encoder_layers=encoder_layers.to(device=device)

In [151]:
encoder_layers(example_tensors.to(device))

tensor([[[-1.0581,  1.0266, -1.3102,  ...,  1.9474,  0.4914, -0.0629],
         [-0.4414,  0.3293, -0.6951,  ...,  1.4459,  0.5976, -0.3158],
         [-0.2268, -0.7159, -0.0768,  ...,  1.8520,  1.2939,  0.3036],
         ...,
         [ 0.2306, -0.4386, -0.2850,  ...,  1.7340,  0.8953,  0.7186],
         [ 0.8397, -0.3872, -0.3890,  ...,  1.8776,  0.7422,  0.8469],
         [ 0.1667, -0.8590, -1.1839,  ...,  1.7494,  0.2738,  0.0736]],

        [[-0.2761,  0.7627, -1.2999,  ...,  1.7479,  0.5531, -0.1315],
         [-0.2227,  0.5590, -0.8152,  ...,  2.0304,  0.6778,  0.2967],
         [-0.2613,  0.8290, -0.2587,  ...,  1.3891,  0.7312, -0.2080],
         ...,
         [ 0.3941, -0.1155, -0.3587,  ...,  1.6393,  1.2288,  1.3723],
         [-0.1064, -0.6976, -1.2014,  ...,  1.7687,  1.1368,  1.0333],
         [-0.5414, -0.3658, -0.6667,  ...,  1.7077,  0.2410,  0.4198]],

        [[-0.5980, -0.1019, -0.9714,  ...,  1.7979,  0.8000,  0.0844],
         [-0.3700, -0.0319, -0.8894,  ...,  1

In [None]:
rand_tensor=torch.LongTensor(size=(10,10)).to(device=device)

In [None]:
rand_tensor.device

In [None]:
encoder_layers=encoder_layers.to(device=device)


In [None]:
### Both input and model are in cuda now 

### 
encoder_layers