In [2]:
import torch
import torch.nn as nn
import math

In [2]:
# check cuda is available or not 
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [4]:
def even_position( p, i, dim):
    return math.sin(p/(10000**((2*i)/dim)))

def odd_position( p, i, dim):
    return math.cos(p/(10000**((2*i)/dim)))

def postional_encoding(tokens_len, embed_dim):
    postional_encodings = []
    for p in range(tokens_len):
        token_postion = []
        for i in  range(embed_dim):
            if i % 2 == 0:
                token_postion.append(even_position( p, i, embed_dim))
            else:
                token_postion.append(even_position( p, i, embed_dim))
        postional_encodings.append(token_postion)
    return postional_encodings



In [5]:
class SelfAttention(nn.Module):
    def __init__(self, embed_dim, dim):
        super().__init__()
        self.query_w = nn.Linear(embed_dim, dim)
        self.key_w = nn.Linear(embed_dim, dim)
        self.value_w = nn.Linear(embed_dim, dim)
        self.softmax = nn.Softmax(dim=-1)  
    def forward(self, embed):
        query = self.query_w(embed)
        key = self.key_w(embed)
        value = self.value_w(embed)
        scores = torch.matmul(query, key.transpose(-2, -1)) / (key.shape[-1] ** 0.5)
        attn_weights = self.softmax(scores)
        attended = torch.matmul(attn_weights, value)
        return attended

In [6]:
class MultiheadAttention(nn.Module):
    def __init__(self, num_heads, embed_dim, head_dim):
        super().__init__()
        self.num_heads = num_heads
        self.head_dim = head_dim
        self.embed_dim = embed_dim

        self.multi_head_attn = nn.ModuleList([
            SelfAttention(embed_dim, head_dim) for _ in range(num_heads)
        ])
        self.W = nn.Linear(num_heads * head_dim, embed_dim)

    def forward(self, embed):
        heads = [head(embed) for head in self.multi_head_attn]
        heads_cat = torch.cat(heads, dim=-1)
        output = self.W(heads_cat)
        return output


        

In [3]:
class LayerNormalization(nn.Module):
    def __init__(self, embed_dim, eps=1e-5):
        super(LayerNormalization, self).__init__()
        self.alpha = nn.Parameter(torch.ones(embed_dim))  
        self.beta = nn.Parameter(torch.zeros(embed_dim))  
        self.eps = eps

    def forward(self, embed):
        mean = embed.mean(dim=-1, keepdim=True)
        var = embed.var(dim=-1, keepdim=True, unbiased=False)
        normalized = (embed - mean) / torch.sqrt(var + self.eps)

        return self.alpha * normalized + self.beta


In [4]:
class FeedForward(nn.Module):
    def __init__(self, embed_dim):
        super().__init__()
        self.W1 = nn.Linear(embed_dim, embed_dim)
        self.W2 = nn.Linear(embed_dim, embed_dim)
        self.dropout = nn.Dropout(0.2)

    def forward(self, embed):
        x = self.W1(embed)
        x = self.dropout(x)
        x = self.W2(x)  
        return x


In [5]:
class Encoder(nn.Module):
    def __init__(self, num_heads, embed_dim, head_dim):
        super().__init__()  
        self.multiheadattention = MultiheadAttention(num_heads, embed_dim, head_dim)
        self.layernorm1 = nn.LayerNorm(embed_dim)  
        self.feedforward = FeedForward(embed_dim)
        self.layernorm2 = nn.LayerNorm(embed_dim)

    def forward(self, x):
        atten_x = self.multiheadattention(x)
        x = self.layernorm1(atten_x + x)
        ff_out = self.feedforward(x)
        x = self.layernorm2(ff_out + x)
        return x


In [7]:
class StackEncoder(nn.Module):
    def __init__(self, num_heads, embed_dim, head_dim):
        super().__init__()
        self.encoders = nn.Sequential(
            *[Encoder(num_heads, embed_dim, head_dim) for _ in range(6)]
        )
    
    def forward(self, x):
        return self.encoders(x)
