### Describe architecture of transformers that is commonly used in LLMs

### Encoder

1 - Multi-Head Self-Attention Module
2 - Feed-Forward Neural Network

In [None]:
import torch
import torch.nn as nn

class EncoderLayer(nn.Module):
    
    def __init__(self, embed_dim, num_heads):
        super().__init__(EncoderLayer, self)
        
        self.attention = nn.MultiheadAttention(embed_dim, num_heads)
        self.feedforward = nn.Sequential(
            nn.Linear(embed_dim, 4*embed_dim),
            nn.ReLU(),
            nn.Linear(4*embed_dim, embed_dim),
            
        )
        self.norm_layer_1 = nn.LayerNorm(embed_dim),
        self.norm_layer_2 = nn.LayerNorm(embed_dim)
        
    def forward(self, x):
        atn_output = self.attention(x,x,x)
        x = self.norm_layer_1(x + atn_output)
        ff_output = self.feedforward(x)
        x = self.norm_layer_2(x + ff_output)
        return x
        
        


### Positional Encoding

In [5]:
import numpy as np
s_1 = np.arange(5)
s = np.arange(5)[:,np.newaxis]
print(s)
print(s_1)

[[0]
 [1]
 [2]
 [3]
 [4]]
[0 1 2 3 4]


In [None]:
class MultiHeadAttention:
    def __init__(self, d_model, num_heads):
        super().__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        self.depth = d_model // num_heads
        self.wq = nn.Linear(d_model, d_model)
        self.wk = nn.Linear(d_model, d_model)
        self.wv = nn.Linear(d_model d_model)
        self.dense = nn.Linear(d_model, d_model) 
        
    def split_heads(self, x, batch_size):
        x = x.view(batch_size, -1, self.num_heads, self.depth)
        return x.permute(0, 2, 1, 3)
    
    def forward(self, q, k, v, mask = None):
        batch_size = q.size(0)
        q = self.wq(q)
        k = self.wk(k)
        v = self.wv(v)
        
        q = self.split_heads(q, batch_size)
        k = self.split_heads(k, batch_size)
        v = self.split_heads(v, batch_size)
        
        scaled_attention = secaled_dot_product_attention(q, k, v, mask)
        concat_attention = scaled_attention.permute(0, 2, 1, 3).contiguous()
        concat_attention = concat_attention.view(batch_size, -1, self.d_model)
        
        return self.dense(concat_attention)
    

### Feed forward Network

In [None]:
class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super().__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.linear2 = nn.Linear(d_ff, d_model)
        
    def forward(self, x):
        return self.linear2(F.relu(self.linear1(x)))