<a href="https://colab.research.google.com/github/adnaen/machine-learning-notes/blob/main/llm/transformers/encoder/encoder_block.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **LET'S CREATE AN ENCODER FROM SCRATCH**

In [21]:
import torch

## **POSITIONAL ENCODER**

In [69]:
# TODO

## **MULTI-HEAD ATTENTION NN**

In [62]:
class MultiHeadAttention(torch.nn.Module):
    def __init__(
            self,
            d_model: int,
            seq_len: int,
            heads: int=9
        ) -> None:
        super().__init__()

        self.q_w = torch.nn.Linear(d_model, d_model)
        self.k_w = torch.nn.Linear(d_model, d_model)
        self.v_w = torch.nn.Linear(d_model, d_model)

        self.d_model = d_model
        self.no_of_heads = heads
        self.d_k = self.d_model // self.no_of_heads
        self.seq_len = seq_len

        print(
            f"INFO\n No.Of Heads: {self.no_of_heads}\n Model Dim(d_model) : {self.d_model}\n DK: {self.d_k}"
        )

    def forward(self, X: torch.Tensor) -> torch.Tensor:
        q_val = self.q_w(X)
        k_val = self.k_w(X)
        v_val = self.v_w(X)

        # split Q, K, V for multi-heads with shape (no_of_heads, seq_len, d_k)
        q_split = q_val.view(self.no_of_heads, self.seq_len, self.d_k)
        k_split = k_val.view(self.no_of_heads, self.seq_len, self.d_k)
        v_split = v_val.view(self.no_of_heads, self.seq_len, self.d_k)

        # calculate Attention score for each head
        attention_score = []

        for q, k, v in zip(q_split, k_split, v_split):
            attention_score.append(
                self._attention_calculation(q, k, v, self.d_k)
                )

        heads_result = torch.stack(
            attention_score, dim=0
            )  # convert the list to tensor

        merged_result = (
            heads_result.permute(1, 0, 2)
            .reshape(self.seq_len, self.d_model)
        )   # combine all heads output to single tensor --
            #  shape (seq_len, d_model) / input_size

        # THE RESULT IS NOTHING BUT,
        # CONTEXTUAL EMBEDDING OF THE INPUT EMBEDDING.
        # THIS HELP THE MODEL TO UNDERSTAND CONTEXTUAL MEANING OF WORDS.
        # SO IT SHOULD BE THE SAME SHAPE.
        return merged_result

    def _attention_calculation(
            self,
            Q: torch.Tensor,
            K: torch.Tensor,
            V: torch.Tensor,
            dk: int
        ) -> torch.Tensor:
        # Attention (Q, K, V) = softmax((Q . k^T / root of dk)) . V

        softmax_out = torch.softmax((Q @ K.T) / torch.sqrt(torch.tensor(dk)), dim=1)
        return softmax_out @ V

In [63]:
ip_embedding = torch.randn((5,20))
ip_embedding

tensor([[ 0.1557,  0.1457,  1.1797,  0.0227,  0.1882, -0.4213, -0.8316,  3.3527,
          0.7365, -0.3173,  0.7133, -0.6473,  1.7734, -2.7667, -0.5315,  1.6694,
         -0.3138, -0.1087, -0.2894, -1.8129],
        [-1.3165,  0.0331,  0.7439, -1.3518,  0.9552, -1.7826, -0.1749,  0.3523,
         -0.3660,  0.5370, -0.7630, -1.7973, -1.7436,  1.0824,  1.6633,  0.1598,
          0.0099, -1.3341, -0.1542,  0.8692],
        [ 0.4177, -1.3074,  1.4559,  0.7019,  1.1017,  0.8272,  0.4233, -0.6521,
          0.1267, -0.1935, -0.5222,  0.0935,  0.3764,  0.7102, -0.2608, -0.3452,
         -0.1504,  0.1802,  0.3866, -0.8744],
        [-1.6731,  1.2850,  0.0205,  1.8303,  0.0707,  0.7198, -0.5310, -0.5819,
          0.3390,  0.2329,  1.4084, -0.2606, -0.2410, -0.8269,  0.1043, -0.1089,
          0.7794,  0.2922, -2.2894,  0.0086],
        [ 0.1274,  1.6530,  0.6031,  1.0910, -0.8939,  0.6834,  0.3508, -1.9509,
          0.1218, -0.5213,  0.2998,  1.2847, -0.3003,  0.1796, -0.5410, -1.6766,
      

In [64]:
model = MultiHeadAttention(d_model=20, heads=4, seq_len=5)

INFO
 No.Of Heads: 4
 Model Dim(d_model) : 20
 DK: 5


In [67]:
res = model(ip_embedding)
res

tensor([[ 0.4258, -0.0375, -0.1155,  0.3464,  0.4537,  0.1527,  0.2004,  0.3676,
         -0.1934, -0.7494,  0.1169, -0.2387, -0.0049,  0.0505, -0.1386,  0.2337,
         -0.3314,  0.1832, -0.5774,  0.0385],
        [ 0.3898, -0.0371, -0.1146,  0.3455,  0.5007,  0.1759,  0.2645,  0.1734,
         -0.0868, -0.5134,  0.0742, -0.2502,  0.0102,  0.0213, -0.0571,  0.2139,
         -0.2073,  0.3164, -0.6263, -0.0985],
        [ 0.3092, -0.0635, -0.0811,  0.4539,  0.6176,  0.1076,  0.2691,  0.2388,
         -0.1340, -0.5651,  0.1258, -0.1532,  0.0224, -0.0200, -0.0663,  0.2383,
         -0.1465,  0.3350, -0.6699, -0.1644],
        [ 0.3519,  0.0769, -0.1410,  0.1825,  0.3607,  0.1492,  0.2705,  0.1778,
         -0.0808, -0.4741,  0.1418, -0.1836, -0.0034,  0.0046, -0.1354,  0.1612,
         -0.3300,  0.2692, -0.5066, -0.0693],
        [ 0.3561, -0.0961, -0.0865,  0.4727,  0.6272,  0.1777,  0.2421,  0.2201,
         -0.0942, -0.5291,  0.1162, -0.1538,  0.0165, -0.0181, -0.0626,  0.1577,
      

In [68]:
res.shape

torch.Size([5, 20])

## **Layer Norm**

In [70]:
# TODO

## **ADD / RESIDUAL**

In [71]:
# TODO

## **PASS THROUGH FFN LAYER (2 LAYER MLP)**