<a href="https://colab.research.google.com/github/adnaen/machine-learning-notes/blob/main/llm/transformers/transformer/transformer_implementation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Transformer Implementation Using PyTorch**

In [1]:
import torch

- A Transformer consists of a stack of encoders and decoders.
- However, modern LLMs like `GPT` `do not` use the full `encoder–decoder architecture`.
- `GPT` models are based only on the `decoder block`.
- I implemented this according to the [**Attention Is All You Need paper (2017)**](https://proceedings.neurips.cc/paper_files/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf) *by Google*.
- I chose **PyTorch over NumPy** because it provides a deeper understanding of tensors and is widely used in production-level LLM development.
- **We'll implement the core components from scratch.**

## **setup input data**

In [2]:
# token size      : 100
# embeddings size : 10

input_token = torch.randn([100, 10])
input_token.shape

torch.Size([100, 10])

## **Helper Functions**

In [3]:
def positional_encoding() -> torch.Tensor:
    pass

## **Self Attention**

In [4]:
def get_attention_score(
        q: torch.Tensor,
        k: torch.Tensor,
        v: torch.Tensor,
        d_model: int,
        num_of_heads: int,
        masking: torch.Tensor | None = None
        ) -> torch.Tensor:

        """
        Attention (Q, K, V) = Softmax((Q * K^T) / root of d_k) * V
        """
        d_k = torch.tensor(d_model // num_of_heads) # no.of feature per head

        val_1 = q @ k.T / torch.sqrt(d_k)
        if masking:
            val_1 += masking
        return torch.softmax(val_1, dim=1) @ v

## **Multi Head Attention**

In [5]:
class MultiHeadAttention(torch.nn.Module):
    def __init__(self, d_model: int, num_heads: int) -> None:
        super().__init__(self)
        self.d_model = d_model
        self.num_of_heads = num_heads

        self.q_w = torch.nn.Linear(d_model, d_model)
        self.k_w = torch.nn.Linear(d_model, d_model)
        self.v_w = torch.nn.Linear(d_model, d_model)

        self.d_k = self.d_model // self.num_of_heads

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        self.seq_len = x.shape[0]

        Q = self.q_w(x)
        K = self.k_w(x)
        V = self.v_w(x)

        split_q = Q.view(self.seq_len, self.num_heads, self.d_k).transpose(0, 1)
        split_k = K.view(self.seq_len, self.num_heads, self.d_k).transpose(0, 1)
        split_v = V.view(self.seq_len, self.num_heads, self.d_k).transpose(0, 1)

        heads_scores = []
        for q, k, v in zip(split_q, split_k, split_v):
            score = get_attention_score(
                q=q, k=k, v=v, d_model=self.d_model, num_of_heads=self.num_of_heads
                )

            heads_scores.append(score)

        tensor_scores = torch.stack(heads_scores, dim=0)

        merged_scores = tensor_scores.transpose(0, 1).reshape(self.seq_len, self.d_model)

        return merged_scores

## **Masked Multi Head Attention**

In [6]:
class MaskedMultiHeadAttention(torch.nn.Module):
    def __init__(self, d_model: int, num_heads: int) -> None:
        super().__init__(self)
        self.d_model = d_model
        self.num_of_heads = num_heads

        self.q_w = torch.nn.Linear(d_model, d_model)
        self.k_w = torch.nn.Linear(d_model, d_model)
        self.v_w = torch.nn.Linear(d_model, d_model)

        self.d_k = self.d_model // self.num_of_heads

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        self.seq_len = x.shape[0]

        Q = self.q_w(x)
        K = self.k_w(x)
        V = self.v_w(x)

        split_q = Q.view(self.seq_len, self.num_heads, self.d_k).transpose(0, 1)
        split_k = K.view(self.seq_len, self.num_heads, self.d_k).transpose(0, 1)
        split_v = V.view(self.seq_len, self.num_heads, self.d_k).transpose(0, 1)

        heads_scores = []
        for q, k, v in zip(split_q, split_k, split_v):
            score = get_attention_score(
                q=q, k=k, v=v, d_model=self.d_model, num_of_heads=self.num_of_heads
                )

            heads_scores.append(score)

        tensor_scores = torch.stack(heads_scores, dim=0)

        merged_scores = tensor_scores.transpose(0, 1).reshape(self.seq_len, self.d_model)

        return merged_scores

## **Encoder Block**

In [7]:
class EncoderBlock(torch.nn.Module):

    def __init__(self, d_model: int) -> None:
        super().__init__(self)

        self.mha = MultiHeadAttention(d_model=d_model, num_heads=1)
        self.norm1 = torch.nn.LayerNorm()
        self.norm1 = torch.nn.LayerNorm()
        self.ffn = torch.nn.Sequence(
            torch.nn.Linear(d_model, d_model * 4),
            torch.ReLu(),
            torch.nn.Linear(d_model * 4, d_model * 4),
        )


    def forward(self, x: torch.Tensor) -> torch.Tensor:
        mha_out = self.mha(x) # multi-head attention block
        norm1_result = self.norm1(mha_out + x) # residual conncetion + layer norm
        ffn_result = self.ffn(norm1_result) # ffn (2 layer mlp)
        norm2_result = self.norm2(ffn_result+x) # residual connection + layer norm

        return norm2_result


## **Decoder Block**

In [8]:
class DecoderBlock(torch.nn.Module):
    def __init__(self) -> None:
        super().__init__(self)

    def forward(self) -> torch.Tensor:
        pass