## Implementatin

#### Coding the architecture

In [2]:
GPT_CONFIG_124M = {
"vocab_size": 50257, # Vocabulary size
"context_length": 1024, # Context length
"emb_dim": 768, # Embedding dimension
"n_heads": 12, # Number of attention heads
"n_layers": 12, # Number of layers
"drop_rate": 0.1, # Dropout rate
"qkv_bias": False # Query-Key-Value bias
}

In [22]:
import torch
import torch.nn as nn

class DummyGPT(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.tok_embdgs = nn.Embedding(config["vocab_size"], config["emb_dim"])
        self.pos_embdgs = nn.Embedding(config["context_length"], config["emb_dim"])
        self.drop_embdgs = nn.Dropout(config["drop_rate"])
        
        # Transformer block placeholder
        self.transformer_block = nn.Sequential(*[DummyTransformerBlock(config) for _ in range (config["n_layers"])])


        # LayerNorm placeholder
        self.normalization_layer = DummyLayerNorm(config["emb_dim"])
        self.out_head = nn.Linear(config["emb_dim"], config["vocab_size"], bias=False)

    def forward(self, in_idx):
        batch_size, sequence_len = in_idx.shape
        tok_embdgs = self.tok_embdgs(in_idx)
        pos_embdgs = self.pos_embdgs(torch.arange(sequence_len, device=in_idx.device))
        return self.out_head(self.transformer_block(self.drop_embdgs(tok_embdgs + pos_embdgs)))
    
class DummyLayerNorm(nn.Module):
    def __init__(self, normalized_shape, eps=1e-5):
        super().__init__()
    
    def forward(self, x):
        return x
    
class DummyTransformerBlock(nn.Module):
    def __init__(self, config):
        super().__init__()
    
    def forward(self, x):
        return x
# 


In [None]:
import tiktoken

In [24]:
tokenizer = tiktoken.get_encoding("gpt2")
batch = []
text1 = "I really need to go"
text2 = "Every day is a new"

batch.append(torch.tensor(tokenizer.encode(text1)))
batch.append(torch.tensor(tokenizer.encode(text2)))
batch = torch.stack(batch, dim=0)
print(batch)

tensor([[  40, 1107,  761,  284,  467],
        [6109, 1110,  318,  257,  649]])


In [25]:
torch.manual_seed(123)
model = DummyGPT(GPT_CONFIG_124M)
logits = model(batch)
print("Output shape:", logits.shape)
print(logits)

Output shape: torch.Size([2, 5, 50257])
tensor([[[ 0.7386, -1.2247, -0.3308,  ..., -0.2005,  0.3440, -0.2257],
         [-1.9745,  0.9537, -0.6724,  ..., -0.9423, -0.0838,  0.9865],
         [-0.8479,  1.4844,  0.0894,  ..., -0.0867,  0.6976, -0.7501],
         [-1.2166,  1.5133,  0.2305,  ...,  2.3118, -0.0691,  0.5523],
         [-0.4713, -0.8240,  1.2236,  ...,  0.3428, -0.0308, -1.6088]],

        [[-1.4299,  0.1282, -0.9106,  ..., -1.6346, -0.3399, -0.5688],
         [-0.3886,  0.2121, -0.4795,  ...,  0.0446,  0.2682,  1.3582],
         [ 0.6756, -0.5993, -0.4150,  ...,  0.3043,  0.1444, -0.1647],
         [ 0.3197, -0.7921, -0.2955,  ...,  1.8263,  0.0524, -0.0759],
         [-0.5478, -0.7816,  0.1229,  ..., -0.8491, -1.2927, -1.1232]]],
       grad_fn=<UnsafeViewBackward0>)


#### Normalizing activations with layer normalization

We implement layer normalization to improve stability of the neural network. We've to adjust the output (activation) of each layer such that the mean is 0 and the variance is 1 (unit variance). We apply layer normalization after before and after the multi-head attention module.