## Implementatin

#### Coding the architecture

In [2]:
GPT_CONFIG_124M = {
"vocab_size": 50257, # Vocabulary size
"context_length": 1024, # Context length
"emb_dim": 768, # Embedding dimension
"n_heads": 12, # Number of attention heads
"n_layers": 12, # Number of layers
"drop_rate": 0.1, # Dropout rate
"qkv_bias": False # Query-Key-Value bias
}

In [22]:
import torch
import torch.nn as nn

class DummyGPT(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.tok_embdgs = nn.Embedding(config["vocab_size"], config["emb_dim"])
        self.pos_embdgs = nn.Embedding(config["context_length"], config["emb_dim"])
        self.drop_embdgs = nn.Dropout(config["drop_rate"])
        
        # Transformer block placeholder
        self.transformer_block = nn.Sequential(*[DummyTransformerBlock(config) for _ in range (config["n_layers"])])


        # LayerNorm placeholder
        self.normalization_layer = DummyLayerNorm(config["emb_dim"])
        self.out_head = nn.Linear(config["emb_dim"], config["vocab_size"], bias=False)

    def forward(self, in_idx):
        batch_size, sequence_len = in_idx.shape
        tok_embdgs = self.tok_embdgs(in_idx)
        pos_embdgs = self.pos_embdgs(torch.arange(sequence_len, device=in_idx.device))
        return self.out_head(self.transformer_block(self.drop_embdgs(tok_embdgs + pos_embdgs)))
    
class DummyLayerNorm(nn.Module):
    def __init__(self, normalized_shape, eps=1e-5):
        super().__init__()
    
    def forward(self, x):
        return x
    
class DummyTransformerBlock(nn.Module):
    def __init__(self, config):
        super().__init__()
    
    def forward(self, x):
        return x
# 


In [None]:
import tiktoken

In [24]:
tokenizer = tiktoken.get_encoding("gpt2")
batch = []
text1 = "I really need to go"
text2 = "Every day is a new"

batch.append(torch.tensor(tokenizer.encode(text1)))
batch.append(torch.tensor(tokenizer.encode(text2)))
batch = torch.stack(batch, dim=0)
print(batch)

tensor([[  40, 1107,  761,  284,  467],
        [6109, 1110,  318,  257,  649]])


In [25]:
torch.manual_seed(123)
model = DummyGPT(GPT_CONFIG_124M)
logits = model(batch)
print("Output shape:", logits.shape)
print(logits)

Output shape: torch.Size([2, 5, 50257])
tensor([[[ 0.7386, -1.2247, -0.3308,  ..., -0.2005,  0.3440, -0.2257],
         [-1.9745,  0.9537, -0.6724,  ..., -0.9423, -0.0838,  0.9865],
         [-0.8479,  1.4844,  0.0894,  ..., -0.0867,  0.6976, -0.7501],
         [-1.2166,  1.5133,  0.2305,  ...,  2.3118, -0.0691,  0.5523],
         [-0.4713, -0.8240,  1.2236,  ...,  0.3428, -0.0308, -1.6088]],

        [[-1.4299,  0.1282, -0.9106,  ..., -1.6346, -0.3399, -0.5688],
         [-0.3886,  0.2121, -0.4795,  ...,  0.0446,  0.2682,  1.3582],
         [ 0.6756, -0.5993, -0.4150,  ...,  0.3043,  0.1444, -0.1647],
         [ 0.3197, -0.7921, -0.2955,  ...,  1.8263,  0.0524, -0.0759],
         [-0.5478, -0.7816,  0.1229,  ..., -0.8491, -1.2927, -1.1232]]],
       grad_fn=<UnsafeViewBackward0>)


#### Normalizing activations with layer normalization

We implement layer normalization to improve stability of the neural network. We've to adjust the output (activation) of each layer such that the mean is 0 and the variance is 1 (unit variance). We apply layer normalization after before and after the multi-head attention module.

In [29]:
torch.manual_seed(123)
example = torch.randn(2,5)
layer = nn.Sequential(nn.Linear(5,6), nn.ReLU())
# layer is a Linear layer followed by a non-linear activation layer ReLU (turns -ive vals into 0)

result = layer(example)
print(result)

tensor([[0.2260, 0.3470, 0.0000, 0.2216, 0.0000, 0.0000],
        [0.2133, 0.2394, 0.0000, 0.5198, 0.3297, 0.0000]],
       grad_fn=<ReluBackward0>)


In [30]:
# Let's examine the mean and variance

mean = result.mean(dim=-1, keepdim=True)
variance =result.var(dim=-1, keepdim=True)
print("Mean:", mean)
print("Variance:", variance)

Mean: tensor([[0.1324],
        [0.2170]], grad_fn=<MeanBackward1>)
Variance: tensor([[0.0231],
        [0.0398]], grad_fn=<VarBackward0>)


`keepdim=True` ensures the output has the same shape as the input

`dim=-1` calculates the statistic at the last dimension, which is the column in a 2D tensor 

Now we'll apply the normalization layer to the layer outputs

In [31]:
normalized_result = (result - mean) / torch.sqrt(variance)
mean = normalized_result.mean(dim=-1, keepdim=True)
variance = normalized_result.var(dim=-1, keepdim=True)
print("Normalized output:", normalized_result)
print("Normalized Mean:", mean)
print("Normalized variance:", variance)

Normalized output: tensor([[ 0.6159,  1.4126, -0.8719,  0.5872, -0.8719, -0.8719],
        [-0.0189,  0.1121, -1.0876,  1.5173,  0.5647, -1.0876]],
       grad_fn=<DivBackward0>)
Normalized Mean: tensor([[9.9341e-09],
        [0.0000e+00]], grad_fn=<MeanBackward1>)
Normalized variance: tensor([[1.0000],
        [1.0000]], grad_fn=<VarBackward0>)


In [32]:
class LayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))
    
    def forward(self, result):
        mean = result.mean(dim=-1, keepdim=True)
        variance = result.var(dim=-1, keepdim=True, unbiased = False)
        normalized_result = (result-mean) / torch.sqrt(variance+self.eps)
        return self.scale*normalized_result + self.shift

`self.eps` prevents division by zero from occuring

`unbiased=False` because when calculating the variance, we divide the variance formula by *n*. In Bessel's calculation, we divide it by *n-1* to adjust for bias in sample variance estimation, which results in a biased estimate. In LLM where the embedding size is very large, there's pretty much no difference between *n* and *n-1*. So we choose this approach to mimic GPT-2

In [34]:
ln = LayerNorm(emb_dim=5)
out_ln = ln(example)
mean = out_ln.mean(dim=-1, keepdim=True)
variance = out_ln.var(dim=-1, keepdim=True, unbiased=False)
print("Mean:", mean)
print("Variance:", variance)

Mean: tensor([[-2.9802e-08],
        [ 0.0000e+00]], grad_fn=<MeanBackward1>)
Variance: tensor([[1.0000],
        [1.0000]], grad_fn=<VarBackward0>)
