### Architecture

GPT consists of 124M parameters. Parameters refers to the number of trainable weights

we will now have a configuration specifing the model architecture. The configuration will be a dictionary with the following keys:

In [1]:
GPT_config = {
    "vocab_size":50257,
    "context_length":1024,
    "emb_dim":768,
    "n_heads":12,
    "n_layers":12,
    "drop_rate":0.1,
    "qkv_bias":False # query key value bias
}

### Backbone of GPT architecture:

The idea is to assemble the GPT architecture first by creating a backbone which helps us understand the overall structure of the model.

In [2]:
import torch
import torch.nn as nn

In [3]:
class DummyTransformerBlock(nn.Module): #placeholder for the transformer block
    def __init__(self,cfg):
        super().__init__()

    def forward(self,x):
        return x
    
class DummyLayerNorm(nn.Module):
    def __init__(self,normalized_shape,eps=1e-5): #placeholder for layernorm
        super().__init__()

    def forward(self,x):
        return x

class DummyGPT(nn.Module):
    def __init__(self,cfg):
        super(DummyGPT,self).__init__()
        self.tok_emb = nn.Embedding(cfg["vocab_size"],cfg["emb_dim"])
        self.pos_emb = nn.Embedding(cfg["context_length"],cfg["emb_dim"])
        self.drop_emb = nn.Dropout(cfg["drop_rate"])
        self.tr_blocks = nn.Sequential(
            *[DummyTransformerBlock(cfg)
              for _ in range(cfg["n_layers"])]
        )
        self.final_norm = DummyLayerNorm(cfg['emb_dim'])
        self.out_head = nn.Linear(
            cfg["emb_dim"],cfg["vocab_size"],bias=False)
    
    def forward(self,in_idx):
        batch_size , seq_len = in_idx.shape
        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(
            torch.arange(seq_len,device=in_idx.device)
        )
        x = tok_embeds + pos_embeds
        x = self.drop_emb(x)
        x = self.tr_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits
    

### Preview of the process

Given an input to the model
- it is tokenized
- then embedded and fed to the GPT model
The output token dimensions matches the input token dimensions and each of the output token is of 50,257 dimensions as specified in the config

In [4]:
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")
batch = []
txt1 = "every day is a good"
txt2 = "the sky shines and is"

batch.append(torch.tensor(tokenizer.encode(txt1)))
batch.append(torch.tensor(tokenizer.encode(txt2)))
batch = torch.stack(batch,dim=0)
print(batch)

tensor([[16833,  1110,   318,   257,   922],
        [ 1169,  6766, 32481,   290,   318]])


In [5]:
torch.manual_seed(42)
model = DummyGPT(GPT_config)
logits = model(batch)
print(f"Output size:{logits.shape}")
print(logits)

Output size:torch.Size([2, 5, 50257])
tensor([[[ 2.2379e-01,  1.9160e+00,  1.4026e+00,  ...,  2.5329e-01,
           8.0193e-01, -5.4960e-01],
         [-8.2327e-01,  8.4603e-01, -6.6348e-01,  ..., -2.0132e-01,
           1.2730e+00, -8.0405e-01],
         [-7.4022e-01,  6.8229e-01, -1.9328e-01,  ..., -2.9264e-01,
           7.4103e-01, -9.3218e-01],
         [ 7.7214e-01, -2.4004e-01, -8.4904e-01,  ...,  1.7395e-01,
           1.1160e+00,  1.8534e+00],
         [-8.3695e-01,  1.6836e-01, -7.3559e-01,  ..., -8.2297e-02,
           8.2059e-02,  6.8131e-01]],

        [[ 1.0112e-01,  6.1023e-04,  1.2280e+00,  ...,  1.4239e+00,
          -1.0427e+00, -7.7041e-01],
         [-1.9589e+00,  2.5010e-01, -3.8665e-01,  ...,  8.4582e-01,
           5.6371e-01,  8.1461e-02],
         [-2.0782e-01, -1.1903e+00,  8.4159e-01,  ..., -1.2821e+00,
           6.5002e-01, -1.3746e+00],
         [ 5.2558e-01,  4.8669e-01, -5.6087e-01,  ..., -5.1120e-01,
           1.5828e-01,  1.1571e+00],
         [-4.83

### Layer normalization
To avoid problems such as vanishing or exploding gradients, we normalize the output of each layer before applying the activation function. This is done by applying a layer normalization operation to the output of each sub-layer.

In [6]:
torch.manual_seed(42)
batch_example = torch.randn(2,5)
layer = nn.Sequential(nn.Linear(5,6),nn.ReLU())
out = layer(batch_example)
print(out)

tensor([[0.0000, 0.1842, 0.0052, 0.7233, 0.0000, 0.5298],
        [0.0000, 0.0000, 0.0000, 0.2237, 0.0000, 0.7727]],
       grad_fn=<ReluBackward0>)


- here the relu activate function creates a theshold such that negative inputs are made 0, ensuring only positive values are present.

In [7]:
mean = out.mean(dim=-1,keepdim=True)
var = out.mean(dim=-1,keepdim=True)
print(f"Mean:{mean}")
print(f"variance:{var}")

Mean:tensor([[0.2404],
        [0.1661]], grad_fn=<MeanBackward1>)
variance:tensor([[0.2404],
        [0.1661]], grad_fn=<MeanBackward1>)


- now applying layer normalization to layer outputs obtained previously, we ensure that the output is scaled to a mean of 0 and variance of 1.

In [8]:
out_norm = (out-mean)/torch.sqrt(var)
mean = out_norm.mean(dim=-1,keepdim=True)
var = out_norm.var(dim=-1,keepdim=True)
print(f"Normalized layer outputs:{out_norm}")
print(f"Mean:{mean}")
print(f"Variance:{var}")

Normalized layer outputs:tensor([[-0.4903, -0.1147, -0.4798,  0.9849, -0.4903,  0.5902],
        [-0.4075, -0.4075, -0.4075,  0.1415, -0.4075,  1.4886]],
       grad_fn=<DivBackward0>)
Mean:tensor([[1.9868e-08],
        [1.4901e-08]], grad_fn=<MeanBackward1>)
Variance:tensor([[0.4085],
        [0.5800]], grad_fn=<VarBackward0>)


### Layer normalization class:

In [9]:
class LayerNorm(nn.Module):
    def __init__(self,emb_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))

    def forward(self,x):
        mean = x.mean(dim=-1,keepdim=True)
        var = x.var(dim=-1,keepdim=True,unbiased=False)
        norm_x = (x-mean)/torch.sqrt(var+self.eps)
        return self.scale * norm_x + self.shift

### Gelu activation function:

In [10]:
class GELU(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self,x):
        return 0.5*x*(1+torch.tanh(
            torch.sqrt(torch.tensor(2.0/torch.pi))*
            (x+0.044715+torch.pow(x,3))
        ))

### Feed forward network:

In [11]:
class FeedForward(nn.Module):
    def __init__(self,cfg):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(cfg["emb_dim"],cfg["emb_dim"]*4),
            GELU(),
            nn.Linear(cfg["emb_dim"]*4,cfg["emb_dim"])
        )

    def forward(self,x):
        return self.layers(x)

In [12]:
ffn = FeedForward(GPT_config)
x = torch.rand(2,3,768)
out = ffn(x)
print(out.shape)

torch.Size([2, 3, 768])


### Adding shortcut connections:

The goal is to understand the impact of adding residual connections to the network.

In [13]:
class SampleDNN(nn.Module):
    def __init__(self,layer_sizes,use_residual):
        super().__init__()
        self.use_residual = use_residual
        self.layers = nn.ModuleList()

        for i in range(len(layer_sizes)-1):
            self.layers.append(
                nn.Sequential(
                    nn.Linear(layer_sizes[i],layer_sizes[i+1])
                    )
                )
    
    def forward(self,x):
        for layer in self.layers:
            layer_output = layer(x)
            if self.use_residual and x.shape == layer_output.shape:
                x = x + layer_output
            else:
                x = layer_output
        return x

In [14]:
layer_sizes = [3,3,3,3,3,1]
sample_input = torch.tensor([[1.,0.,-1.]])
torch.manual_seed(42)
non_residual_model = SampleDNN(layer_sizes,use_residual=False)

In [15]:
#print gradients to check the impact of the residual connection
def print_gradients(model,x):
    output = model(x)
    target = torch.tensor([[0.]])
    loss = nn.MSELoss()
    loss = loss(output,target)
    loss.backward()
    for name,param in model.named_parameters():
        if 'weight' in name:
            print(f"{name}:{param.grad.abs().mean().item()}")

In [16]:
print_gradients(non_residual_model,sample_input)

layers.0.0.weight:0.010758275166153908
layers.1.0.weight:0.02833496779203415
layers.2.0.weight:0.031643934547901154
layers.3.0.weight:0.12355596572160721
layers.4.0.weight:0.18377549946308136


In [17]:
torch.manual_seed(420)
residual_model = SampleDNN(layer_sizes,use_residual=True)
print_gradients(residual_model,sample_input)

layers.0.0.weight:0.18751747906208038
layers.1.0.weight:0.16522449254989624
layers.2.0.weight:0.21649211645126343
layers.3.0.weight:0.25283390283584595
layers.4.0.weight:0.9302193522453308


- While we do not see a stark difference in the output, we do see the gradients in the initial layers being stable and rsising gradually in the model with residual connections.
- The residual connections help in training the model faster and also helps in avoiding the vanishing gradient problem.

### Transformer block

In [18]:
class MultiHeadAttention(nn.Module):
    def __init__(self,d_in,d_out,
                 context_length,dropout,
                 num_heads, qkv_bias=False):
        
        super().__init__()
        self.d_out = d_out
        self.num_heads = num_heads
        self.head_dim = d_out // num_heads
        self.W_query = nn.Linear(d_in,d_out,bias=qkv_bias)
        self.W_key = nn.Linear(d_in,d_out,bias=qkv_bias)
        self.W_value = nn.Linear(d_in,d_out,bias=qkv_bias)
        self.out_proj = nn.Linear(d_out,d_out)
        self.dropout = nn.Dropout(dropout)
        self.register_buffer(
            "mask",
            torch.triu(torch.ones(context_length,context_length),
                       diagonal=1)
        )


    def forward(self,x):
        b, num_tokens , d_in = x.shape
        keys = self.W_key(x)
        queries = self.W_query(x)
        values = self.W_value(x)

        # Splitting the matrix by adding num_heads
        keys = keys.view(b,num_tokens,self.num_heads,self.head_dim)
        queries = queries.view(b,num_tokens,self.num_heads,self.head_dim)
        values = values.view(b,num_tokens,self.num_heads,self.head_dim)


        # Converting the shape
        # b,num_tokens,num_heads,head_dim = b,num_heads,num_tokens,head_dim
        # crucial for aligning the keys and values in multiple heads
        keys = keys.transpose(1,2) 
        queries = queries.transpose(1,2)
        values = values.transpose(1,2)

        attention_scores = queries @ keys.transpose(2,3) # dot product for each head
        mask_bool = self.mask.bool()[:num_tokens,:num_tokens]
        attention_scores.masked_fill_(mask_bool, -torch.inf)

        attention_weights = torch.softmax(
            attention_scores / keys.shape[-1]**0.5 , dim = -1
        )
        attention_weights = self.dropout(attention_weights)

        # b , num_token , n_heads , head_dim
        context_vectors = (attention_weights @ values).transpose(1,2) 

        # combines the heads 
        context_vectors = context_vectors.contiguous().view(
            b,num_tokens,self.d_out
        )
        context_vectors = self.out_proj(context_vectors)
        return context_vectors
    

class TransformerBlock(nn.Module):
    def __init__(self,cfg):
        super().__init__()
        self.att = MultiHeadAttention(
            d_in = cfg["emb_dim"],
            d_out= cfg["emb_dim"],
            context_length= cfg["context_length"],
            num_heads= cfg["n_heads"],
            dropout= cfg["drop_rate"],
            qkv_bias= cfg["qkv_bias"]
        )
        self.ff = FeedForward(cfg)
        self.norm1 = LayerNorm(cfg["emb_dim"])
        self.norm2 = LayerNorm(cfg["emb_dim"])
        self.drop_shortcut = nn.Dropout(cfg["drop_rate"])   

    def forward(self,x):
        residual = x
        x = self.norm1(x)
        x = self.att(x)
        x = self.drop_shortcut(x)
        x = x + residual

        residual = x
        x = self.norm2(x)
        x = self.ff(x)  
        x = self.drop_shortcut(x)
        x = x + residual
        return x

In [19]:
torch.manual_seed(42)
x = torch.rand(2,4,768)
block = TransformerBlock(GPT_config)
out = block(x)
print(f"input shape:{x.shape}")
print(f"output shape:{out.shape}")

input shape:torch.Size([2, 4, 768])
output shape:torch.Size([2, 4, 768])


### GPT architecture:

In [20]:
class GPTModel(nn.Module):
    def __init__(self,cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg["vocab_size"],cfg["emb_dim"])
        self.pos_emb = nn.Embedding(cfg["context_length"],cfg["emb_dim"])
        self.drop_emb = nn.Dropout(cfg["drop_rate"])
        self.trf_blocks = nn.Sequential(
            *[TransformerBlock(cfg) for  _ in range(cfg["n_layers"])]
        )
        self.final_norm = LayerNorm(cfg["emb_dim"])
        self.out_head = nn.Linear(
            cfg["emb_dim"],cfg["vocab_size"],bias=False
        )

    def forward(self,in_idx):
        batch_size , seq_len = in_idx.shape
        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(
            torch.arange(seq_len,device=in_idx.device)
        )
        x = tok_embeds + pos_embeds
        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits

In [21]:
torch.manual_seed(42)
model = GPTModel(GPT_config)
out = model(batch)
print(f"input batch:\n{batch}")
print(f"output shape:{out.shape}")
print(out)

input batch:
tensor([[16833,  1110,   318,   257,   922],
        [ 1169,  6766, 32481,   290,   318]])
output shape:torch.Size([2, 5, 50257])
tensor([[[ 0.6484, -0.1046, -0.1722,  ..., -0.2517, -0.4366, -0.6187],
         [-0.0685,  0.4742,  0.6509,  ...,  0.1216, -0.1821, -0.5249],
         [ 0.3725,  1.2968,  0.4850,  ...,  0.6677,  0.6186, -0.6493],
         [-0.2975, -0.2977,  1.1011,  ..., -0.2127, -0.2804, -0.9128],
         [-0.3206,  0.0167,  0.1753,  ..., -0.1021, -0.5538,  0.1212]],

        [[-0.1876, -0.0367,  1.3480,  ...,  0.1788,  0.5199,  0.1640],
         [ 0.1850, -0.5256,  0.2818,  ...,  0.4252, -0.1914,  0.1537],
         [ 0.1942,  0.7726,  0.0324,  ...,  0.8447, -0.4039, -0.3113],
         [-1.0523, -0.5826,  0.1886,  ...,  0.9998, -0.1479, -1.0728],
         [ 0.4366, -0.1486, -0.0560,  ...,  0.1409,  0.1826, -0.7229]]],
       grad_fn=<UnsafeViewBackward0>)


In [22]:
total_parameters = sum(p.numel() for p in model.parameters())
print(f"Total number of parameters in GPT:{total_parameters}")

Total number of parameters in GPT:163009536


GPT-2 consists of 124M parameters.However here we see 163M parameters. The reason is that GPT-2 reuses the weights from the token embedding layer in the output layer. This process is called weight tying.
This practice of removing the weights from the output layer helps in reducing the overall memory footprint.

In [27]:
#checking the number of parameters in each individual module
block = TransformerBlock(GPT_config)
mha_params = sum(p.numel() for p in block.att.parameters())
print(f"Parameters in MultiHeadAttention:{mha_params:,}")

ff_params = sum(p.numel() for p in block.ff.parameters())
print(f"Parameters in FeedForward:{ff_params:,}")

Parameters in MultiHeadAttention:2,360,064
Parameters in FeedForward:4,722,432


### Generating text:

In [29]:
def generate_text(model,idx,max_new_tokens,context_size):
    for _ in range(max_new_tokens):
        idx_cond = idx[:,-context_size:]
        with torch.no_grad():
            logits = model(idx_cond)
        logits = logits[:,-1,:] 
        probas = torch.softmax(logits,dim=-1)
        idx_next = torch.argmax(probas,dim=-1,keepdim=True)
        idx = torch.cat((idx,idx_next),dim=-1)
    return idx

In [30]:
example_context = "Hello my name"
encoded = tokenizer.encode(example_context)
print(f"encoded text:{encoded}")
encoded_tensors = torch.tensor(encoded).unsqueeze(0)
print("encoded tensors:",encoded_tensors.shape)  

encoded text:[15496, 616, 1438]
encoded tensors: torch.Size([1, 3])


In [31]:
model.eval()
out = generate_text(model,encoded_tensors,6,GPT_config["context_length"])
print(out)
print(f"output length:{len(out[0])}")

tensor([[15496,   616,  1438, 18612, 48670, 28246, 39567, 46805, 44013]])
output length:9


In [32]:
decoded_text = tokenizer.decode(out.squeeze(0).tolist())
print(decoded_text)

Hello my name Professional rg hemp Warn PROGRAM ABE
