## Investigating model sizes with different configurations

- **GPT2-LARGE** (the 124M configuration we already implemented):
        - "emb_dim" = 768
        - "n_layers" = 12
        - "n_heads" = 12

    - **GPT2-medium:**
        - "emb_dim" = 1024
        - "n_layers" = 24
        - "n_heads" = 16
    
    - **GPT2-large:**
        - "emb_dim" = 1280
        - "n_layers" = 36
        - "n_heads" = 20
    
    - **GPT2-XL:**
        - "emb_dim" = 1600
        - "n_layers" = 48
        - "n_heads" = 25

In [14]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,    # Vocabulary size
    "context_length": 1024, # Context length
    "emb_dim": 768,         # Embedding dimension
    "n_heads": 12,          # Number of attention heads
    "n_layers": 12,         # Number of layers
    "drop_rate": 0.1,       # Dropout rate
    "qkv_bias": False       # Query-Key-Value bias
}

GPT_CONFIG_SMALL = {k:v for k,v in GPT_CONFIG_124M.items()}

GPT_CONFIG_MEDIUM = {k:v for k,v in GPT_CONFIG_124M.items()}
GPT_CONFIG_MEDIUM["emb_dim"] = 1024
GPT_CONFIG_MEDIUM["n_layers"] = 24
GPT_CONFIG_MEDIUM["n_heads"] = 16

GPT_CONFIG_LARGE = {k:v for k,v in GPT_CONFIG_124M.items()}
GPT_CONFIG_LARGE["emb_dim"] = 1280
GPT_CONFIG_LARGE["n_layers"] = 36
GPT_CONFIG_LARGE["n_heads"] = 20

GPT_CONFIG_XL = {k:v for k,v in GPT_CONFIG_124M.items()}
GPT_CONFIG_XL["emb_dim"] = 1600
GPT_CONFIG_XL["n_layers"] = 48
GPT_CONFIG_XL["n_heads"] = 25

### Code from chapter

In [15]:
import torch
import torch.nn as nn
class GELU(nn.Module):
    def __init__(self):
        super().__init__()
    
    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(
            torch.sqrt(torch.tensor(2.0 / torch.pi)) * 
            (x + 0.044715 * torch.pow(x, 3))
        ))
    
class FeedForward(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]),
            GELU(),
            nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"]),
        )
    
    def forward(self, x):
        return self.layers(x)

class LayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True, unbiased=False)
        norm_x = (x - mean) / torch.sqrt(var + self.eps)
        return self.scale*norm_x + self.shift
        
from previous_chapters import MultiHeadAttention
class TransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.att = MultiHeadAttention(
            d_in = cfg["emb_dim"],
            d_out = cfg["emb_dim"],
            context_length = cfg["context_length"],
            n_heads = cfg["n_heads"],
            dropout = cfg["drop_rate"],
            qkv_bias = cfg["qkv_bias"]
        )
        self.ff = FeedForward(cfg)
        self.norm1 = LayerNorm(cfg["emb_dim"])
        self.norm2 = LayerNorm(cfg["emb_dim"])
        self.drop_shortcut = nn.Dropout(cfg["drop_rate"])

    def forward(self, x):
        shortcut = x
        x = self.norm1(x)
        x = self.att(x)
        x = self.drop_shortcut(x)
        x = x + shortcut

        shortcut = x
        x = self.norm2(x)
        x = self.ff(x)
        x = self.drop_shortcut(x)
        x = x + shortcut

        return x


class GPTModel(nn.Module):
    def __init__(self,cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg["vocab_size"],cfg["emb_dim"])
        self.pos_emb = nn.Embedding(cfg["context_length"],cfg["emb_dim"])
        self.drop_emb = nn.Dropout(cfg["drop_rate"])

        self.trf_blocks = nn.Sequential(
            *[TransformerBlock(cfg) for _ in range(cfg["n_layers"])]
        )

        self.final_norm = LayerNorm(cfg["emb_dim"])
        self.out_head = nn.Linear(cfg["emb_dim"], cfg["vocab_size"],bias=False)

    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape
        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(torch.arange(seq_len,device=in_idx.device))
        x = tok_embeds + pos_embeds
        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits

In [16]:
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")

batch = []

txt1 = "Every effort moves you"
txt2 = "Every day holds a"

batch.append(torch.tensor(tokenizer.encode(txt1)))
batch.append(torch.tensor(tokenizer.encode(txt2)))
batch = torch.stack(batch, dim=0)
print(batch)
torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_124M)

out = model(batch)
print("Input batch:\n", batch)
print("\nOutput shape:", out.shape)
print(out)

tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])
Input batch:
 tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])

Output shape: torch.Size([2, 4, 50257])
tensor([[[ 0.3613,  0.4223, -0.0711,  ...,  0.3483,  0.4661, -0.2838],
         [-0.1792, -0.5660, -0.9485,  ...,  0.0477,  0.5181, -0.3168],
         [ 0.7120,  0.0332,  0.1085,  ...,  0.1018, -0.4327, -0.2553],
         [-1.0076,  0.3418, -0.1190,  ...,  0.7195,  0.4023,  0.0532]],

        [[-0.2564,  0.0900,  0.0335,  ...,  0.2659,  0.4454, -0.6806],
         [ 0.1230,  0.3653, -0.2074,  ...,  0.7705,  0.2710,  0.2246],
         [ 1.0558,  1.0318, -0.2800,  ...,  0.6936,  0.3205, -0.3178],
         [-0.1565,  0.3926,  0.3288,  ...,  1.2630, -0.1858,  0.0388]]],
       grad_fn=<UnsafeViewBackward0>)


### Now Evaluating

In [17]:
model_list = [GPT_CONFIG_SMALL, GPT_CONFIG_MEDIUM, 
              GPT_CONFIG_LARGE, GPT_CONFIG_XL]
model_name = ["GPT_CONFIG_SMALL", "GPT_CONFIG_MEDIUM", 
              "GPT_CONFIG_LARGE", "GPT_CONFIG_XL"]
for name, config in zip(model_name,model_list):
    model = GPTModel(config)
    total_params = sum(p.numel() for p in model.parameters())
    print(f"\n\nModel configuration: {name}")
    for k in ["emb_dim", "n_layers", "n_heads"]:
        print(f"{k} = {config[k]}")
    print(f"Total number of parameters: {total_params:,}")
    total_size_bytes = total_params * 4
    # Convert to megabytes
    total_size_mb = total_size_bytes / (1024 * 1024)
    print(f"Total size of the model: {total_size_mb:.2f} MB")




Model configuration: GPT_CONFIG_SMALL
emb_dim = 768
n_layers = 12
n_heads = 12
Total number of parameters: 163,009,536
Total size of the model: 621.83 MB


Model configuration: GPT_CONFIG_MEDIUM
emb_dim = 1024
n_layers = 24
n_heads = 16
Total number of parameters: 406,212,608
Total size of the model: 1549.58 MB


Model configuration: GPT_CONFIG_LARGE
emb_dim = 1280
n_layers = 36
n_heads = 20
Total number of parameters: 838,220,800
Total size of the model: 3197.56 MB


Model configuration: GPT_CONFIG_XL
emb_dim = 1600
n_layers = 48
n_heads = 25
Total number of parameters: 1,637,792,000
Total size of the model: 6247.68 MB
