# GPT Architecture
**with placeholder transformer block and layer norm**

## Configuration Params

In [22]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,    # Vocabulary size
    "context_length": 1024, # Context length
    "emb_dim": 768,         # Embedding dimension
    "n_heads": 12,          # Number of attention heads
    "n_layers": 12,         # Number of transformer layers
    "drop_rate": 0.1,       # Dropout rate
    "qkv_bias": False       # Query-Key-Value bias
}

## GPT Skeleton Class

In [23]:
import torch
import torch.nn as nn

In [34]:
class GPTSkeleton(nn.Module):
    def __init__(self, config: dict):
        super(GPTSkeleton, self).__init__()
        self.token_embedding = nn.Embedding(config["vocab_size"], config["emb_dim"])
        self.position_embedding = nn.Embedding(config["context_length"], config["emb_dim"])

        self.dropout = nn.Dropout(config["drop_rate"])

        self.transformer_layers = nn.Sequential(
            *[TransformerBlockSkeleton(config) for _ in range(config["n_layers"])]
        )

        self.final_norm = LayerNormSkeleton(config["emb_dim"])

        self.out = nn.Linear(config["emb_dim"], config["vocab_size"], bias=False)

    def forward(self, token_ids):
        batch_size, seq_length = token_ids.shape
        token_embeds = self.token_embedding(token_ids)
        position_embeds = self.position_embedding(torch.arange(seq_length, device=token_ids.device))
        x = token_embeds + position_embeds
        x = self.dropout(x)
        x = self.transformer_layers(x)
        x = self.final_norm(x)
        logits = self.out(x)
        return logits

In [35]:
class TransformerBlockSkeleton(nn.Module):
    def __init__(self, config: dict):
        super(TransformerBlockSkeleton, self).__init__()

    def forward(self, x):
        return x

In [36]:
class LayerNormSkeleton(nn.Module):
    def __init__(self, normalized_shape, eps: float = 1e-5):
        super(LayerNormSkeleton, self).__init__()

    def forward(self, x):
        return x

Recall that nn.Embedding is like a look up table

## Sample Input and Output

### Input Text

In [37]:
txt_1 = "He is from India."
txt_2 = "She is from Germany."

### Input Token IDs

#### Tokenizer

In [38]:
import tiktoken

In [39]:
tokenizer = tiktoken.get_encoding("gpt2")

#### Token IDs

In [40]:
token_ids_1 = tokenizer.encode(txt_1)
token_ids_2 = tokenizer.encode(txt_2)

print(f"Text 1 Token IDs: {token_ids_1}\n")

print(f"Text 2 Token IDs: {token_ids_2}")

Text 1 Token IDs: [1544, 318, 422, 3794, 13]

Text 2 Token IDs: [3347, 318, 422, 4486, 13]


#### Token ID Tensors

In [41]:
token_id_tensor_1 = torch.tensor(token_ids_1)
token_id_tensor_2 = torch.tensor(token_ids_2)

print(f"Token ID Tensor 1:\n{token_id_tensor_1}, Shape: {token_id_tensor_1.shape}\n")
print(f"Token ID Tensor 2:\n{token_id_tensor_2}, Shape: {token_id_tensor_2.shape}")

Token ID Tensor 1:
tensor([1544,  318,  422, 3794,   13]), Shape: torch.Size([5])

Token ID Tensor 2:
tensor([3347,  318,  422, 4486,   13]), Shape: torch.Size([5])


#### Batch of Input ID Tensors

In [42]:
input_batch = torch.stack([token_id_tensor_1, token_id_tensor_2], dim=0)

print(f"Input Batch:\n{input_batch}\n\nShape: {input_batch.shape}")

Input Batch:
tensor([[1544,  318,  422, 3794,   13],
        [3347,  318,  422, 4486,   13]])

Shape: torch.Size([2, 5])


### Model Instance

In [43]:
model = GPTSkeleton(GPT_CONFIG_124M)
logits = model(input_batch)

print(f"Output Logits:\n{logits}\n")

print(f"Output Logits Shape: {logits.shape}")

Output Logits:
tensor([[[ 0.4530,  0.1816, -0.9035,  ..., -1.5501, -1.7228, -0.4438],
         [ 0.9557,  0.0509,  0.3496,  ...,  0.1179,  0.9291, -0.1950],
         [ 1.0723, -0.8035, -0.6887,  ..., -0.8423, -0.4015,  0.5355],
         [ 1.2788, -0.0959, -0.6675,  ..., -0.3568,  1.3401,  0.4810],
         [ 1.1990,  0.2454, -1.7165,  ...,  0.7395, -1.8960,  0.0944]],

        [[-0.5037,  0.9369, -2.1984,  ..., -0.0597, -0.9168,  0.8416],
         [ 1.2974,  0.2501,  0.4299,  ...,  0.5218,  1.0064,  0.0227],
         [ 1.1131, -0.4677, -0.8729,  ..., -0.4538, -0.1353,  0.1290],
         [ 1.2739, -0.2676,  1.0142,  ...,  0.0481,  2.2718,  0.2456],
         [ 0.9434, -0.3837, -1.2274,  ...,  0.9049, -1.6810, -0.1954]]],
       grad_fn=<UnsafeViewBackward0>)

Output Logits Shape: torch.Size([2, 5, 50257])
