# Implementing GPT model from scratch to generate text.

In [2]:
# Configuration of the GPT-2 model.

GPT_CONFIG_124M = {
    "vocab_size" : 50257,
    "context_length" : 1024,
    "emb_dim" : 768,
    "n_heads" : 12,
    "n_layers" : 12,
    "drop_rate": 0.1,
    "qkv_bias" : False
}

# 1. Building the Placeholder GPT model Architecture

In [3]:
import torch
import torch.nn as nn



In [13]:
class DummyGPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        self.drop_emb = nn.Dropout(cfg["drop_rate"])
        self.trf_blocks = nn.Sequential(
            *[DummyTransformerBlock(cfg)
             for _ in range(cfg["n_layers"])]
        )
        self.final_norm = DummyLayerNorm(cfg["emb_dim"])
        self.out_head = nn.Linear(
            cfg["emb_dim"], cfg["vocab_size"], bias=False
        )

    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape
        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(
            torch.arange(seq_len, device=in_idx.device)
        )
        x = tok_embeds + pos_embeds
        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits

In [14]:
class DummyTransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()

    def forward(self, x):
        return x

In [7]:
class DummyLayerNorm(nn.Module):
    def __init__(self, normalized_shape, eps=-5):
        super().__init__()
        
    def forward(self, x):
        return x

## Preparing the input data and initilizing the GPT model

In [8]:
import tiktoken

In [9]:
tokenizer = tiktoken.get_encoding("gpt2")
batch = []
txt1 = "Every effort moves you"
txt2 = "Every day holds a"

batch.append(torch.tensor(tokenizer.encode(txt1)))
batch.append(torch.tensor(tokenizer.encode(txt2)))
batch = torch.stack(batch, dim=0)
print(batch)

tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])


## Initializing 124-million-parameter dummyModel instance

In [15]:
torch.manual_seed(123)
model = DummyGPTModel(GPT_CONFIG_124M)
logits = model(batch)
print("Output shape: ", logits.shape)
print(logits)

Output shape:  torch.Size([2, 4, 50257])
tensor([[[-0.9289,  0.2748, -0.7557,  ..., -1.6070,  0.2702, -0.5888],
         [-0.4476,  0.1726,  0.5354,  ..., -0.3932,  1.5285,  0.8557],
         [ 0.5680,  1.6053, -0.2155,  ...,  1.1624,  0.1380,  0.7425],
         [ 0.0447,  2.4787, -0.8843,  ...,  1.3219, -0.0864, -0.5856]],

        [[-1.5474, -0.0542, -1.0571,  ..., -1.8061, -0.4494, -0.6747],
         [-0.8422,  0.8243, -0.1098,  ..., -0.1434,  0.2079,  1.2046],
         [ 0.1355,  1.1858, -0.1453,  ...,  0.0869, -0.1590,  0.1552],
         [ 0.1666, -0.8138,  0.2307,  ...,  2.5035, -0.3055, -0.3083]]],
       grad_fn=<UnsafeViewBackward0>)


## Normalizing Activations with Layer Normalization

In [21]:
class LayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True, unbiased=False)
        norm_x = (x - mean) / torch.sqrt(var + self.eps)
        return self.scale * norm_x + self.shift

In [22]:
# Testing the normalization layer.

batch_example = torch.randn(2, 5) 
ln = LayerNorm(emb_dim=5)
out_ln = ln(batch_example)
mean = out_ln.mean(dim=-1, keepdim=True)
var = out_ln.var(dim=-1, unbiased=False, keepdim = True)
print("Mean:\n", mean)
print("Variance:\n", var)

Mean:
 tensor([[1.1921e-08],
        [0.0000e+00]], grad_fn=<MeanBackward1>)
Variance:
 tensor([[1.0000],
        [1.0000]], grad_fn=<VarBackward0>)


# 2 . GELU activation

In [24]:
class GELU(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(
            torch.sqrt(torch.tensor(2.0 / torch.pi)) *
            (x + 0.044715 * torch.pow(x, 3))
        ))

In [26]:
class FeedForward(nn.Module):
 def __init__(self, cfg):
     super().__init__()
     self.layers = nn.Sequential(
         nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]),
         GELU(),
         nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"]),
     )
 def forward(self, x):
     return self.layers(x)

In [27]:
# Initializing the new feedforward module with token embedding size of 768 and feed
# ... it batch input of two samples 3 tokens each.

ffn = FeedForward(GPT_CONFIG_124M)
x = torch.rand(2, 3, 768)
out = ffn(x)
print(out.shape)

torch.Size([2, 3, 768])


# 3. Creating the Transformer block of the GPT

In [28]:
class MultiHeadAttention(nn.Module):
 def __init__(self, d_in, d_out,
     context_length, dropout, num_heads, qkv_bias=False):
     super().__init__()
     assert (d_out % num_heads == 0), \
     "d_out must be divisible by num_heads"
     self.d_out = d_out
     self.num_heads = num_heads
     self.head_dim = d_out // num_heads
     self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
     self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
     self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
     self.out_proj = nn.Linear(d_out, d_out)
     self.dropout = nn.Dropout(dropout)
     self.register_buffer(
     "mask",
     torch.triu(torch.ones(context_length, context_length),
     diagonal=1)
     )
 def forward(self, x):
     b, num_tokens, d_in = x.shape
     keys = self.W_key(x)
     queries = self.W_query(x)
     values = self.W_value(x)
     keys = keys.view(b, num_tokens, self.num_heads, self.head_dim)
     values = values.view(b, num_tokens, self.num_heads, self.head_dim)
     queries = queries.view(
     b, num_tokens, self.num_heads, self.head_dim
     )
     keys = keys.transpose(1, 2)
     queries = queries.transpose(1, 2)
     values = values.transpose(1, 2)
     attn_scores = queries @ keys.transpose(2, 3)
     mask_bool = self.mask.bool()[:num_tokens, :num_tokens]
    
     attn_scores.masked_fill_(mask_bool, -torch.inf)
     attn_weights = torch.softmax(
     attn_scores / keys.shape[-1]**0.5, dim=-1)
     attn_weights = self.dropout(attn_weights)
     context_vec = (attn_weights @ values).transpose(1, 2)
    
     context_vec = context_vec.contiguous().view(
     b, num_tokens, self.d_out
     )
     context_vec = self.out_proj(context_vec)
     return context_vec

In [29]:
class TransformerBlock(nn.Module):
    def __init__(self, cfg):
         super().__init__()
         self.att = MultiHeadAttention(
         d_in=cfg["emb_dim"],
         d_out=cfg["emb_dim"],
         context_length=cfg["context_length"],
         num_heads=cfg["n_heads"],
         dropout=cfg["drop_rate"],
         qkv_bias=cfg["qkv_bias"])
         self.ff = FeedForward(cfg)
         self.norm1 = LayerNorm(cfg["emb_dim"])
         self.norm2 = LayerNorm(cfg["emb_dim"])
         self.drop_shortcut = nn.Dropout(cfg["drop_rate"])

    def forward(self, x):
         shortcut = x
         x = self.norm1(x)
         x = self.att(x)
         x = self.drop_shortcut(x)
         x = x + shortcut
         shortcut = x
         x = self.norm2(x)
         x = self.ff(x)
         x = self.drop_shortcut(x)
         x = x + shortcut
         return x

In [31]:
# Lets instantiate a transformer block and feed it some sample data.

torch.manual_seed(123)
x = torch.rand(2, 4, 768)
block = TransformerBlock(GPT_CONFIG_124M)
output = block(x)

print("Input shape: ", x.shape)
print("Output shape:", output.shape)

Input shape:  torch.Size([2, 4, 768])
Output shape: torch.Size([2, 4, 768])


# 4. Coding the GPT model

In [32]:
class GPTModel(nn.Module):
 def __init__(self, cfg):
     super().__init__()
     self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
     self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
     self.drop_emb = nn.Dropout(cfg["drop_rate"])
    
     self.trf_blocks = nn.Sequential(
     *[TransformerBlock(cfg) for _ in range(cfg["n_layers"])])
    
     self.final_norm = LayerNorm(cfg["emb_dim"])
     self.out_head = nn.Linear(
         cfg["emb_dim"], cfg["vocab_size"], bias=False
     )
     
 def forward(self, in_idx):
     batch_size, seq_len = in_idx.shape
     tok_embeds = self.tok_emb(in_idx)
    
     pos_embeds = self.pos_emb(
     torch.arange(seq_len, device=in_idx.device)
     )
     x = tok_embeds + pos_embeds
     x = self.drop_emb(x)
     x = self.trf_blocks(x)
     x = self.final_norm(x)
     logits = self.out_head(x)
     return logits

In [33]:
# Initializing the model instance.

torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_124M)

out = model(batch)
print("Input batch:\n", batch)
print("\nOutput shape: ", out.shape)
print(out)

Input batch:
 tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])

Output shape:  torch.Size([2, 4, 50257])
tensor([[[ 0.1381,  0.0077, -0.1963,  ..., -0.0222, -0.1060,  0.1717],
         [ 0.3865, -0.8408, -0.6564,  ..., -0.5163,  0.2369, -0.3357],
         [ 0.6989, -0.1829, -0.1631,  ...,  0.1472, -0.6504, -0.0056],
         [-0.4290,  0.1669, -0.1258,  ...,  1.1579,  0.5303, -0.5549]],

        [[ 0.1094, -0.2894, -0.1467,  ..., -0.0557,  0.2911, -0.2824],
         [ 0.0882, -0.3552, -0.3527,  ...,  1.2930,  0.0053,  0.1898],
         [ 0.6091,  0.4702, -0.4094,  ...,  0.7688,  0.3787, -0.1974],
         [-0.0612, -0.0737,  0.4751,  ...,  1.2463, -0.3834,  0.0609]]],
       grad_fn=<UnsafeViewBackward0>)


In [36]:
# checking the total nnumber of model parameters.

total_params = sum(p.numel() for p in model.parameters())
print(f"Total number of parameters:  {total_params:,}")

Total number of parameters:  163,009,536


In [37]:
# Calculating the memory requirements.

total_size_bytes = total_params * 4
total_size_mb = total_size_bytes / (1024 * 1024)
print(f"Total size of the model: {total_size_mb:.2f} MB")

Total size of the model: 621.83 MB


## Generating Texts

In [41]:
def generate_text_simple(model, idx,max_new_tokens, context_size):
 for _ in range(max_new_tokens):
     idx_cond = idx[:, -context_size:]
     with torch.no_grad():
         logits = model(idx_cond)
    
     logits = logits[:, -1, :]
     probas = torch.softmax(logits, dim=-1)
     idx_next = torch.argmax(probas, dim=-1, keepdim=True)
     idx = torch.cat((idx, idx_next), dim=1)
 return idx

In [42]:
# Testing out the text generation function.

start_context = "Hello, I am"
encoded = tokenizer.encode(start_context)
print("encoded:", encoded)
encoded_tensor = torch.tensor(encoded).unsqueeze(0)
print("encoded_tensor.shape:", encoded_tensor.shape)

encoded: [15496, 11, 314, 716]
encoded_tensor.shape: torch.Size([1, 4])


In [43]:
model.eval()
out = generate_text_simple(
    model=model,
    idx=encoded_tensor,
    max_new_tokens=6,
    context_size=GPT_CONFIG_124M["context_length"]
)

print("Output:", out)
print("Output length:", len(out[0]))

Output: tensor([[15496,    11,   314,   716, 27018, 24086, 47843, 30961, 42348,  7267]])
Output length: 10


In [44]:
decoded_text = tokenizer.decode(out.squeeze(0).tolist())
print(decoded_text)

Hello, I am Featureiman Byeswickattribute argue
