<a href="https://colab.research.google.com/github/ahinagangopadhyay/Machine-Learning/blob/main/LLM_BuildFromScratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install dependencies (most are preinstalled in Colab)
!pip install -q numpy torch tqdm

# Import libraries
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from tqdm import tqdm


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m61.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m33.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m32.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
# A tiny corpus of text
text = "hello there general kenobi"
chars = sorted(list(set(text)))
vocab_size = len(chars)

# Create a simple mapping: char to index and index to char
stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for ch, i in stoi.items()}

def encode(s):
    return [stoi[c] for c in s]

def decode(indices):
    return ''.join([itos[i] for i in indices])

print("Vocab:", chars)
print("Encoded:", encode("hello"))
print("Decoded:", decode(encode("hello")))


Vocab: [' ', 'a', 'b', 'e', 'g', 'h', 'i', 'k', 'l', 'n', 'o', 'r', 't']
Encoded: [5, 3, 8, 8, 10]
Decoded: hello


In [3]:
import torch

# Sample input text
data = "hello there general kenobi"
block_size = 8  # Sequence length to train on

# Encode the full text into token IDs
encoded_data = torch.tensor(encode(data), dtype=torch.long)

# Example: Get first chunk of token IDs
x = encoded_data[:block_size]
print("Token IDs:", x)

# Create embedding table (like a lookup for each token ID)
embedding_dim = 16
embedding_table = nn.Embedding(vocab_size, embedding_dim)

# Get embeddings for x
x_embed = embedding_table(x)
print("Embeddings shape:", x_embed.shape)  # Should be [block_size, embedding_dim]


Token IDs: tensor([ 5,  3,  8,  8, 10,  0, 12,  5])
Embeddings shape: torch.Size([8, 16])


In [4]:
# Positional embeddings
position_embedding_table = nn.Embedding(block_size, embedding_dim)

# Create position indices [0, 1, 2, ..., block_size-1]
position_ids = torch.arange(block_size)

# Get position embeddings
position_embeddings = position_embedding_table(position_ids)

# Add token + position embeddings
x_final = x_embed + position_embeddings
print("Final input shape:", x_final.shape)


Final input shape: torch.Size([8, 16])


In [5]:
class SelfAttentionHead(nn.Module):
    def __init__(self, embedding_dim, head_size):
        super().__init__()
        self.key = nn.Linear(embedding_dim, head_size, bias=False)
        self.query = nn.Linear(embedding_dim, head_size, bias=False)
        self.value = nn.Linear(embedding_dim, head_size, bias=False)
        self.dropout = nn.Dropout(0.1)

    def forward(self, x):
        B, T, C = x.shape  # Batch, Time (sequence), Channels

        k = self.key(x)    # (B, T, head_size)
        q = self.query(x)  # (B, T, head_size)

        # Compute attention scores ("affinities")
        wei = q @ k.transpose(-2, -1) / (C ** 0.5)  # (B, T, T)

        # Mask out future tokens (causal)
        mask = torch.tril(torch.ones(T, T))
        wei = wei.masked_fill(mask == 0, float('-inf'))

        # Softmax to get attention weights
        wei = F.softmax(wei, dim=-1)
        wei = self.dropout(wei)

        # Get values
        v = self.value(x)  # (B, T, head_size)

        # Weighted sum of values
        out = wei @ v  # (B, T, head_size)
        return out


In [6]:
# Let's say x_final is [1, 8, 16] = (batch=1, sequence=8, embedding_dim=16)
x_input = x_final.unsqueeze(0)  # Add batch dimension

head = SelfAttentionHead(embedding_dim=16, head_size=8)
out = head(x_input)

print("Self-attention output shape:", out.shape)  # Should be [1, 8, 8]


Self-attention output shape: torch.Size([1, 8, 8])


In [7]:
class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, embedding_dim, head_size):
        super().__init__()
        self.heads = nn.ModuleList([
            SelfAttentionHead(embedding_dim, head_size) for _ in range(num_heads)
        ])
        self.proj = nn.Linear(num_heads * head_size, embedding_dim)
        self.dropout = nn.Dropout(0.1)

    def forward(self, x):
        out = torch.cat([head(x) for head in self.heads], dim=-1)
        out = self.proj(out)
        return self.dropout(out)


In [8]:
class FeedForward(nn.Module):
    def __init__(self, embedding_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(embedding_dim, 4 * embedding_dim),
            nn.ReLU(),
            nn.Linear(4 * embedding_dim, embedding_dim),
            nn.Dropout(0.1),
        )

    def forward(self, x):
        return self.net(x)


In [9]:
class TransformerBlock(nn.Module):
    def __init__(self, embedding_dim, num_heads):
        super().__init__()
        head_size = embedding_dim // num_heads
        self.sa = MultiHeadAttention(num_heads, embedding_dim, head_size)
        self.ffwd = FeedForward(embedding_dim)
        self.ln1 = nn.LayerNorm(embedding_dim)
        self.ln2 = nn.LayerNorm(embedding_dim)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))  # Attention + residual
        x = x + self.ffwd(self.ln2(x))  # FeedForward + residual
        return x


In [10]:
block = TransformerBlock(embedding_dim=16, num_heads=4)
out = block(x_input)
print("Transformer output shape:", out.shape)  # Should be [1, 8, 16]


Transformer output shape: torch.Size([1, 8, 16])


In [11]:
class GPTMini(nn.Module):
    def __init__(self, vocab_size, embedding_dim=32, block_size=8, n_heads=4, n_layers=2):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, embedding_dim)
        self.position_embedding_table = nn.Embedding(block_size, embedding_dim)

        self.blocks = nn.Sequential(
            *[TransformerBlock(embedding_dim, n_heads) for _ in range(n_layers)]
        )

        self.ln_f = nn.LayerNorm(embedding_dim)  # Final layer norm
        self.lm_head = nn.Linear(embedding_dim, vocab_size)  # Language model head

    def forward(self, idx):
        B, T = idx.shape

        tok_emb = self.token_embedding_table(idx)  # (B, T, C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=idx.device))  # (T, C)
        x = tok_emb + pos_emb  # (B, T, C)

        x = self.blocks(x)  # Apply all transformer blocks
        x = self.ln_f(x)    # Final layer norm
        logits = self.lm_head(x)  # (B, T, vocab_size)

        return logits


In [13]:
model = GPTMini(vocab_size=vocab_size, embedding_dim=32, block_size=8, n_heads=4, n_layers=2)

sample_idx = torch.tensor([encode("hello th")], dtype=torch.long)  # ✔️ fixed
logits = model(sample_idx)

print("Logits shape:", logits.shape)  # Should be [1, 8, vocab_size]


Logits shape: torch.Size([1, 8, 13])


In [14]:
import random

# Full dataset (string → int)
data = torch.tensor(encode(text), dtype=torch.long)

# Split into small training samples
block_size = 8
X = []
Y = []

for i in range(len(data) - block_size):
    context = data[i:i+block_size]
    target = data[i+1:i+block_size+1]
    X.append(context)
    Y.append(target)

X = torch.stack(X)
Y = torch.stack(Y)

print("Input shape:", X.shape)   # [n_samples, block_size]
print("Target shape:", Y.shape)  # same


Input shape: torch.Size([18, 8])
Target shape: torch.Size([18, 8])


In [15]:
from torch.nn import functional as F

model = GPTMini(vocab_size=vocab_size, embedding_dim=32, block_size=block_size, n_heads=4, n_layers=2)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)

batch_size = 4
max_iters = 1000

for iter in range(max_iters):
    # Sample a random mini-batch
    ix = torch.randint(0, X.shape[0], (batch_size,))
    x_batch = X[ix]
    y_batch = Y[ix]

    # Forward pass
    logits = model(x_batch)
    loss = F.cross_entropy(logits.view(-1, vocab_size), y_batch.view(-1))

    # Backward pass
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # Log every 100 steps
    if iter % 100 == 0:
        print(f"Step {iter}: loss = {loss.item():.4f}")


Step 0: loss = 2.6735
Step 100: loss = 0.8155
Step 200: loss = 0.2828
Step 300: loss = 0.1159
Step 400: loss = 0.1669
Step 500: loss = 0.2494
Step 600: loss = 0.1561
Step 700: loss = 0.0581
Step 800: loss = 0.0850
Step 900: loss = 0.2004


In [16]:
def generate(model, idx, max_new_tokens):
    model.eval()
    for _ in range(max_new_tokens):
        idx_cond = idx[:, -block_size:]  # Last block_size tokens
        logits = model(idx_cond)
        logits = logits[:, -1, :]  # Take the last token's logits
        probs = F.softmax(logits, dim=-1)
        next_token = torch.multinomial(probs, num_samples=1)
        idx = torch.cat((idx, next_token), dim=1)
    return idx

# Start from "hello th"
context = torch.tensor([encode("hello th")], dtype=torch.long)
generated = generate(model, context, max_new_tokens=20)
print("Generated text:", decode(generated[0].tolist()))


Generated text: hello there general kenobike
