In [None]:
# ================================
# Mini-GPT for Text Generation
# ================================
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import random

# ------------------
# 1️⃣ Set random seeds
# ------------------
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)

# ------------------
# 2️⃣ Sample dataset
# ------------------
text = (
    "Once upon a time, in a land far away, there was a small village. "
    "In this village, lived a young girl named Alice who loved adventures. "
    "Every day, she would explore the forest, discovering new secrets and magical creatures. "
    "One day, she stumbled upon a mysterious door hidden behind the trees."
)

# ------------------
# 3️⃣ Tokenization
# ------------------
chars = sorted(list(set(text)))
vocab_size = len(chars)
stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for ch, i in stoi.items()}

def encode(s):  # string to integer sequence
    return [stoi[c] for c in s]

def decode(l):  # integer sequence to string
    return "".join([itos[i] for i in l])

data = torch.tensor(encode(text), dtype=torch.long)

# Train/validation split
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

# ------------------
# 4️⃣ Hyperparameters
# ------------------
block_size = 8      # context size
batch_size = 4
embedding_dim = 32
n_heads = 4
n_layers = 2
dropout = 0.1
lr = 1e-2
epochs = 1000

device = 'cuda' if torch.cuda.is_available() else 'cpu'

# ------------------
# 5️⃣ Data loader
# ------------------
def get_batch(data, batch_size, block_size):
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x.to(device), y.to(device)

# ------------------
# 6️⃣ Mini-GPT Model
# ------------------
class Head(nn.Module):
    """One attention head"""
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(embedding_dim, head_size, bias=False)
        self.query = nn.Linear(embedding_dim, head_size, bias=False)
        self.value = nn.Linear(embedding_dim, head_size, bias=False)
        self.register_buffer("tril", torch.tril(torch.ones(block_size, block_size)))

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)   # (B,T,head_size)
        q = self.query(x) # (B,T,head_size)
        wei = q @ k.transpose(-2, -1) * C**-0.5
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        wei = F.softmax(wei, dim=-1)
        v = self.value(x)
        out = wei @ v
        return out

class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(num_heads*head_size, embedding_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

class FeedForward(nn.Module):
    def __init__(self, embedding_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(embedding_dim, 4*embedding_dim),
            nn.ReLU(),
            nn.Linear(4*embedding_dim, embedding_dim),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    """Transformer block: attention + feedforward"""
    def __init__(self, embedding_dim, n_heads):
        super().__init__()
        head_size = embedding_dim // n_heads
        self.sa = MultiHeadAttention(n_heads, head_size)
        self.ffwd = FeedForward(embedding_dim)
        self.ln1 = nn.LayerNorm(embedding_dim)
        self.ln2 = nn.LayerNorm(embedding_dim)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

class MiniGPT(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, embedding_dim)
        self.position_embedding_table = nn.Embedding(block_size, embedding_dim)
        self.blocks = nn.Sequential(*[Block(embedding_dim, n_heads) for _ in range(n_layers)])
        self.ln_f = nn.LayerNorm(embedding_dim)
        self.lm_head = nn.Linear(embedding_dim, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        tok_emb = self.token_embedding_table(idx)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device))
        x = tok_emb + pos_emb
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.lm_head(x)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            loss = F.cross_entropy(logits.view(B*T, C), targets.view(B*T))
        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, _ = self(idx_cond)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            next_idx = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, next_idx), dim=1)
        return idx

model = MiniGPT().to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)

# ------------------
# 7️⃣ Training loop
# ------------------
for epoch in range(epochs):
    xb, yb = get_batch(train_data, batch_size, block_size)
    logits, loss = model(xb, yb)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if epoch % 50 == 0:
        print(f"Epoch {epoch}, Loss {loss.item():.4f}")

# ------------------
# 8️⃣ Generate text
# ------------------
context = torch.tensor(encode("Alice"), dtype=torch.long, device=device).unsqueeze(0)
generated_idx = model.generate(context, max_new_tokens=50)[0].tolist()
print("Generated Text:\n", decode(generated_idx))


Epoch 0, Loss 3.5432
Epoch 50, Loss 2.2185
Epoch 100, Loss 1.6028
Epoch 150, Loss 1.5447
Epoch 200, Loss 1.8483
Epoch 250, Loss 1.4301
Epoch 300, Loss 0.8368
Epoch 350, Loss 1.2287
Epoch 400, Loss 1.2923
Epoch 450, Loss 0.9025
Epoch 500, Loss 1.0508
Epoch 550, Loss 0.7236
Epoch 600, Loss 1.0718
Epoch 650, Loss 0.4333
Epoch 700, Loss 0.7527
Epoch 750, Loss 0.6715
Epoch 800, Loss 0.7509
Epoch 850, Loss 0.8164
Epoch 900, Loss 0.6230
Epoch 950, Loss 0.4751
Epoch 1000, Loss 0.5099
Epoch 1050, Loss 0.9172
Epoch 1100, Loss 0.4152
Epoch 1150, Loss 0.6441
Epoch 1200, Loss 0.5952
Epoch 1250, Loss 0.5502
Epoch 1300, Loss 0.5558
Epoch 1350, Loss 0.4457
Epoch 1400, Loss 0.5304
Epoch 1450, Loss 0.5324
Epoch 1500, Loss 0.5823
Epoch 1550, Loss 0.5969
Epoch 1600, Loss 0.7007
Epoch 1650, Loss 0.4960
Epoch 1700, Loss 0.4824
Epoch 1750, Loss 0.6776
Epoch 1800, Loss 0.4685
Epoch 1850, Loss 0.2972
Epoch 1900, Loss 0.4093
Epoch 1950, Loss 0.5261
Epoch 2000, Loss 0.6645
Epoch 2050, Loss 0.3795
Epoch 2100, Los