In [1]:
import torch
from pathlib import Path

# reproducibility
torch.manual_seed(1337)

# device
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print("device:", device)

# repo root + data path 
REPO_ROOT = Path.cwd()
if (REPO_ROOT / "data").exists() is False and (REPO_ROOT.parent / "data").exists():
    REPO_ROOT = REPO_ROOT.parent

data_path = REPO_ROOT / "data" / "names.txt"
words = data_path.read_text(encoding="utf-8").splitlines()

print("repo root:", REPO_ROOT)
print("num words:", len(words))
print("first 5 words:", words[:5])

device: mps
repo root: /Users/home/Developer/github/makemore-notes
num words: 32033
first 5 words: ['emma', 'olivia', 'ava', 'isabella', 'sophia']


In [None]:
# build vocabulary + train/dev/test split (reproducible)

# vocabulary (same convention: '.' = 0)
chars = sorted(set("".join(words)))
stoi = {ch: i + 1 for i, ch in enumerate(chars)}
stoi["."] = 0
itos = {i: ch for ch, i in stoi.items()}
vocab_size = len(itos)

print("vocab_size:", vocab_size)
print("itos sample:", list(itos.items())[:10])

# reproducible shuffle + split
g_cpu = torch.Generator().manual_seed(1337)
words_shuf = words[:]  # copy
perm = torch.randperm(len(words_shuf), generator=g_cpu).tolist()
words_shuf = [words_shuf[i] for i in perm]

n1 = int(0.8 * len(words_shuf))
n2 = int(0.9 * len(words_shuf))
words_tr = words_shuf[:n1]
words_dev = words_shuf[n1:n2]
words_te = words_shuf[n2:]

print("splits:", len(words_tr), len(words_dev), len(words_te))
print("sample train words:", words_tr[:5])

vocab_size: 27
itos sample: [(1, 'a'), (2, 'b'), (3, 'c'), (4, 'd'), (5, 'e'), (6, 'f'), (7, 'g'), (8, 'h'), (9, 'i'), (10, 'j')]
splits: 25626 3203 3204
sample train words: ['christie', 'edi', 'wallace', 'arieliz', 'aboubacar']


In [3]:
# dataset builder with configurable context length (block_size)

block_size = 3  # context length (Karpathy uses 3 early; we'll grow later)

def build_dataset(words_list: list[str], block_size: int):
    X, Y = [], []
    for w in words_list:
        context = [0] * block_size
        for ch in w + ".":
            ix = stoi[ch]
            X.append(context)
            Y.append(ix)
            context = context[1:] + [ix]  # shift window and append
    X = torch.tensor(X, dtype=torch.long, device=device)
    Y = torch.tensor(Y, dtype=torch.long, device=device)
    return X, Y

Xtr, Ytr = build_dataset(words_tr, block_size)
Xdev, Ydev = build_dataset(words_dev, block_size)
Xte, Yte = build_dataset(words_te, block_size)

print("Xtr/Ytr:", Xtr.shape, Ytr.shape)
print("Xdev/Ydev:", Xdev.shape, Ydev.shape)
print("Xte/Yte:", Xte.shape, Yte.shape)

# quick sanity peek
def decode_ctx(ctx):
    return "".join(itos[int(i)] for i in ctx)

for i in [0, 1, 2, 3, 4]:
    print(f"{i:>3}  X='{decode_ctx(Xtr[i])}' -> Y='{itos[int(Ytr[i])]}'")

Xtr/Ytr: torch.Size([182508, 3]) torch.Size([182508])
Xdev/Ydev: torch.Size([22860, 3]) torch.Size([22860])
Xte/Yte: torch.Size([22778, 3]) torch.Size([22778])
  0  X='...' -> Y='c'
  1  X='..c' -> Y='h'
  2  X='.ch' -> Y='r'
  3  X='chr' -> Y='i'
  4  X='hri' -> Y='s'


In [4]:
# define a deeper MLP with BatchNorm (no training yet)

import torch.nn.functional as F

# hyperparameters (explicit and easy to tune)
n_embed = 10
n_hidden = 200

g = torch.Generator(device=device).manual_seed(1337)

# parameters
C = torch.randn((vocab_size, n_embed), generator=g, device=device)

W1 = torch.randn((block_size * n_embed, n_hidden), generator=g, device=device) * (5/3) / (block_size * n_embed) ** 0.5
b1 = torch.zeros((n_hidden,), device=device)

W2 = torch.randn((n_hidden, n_hidden), generator=g, device=device) * (5/3) / n_hidden ** 0.5
b2 = torch.zeros((n_hidden,), device=device)

W3 = torch.randn((n_hidden, vocab_size), generator=g, device=device) * 0.01
b3 = torch.zeros((vocab_size,), device=device)

# BatchNorm parameters (learned scale + shift)
bn_gain = torch.ones((n_hidden,), device=device)
bn_bias = torch.zeros((n_hidden,), device=device)

parameters = [C, W1, b1, W2, b2, W3, b3, bn_gain, bn_bias]
for p in parameters:
    p.requires_grad_(True)

print("num parameters:", sum(p.nelement() for p in parameters))

num parameters: 52497


In [6]:
# forward pass with BatchNorm (train-mode), plus running stats init

# running stats (buffers, not trained by gradient descent)
bn_running_mean = torch.zeros((n_hidden,), device=device)
bn_running_var = torch.ones((n_hidden,), device=device)

bn_momentum = 0.1
bn_eps = 1e-5

def forward(Xb: torch.Tensor, *, train: bool) -> torch.Tensor:
    """
    Xb: (B, block_size) long
    returns logits: (B, vocab_size)
    """
    global bn_running_mean, bn_running_var

    # embeddings
    emb = C[Xb]                           # (B, block_size, n_embed)
    x = emb.view(emb.shape[0], -1)        # (B, block_size*n_embed)

    # layer 1
    h1 = x @ W1 + b1                      # (B, n_hidden)
    h1 = torch.tanh(h1)

    # layer 2 pre-activation
    h2_pre = h1 @ W2 + b2                 # (B, n_hidden)

    # BatchNorm on h2_pre
    if train:
        batch_mean = h2_pre.mean(dim=0, keepdim=False)                   # (n_hidden,)
        batch_var = h2_pre.var(dim=0, unbiased=False, keepdim=False)     # (n_hidden,)

        h2_hat = (h2_pre - batch_mean) / torch.sqrt(batch_var + bn_eps)

        # update running stats (no grad)
        with torch.no_grad():
            bn_running_mean = (1 - bn_momentum) * bn_running_mean + bn_momentum * batch_mean
            bn_running_var  = (1 - bn_momentum) * bn_running_var  + bn_momentum * batch_var
    else:
        h2_hat = (h2_pre - bn_running_mean) / torch.sqrt(bn_running_var + bn_eps)

    # scale + shift (learned)
    h2 = bn_gain * h2_hat + bn_bias
    h2 = torch.tanh(h2)

    # output
    logits = h2 @ W3 + b3                 # (B, vocab_size)
    return logits

# quick sanity forward pass
B = 32
ix = torch.randint(0, Xtr.shape[0], (B,), generator=g, device=device)
logits = forward(Xtr[ix], train=True)
loss = F.cross_entropy(logits, Ytr[ix])

print("logits:", logits.shape, "loss:", loss.item())
print("bn_running_mean/std (first 5):",
      bn_running_mean[:5].tolist(),
      torch.sqrt(bn_running_var[:5]).tolist())

logits: torch.Size([32, 27]) loss: 3.294424533843994
bn_running_mean/std (first 5): [0.07393120229244232, -0.04032566025853157, -0.029199523851275444, -0.029229525476694107, 0.06531120091676712] [1.0036672353744507, 1.0191179513931274, 1.0848532915115356, 1.0036523342132568, 1.0340840816497803]


In [7]:
# training loop (BatchNorm train-mode) + periodic dev evaluation

import math

def split_loss(X: torch.Tensor, Y: torch.Tensor, batch_size: int = 4096) -> float:
    """Average loss over a split (uses BN in eval-mode)."""
    losses = []
    for start in range(0, X.shape[0], batch_size):
        xb = X[start:start+batch_size]
        yb = Y[start:start+batch_size]
        logits = forward(xb, train=False)
        losses.append(F.cross_entropy(logits, yb).detach())
    return torch.stack(losses).mean().item()

# training hyperparams
max_steps = 20_000
batch_size = 32
eval_interval = 1_000

# simple learning-rate schedule (Karpathy-style, but clean)
def lr_at(step: int) -> float:
    return 0.1 if step < 10_000 else 0.01

lossi = []

for step in range(max_steps + 1):
    # minibatch
    ix = torch.randint(0, Xtr.shape[0], (batch_size,), generator=g, device=device)
    xb, yb = Xtr[ix], Ytr[ix]

    # forward (BN in train mode)
    logits = forward(xb, train=True)
    loss = F.cross_entropy(logits, yb)

    # backward
    for p in parameters:
        p.grad = None
    loss.backward()

    # update
    lr = lr_at(step)
    with torch.no_grad():
        for p in parameters:
            p -= lr * p.grad

    # track
    if step % eval_interval == 0:
        train_loss = split_loss(Xtr, Ytr)
        dev_loss = split_loss(Xdev, Ydev)
        print(f"{step:6d}/{max_steps}  lr={lr:.3g}  train={train_loss:.4f}  dev={dev_loss:.4f}")
        lossi.append(dev_loss)

     0/20000  lr=0.1  train=3.3085  dev=3.3079
  1000/20000  lr=0.1  train=2.3706  dev=2.3782
  2000/20000  lr=0.1  train=2.3264  dev=2.3385
  3000/20000  lr=0.1  train=2.2762  dev=2.2885
  4000/20000  lr=0.1  train=2.2922  dev=2.3094
  5000/20000  lr=0.1  train=2.2549  dev=2.2700
  6000/20000  lr=0.1  train=2.2333  dev=2.2508
  7000/20000  lr=0.1  train=2.2396  dev=2.2614
  8000/20000  lr=0.1  train=2.2337  dev=2.2584
  9000/20000  lr=0.1  train=2.2184  dev=2.2403
 10000/20000  lr=0.01  train=2.2209  dev=2.2471
 11000/20000  lr=0.01  train=2.1487  dev=2.1773
 12000/20000  lr=0.01  train=2.1436  dev=2.1732
 13000/20000  lr=0.01  train=2.1392  dev=2.1680
 14000/20000  lr=0.01  train=2.1381  dev=2.1667
 15000/20000  lr=0.01  train=2.1358  dev=2.1644
 16000/20000  lr=0.01  train=2.1330  dev=2.1624
 17000/20000  lr=0.01  train=2.1293  dev=2.1596
 18000/20000  lr=0.01  train=2.1286  dev=2.1596
 19000/20000  lr=0.01  train=2.1267  dev=2.1578
 20000/20000  lr=0.01  train=2.1258  dev=2.1574


In [8]:
# sample names from the trained BN model (eval-mode forward)

@torch.no_grad()
def sample_names(num_samples: int = 20, max_len: int = 30, temperature: float = 1.0):
    out = []
    for _ in range(num_samples):
        context = [0] * block_size
        name_chars = []
        while True:
            x = torch.tensor([context], dtype=torch.long, device=device)     # (1, block_size)
            logits = forward(x, train=False)                                 # BN uses running stats
            logits = logits / temperature
            probs = torch.softmax(logits, dim=1)                             # (1, vocab_size)

            ix = torch.multinomial(probs, num_samples=1).item()
            if ix == 0:
                break

            name_chars.append(itos[ix])
            context = context[1:] + [ix]

            if len(name_chars) >= max_len:
                break

        out.append("".join(name_chars))
    return out

for t in [0.7, 1.0, 1.3]:
    print(f"\n--- temperature={t} ---")
    for n in sample_names(num_samples=10, temperature=t):
        print(n)


--- temperature=0.7 ---
rayshika
bracolten
madari
samar
quinn
jacott
faon
anleann
damaris
dylani

--- temperature=1.0 ---
joa
taz
malan
rhin
sywadepha
kemae
alynn
amari
taer
kya

--- temperature=1.3 ---
scenney
bod
khyrikzangi
zaxi
nafelsa
briglazaryn
chripm
ratyn
lie
donkianori


In [9]:
# final evaluation (train/dev/test) using BN eval-mode

train_loss = split_loss(Xtr, Ytr)
dev_loss   = split_loss(Xdev, Ydev)
test_loss  = split_loss(Xte, Yte)

print(f"final  train loss: {train_loss:.4f}")
print(f"final    dev loss: {dev_loss:.4f}")
print(f"final   test loss: {test_loss:.4f}")

final  train loss: 2.1258
final    dev loss: 2.1574
final   test loss: 2.1506


In [10]:
# experiment summary

summary = {
    "part": "03",
    "notebook": "04_mlp_deepening_and_normalization.ipynb",
    "device": str(device),
    "block_size": block_size,
    "vocab_size": vocab_size,
    "n_embed": n_embed,
    "n_hidden": n_hidden,
    "max_steps": 20_000,
    "batch_size": 32,
    "lr_schedule": "0.1 for <10k steps, then 0.01",
    "batchnorm": {
        "momentum": bn_momentum,
        "eps": bn_eps,
        "running_stats_used_in_eval": True,
    },
    "loss": {
        "train": float(train_loss),
        "dev": float(dev_loss),
        "test": float(test_loss),
    },
}

for k, v in summary.items():
    print(f"{k}: {v}")

part: 03
notebook: 04_mlp_deepening_and_normalization.ipynb
device: mps
block_size: 3
vocab_size: 27
n_embed: 10
n_hidden: 200
max_steps: 20000
batch_size: 32
lr_schedule: 0.1 for <10k steps, then 0.01
batchnorm: {'momentum': 0.1, 'eps': 1e-05, 'running_stats_used_in_eval': True}
loss: {'train': 2.125819683074951, 'dev': 2.1573522090911865, 'test': 2.1505839824676514}
