## Clean Neural Network Implementation

### Performance log

Step 1
hidden dim 256
1. epochs 5, num_layers 1: 2.173
2. epochs 20, num layers 1: 1.906
    - cpu (Apple M4): 2m 2.3s
    - mps: 2m 12.6s
3. epochs 20, num_layers 2: 1.821
    - mps: 3m 50.8s
4. epochs 20, num_layers 3: 1.793
    - mps: 5m 47.8s
5. epochs 20, num_layers 2, hidden_dim 512: 1.772
    - mps: 5m 52.1s

Step 2
seq len 128, grad clip 1.0
1. seq len 256: 1.737
    - mps: 5m 44.2s

seq len 256, learning rate 2e-3, grad clip 1.0, batch size 128, dropout 0.1

Nan fix: seq len 384, learning rate 1e-4, grad clip 0.1, batch size 32, dropout 0.3: 2.059 still learning
    - mps: 16m 55.4s

2. Add scheduler, 2e-3: 1.735

Base learning rate optimization:
5e-3: 1.771
3e-3: 1.760
1e-3: 1.753

Add weight decay to optimizer, 2e-3: 1.733

3. overlap step original: None
- 64: 1.741
    - 25m 44.9s
- 32: 1.753

4. dropout original: 0.1
- 0.2: 1.731
- 0.3: 1.731

5. model architecture original: GRU
- LSTM: 1.835

In [12]:
import math, os, random, torch, torch.nn as nn
from torch.utils.data import Dataset, DataLoader

def set_seed(seed=42):
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic=True
    torch.backends.cudnn.benchmark=False

set_seed(42)
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Using Apple Silicon GPU (MPS)")
elif torch.cuda.is_available():
    device = torch.device("cuda") 
    print("Using CUDA GPU")
else:
    device = torch.device("cpu")
    print("Using CPU")

# device detection
print(f"Training on: {device}")
print(f"Target: Reduce BPC from current baseline\n")

config = {
    "data_path": "data/modern_chronicle.txt",
    "seq_len": 256,
    "batch_size": 128,
    "embedding_dim": 256,
    "hidden_dim": 512,
    "num_layers": 2,
    "dropout": 0.2,
    "rnn_type": "LSTM",
    "num_epochs": 20,
    "learning_rate": 2e-3,
    "grad_clip": 1.0,
    "log_every": 100,
    "sample_every": 100,
    "max_generate": 400,
    "temperature": 0.9,
    "top_k": 40,
    "top_p": 0.9,
    "val_fraction": 0.05,
    "overlap_step": None,
    "save_path": "char_rnn_checkpoint.pt"
}

if config["data_path"] and os.path.exists(config["data_path"]):
    with open(config["data_path"], "r", encoding="utf-8") as f: text = f.read()
else:
    text = "ROMEO:\nBut soft, what light through yonder window breaks?\nIt is the east, and Juliet is the sun.\n"

class CharVocab:
    def __init__(self, text):
        chars = sorted(list(set(text)))
        self.itos = chars
        self.stoi = {c: i for i, c in enumerate(chars)}
    
    def encode(self, s):
        return [self.stoi[c] for c in s if c in self.stoi]
    
    def decode(self, ids):
        return "".join(self.itos[i] for i in ids)

vocab = CharVocab(text)
vocab_size = len(vocab.itos)
data_ids = torch.tensor(vocab.encode(text), dtype=torch.long)
n_total = len(data_ids)
n_val = max(1, int(n_total * config["val_fraction"]))
train_ids = data_ids[:-n_val]
val_ids = data_ids[-n_val:]

class CharChunkDataset(Dataset):
    def __init__(self, ids, seq_len, step=None):
        self.ids = ids
        self.T = seq_len
        self.step = step if step is not None else seq_len
        self.num_chunks = (len(ids) - 1 - seq_len) // self.step + 1
        self.starts = [i * self.step for i in range(self.num_chunks)]
    
    def __len__(self):
        return self.num_chunks
    
    def __getitem__(self, idx):
        s = self.starts[idx]
        return self.ids[s:s + self.T], self.ids[s + 1:s + 1 + self.T]

train_ds = CharChunkDataset(train_ids, config["seq_len"], config["overlap_step"])
val_ds = CharChunkDataset(val_ids, config["seq_len"], config["overlap_step"])
train_loader = DataLoader(train_ds, batch_size=config["batch_size"], shuffle=True, drop_last=True)
val_loader = DataLoader(val_ds, batch_size=config["batch_size"], shuffle=False, drop_last=True)

class CharRNN(nn.Module):
    def __init__(self, vocab_size, emb, hidden, layers, dropout, rnn_type="GRU"):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, emb)
        rnn_cls = {"GRU": nn.GRU, "LSTM": nn.LSTM}[rnn_type.upper()]
        self.rnn = rnn_cls(emb, hidden, num_layers=layers, dropout=dropout if layers > 1 else 0.0, batch_first=True)
        self.drop = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden, vocab_size)
        self.rnn_type = rnn_type.upper()
        self.layers = layers
        self.hidden = hidden
    
    def forward(self, x, h=None):
        x = self.emb(x)
        x, h = self.rnn(x, h)
        x = self.drop(x)
        return self.fc(x), h
    
    def init_hidden(self, batch_size, device):
        if self.rnn_type == "LSTM":
            return (torch.zeros(self.layers, batch_size, self.hidden, device=device),
                    torch.zeros(self.layers, batch_size, self.hidden, device=device))
        else:
            return torch.zeros(self.layers, batch_size, self.hidden, device=device)

model = CharRNN(vocab_size, config["embedding_dim"], config["hidden_dim"], config["num_layers"], config["dropout"], config["rnn_type"]).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=config["learning_rate"], weight_decay=1e-5)

# scheduler
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode='min', factor=0.5, patience=3
)

def bpc_from_loss(loss_val):
    return loss_val / math.log(2.0)

def evaluate_loss(data_loader):
    model.eval(); total = 0.0; count = 0
    
    with torch.no_grad():
        for x, y in data_loader:
            x, y = x.to(device), y.to(device)
            logits, _ = model(x)
            loss = criterion(logits.view(-1, vocab_size), y.view(-1))
            total += loss.item(); count += 1
    
    if count == 0:
        return float("nan"), float("nan")
    
    avg = total / count
    return avg, bpc_from_loss(avg)

def sample_text(model, vocab, max_new_tokens=300, temperature=1.0, top_k=None, top_p=None, prompt="", device="cpu"):
    model.eval()
    
    with torch.no_grad():
        if not prompt:
            prompt = random.choice(vocab.itos)
        
        input_ids = torch.tensor(vocab.encode(prompt), dtype=torch.long, device=device).unsqueeze(0)
        h = None
        out = list(prompt)
        
        for _ in range(max_new_tokens):
            logits, h = model(input_ids, h)
            last_logits = logits[0, -1, :] / max(1e-6, temperature)
            probs = torch.softmax(last_logits, dim=-1)
            
            if top_k is not None:
                k = min(top_k, probs.numel())
                topk_vals, topk_idx = torch.topk(probs, k)
                mask = torch.zeros_like(probs, dtype=torch.bool); mask[topk_idx] = True
                probs = probs.masked_fill(~mask, 0)
            
            if top_p is not None:
                sorted_probs, sorted_idx = torch.sort(probs, descending=True)
                cumsum = torch.cumsum(sorted_probs, dim=0)
                keep = cumsum <= top_p; keep[0] = True
                filtered = torch.zeros_like(sorted_probs).masked_scatter(keep, sorted_probs[keep])
                probs = torch.zeros_like(probs).scatter(0, sorted_idx, filtered)
            
            s = probs.sum()
            
            if s <= 0 or torch.isnan(s):
                next_id = torch.argmax(last_logits)
            
            else:
                probs = probs / s
                next_id = torch.multinomial(probs, 1).item()
            
            out.append(vocab.itos[int(next_id)])
            input_ids = torch.tensor([[next_id]], device=device)
        
        return "".join(out)

global_step = 0; best_val = float("inf")

for epoch in range(1, config["num_epochs"] + 1):
    model.train()
    running = 0.0
    
    for i, (x, y) in enumerate(train_loader, start=1):
        x, y = x.to(device), y.to(device)
        logits, _ = model(x)
        loss = criterion(logits.view(-1, vocab_size), y.view(-1))
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        
        if config["grad_clip"] is not None:
            torch.nn.utils.clip_grad_norm_(model.parameters(), config["grad_clip"])
        
        optimizer.step()
        running += loss.item(); global_step += 1
        
        if global_step % config["log_every"] == 0:
            avg_loss = running / config["log_every"]
            avg_bpc = bpc_from_loss(avg_loss)
            # epoch num and bpc
            print(f"Epoch {epoch} | Training BPC: {avg_bpc:.3f}")
            running = 0.0
        
        if global_step % config["sample_every"] == 0:
            print("\n--- Generated Text ---")
            sample = sample_text(model, vocab, max_new_tokens=config["max_generate"], temperature=config["temperature"], top_k=config["top_k"], top_p=config["top_p"], prompt="ROMEO:", device=device)
            # only first 200 chars
            print(sample[:200] + "..." if len(sample) > 200 else sample)
            print("---\n")
    
    val_loss, val_bpc = evaluate_loss(val_loader)
    print(f"Epoch {epoch} Complete | Validation BPC: {val_bpc:.3f}")
    
    if val_loss < best_val:
        best_val = val_loss
        torch.save({"model_state": model.state_dict(), "config": config, "stoi": vocab.stoi, "itos": vocab.itos}, config["save_path"])
        print(f"New best model saved (BPC: {val_bpc:.3f})")
    # after validation
    scheduler.step(val_loss)
    print()

print(f"Training Complete")
print(f"Best Validation BPC: {bpc_from_loss(best_val):.3f}")

Using Apple Silicon GPU (MPS)
Training on: mps
Target: Reduce BPC from current baseline

Epoch 1 Complete | Validation BPC: 3.174
New best model saved (BPC: 3.174)

Epoch 2 Complete | Validation BPC: 2.643
New best model saved (BPC: 2.643)

Epoch 3 Complete | Validation BPC: 2.390
New best model saved (BPC: 2.390)

Epoch 4 | Training BPC: 0.525

--- Generated Text ---
ROMEO: Poth sirtrip criatatured to the wouldn so the had shew speid, owred and a man we know-cansualaty andrestode took as
she asdastias, to atring in Honora candole as he sittamstly his
resturing,
af...
---

Epoch 4 Complete | Validation BPC: 2.241
New best model saved (BPC: 2.241)

Epoch 5 Complete | Validation BPC: 2.145
New best model saved (BPC: 2.145)

Epoch 6 Complete | Validation BPC: 2.073
New best model saved (BPC: 2.073)

Epoch 7 Complete | Validation BPC: 2.020
New best model saved (BPC: 2.020)

Epoch 8 | Training BPC: 0.363

--- Generated Text ---
ROMEO:. My enterious asked his argulamed came appranatimently 

In [16]:
import torch

# Load checkpoint
checkpoint_path = "char_rnn_checkpoint.pt"
device = torch.device("mps" if torch.backends.mps.is_available()
                      else "cuda" if torch.cuda.is_available()
                      else "cpu")

checkpoint = torch.load(checkpoint_path, map_location=device)

# Rebuild the model with saved config
model = CharRNN(
    vocab_size=len(checkpoint["itos"]),
    emb=checkpoint["config"]["embedding_dim"],
    hidden=checkpoint["config"]["hidden_dim"],
    layers=checkpoint["config"]["num_layers"],
    dropout=checkpoint["config"]["dropout"],
    rnn_type=checkpoint["config"]["rnn_type"]
).to(device)

# Load weights
model.load_state_dict(checkpoint["model_state"])
model.eval()

# Restore vocab mappings
itos = checkpoint["itos"]
stoi = checkpoint["stoi"]

class CharVocab:
    def __init__(self, itos, stoi):
        self.itos = itos
        self.stoi = stoi
    def encode(self, s): return [self.stoi[c] for c in s if c in self.stoi]
    def decode(self, ids): return "".join(self.itos[i] for i in ids)

vocab = CharVocab(itos, stoi)

# --- Generate text ---
sample = sample_text(
    model,
    vocab,
    max_new_tokens=400,              # how many characters to generate
    temperature=0.9,                 # randomness (lower = more deterministic)
    top_k=40,                        # restrict to top-K predictions
    top_p=0.9,                       # nucleus sampling
    prompt="ROMEO:",                 # optional starting text
    device=device
)

print("\n--- Generated Text ---")
print(sample)
print("--- End ---")


--- Generated Text ---
ROMEO: Honora golded herself to relate that he was not presently, presently the
calls a scires of the forested wall was the memory of the fashionable
experience in the carved slopes. He was still anything in spite of summer
singing at her with a long true, when the ball of strength had been
started, and she caught her arm at the terrace at any time something of
herself. She was to read a subject of its 
--- End ---


- modern chronicle:
- tinyshakespeare:

In [15]:
import math, os, random, torch, torch.nn as nn
from torch.utils.data import Dataset, DataLoader

def set_seed(seed=42):
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic=True
    torch.backends.cudnn.benchmark=False

set_seed(42)
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Using Apple Silicon GPU (MPS)")
elif torch.cuda.is_available():
    device = torch.device("cuda") 
    print("Using CUDA GPU")
else:
    device = torch.device("cpu")
    print("Using CPU")

# device detection
print(f"Training on: {device}")
print(f"Target: Reduce BPC from current baseline\n")

config = {
    "data_path": "data/modern_chronicle.txt",
    "seq_len": 256,
    "batch_size": 128,
    "embedding_dim": 256,
    "hidden_dim": 512,
    "num_layers": 2,
    # Slightly more regularization to help generalization
    "dropout": 0.25,
    "rnn_type": "GRU",
    # Let scheduler work across more epochs; best checkpoint is saved
    "num_epochs": 30,
    # Lower LR for steadier convergence on GRU char models
    "learning_rate": 1e-3,
    # Tighter clip to stabilize updates
    "grad_clip": 0.5,
    "log_every": 100,
    "sample_every": 200,
    "max_generate": 400,
    "temperature": 0.9,
    "top_k": 40,
    "top_p": 0.9,
    # Larger val split for a more stable plateau signal
    "val_fraction": 0.10,
    # Overlap for better context continuity (train & val share this in current logic)
    "overlap_step": 64,
    "save_path": "char_rnn_checkpoint.pt"
}

if config["data_path"] and os.path.exists(config["data_path"]):
    with open(config["data_path"], "r", encoding="utf-8") as f: text = f.read()
else:
    text = "ROMEO:\nBut soft, what light through yonder window breaks?\nIt is the east, and Juliet is the sun.\n"

class CharVocab:
    def __init__(self, text):
        chars = sorted(list(set(text)))
        self.itos = chars
        self.stoi = {c: i for i, c in enumerate(chars)}
    
    def encode(self, s):
        return [self.stoi[c] for c in s if c in self.stoi]
    
    def decode(self, ids):
        return "".join(self.itos[i] for i in ids)

vocab = CharVocab(text)
vocab_size = len(vocab.itos)
data_ids = torch.tensor(vocab.encode(text), dtype=torch.long)
n_total = len(data_ids)
n_val = max(1, int(n_total * config["val_fraction"]))
train_ids = data_ids[:-n_val]
val_ids = data_ids[-n_val:]

class CharChunkDataset(Dataset):
    def __init__(self, ids, seq_len, step=None):
        self.ids = ids
        self.T = seq_len
        self.step = step if step is not None else seq_len
        self.num_chunks = (len(ids) - 1 - seq_len) // self.step + 1
        self.starts = [i * self.step for i in range(self.num_chunks)]
    
    def __len__(self):
        return self.num_chunks
    
    def __getitem__(self, idx):
        s = self.starts[idx]
        return self.ids[s:s + self.T], self.ids[s + 1:s + 1 + self.T]

train_ds = CharChunkDataset(train_ids, config["seq_len"], config["overlap_step"])
val_ds = CharChunkDataset(val_ids, config["seq_len"], config["overlap_step"])
train_loader = DataLoader(train_ds, batch_size=config["batch_size"], shuffle=True, drop_last=True)
val_loader = DataLoader(val_ds, batch_size=config["batch_size"], shuffle=False, drop_last=True)

class CharRNN(nn.Module):
    def __init__(self, vocab_size, emb, hidden, layers, dropout, rnn_type="GRU"):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, emb)
        rnn_cls = {"GRU": nn.GRU, "LSTM": nn.LSTM}[rnn_type.upper()]
        self.rnn = rnn_cls(emb, hidden, num_layers=layers, dropout=dropout if layers > 1 else 0.0, batch_first=True)
        self.drop = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden, vocab_size)
        self.rnn_type = rnn_type.upper()
        self.layers = layers
        self.hidden = hidden
    
    def forward(self, x, h=None):
        x = self.emb(x)
        x, h = self.rnn(x, h)
        x = self.drop(x)
        return self.fc(x), h
    
    def init_hidden(self, batch_size, device):
        if self.rnn_type == "LSTM":
            return (torch.zeros(self.layers, batch_size, self.hidden, device=device),
                    torch.zeros(self.layers, batch_size, self.hidden, device=device))
        else:
            return torch.zeros(self.layers, batch_size, self.hidden, device=device)

model = CharRNN(vocab_size, config["embedding_dim"], config["hidden_dim"], config["num_layers"], config["dropout"], config["rnn_type"]).to(device)
criterion = nn.CrossEntropyLoss()

# Stronger L2 helps generalization; allowed as fine-tuning
optimizer = torch.optim.Adam(model.parameters(), lr=config["learning_rate"], weight_decay=1e-4)

# scheduler (tighter patience, min_lr, cooldown) â€” still same logic location (epoch-end)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode='min', factor=0.5, patience=2, threshold=1e-3, cooldown=1, min_lr=1e-5,
)

def bpc_from_loss(loss_val):
    return loss_val / math.log(2.0)

def evaluate_loss(data_loader):
    model.eval(); total = 0.0; count = 0
    
    with torch.no_grad():
        for x, y in data_loader:
            x, y = x.to(device), y.to(device)
            logits, _ = model(x)
            loss = criterion(logits.view(-1, vocab_size), y.view(-1))
            total += loss.item(); count += 1
    
    if count == 0:
        return float("nan"), float("nan")
    
    avg = total / count
    return avg, bpc_from_loss(avg)

def sample_text(model, vocab, max_new_tokens=300, temperature=1.0, top_k=None, top_p=None, prompt="", device="cpu"):
    model.eval()
    
    with torch.no_grad():
        if not prompt:
            prompt = random.choice(vocab.itos)
        
        input_ids = torch.tensor(vocab.encode(prompt), dtype=torch.long, device=device).unsqueeze(0)
        h = None
        out = list(prompt)
        
        for _ in range(max_new_tokens):
            logits, h = model(input_ids, h)
            last_logits = logits[0, -1, :] / max(1e-6, temperature)
            probs = torch.softmax(last_logits, dim=-1)
            
            if top_k is not None:
                k = min(top_k, probs.numel())
                topk_vals, topk_idx = torch.topk(probs, k)
                mask = torch.zeros_like(probs, dtype=torch.bool); mask[topk_idx] = True
                probs = probs.masked_fill(~mask, 0)
            
            if top_p is not None:
                sorted_probs, sorted_idx = torch.sort(probs, descending=True)
                cumsum = torch.cumsum(sorted_probs, dim=0)
                keep = cumsum <= top_p; keep[0] = True
                filtered = torch.zeros_like(sorted_probs).masked_scatter(keep, sorted_probs[keep])
                probs = torch.zeros_like(probs).scatter(0, sorted_idx, filtered)
            
            s = probs.sum()
            
            if s <= 0 or torch.isnan(s):
                next_id = torch.argmax(last_logits)
            
            else:
                probs = probs / s
                next_id = torch.multinomial(probs, 1).item()
            
            out.append(vocab.itos[int(next_id)])
            input_ids = torch.tensor([[next_id]], device=device)
        
        return "".join(out)

global_step = 0; best_val = float("inf")

for epoch in range(1, config["num_epochs"] + 1):
    model.train()
    running = 0.0
    
    for i, (x, y) in enumerate(train_loader, start=1):
        x, y = x.to(device), y.to(device)
        logits, _ = model(x)
        loss = criterion(logits.view(-1, vocab_size), y.view(-1))
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        
        if config["grad_clip"] is not None:
            torch.nn.utils.clip_grad_norm_(model.parameters(), config["grad_clip"])
        
        optimizer.step()
        running += loss.item(); global_step += 1
        
        if global_step % config["log_every"] == 0:
            avg_loss = running / config["log_every"]
            avg_bpc = bpc_from_loss(avg_loss)
            # epoch num and bpc
            print(f"Epoch {epoch} | Training BPC: {avg_bpc:.3f}")
            running = 0.0
        
        if global_step % config["sample_every"] == 0:
            print("\n--- Generated Text ---")
            sample = sample_text(model, vocab, max_new_tokens=config["max_generate"], temperature=config["temperature"], top_k=config["top_k"], top_p=config["top_p"], prompt="ROMEO:", device=device)
            # only first 200 chars
            print(sample[:200] + "..." if len(sample) > 200 else sample)
            print("---\n")
    
    val_loss, val_bpc = evaluate_loss(val_loader)
    print(f"Epoch {epoch} Complete | Validation BPC: {val_bpc:.3f}")
    
    if val_loss < best_val:
        best_val = val_loss
        torch.save({"model_state": model.state_dict(), "config": config, "stoi": vocab.stoi, "itos": vocab.itos}, config["save_path"])
        print(f"New best model saved (BPC: {val_bpc:.3f})")
    # after validation
    scheduler.step(val_loss)
    print()

print(f"Training Complete")
print(f"Best Validation BPC: {bpc_from_loss(best_val):.3f}")

Using Apple Silicon GPU (MPS)
Training on: mps
Target: Reduce BPC from current baseline



KeyboardInterrupt: 