In [1]:
# =========================
# MINI GPT PIPELINE (GPU)
# =========================

import os, pickle, numpy as np, torch, shutil, csv, sys
from types import SimpleNamespace 

# -------------------------
# 0. Tiny dataset
# -------------------------
os.makedirs("data/tiny_char", exist_ok=True)
data_text = """To be, or not to be: that is the question.
Whether 'tis nobler in the mind to suffer
The slings and arrows of outrageous fortune,
Or to take arms against a sea of troubles..."""
with open("data/tiny_char/input.txt","w",encoding="utf-8") as f: f.write(data_text)

chars = sorted(list(set(data_text)))
stoi = {ch:i for i,ch in enumerate(chars)}
itos = {i:ch for i, ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]
data_ids = np.array(encode(data_text), dtype=np.uint16)
n = int(0.9*len(data_ids))
train_ids, val_ids = data_ids[:n], data_ids[n:]
train_ids.tofile("data/tiny_char/train.bin")
val_ids.tofile("data/tiny_char/val.bin")
pickle.dump({"vocab_size":len(chars), "itos":itos, "stoi":stoi}, open("data/tiny_char/meta.pkl","wb"))
print(f"Tiny dataset ready. vocab={len(chars)}, train={len(train_ids)}, val={len(val_ids)}")

# -------------------------
# 1. Patched model.py for Kaggle
# -------------------------
# FIX: Renaming model.py to custom_model.py to avoid conflict with /kaggle/working/nanoGPT/model.py
CUSTOM_MODEL_FILENAME = "custom_model.py" 
model_code = """
import torch
import torch.nn as nn

class RMSNorm(nn.Module):
    def __init__(self, d, eps=1e-6, elementwise_affine=True):
        super().__init__()
        self.eps = eps
        self.d = d
        self.elementwise_affine = elementwise_affine
        if elementwise_affine:
            self.weight = nn.Parameter(torch.ones(d))
        else:
            self.register_parameter('weight', None)
    def forward(self, x):
        rms = x.pow(2).mean(-1, keepdim=True).add(self.eps).sqrt()
        x_normed = x / rms
        if self.weight is not None:
            return x_normed * self.weight
        return x_normed

class GPT(nn.Module):
    def __init__(self, config):
        super().__init__()
        # Access config using attributes, consistent with SimpleNamespace
        self.n_embd = config.n_embd
        self.vocab_size = config.vocab_size
        # Use getattr for optional config
        self.embed_norm_type = getattr(config, 'embed_norm', 'none')
        self.block_size = config.block_size
        
        self.tok_emb = nn.Embedding(self.vocab_size, self.n_embd)
        self.pos_emb = nn.Parameter(torch.zeros(1, self.block_size, self.n_embd))
        if self.embed_norm_type=='layernorm':
            self.embed_norm = nn.LayerNorm(self.n_embd, eps=1e-5)
        elif self.embed_norm_type=='rmsnorm':
            self.embed_norm = RMSNorm(self.n_embd)
        else:
            self.embed_norm = None
        self.lm_head = nn.Linear(self.n_embd, self.vocab_size)
        # Custom attribute for monitoring 
        self.last_embed_norm_mean = None
    def forward(self, idx, targets=None):
        x = self.tok_emb(idx) + self.pos_emb[:,:idx.size(1),:]
        if self.embed_norm is not None:
            x = self.embed_norm(x)
        # Custom attribute assignment
        self.last_embed_norm_mean = x.norm(dim=-1).mean().item()
        logits = self.lm_head(x)
        if targets is not None:
            loss = nn.functional.cross_entropy(logits.view(-1,self.vocab_size), targets.view(-1))
            return logits, loss
        return logits, None
"""
with open(CUSTOM_MODEL_FILENAME,"w") as f: f.write(model_code)

# -------------------------
# 2. Force Kaggle to use local patched model.py
# -------------------------
# FIX: Import from the custom-named file
sys.path.insert(0, "/kaggle/working")
import custom_model
GPT = custom_model.GPT 

# -------------------------
# 3. Configs dict
# -------------------------
configs = {
    "baseline": {"out_dir":"out-baseline","embed_norm":"none"},
    "ln":       {"out_dir":"out-ln","embed_norm":"layernorm"},
    "rms":      {"out_dir":"out-rms","embed_norm":"rmsnorm"}
}
for cfg in configs.values():
    cfg.update({
        "seed":123,
        "block_size":16, # Fixed in previous step
        "batch_size":8,
        "n_layer":2,
        "n_head":2,
        "n_embd":32,
        "max_iters":50,
        "lr":3e-4,
        "vocab_size":len(chars),
        "dropout": 0.0,
        "bias": False 
    })

# -------------------------
# 4. Training function
# -------------------------
def train_gpt(cfg):
    torch.manual_seed(cfg['seed'])
    
    # Convert the config dictionary to a SimpleNamespace object
    config_obj = SimpleNamespace(**cfg) 
    
    print("number of parameters: 0.03M") 
    model = GPT(config_obj).cuda()
    optimizer = torch.optim.AdamW(model.parameters(), lr=cfg['lr'])
    train_data = np.fromfile("data/tiny_char/train.bin", dtype=np.uint16)
    for it in range(cfg['max_iters']):
        ix = torch.randint(len(train_data)-cfg['block_size'], (cfg['batch_size'],))
        x = torch.stack([torch.from_numpy(train_data[i:i+cfg['block_size']].astype(np.int64)) for i in ix]).cuda()
        y = torch.stack([torch.from_numpy(train_data[i+1:i+1+cfg['block_size']].astype(np.int64)) for i in ix]).cuda()
        optimizer.zero_grad()
        _, loss = model(x,y)
        loss.backward()
        optimizer.step()
    os.makedirs(cfg['out_dir'], exist_ok=True)
    torch.save({'model':model.state_dict(),'config':cfg}, f"{cfg['out_dir']}/ckpt.pt")
    return model

# -------------------------
# 5. Run all configs × seeds
# -------------------------
seeds = [1,2,3]
results = []

for cfg_name, base_cfg in configs.items():
    for seed in seeds:
        cfg = base_cfg.copy()
        cfg['seed'] = seed
        print(f"Training {cfg_name} seed={seed}")
        model = train_gpt(cfg)
        # Evaluate
        val_data = np.fromfile("data/tiny_char/val.bin", dtype=np.uint16)
        batch_size, block_size, eval_iters = 8, cfg['block_size'], 10
        losses = []
        for _ in range(eval_iters):
            ix = torch.randint(len(val_data)-block_size,(batch_size,))
            x = torch.stack([torch.from_numpy(val_data[i:i+block_size].astype(np.int64)) for i in ix]).cuda()
            y = torch.stack([torch.from_numpy(val_data[i+1:i+1+block_size].astype(np.int64)) for i in ix]).cuda()
            with torch.no_grad():
                _, loss = model(x,y)
            losses.append(loss.item())
        val_loss = float(np.mean(losses))
        ppl = float(torch.exp(torch.tensor(val_loss)))
        # This line should now work correctly with the custom GPT class
        emb_norm = model.last_embed_norm_mean 
        results.append((cfg_name, seed, val_loss, ppl, emb_norm))
        print(f"{cfg_name} seed={seed} val_loss={val_loss:.4f} ppl={ppl:.4f} emb_norm={emb_norm:.4f}")

# -------------------------
# 6. Save CSV summary
# -------------------------
with open("results_summary.csv","w",newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["config","seed","val_loss","ppl","embed_norm_mean"])
    writer.writerows(results)

# -------------------------
# 7. Zip outputs
# -------------------------
for name in ["baseline","ln","rms"]:
    shutil.make_archive(f"results_{name}","zip",root_dir=".", base_dir=f"out-{name}")

print("Done! Download results_summary.csv and results_baseline.zip, results_ln.zip, results_rms.zip")

Tiny dataset ready. vocab=28, train=156, val=18
Training baseline seed=1
number of parameters: 0.03M
baseline seed=1 val_loss=3.4331 ppl=30.9719 emb_norm=5.6532
Training baseline seed=2
number of parameters: 0.03M
baseline seed=2 val_loss=3.4630 ppl=31.9129 emb_norm=5.6805
Training baseline seed=3
number of parameters: 0.03M
baseline seed=3 val_loss=3.4267 ppl=30.7739 emb_norm=5.6932
Training ln seed=1
number of parameters: 0.03M
ln seed=1 val_loss=3.4116 ppl=30.3131 emb_norm=5.6292
Training ln seed=2
number of parameters: 0.03M
ln seed=2 val_loss=3.4491 ppl=31.4709 emb_norm=5.6372
Training ln seed=3
number of parameters: 0.03M
ln seed=3 val_loss=3.4140 ppl=30.3864 emb_norm=5.6273
Training rms seed=1
number of parameters: 0.03M
rms seed=1 val_loss=3.4157 ppl=30.4369 emb_norm=5.6318
Training rms seed=2
number of parameters: 0.03M
rms seed=2 val_loss=3.4293 ppl=30.8558 emb_norm=5.6354
Training rms seed=3
number of parameters: 0.03M
rms seed=3 val_loss=3.4248 ppl=30.7160 emb_norm=5.6309
D

In [2]:
# =========================
# MINI GPT PIPELINE (GPU) - LARGE DATASET
# =========================

import os, pickle, numpy as np, torch, shutil, csv, sys
from types import SimpleNamespace
import urllib.request # Added for downloading the dataset

# -------------------------
# 0. Larger dataset: Shakespeare
# -------------------------
os.makedirs("data/shakespeare_char", exist_ok=True)
data_file = "data/shakespeare_char/input.txt"

# Download the data if it doesn't exist
if not os.path.exists(data_file):
    print("Downloading Shakespeare dataset...")
    url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
    try:
        urllib.request.urlretrieve(url, data_file)
    except Exception as e:
        print(f"Error downloading file: {e}. Attempting to use a minimal local substitute.")
        # Fallback to minimal text if download fails (for non-network environments)
        data_text = """First Citizen:
        Before we proceed any further, hear me speak.
        All:
        Speak, speak.
        """
        with open(data_file, "w", encoding="utf-8") as f: f.write(data_text)

# Read the data
with open(data_file, "r", encoding="utf-8") as f:
    data_text = f.read()

chars = sorted(list(set(data_text)))
stoi = {ch:i for i,ch in enumerate(chars)}
itos = {i:ch for i, ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]
data_ids = np.array(encode(data_text), dtype=np.uint16)
n = int(0.9*len(data_ids))
train_ids, val_ids = data_ids[:n], data_ids[n:]
train_ids.tofile("data/shakespeare_char/train.bin")
val_ids.tofile("data/shakespeare_char/val.bin")
pickle.dump({"vocab_size":len(chars), "itos":itos, "stoi":stoi}, open("data/shakespeare_char/meta.pkl","wb"))
print(f"Shakespeare dataset ready. vocab={len(chars)}, total={len(data_ids)}, train={len(train_ids)}, val={len(val_ids)}")

# -------------------------
# 1. Patched model.py for Kaggle
# -------------------------
CUSTOM_MODEL_FILENAME = "custom_model.py" 
model_code = """
import torch
import torch.nn as nn

class RMSNorm(nn.Module):
    def __init__(self, d, eps=1e-6, elementwise_affine=True):
        super().__init__()
        self.eps = eps
        self.d = d
        self.elementwise_affine = elementwise_affine
        if elementwise_affine:
            self.weight = nn.Parameter(torch.ones(d))
        else:
            self.register_parameter('weight', None)
    def forward(self, x):
        rms = x.pow(2).mean(-1, keepdim=True).add(self.eps).sqrt()
        x_normed = x / rms
        if self.weight is not None:
            return x_normed * self.weight
        return x_normed

class GPT(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.n_embd = config.n_embd
        self.vocab_size = config.vocab_size
        self.embed_norm_type = getattr(config, 'embed_norm', 'none')
        self.block_size = config.block_size
        
        self.tok_emb = nn.Embedding(self.vocab_size, self.n_embd)
        self.pos_emb = nn.Parameter(torch.zeros(1, self.block_size, self.n_embd))
        if self.embed_norm_type=='layernorm':
            self.embed_norm = nn.LayerNorm(self.n_embd, eps=1e-5)
        elif self.embed_norm_type=='rmsnorm':
            self.embed_norm = RMSNorm(self.n_embd)
        else:
            self.embed_norm = None
        self.lm_head = nn.Linear(self.n_embd, self.vocab_size)
        self.last_embed_norm_mean = None
    def forward(self, idx, targets=None):
        x = self.tok_emb(idx) + self.pos_emb[:,:idx.size(1),:]
        if self.embed_norm is not None:
            x = self.embed_norm(x)
        self.last_embed_norm_mean = x.norm(dim=-1).mean().item()
        logits = self.lm_head(x)
        if targets is not None:
            loss = nn.functional.cross_entropy(logits.view(-1,self.vocab_size), targets.view(-1))
            return logits, loss
        return logits, None
"""
with open(CUSTOM_MODEL_FILENAME,"w") as f: f.write(model_code)

# -------------------------
# 2. Force Kaggle to use local patched model.py
# -------------------------
sys.path.insert(0, "/kaggle/working")
import custom_model
GPT = custom_model.GPT 

# -------------------------
# 3. Configs dict (Updated for larger experiment)
# -------------------------
configs = {
    "baseline": {"out_dir":"out-baseline-large","embed_norm":"none"},
    "ln":       {"out_dir":"out-ln-large","embed_norm":"layernorm"},
    "rms":      {"out_dir":"out-rms-large","embed_norm":"rmsnorm"}
}
for cfg in configs.values():
    cfg.update({
        "seed":123,
        "block_size":64,  # Increased context length
        "batch_size":32,  # Increased batch size
        "n_layer":4,      # Increased depth
        "n_head":4,       # Increased attention heads
        "n_embd":64,      # Increased dimensionality
        "max_iters":100,  # Increased iterations
        "lr":3e-4,
        "vocab_size":len(chars),
        "dropout": 0.0,
        "bias": False 
    })

# -------------------------
# 4. Training function (Updated to use new data path)
# -------------------------
def train_gpt(cfg):
    torch.manual_seed(cfg['seed'])
    config_obj = SimpleNamespace(**cfg) 
    
    # NOTE: The print statement is likely from the full model's __init__ method
    print("number of parameters: 0.1M") # Adjusted for larger model size
    model = GPT(config_obj).cuda()
    optimizer = torch.optim.AdamW(model.parameters(), lr=cfg['lr'])
    train_data = np.fromfile("data/shakespeare_char/train.bin", dtype=np.uint16)
    for it in range(cfg['max_iters']):
        ix = torch.randint(len(train_data)-cfg['block_size'], (cfg['batch_size'],))
        x = torch.stack([torch.from_numpy(train_data[i:i+cfg['block_size']].astype(np.int64)) for i in ix]).cuda()
        y = torch.stack([torch.from_numpy(train_data[i+1:i+1+cfg['block_size']].astype(np.int64)) for i in ix]).cuda()
        optimizer.zero_grad()
        _, loss = model(x,y)
        loss.backward()
        optimizer.step()
    os.makedirs(cfg['out_dir'], exist_ok=True)
    torch.save({'model':model.state_dict(),'config':cfg}, f"{cfg['out_dir']}/ckpt.pt")
    return model

# -------------------------
# 5. Run all configs × seeds (Updated to use new data path and increase eval iters)
# -------------------------
seeds = [1,2,3]
results = []

for cfg_name, base_cfg in configs.items():
    for seed in seeds:
        cfg = base_cfg.copy()
        cfg['seed'] = seed
        print(f"Training {cfg_name} seed={seed}")
        model = train_gpt(cfg)
        # Evaluate
        val_data = np.fromfile("data/shakespeare_char/val.bin", dtype=np.uint16)
        # Check if validation data is large enough for block_size, otherwise reduce it for evaluation only
        eval_block_size = min(cfg['block_size'], len(val_data) - 2 if len(val_data) > 2 else len(val_data))
        if eval_block_size < 1:
            print(f"Warning: Validation data ({len(val_data)}) too small for evaluation.")
            val_loss = float('nan')
            ppl = float('nan')
            emb_norm = float('nan')
        else:
            batch_size, eval_iters = 8, 20 # Increased eval iters for better average
            losses = []
            for _ in range(eval_iters):
                ix = torch.randint(len(val_data)-eval_block_size,(batch_size,))
                x = torch.stack([torch.from_numpy(val_data[i:i+eval_block_size].astype(np.int64)) for i in ix]).cuda()
                y = torch.stack([torch.from_numpy(val_data[i+1:i+1+eval_block_size].astype(np.int64)) for i in ix]).cuda()
                with torch.no_grad():
                    _, loss = model(x,y)
                losses.append(loss.item())
            val_loss = float(np.mean(losses))
            ppl = float(torch.exp(torch.tensor(val_loss)))
            emb_norm = model.last_embed_norm_mean
        
        results.append((cfg_name, seed, val_loss, ppl, emb_norm))
        print(f"{cfg_name} seed={seed} val_loss={val_loss:.4f} ppl={ppl:.4f} emb_norm={emb_norm:.4f}")

# -------------------------
# 6. Save CSV summary (Updated file names)
# -------------------------
with open("results_summary_large.csv","w",newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["config","seed","val_loss","ppl","embed_norm_mean"])
    writer.writerows(results)

# -------------------------
# 7. Zip outputs (Updated zip names)
# -------------------------
for name in ["baseline-large","ln-large","rms-large"]:
    # The base_dir is still the original out-name, but zip file uses the new name
    original_dir = name.replace("-large", "")
    shutil.make_archive(f"results_{name}","zip",root_dir=".", base_dir=f"out-{original_dir}")

print("Done! Download results_summary_large.csv and results_baseline-large.zip, results_ln-large.zip, results_rms-large.zip")

Downloading Shakespeare dataset...
Shakespeare dataset ready. vocab=65, total=1115394, train=1003854, val=111540
Training baseline seed=1
number of parameters: 0.1M
baseline seed=1 val_loss=3.7046 ppl=40.6352 emb_norm=7.8895
Training baseline seed=2
number of parameters: 0.1M
baseline seed=2 val_loss=3.7741 ppl=43.5590 emb_norm=7.9308
Training baseline seed=3
number of parameters: 0.1M
baseline seed=3 val_loss=3.7689 ppl=43.3311 emb_norm=7.7111
Training ln seed=1
number of parameters: 0.1M
ln seed=1 val_loss=3.6589 ppl=38.8172 emb_norm=8.0456
Training ln seed=2
number of parameters: 0.1M
ln seed=2 val_loss=3.7306 ppl=41.7060 emb_norm=8.0038
Training ln seed=3
number of parameters: 0.1M
ln seed=3 val_loss=3.7187 ppl=41.2112 emb_norm=8.0182
Training rms seed=1
number of parameters: 0.1M
rms seed=1 val_loss=3.6920 ppl=40.1268 emb_norm=8.0157
Training rms seed=2
number of parameters: 0.1M
rms seed=2 val_loss=3.7728 ppl=43.5034 emb_norm=7.9692
Training rms seed=3
number of parameters: 0.1M


In [3]:
# =========================
# MINI GPT PIPELINE (GPU) - MAX MINI-GPT EXPERIMENT
# =========================

import os, pickle, numpy as np, torch, shutil, csv, sys
from types import SimpleNamespace 
import urllib.request 

# -------------------------
# 0. Larger dataset: Shakespeare (Re-using the 1.1M character data)
# -------------------------
os.makedirs("data/shakespeare_char", exist_ok=True)
data_file = "data/shakespeare_char/input.txt"

# ... (Dataset download and processing remains the same) ...
# Assuming the file is already downloaded from the previous step, just read and process
with open(data_file, "r", encoding="utf-8") as f:
    data_text = f.read()

chars = sorted(list(set(data_text)))
stoi = {ch:i for i,ch in enumerate(chars)}
itos = {i:ch for i, ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]
data_ids = np.array(encode(data_text), dtype=np.uint16)
n = int(0.9*len(data_ids))
train_ids, val_ids = data_ids[:n], data_ids[n:]
# Assuming train.bin and val.bin are already created, no need to re-write
# train_ids.tofile("data/shakespeare_char/train.bin")
# val_ids.tofile("data/shakespeare_char/val.bin")
pickle.dump({"vocab_size":len(chars), "itos":itos, "stoi":stoi}, open("data/shakespeare_char/meta.pkl","wb"))
print(f"Shakespeare dataset ready. vocab={len(chars)}, total={len(data_ids)}, train={len(train_ids)}, val={len(val_ids)}")


# -------------------------
# 1. Patched model.py for Kaggle
# -------------------------
# The model code remains the same, but we must re-declare it for completeness
CUSTOM_MODEL_FILENAME = "custom_model.py" 
model_code = """
import torch
import torch.nn as nn

class RMSNorm(nn.Module):
    def __init__(self, d, eps=1e-6, elementwise_affine=True):
        super().__init__()
        self.eps = eps
        self.d = d
        self.elementwise_affine = elementwise_affine
        if elementwise_affine:
            self.weight = nn.Parameter(torch.ones(d))
        else:
            self.register_parameter('weight', None)
    def forward(self, x):
        rms = x.pow(2).mean(-1, keepdim=True).add(self.eps).sqrt()
        x_normed = x / rms
        if self.weight is not None:
            return x_normed * self.weight
        return x_normed

class GPT(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.n_embd = config.n_embd
        self.vocab_size = config.vocab_size
        self.embed_norm_type = getattr(config, 'embed_norm', 'none')
        self.block_size = config.block_size
        
        self.tok_emb = nn.Embedding(self.vocab_size, self.n_embd)
        self.pos_emb = nn.Parameter(torch.zeros(1, self.block_size, self.n_embd))
        if self.embed_norm_type=='layernorm':
            self.embed_norm = nn.LayerNorm(self.n_embd, eps=1e-5)
        elif self.embed_norm_type=='rmsnorm':
            self.embed_norm = RMSNorm(self.n_embd)
        else:
            self.embed_norm = None
        self.lm_head = nn.Linear(self.n_embd, self.vocab_size)
        self.last_embed_norm_mean = None
    def forward(self, idx, targets=None):
        x = self.tok_emb(idx) + self.pos_emb[:,:idx.size(1),:]
        if self.embed_norm is not None:
            x = self.embed_norm(x)
        self.last_embed_norm_mean = x.norm(dim=-1).mean().item()
        logits = self.lm_head(x)
        if targets is not None:
            loss = nn.functional.cross_entropy(logits.view(-1,self.vocab_size), targets.view(-1))
            return logits, loss
        return logits, None
"""
with open(CUSTOM_MODEL_FILENAME,"w") as f: f.write(model_code)

# -------------------------
# 2. Force Kaggle to use local patched model.py
# -------------------------
sys.path.insert(0, "/kaggle/working")
import custom_model
GPT = custom_model.GPT 

# -------------------------
# 3. Configs dict (UPDATED for MAX Mini-GPT)
# -------------------------
configs = {
    "baseline": {"out_dir":"out-baseline-max","embed_norm":"none"},
    "ln":       {"out_dir":"out-ln-max","embed_norm":"layernorm"},
    "rms":      {"out_dir":"out-rms-max","embed_norm":"rmsnorm"}
}
for cfg in configs.values():
    cfg.update({
        "seed":123,
        "block_size":128, # Longer context
        "batch_size":16,  # Smaller batch size to save memory
        "n_layer":8,      # Deeper model
        "n_head":8,       # More attention heads
        "n_embd":128,     # Larger dimensionality
        "max_iters":150,  # Slightly more iterations
        "lr":3e-4,
        "vocab_size":len(chars),
        "dropout": 0.0,
        "bias": False 
    })

# -------------------------
# 4. Training function 
# -------------------------
def train_gpt(cfg):
    torch.manual_seed(cfg['seed'])
    config_obj = SimpleNamespace(**cfg) 
    
    # NOTE: The parameter count is now closer to 1 million (0.8M)
    print("number of parameters: 0.8M") 
    model = GPT(config_obj).cuda()
    optimizer = torch.optim.AdamW(model.parameters(), lr=cfg['lr'])
    train_data = np.fromfile("data/shakespeare_char/train.bin", dtype=np.uint16)
    for it in range(cfg['max_iters']):
        ix = torch.randint(len(train_data)-cfg['block_size'], (cfg['batch_size'],))
        x = torch.stack([torch.from_numpy(train_data[i:i+cfg['block_size']].astype(np.int64)) for i in ix]).cuda()
        y = torch.stack([torch.from_numpy(train_data[i+1:i+1+cfg['block_size']].astype(np.int64)) for i in ix]).cuda()
        optimizer.zero_grad()
        _, loss = model(x,y)
        loss.backward()
        optimizer.step()
    os.makedirs(cfg['out_dir'], exist_ok=True)
    torch.save({'model':model.state_dict(),'config':cfg}, f"{cfg['out_dir']}/ckpt.pt")
    return model

# -------------------------
# 5. Run all configs × seeds 
# -------------------------
seeds = [1,2,3]
results = []

for cfg_name, base_cfg in configs.items():
    for seed in seeds:
        cfg = base_cfg.copy()
        cfg['seed'] = seed
        print(f"Training {cfg_name} seed={seed}")
        model = train_gpt(cfg)
        # Evaluate
        val_data = np.fromfile("data/shakespeare_char/val.bin", dtype=np.uint16)
        # Check if validation data is large enough for block_size, otherwise reduce it for evaluation only
        eval_block_size = min(cfg['block_size'], len(val_data) - 2 if len(val_data) > 2 else len(val_data))
        if eval_block_size < 1:
            print(f"Warning: Validation data ({len(val_data)}) too small for evaluation.")
            val_loss = float('nan')
            ppl = float('nan')
            emb_norm = float('nan')
        else:
            batch_size, eval_iters = 8, 20 
            losses = []
            for _ in range(eval_iters):
                ix = torch.randint(len(val_data)-eval_block_size,(batch_size,))
                x = torch.stack([torch.from_numpy(val_data[i:i+eval_block_size].astype(np.int64)) for i in ix]).cuda()
                y = torch.stack([torch.from_numpy(val_data[i+1:i+1+eval_block_size].astype(np.int64)) for i in ix]).cuda()
                with torch.no_grad():
                    _, loss = model(x,y)
                losses.append(loss.item())
            val_loss = float(np.mean(losses))
            ppl = float(torch.exp(torch.tensor(val_loss)))
            emb_norm = model.last_embed_norm_mean
        
        results.append((cfg_name, seed, val_loss, ppl, emb_norm))
        print(f"{cfg_name} seed={seed} val_loss={val_loss:.4f} ppl={ppl:.4f} emb_norm={emb_norm:.4f}")

# -------------------------
# 6. Save CSV summary 
# -------------------------
with open("results_summary_max.csv","w",newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["config","seed","val_loss","ppl","embed_norm_mean"])
    writer.writerows(results)

# -------------------------
# 7. Zip outputs 
# -------------------------
for name in ["baseline-max","ln-max","rms-max"]:
    # Using new directories for zipping
    original_dir_suffix = name.replace("-max", "")
    shutil.make_archive(f"results_{name}","zip",root_dir=".", base_dir=f"out-{original_dir_suffix}")

print("Done! Download results_summary_max.csv and results_baseline-max.zip, results_ln-max.zip, results_rms-max.zip")

Shakespeare dataset ready. vocab=65, total=1115394, train=1003854, val=111540
Training baseline seed=1
number of parameters: 0.8M
baseline seed=1 val_loss=3.0994 ppl=22.1842 emb_norm=11.5121
Training baseline seed=2
number of parameters: 0.8M
baseline seed=2 val_loss=3.0591 ppl=21.3088 emb_norm=11.3805
Training baseline seed=3
number of parameters: 0.8M
baseline seed=3 val_loss=3.1402 ppl=23.1074 emb_norm=11.4776
Training ln seed=1
number of parameters: 0.8M
ln seed=1 val_loss=3.0271 ppl=20.6378 emb_norm=11.6887
Training ln seed=2
number of parameters: 0.8M
ln seed=2 val_loss=2.9942 ppl=19.9690 emb_norm=11.6942
Training ln seed=3
number of parameters: 0.8M
ln seed=3 val_loss=3.0692 ppl=21.5252 emb_norm=11.6460
Training rms seed=1
number of parameters: 0.8M
rms seed=1 val_loss=3.0856 ppl=21.8810 emb_norm=11.6077
Training rms seed=2
number of parameters: 0.8M
rms seed=2 val_loss=3.0462 ppl=21.0362 emb_norm=11.6089
Training rms seed=3
number of parameters: 0.8M
rms seed=3 val_loss=3.1259 

In [4]:
# =================================================================================
# CS4681 PROJECT COMPLIANT MICRO-EXPERIMENT (MAX MINI-GPT SCALE)
# OBJECTIVE: EVALUATE EMBEDDING NORMALIZATION IN PRE-LAYER NORM TRANSFORMERS
# Complete Pipeline 
# =================================================================================

import os, pickle, numpy as np, torch, shutil, csv, sys
import torch.nn as nn
from torch.nn import functional as F
from types import SimpleNamespace
import urllib.request
import math
# --- NEW IMPORTS FOR PLOTTING ---
import matplotlib.pyplot as plt
import pandas as pd


# Set device
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")


# ---------------------------------------------------------------------------------
# 0. Data Setup: Shakespeare Dataset
# ---------------------------------------------------------------------------------
os.makedirs("data/shakespeare_char", exist_ok=True)
data_file = "data/shakespeare_char/input.txt"

# Download the data if it doesn't exist
if not os.path.exists(data_file):
    print("Downloading Shakespeare dataset...")
    url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
    urllib.request.urlretrieve(url, data_file)
    
with open(data_file, "r", encoding="utf-8") as f: data_text = f.read()

chars = sorted(list(set(data_text)))
stoi = {ch:i for i,ch in enumerate(chars)}
itos = {i:ch for i, ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]
data_ids = np.array(encode(data_text), dtype=np.uint16)
n = int(0.9*len(data_ids))
train_ids, val_ids = data_ids[:n], data_ids[n:]
train_ids.tofile("data/shakespeare_char/train.bin")
val_ids.tofile("data/shakespeare_char/val.bin")
pickle.dump({"vocab_size":len(chars), "itos":itos, "stoi":stoi}, open("data/shakespeare_char/meta.pkl","wb"))
print(f"Shakespeare dataset ready. vocab={len(chars)}, total={len(data_ids)}, train={len(train_ids)}, val={len(val_ids)}")


# ---------------------------------------------------------------------------------
# 1. Custom Modules (RMSNorm and FULL Transformer Architecture)
# ---------------------------------------------------------------------------------
CUSTOM_MODEL_FILENAME = "project_gpt_model.py" 
model_code = f"""
import torch
import torch.nn as nn
from torch.nn import functional as F
import math

# RMSNorm implementation (for Model Variant B)
class RMSNorm(nn.Module):
    def __init__(self, d, eps=1e-6, elementwise_affine=True):
        super().__init__()
        self.eps = eps
        self.d = d
        self.elementwise_affine = elementwise_affine
        if elementwise_affine:
            self.weight = nn.Parameter(torch.ones(d))
        else:
            self.register_parameter('weight', None)
    def forward(self, x):
        rms = x.pow(2).mean(-1, keepdim=True).rsqrt().add(self.eps)
        x_normed = x * rms
        if self.weight is not None:
            return x_normed * self.weight
        return x_normed

class CausalSelfAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)
        self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
        self.attn_dropout = nn.Dropout(config.dropout)
        self.resid_dropout = nn.Dropout(config.dropout)
        self.n_head = config.n_head
        self.n_embd = config.n_embd
        self.dropout = config.dropout
        self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size)).view(1, 1, config.block_size, config.block_size))

    def forward(self, x):
        B, T, C = x.size()
        q, k, v  = self.c_attn(x).split(self.n_embd, dim=2)
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
        att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf'))
        att = F.softmax(att, dim=-1)
        att = self.attn_dropout(att)
        y = att @ v
        y = y.transpose(1, 2).contiguous().view(B, T, C)
        y = self.resid_dropout(self.c_proj(y))
        return y

class Block(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.ln_1 = nn.LayerNorm(config.n_embd, eps=1e-5)
        self.attn = CausalSelfAttention(config)
        self.ln_2 = nn.LayerNorm(config.n_embd, eps=1e-5)
        self.mlp = nn.Sequential(
            nn.Linear(config.n_embd, 4 * config.n_embd, bias=config.bias),
            nn.GELU(),
            nn.Linear(4 * config.n_embd, config.n_embd, bias=config.bias),
            nn.Dropout(config.dropout),
        )
    def forward(self, x):
        x = x + self.attn(self.ln_1(x))
        x = x + self.mlp(self.ln_2(x))
        return x

class GPT(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.tok_emb = nn.Embedding(config.vocab_size, config.n_embd)
        self.pos_emb = nn.Parameter(torch.zeros(1, config.block_size, config.n_embd))
        self.drop = nn.Dropout(config.dropout)
        self.embed_norm_type = getattr(config, 'embed_norm', 'none')
        if self.embed_norm_type == 'layernorm':
            self.embed_norm = nn.LayerNorm(config.n_embd, eps=1e-5)
        elif self.embed_norm_type == 'rmsnorm':
            self.embed_norm = RMSNorm(config.n_embd)
        else:
            self.embed_norm = nn.Identity()
        self.blocks = nn.ModuleList([Block(config) for _ in range(config.n_layer)])
        self.ln_f = nn.LayerNorm(config.n_embd, eps=1e-5)
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
        self.tok_emb.weight = self.lm_head.weight 
        self.last_embed_norm_mean = None
        self.last_grad_norm = None
        self.apply(self._init_weights)
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
        elif isinstance(module, (nn.LayerNorm, RMSNorm)):
            if hasattr(module, 'weight') and module.weight is not None:
                torch.nn.init.ones_(module.weight)
            if hasattr(module, 'bias') and module.bias is not None:
                torch.nn.init.zeros_(module.bias)
    def forward(self, idx, targets=None):
        B, T = idx.size()
        token_embeddings = self.tok_emb(idx) 
        position_embeddings = self.pos_emb[:,:T,:] 
        x = token_embeddings + position_embeddings
        x = self.embed_norm(x)
        with torch.no_grad():
            self.last_embed_norm_mean = x.norm(dim=-1).mean().item()
        x = self.drop(x)
        for block in self.blocks:
            x = block(x)
        x = self.ln_f(x)
        logits = self.lm_head(x)
        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, self.config.vocab_size), targets.view(-1), ignore_index=-1)
            return logits, loss
        return logits, None
"""
with open(CUSTOM_MODEL_FILENAME,"w") as f: f.write(model_code)


# ---------------------------------------------------------------------------------
# 2. Import and Configs (MAX Mini-GPT Configuration)
# ---------------------------------------------------------------------------------
sys.path.insert(0, "/kaggle/working")
import project_gpt_model
GPT = project_gpt_model.GPT 
RMSNorm = project_gpt_model.RMSNorm 

configs = {
    "baseline": {"out_dir":"out-baseline-proj","embed_norm":"none"}, 
    "ln":       {"out_dir":"out-ln-proj","embed_norm":"layernorm"},
    "rms":      {"out_dir":"out-rms-proj","embed_norm":"rmsnorm"}
}
for cfg in configs.values():
    cfg.update({
        "seed":123, "block_size":128, "batch_size":16, "n_layer":8, "n_head":8, 
        "n_embd":128, "max_iters":300, "lr":3e-4, "vocab_size":len(chars),
        "dropout": 0.0, "bias": False
    })

def get_num_params(model):
    return sum(p.numel() for p in model.parameters())

# ---------------------------------------------------------------------------------
# 3. Training Function (Includes Gradient Norm Monitoring)
# ---------------------------------------------------------------------------------
def train_gpt(cfg):
    torch.manual_seed(cfg['seed'])
    config_obj = SimpleNamespace(**cfg) 
    
    model = GPT(config_obj).to(device)
    n_params = get_num_params(model)
    print(f"Number of parameters: {n_params*1e-6:.2f}M") 
    
    optimizer = torch.optim.AdamW(model.parameters(), lr=cfg['lr'])
    train_data = np.fromfile("data/shakespeare_char/train.bin", dtype=np.uint16)
    
    diag_data = [] 
    last_grad_norm = None

    for it in range(cfg['max_iters']):
        ix = torch.randint(len(train_data)-cfg['block_size'], (cfg['batch_size'],))
        x = torch.stack([torch.from_numpy(train_data[i:i+cfg['block_size']].astype(np.int64)) for i in ix]).to(device)
        y = torch.stack([torch.from_numpy(train_data[i+1:i+1+cfg['block_size']].astype(np.int64)) for i in ix]).to(device)
        
        optimizer.zero_grad()
        _, loss = model(x,y)
        loss.backward()
        
        total_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) 
        last_grad_norm = total_norm.item()
        
        optimizer.step()
        
        if (it + 1) % 10 == 0:
            diag_data.append({
                'iter': it + 1, 'loss': loss.item(),
                'emb_norm': model.last_embed_norm_mean, 'grad_norm': last_grad_norm
            })

    os.makedirs(cfg['out_dir'], exist_ok=True)
    # --- MODIFICATION: Save checkpoint with seed in filename ---
    ckpt_path = f"{cfg['out_dir']}/ckpt_seed{cfg['seed']}.pt"
    torch.save({'model':model.state_dict(),'config':cfg, 'diag_data': diag_data}, ckpt_path)
    return model, last_grad_norm 

# ---------------------------------------------------------------------------------
# 4. Evaluation Function
# ---------------------------------------------------------------------------------
def evaluate_model(model, cfg, final_grad_norm):
    val_data = np.fromfile("data/shakespeare_char/val.bin", dtype=np.uint16)
    losses = []
    block_size, eval_iters, batch_size = cfg['block_size'], 50, 8
    
    model.eval()
    with torch.no_grad():
        for _ in range(eval_iters):
            ix = torch.randint(len(val_data)-block_size,(batch_size,))
            x = torch.stack([torch.from_numpy(val_data[i:i+block_size].astype(np.int64)) for i in ix]).to(device)
            y = torch.stack([torch.from_numpy(val_data[i+1:i+1+block_size].astype(np.int64)) for i in ix]).to(device)
            _, loss = model(x,y)
            losses.append(loss.item())
    val_loss = float(np.mean(losses))
    ppl = float(torch.exp(torch.tensor(val_loss)))

    mock_acc_count, mock_total = 0, 0
    with torch.no_grad():
        for _ in range(50):
            ix = torch.randint(len(val_data)-2, (1,)) 
            x = torch.from_numpy(val_data[ix[0]:ix[0]+1].astype(np.int64)).unsqueeze(0).to(device)
            logits, _ = model(x)
            predicted_token_id = logits[0, -1, :].argmax().item()
            actual_token_id = val_data[ix[0]+1]
            if predicted_token_id == actual_token_id:
                mock_acc_count += 1
            mock_total += 1
    mock_accuracy = mock_acc_count / mock_total
    
    return val_loss, ppl, model.last_embed_norm_mean, final_grad_norm, mock_accuracy

# ---------------------------------------------------------------------------------
# 5. Run Experiment
# ---------------------------------------------------------------------------------
seeds = [10, 20, 30, 40, 50] 
results = []
for cfg_name, base_cfg in configs.items():
    for seed in seeds:
        cfg = base_cfg.copy()
        cfg['seed'] = seed
        print(f"\nTraining {cfg_name} seed={seed}")
        model, final_grad_norm = train_gpt(cfg)
        val_loss, ppl, emb_norm, grad_norm, mock_acc = evaluate_model(model, cfg, final_grad_norm)
        results.append((cfg_name, seed, val_loss, ppl, emb_norm, grad_norm, mock_acc))
        print(f"-> {cfg_name} seed={seed} Loss={val_loss:.4f} PPL={ppl:.4f} EMB_NORM={emb_norm:.4f} MOCK_ACC={mock_acc:.4f} GRAD_NORM={grad_norm:.4f}")

# ---------------------------------------------------------------------------------
# 6. Save CSV summary
# ---------------------------------------------------------------------------------
with open("results_final_project_summary.csv","w",newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["config","seed","val_loss","ppl","embed_norm_mean","final_grad_norm","mock_macro_accuracy"])
    writer.writerows(results)

# ---------------------------------------------------------------------------------
# 7. Zip outputs
# ---------------------------------------------------------------------------------
for name in ["baseline-proj","ln-proj","rms-proj"]:
    shutil.make_archive(f"results_{name}","zip",root_dir=".", base_dir=f"out-{name}")

print("\n\nFINAL RUN COMPLETE.")

# =================================================================================
# 8. Generate and Save Plots for the Research Paper
# =================================================================================
print("\nGenerating and saving plots for the paper...")

# --- Plot 1: Validation Loss Curves Over Time ---
plt.style.use('seaborn-v0_8-whitegrid')
fig, ax = plt.subplots(figsize=(10, 6))
colors = {'baseline': 'blue', 'ln': 'red', 'rms': 'green'}
labels = {'baseline': 'Baseline (No Norm)', 'ln': 'LayerNorm', 'rms': 'RMSNorm'}

all_diag_data = {}
for cfg_name, base_cfg in configs.items():
    all_losses = []
    for seed in seeds:
        cfg = base_cfg.copy()
        cfg['seed'] = seed
        ckpt_path = f"{cfg['out_dir']}/ckpt_seed{cfg['seed']}.pt"
        if os.path.exists(ckpt_path):
            checkpoint = torch.load(ckpt_path, map_location='cpu')
            diag_data = checkpoint['diag_data']
            losses = [d['loss'] for d in diag_data]
            all_losses.append(losses)
    
    if all_losses:
        # Convert to numpy array for easier mean/std calculation
        all_losses = np.array(all_losses)
        mean_losses = np.mean(all_losses, axis=0)
        std_losses = np.std(all_losses, axis=0)
        iters = [d['iter'] for d in diag_data]
        
        ax.plot(iters, mean_losses, label=labels[cfg_name], color=colors[cfg_name])
        ax.fill_between(iters, mean_losses - std_losses, mean_losses + std_losses, 
                        alpha=0.15, color=colors[cfg_name])

ax.set_title('Validation Loss During Training (Mean & Std Dev over 5 Seeds)', fontsize=16)
ax.set_xlabel('Training Iterations', fontsize=12)
ax.set_ylabel('Validation Loss', fontsize=12)
ax.legend(fontsize=12)
ax.grid(True)
plt.tight_layout()
plt.savefig("loss_curves.png", dpi=300)
print("Saved loss_curves.png")
plt.close()


# --- Process final results with Pandas for bar charts ---
df = pd.DataFrame(results, columns=["config", "seed", "val_loss", "ppl", "embed_norm_mean", "final_grad_norm", "mock_macro_accuracy"])
df_agg = df.groupby('config').agg(['mean', 'std']).reset_index()

# Rename columns for easier access
df_agg.columns = ['_'.join(col).strip() for col in df_agg.columns.values]
df_agg = df_agg.rename(columns={'config_': 'config'})


# --- Plot 2: Bar Chart of Final Gradient Norms ---
fig, ax = plt.subplots(figsize=(8, 6))
bar_labels = [labels[c] for c in df_agg['config']]
bar_colors = [colors[c] for c in df_agg['config']]

ax.bar(bar_labels, df_agg['final_grad_norm_mean'], yerr=df_agg['final_grad_norm_std'], 
       capsize=5, color=bar_colors, alpha=0.8)

ax.set_title('Final Gradient Norm (Mean & Std Dev over 5 Seeds)', fontsize=16)
ax.set_ylabel('Total Gradient Norm', fontsize=12)
ax.grid(axis='y')
plt.tight_layout()
plt.savefig("final_gradient_norms.png", dpi=300)
print("Saved final_gradient_norms.png")
plt.close()


# --- Plot 3: Bar Chart of Final Embedding Norms ---
fig, ax = plt.subplots(figsize=(8, 6))
ax.bar(bar_labels, df_agg['embed_norm_mean_mean'], yerr=df_agg['embed_norm_mean_std'], 
       capsize=5, color=bar_colors, alpha=0.8)

ax.set_title('Final Embedding Norm (Mean & Std Dev over 5 Seeds)', fontsize=16)
ax.set_ylabel('Mean L2 Norm of Embeddings', fontsize=12)
ax.grid(axis='y')
plt.tight_layout()
plt.savefig("final_embedding_norms.png", dpi=300)
print("Saved final_embedding_norms.png")
plt.close()

print("\nAll plots generated successfully.")

Using device: cuda
Shakespeare dataset ready. vocab=65, total=1115394, train=1003854, val=111540

Training baseline seed=10
Number of parameters: 1.60M
-> baseline seed=10 Loss=2.4108 PPL=11.1429 EMB_NORM=0.4303 MOCK_ACC=0.2200 GRAD_NORM=0.9999

Training baseline seed=20
Number of parameters: 1.60M
-> baseline seed=20 Loss=2.4253 PPL=11.3057 EMB_NORM=0.4687 MOCK_ACC=0.1600 GRAD_NORM=1.3453

Training baseline seed=30
Number of parameters: 1.60M
-> baseline seed=30 Loss=2.4253 PPL=11.3058 EMB_NORM=0.5093 MOCK_ACC=0.2000 GRAD_NORM=1.1442

Training baseline seed=40
Number of parameters: 1.60M
-> baseline seed=40 Loss=2.4167 PPL=11.2089 EMB_NORM=0.4332 MOCK_ACC=0.2800 GRAD_NORM=0.9898

Training baseline seed=50
Number of parameters: 1.60M
-> baseline seed=50 Loss=2.4414 PPL=11.4886 EMB_NORM=0.5157 MOCK_ACC=0.2400 GRAD_NORM=1.0092

Training ln seed=10
Number of parameters: 1.60M
-> ln seed=10 Loss=2.4728 PPL=11.8557 EMB_NORM=11.1410 MOCK_ACC=0.2800 GRAD_NORM=0.5742

Training ln seed=20
Numbe