In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
import numpy as np
from pathlib import Path
import requests
from my_tokenizer import CharDataset
from my_gpt import SmolGPT

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

Using device: cuda


#### Unit testing

In [12]:
!pytest my_tests.py -v

platform win32 -- Python 3.11.5, pytest-9.0.1, pluggy-1.6.0 -- C:\Git\learning_pytorch\.venv\Scripts\python.exe
cachedir: .pytest_cache
rootdir: c:\Git\mini_project_language_model\src
plugins: anyio-4.10.0
[1mcollecting ... [0mcollected 6 items

my_tests.py::test_tokenizer_roundtrip [32mPASSED[0m[32m                             [ 16%][0m
my_tests.py::test_single_attention_head [32mPASSED[0m[32m                           [ 33%][0m
my_tests.py::test_multi_attention_head [32mPASSED[0m[32m                            [ 50%][0m
my_tests.py::test_ffn [32mPASSED[0m[32m                                             [ 66%][0m
my_tests.py::test_transformer_block [32mPASSED[0m[32m                               [ 83%][0m
my_tests.py::test_full_model [32mPASSED[0m[32m                                      [100%][0m



#### Get Data using DataSet / DataLoader

In [13]:
from torch.utils.data import DataLoader

# Load Shakespeare data
with open('../data/shakespeare.txt', 'r') as f:
    text = f.read()

# Build vocab from entire text ONCE
vocab = sorted(list(set(text)))

# Split into train/val (80/20)
n = int(0.8 * len(text))
train_text, val_text = text[:n], text[n:]

# Create datasets with shared vocab
train_dataset = CharDataset(train_text, block_size=128, vocab=vocab)
val_dataset = CharDataset(val_text, block_size=128, vocab=vocab)

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

vocab_size = train_dataset.get_vocab_size()

In [14]:
@torch.no_grad()
def estimate_loss(model, train_loader, val_loader, device, eval_batches=50):
    """
    Estimate loss on train and val sets
    Args:
        model: GPT model
        train_loader: training DataLoader
        val_loader: val DataLoader
        device: cpu or cuda
        eval_batches: nb of batches to average over
    Returns:
        a Dictionary with 'train' and 'val' losses
    """
    out = {}
    model.eval()
    
    for split, loader in [('train', train_loader), ('val', val_loader)]:
        losses = []
        for i, (x, y) in enumerate(loader):
            if i >= eval_batches:
                break
            x, y = x.to(device), y.to(device)
            _, loss = model(x, y)
            losses.append(loss.item())
        out[split] = sum(losses) / len(losses)
    
    model.train()
    return out

#### Setting hyperparameters

In [24]:
# The project was tested with 12 layers, 8 attention heads, and 768 embedding dimensions, on a single GPU.

## big
# n_embd=768
# block_size=128
# batch_size = 64
# num_head=8
# num_layers=12
# dropout=0.1
# learning_rate = 1e-4

## small
n_embd = 768
batch_size = 32
block_size = 128
num_head = 8
num_layers = 12
dropout = 0.3
learning_rate = 1e-4


#### Training loop

In [25]:
# Create model
model = SmolGPT(
    vocab_size=vocab_size,
    n_embd=n_embd,
    block_size=block_size,
    num_head=num_head,
    num_layers=num_layers,
    dropout=dropout
).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

training_steps = 6000
eval_interval = training_steps // 100
iter = 0

# add early stopping
best_val_loss = float('inf')
patience = 5  # stop if val loss doesn't improve for 5 evals
patience_counter = 0

for x, y in train_loader:
    if iter % eval_interval == 0:
        losses = estimate_loss(model, train_loader, val_loader, device)
        print(f"Step {iter:4d} | Train: {losses['train']:.4f} | Val: {losses['val']:.4f}")
        
        # Early stopping check
        if losses['val'] < best_val_loss:
            best_val_loss = losses['val']
            patience_counter = 0
        else:
            patience_counter += 1
            print(f"  (no improvement, {patience_counter}/{patience})")
            
            if patience_counter >= patience:
                print(f"Early stopping at step {iter}")
                break
        
    x, y = x.to(device), y.to(device)

    logits, loss = model(x, y)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    iter += 1
    if iter >= training_steps:
        break

print("Training complete!")


Step    0 | Train: 4.3082 | Val: 4.3158
Step   60 | Train: 2.5195 | Val: 2.5087
Step   60 | Train: 2.5195 | Val: 2.5087
Step  120 | Train: 2.4474 | Val: 2.4487
Step  120 | Train: 2.4474 | Val: 2.4487
Step  180 | Train: 2.3827 | Val: 2.3872
Step  180 | Train: 2.3827 | Val: 2.3872
Step  240 | Train: 2.2876 | Val: 2.2977
Step  240 | Train: 2.2876 | Val: 2.2977
Step  300 | Train: 2.1558 | Val: 2.1765
Step  300 | Train: 2.1558 | Val: 2.1765
Step  360 | Train: 2.0728 | Val: 2.0949
Step  360 | Train: 2.0728 | Val: 2.0949
Step  420 | Train: 1.9919 | Val: 2.0169
Step  420 | Train: 1.9919 | Val: 2.0169
Step  480 | Train: 1.9250 | Val: 1.9537
Step  480 | Train: 1.9250 | Val: 1.9537
Step  540 | Train: 1.8519 | Val: 1.8960
Step  540 | Train: 1.8519 | Val: 1.8960
Step  600 | Train: 1.7950 | Val: 1.8414
Step  600 | Train: 1.7950 | Val: 1.8414
Step  660 | Train: 1.7421 | Val: 1.7949
Step  660 | Train: 1.7421 | Val: 1.7949
Step  720 | Train: 1.7019 | Val: 1.7580
Step  720 | Train: 1.7019 | Val: 1.7580


#### Save model

In [None]:
model_path = Path('../models/mini_transformer_model_simple_big.pth')
torch.save(model.state_dict(), model_path)





#### Generation

In [32]:
model.eval()
dataset = CharDataset(text, block_size=128)
max_new_tokens = 500
with torch.no_grad():
    context = "O God, O God!"
    tokens = dataset.encode(context)
    #not sure about this, wanting to change the tensor shape
    idx = torch.tensor(tokens).view(1, len(tokens)).to(device)
    y = model.generate(idx, max_new_tokens)
    completion = dataset.decode(y[0].tolist())
    print(completion)

O God, O God! O hope, for I pray to thy brother,
I prithee my hand--

PAULINA:
I do put him doth
My lord of a cousing duty,
But he had 'twas, heard me pity, and mother
I determine. Never here more, relived to sleep.

ROMEO:
The sun of my ghost: he didst he doth greets me appear
Thy was forbid, with me came gorment to kiss;
His courage years in a lay cast,
To make him pace to late the day of cointrade,
And, tell thee them in her.

Provost:

Son:
Good where I all.

JULIET:
My husband, for sovereign, being dayi
