In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
import numpy as np
from pathlib import Path
import requests
from my_tokenizer import CharDataset
from my_gpt import SmolGPT

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

Using device: cpu


#### Unit testing

In [3]:
!pytest my_tests.py -v

platform linux -- Python 3.12.3, pytest-9.0.0, pluggy-1.6.0 -- /home/aenh/git/mini_project_language_model/.venv/bin/python3
cachedir: .pytest_cache
rootdir: /home/aenh/git/mini_project_language_model/src
plugins: anyio-4.11.0
collected 6 items                                                              [0m[1m

my_tests.py::test_tokenizer_roundtrip ]9;4;1;0\[32mPASSED[0m[32m                             [ 16%][0m
my_tests.py::test_single_attention_head ]9;4;1;16\[32mPASSED[0m[32m                           [ 33%][0m
my_tests.py::test_multi_attention_head ]9;4;1;33\[32mPASSED[0m[32m                            [ 50%][0m
my_tests.py::test_ffn ]9;4;1;50\[32mPASSED[0m[32m                                             [ 66%][0m
my_tests.py::test_transformer_block ]9;4;1;66\[32mPASSED[0m[32m                               [ 83%][0m
my_tests.py::test_full_model ]9;4;1;83\[32mPASSED[0m[32m                                      [100%][0m]9;4;0;\



#### Get Data using DataSet / DataLoader

In [4]:
from torch.utils.data import DataLoader

# Load Shakespeare data
with open('../data/shakespeare.txt', 'r') as f:
    text = f.read()

# Split into train/val (90/10)
n = int(0.9 * len(text))
train_text, val_text = text[:n], text[n:]

# Create datasets using your CharDataset
train_dataset = CharDataset(train_text, block_size=128)
val_dataset = CharDataset(val_text, block_size=128)

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

In [5]:
@torch.no_grad()
def estimate_loss(model, train_loader, val_loader, device, eval_batches=50):
    """
    Estimate loss on train and val sets
    Args:
        model: GPT model
        train_loader: training DataLoader
        val_loader: val DataLoader
        device: cpu or cuda
        eval_batches: nb of batches to average over
    Returns:
        a Dictionary with 'train' and 'val' losses
    """
    out = {}
    model.eval()
    
    for split, loader in [('train', train_loader), ('val', val_loader)]:
        losses = []
        for i, (x, y) in enumerate(loader):
            if i >= eval_batches:
                break
            x, y = x.to(device), y.to(device)
            _, loss = model(x, y)
            losses.append(loss.item())
        out[split] = sum(losses) / len(losses)
    
    model.train()
    return out

#### Setting hyperparameters

In [6]:
# The project was tested with 12 layers, 8 attention heads, and 768 embedding dimensions, on a single GPU.
vocab_size=train_dataset.get_vocab_size(),
n_embd=768,
block_size=128,
num_head=8,
num_layers=12,
dropout=0.1
learning_rate = 1e-4

#### Training loop

In [7]:
# Create model
model = SmolGPT(
    vocab_size=vocab_size,
    n_embd=n_embd,
    block_size=block_size,
    num_head=num_head,
    num_layers=num_layers,
    dropout=dropout
).to(device)

optimizer = torch.optim.Adam(model.parameters(),lr=learning_rate)

num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    
    for x, y in train_loader:
        x, y = x.to(device), y.to(device)
        
        logits, loss = model(x, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    # evaluation after each epoch
    losses = estimate_loss(model, train_loader, val_loader, device)
    print(f"Epoch {epoch:2d} | Train: {losses['train']:.4f} | Val: {losses['val']:.4f}")

print("Training complete!")
    

TypeError: empty() received an invalid combination of arguments - got (tuple, dtype=NoneType, device=NoneType), but expected one of:
 * (tuple of ints size, *, tuple of names names, torch.memory_format memory_format = None, torch.dtype dtype = None, torch.layout layout = None, torch.device device = None, bool pin_memory = False, bool requires_grad = False)
 * (tuple of ints size, *, torch.memory_format memory_format = None, Tensor out = None, torch.dtype dtype = None, torch.layout layout = None, torch.device device = None, bool pin_memory = False, bool requires_grad = False)


#### Generation

In [None]:
model.eval()
dataset = CharDataset(text, block_size=128)
max_new_tokens = 100
with torch.no_grad():
    context = "O God, O God!"
    tokens = dataset.encode(context)
    #not sure about this, wanting to change the tensor shape
    idx = tokens.view(1, len(tokens))
    y = model.generate(idx, max_new_tokens)
    completion = dataset.itos(y)
    print(completion)