In [1]:
#Install Pytorch and tensorboard
'''
import sys
!{sys.executable} -m pip install torch==2.2.2+cu118 torchvision==0.17.2+cu118 torchaudio==2.2.2+cu118 --index-url https://download.pytorch.org/whl/cu118
!{sys.executable} -m pip install tensorboard==2.9.1
'''
#Install torch and torchvision
'''
import torch
import torchvision
print(f"PyTorch version Installed: {torch.__version__}\nTorchvision version Installed: {torchvision.__version__}\n")
if not torch.__version__.startswith("2.2"):
    print("you are using an another version of PyTorch. We expect PyTorch 2.2. You may continue using your version but it"
          " might cause dependency and compatibility issues.")
if not torchvision.__version__.startswith("0.17"):
    print("you are using an another version of torchvision. We expect torchvision 0.17. You can continue with your version but it"
          " might cause dependency and compatibility issues.")
'''

'\nimport torch\nimport torchvision\nprint(f"PyTorch version Installed: {torch.__version__}\nTorchvision version Installed: {torchvision.__version__}\n")\nif not torch.__version__.startswith("2.2"):\n    print("you are using an another version of PyTorch. We expect PyTorch 2.2. You may continue using your version but it"\n          " might cause dependency and compatibility issues.")\nif not torchvision.__version__.startswith("0.17"):\n    print("you are using an another version of torchvision. We expect torchvision 0.17. You can continue with your version but it"\n          " might cause dependency and compatibility issues.")\n'

In [2]:
#Important imports
import sys
import torch
import torchvision
import tensorboard
import numpy as np
import random
import matplotlib.pyplot as plt
import torchvision.transforms as transforms
from torch.utils.data.sampler import SubsetRandomSampler
from torch.utils.tensorboard import SummaryWriter

import torch.nn as nn
import torch.nn.functional as F
from models import TokenAndPositionEmbedding
from models.transformer import TransformerBlock

from models import tokenizer, embedding, transformer

import os
import requests
import pandas as pd
import time

pd.options.mode.chained_assignment = None  # default='warn'

%load_ext autoreload
%autoreload 2
%matplotlib inline

os.environ['KMP_DUPLICATE_LIB_OK']='True' # To prevent the kernel from dying.

In [3]:
data_path     = "data/tiny_shakespeare.txt"
save_path = "output/best.pt"
split_ratio   = (0.6, 0.2, 0.2)   # train/val/test
block_size    = 256
batch_size    = 32
#vocab_size = 65 
patience  = 5

d_model       = 256
n_heads       = 8
n_layers      = 6
d_ff          = 4 * d_model
dropout       = 0.1

learning_rate = 0.002
weight_decay  = 0.01
grad_clip     = 1.0
max_iters     = 5000
eval_interval = 100
eval_iters    = 100
seed          = 1337

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

Device: cuda


In [4]:

#Download dataset tiny shakespeare
'''data_path = "data/tiny_shakespeare.txt"
if os.path.exists(data_path):
    print(f"'{data_path}' already exists, skipping download.")
else:
    url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
    text = requests.get(url).text
    with open("data/tiny_shakespeare.txt", "w", encoding="utf-8") as f:
        f.write(text)
    print("Tiny Shakespeare downloaded! File size:", len(text), "characters")'''

#Download dataset full shakespeare
data_path = "data/full_shakespeare.txt"
if os.path.exists(data_path):
    print(f"'{data_path}' already exists, skipping download.")
else:
    os.makedirs("data", exist_ok=True)
    url = "https://www.gutenberg.org/files/100/100-0.txt"
    print("Downloading full Shakespeare from Project Gutenberg...")
    text = requests.get(url).text

    # Keep only the text
    if "*** START" in text:
        text = text.split("*** START")[1]
    if "*** END" in text:
        text = text.split("*** END")[0]

    with open(data_path, "w", encoding="utf-8") as f:
        f.write(text)
    print("Full Shakespeare downloaded! File size:", len(text), "characters")

#Tokenizer
'''with open("data/tiny_shakespeare.txt", "r", encoding="utf-8") as f:
    text = f.read()'''

with open("data/full_shakespeare.txt", "r", encoding="utf-8") as f:
    text = f.read()

tok = tokenizer.CharTokenizer(text)
print(len(tok.chars), "unique chars")

ids = tok.encode(text)
data = torch.tensor(ids, dtype=torch.long)

#Calculate vocab_size using the actual character set of the tokenizer
vocab_size = getattr(tok, "vocab_size", len(tok.chars))
print("vocab_size =", vocab_size)
mx = int(max(ids)) if len(ids) > 0 else -1
assert mx < int(vocab_size), f"max id {mx} >= vocab_size {int(vocab_size)}"

# Split by split_ratio (can be adjusted in hyperparameter tuning)
n = len(data)
n_train = int(split_ratio[0] * n)
n_val = int(split_ratio[1] * n)
n_test = n - n_train - n_val

train_data = data[:n_train]
val_data = data[n_train:n_train + n_val]
test_data = data[n_train + n_val:]

print(f"Total tokens: {n:,}")
print(f"Train: {len(train_data):,}, Val: {len(val_data):,}, Test: {len(test_data):,}")


#Function for get batch
def get_batch(split):
    data_split = {"train": train_data, "val": val_data, "test": test_data}[split]
    ix = torch.randint(0, len(data_split) - block_size - 1, (batch_size,))
    x = torch.stack([data_split[i     : i + block_size]     for i in ix])
    y = torch.stack([data_split[i + 1 : i + 1 + block_size] for i in ix])
    return x.to(device), y.to(device)

# quick shape check
xb, yb = get_batch("train")
print(xb.shape, yb.shape)  # Expect: (batch_size, block_size)


#define module
class MiniTransformerLM(nn.Module):
    def __init__(self):
        super().__init__()
        self.embed = TokenAndPositionEmbedding(vocab_size, d_model, block_size)
        self.blocks = nn.ModuleList([
            TransformerBlock(d_model, n_heads, d_ff, dropout, block_size)
            for _ in range(n_layers)
        ])
        self.ln_f = nn.LayerNorm(d_model)
        self.head = nn.Linear(d_model, vocab_size)

    def forward(self, idx, targets=None):
        x = self.embed(idx)                      # (B,T,C)
        for blk in self.blocks:
            x = blk(x)
        x = self.ln_f(x)
        logits = self.head(x)                    # (B,T,V)
        loss = None
        if targets is not None:
            loss = F.cross_entropy(
                logits.reshape(-1, logits.size(-1)),
                targets.reshape(-1)
            )
        return logits, loss

    @torch.no_grad()
    def generate(self, idx, max_new_tokens=200, temperature=1.0, top_k=None):
        self.eval()
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, _ = self(idx_cond)
            logits = logits[:, -1, :] / temperature
            if top_k is not None:
                v, _ = torch.topk(logits, k=top_k)
                logits[logits < v[:, [-1]]] = -float("inf")
            probs = torch.softmax(logits, dim=-1)
            next_id = torch.multinomial(probs, num_samples=1)
            idx = torch.cat([idx, next_id], dim=1)
        return idx

'data/full_shakespeare.txt' already exists, skipping download.
100 unique chars
vocab_size = 100
Total tokens: 5,359,388
Train: 3,215,632, Val: 1,071,877, Test: 1,071,879
torch.Size([32, 256]) torch.Size([32, 256])


In [5]:
#set up a new model
model = MiniTransformerLM().to(device)
print("Params:", sum(p.numel() for p in model.parameters() if p.requires_grad))

Params: 4855908


In [6]:
#Tensorboard setup
run_dir = f"runs/tt_{time.strftime('%Y%m%d-%H%M%S')}"
writer = SummaryWriter(log_dir=run_dir)
print("TensorBoard logdir:", run_dir)
os.makedirs(os.path.dirname(save_path), exist_ok=True)


#Start training process
best_val   = float("inf")
bad_epochs = 0

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

@torch.no_grad()
def estimate_loss(split):
    model.eval()
    losses = []
    for _ in range(eval_iters):
        xb, yb = get_batch(split)
        _, loss = model(xb, yb)
        losses.append(loss.item())
    model.train()
    return sum(losses) / len(losses)

for step in range(1, max_iters + 1):
    xb, yb = get_batch("train")
    _, loss = model(xb, yb)

    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
    optimizer.step()

    # Record the loss for each step
    writer.add_scalar("loss/train", loss.item(), step)

    if step % eval_interval == 0 or step == 1:
        val_loss = estimate_loss("val")
        print(f"step {step:4d} | train_loss {loss.item():.4f} | val_loss {val_loss:.4f}")

        # Record loss and learning rate
        writer.add_scalar("loss/val", val_loss, step)
        writer.add_scalar("lr", optimizer.param_groups[0]["lr"], step)

        # Early Stopping
        if val_loss < best_val:
            best_val = val_loss
            bad_epochs = 0
            torch.save({
                "model": model.state_dict(),
                "optimizer": optimizer.state_dict(),
                "step": step,
                "best_val": best_val,
            }, save_path)
            print(f"improved! best_val={best_val:.4f} (saved)")
        else:
            bad_epochs += 1
            print(f"no improvement ({bad_epochs}/{patience})")
            if bad_epochs >= patience:
                print("early stopping triggered.")
                break


#test result best model
@torch.no_grad()
def evaluate_test_set():
    model.eval()
    losses = []
    for _ in range(eval_iters):
        xb, yb = get_batch("test")
        _, loss = model(xb, yb)
        losses.append(loss.item())
    test_loss = sum(losses) / len(losses)
    return test_loss

test_loss = evaluate_test_set()
print(f"Final Test Loss: {test_loss:.4f}")
print(f"Perplexity (PPL): {torch.exp(torch.tensor(test_loss)):.2f}")

TensorBoard logdir: runs/tt_20251018-175737
step    1 | train_loss 4.8431 | val_loss 3.7555
improved! best_val=3.7555 (saved)
step  100 | train_loss 2.4878 | val_loss 2.5030
improved! best_val=2.5030 (saved)
step  200 | train_loss 2.3784 | val_loss 2.3183
improved! best_val=2.3183 (saved)
step  300 | train_loss 1.9972 | val_loss 2.0274
improved! best_val=2.0274 (saved)
step  400 | train_loss 1.8852 | val_loss 1.8750
improved! best_val=1.8750 (saved)
step  500 | train_loss 1.7375 | val_loss 1.7815
improved! best_val=1.7815 (saved)
step  600 | train_loss 1.6813 | val_loss 1.7171
improved! best_val=1.7171 (saved)
step  700 | train_loss 1.6112 | val_loss 1.6739
improved! best_val=1.6739 (saved)
step  800 | train_loss 1.5338 | val_loss 1.6454
improved! best_val=1.6454 (saved)
step  900 | train_loss 1.5259 | val_loss 1.6096
improved! best_val=1.6096 (saved)
step 1000 | train_loss 1.5166 | val_loss 1.5913
improved! best_val=1.5913 (saved)
step 1100 | train_loss 1.4551 | val_loss 1.5777
improv

In [7]:
#test sample output text generation
model.eval()
# Test output: Generate from "ROMEO:"
start_ids = tok.encode("ROMEO:")
idx = torch.tensor([start_ids], dtype=torch.long, device=device)
out = model.generate(idx, max_new_tokens=400, temperature=0.9, top_k=50)
print(tok.decode(out[0].tolist()))

ROMEO: ’tis at enemies.

POMPEY.
If it so, all proud as far you, Clifford as they say at a
state heart, the clome reckon hath his change.

OPHELIA.
Peace, sir, had he;
All hoisted
More of your discharge your lute bones upon you.

AUDREY.
Sir, not begget him; Clifford, from his sons!

ALEXAS.
And fetch the mellaries of his scroll brave
Have been arged hereafter of sacrifice,
Nor come than makes her merch
