In [1]:
from model import ModelConfig, LlamaModel
from train import TrainerConfig, DataLoader, Trainer

from transformers import AutoTokenizer

In [2]:
tokenizer_id = "HuggingFaceTB/SmolLM2-135M"

In [3]:
tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
tokenizer.pad_token = tokenizer.eos_token

In [4]:
model_config = ModelConfig(
    vocab_size=tokenizer.vocab_size,
    d_model=576,
    d_head=64,
    d_mlp_proj=1536,
    n_layers=30,
    n_kv_heads=3,
    n_attn_heads=9,
    rms_norm_eps=1e-5,
    initializer_range=0.041666666666666664,
    rope_theta=100000.0,
    padding_idx=tokenizer.pad_token_id
)

In [5]:
train_config = TrainerConfig(
    per_device_train_batch_size=32,
    max_seq_len=128,
    num_epochs=12,
    eval_interval_steps=25,
    learning_rate=1e-4,
    grad_clip_norm=1.0,
    val_size=0.2,
    log_dir="runs/shakespeare",
    warmup_ratio=0.1
)

In [6]:
with open("data/tiny_shakespeare.txt") as f:
    text = f.read()

In [7]:
model = LlamaModel(model_config)
dataloader = DataLoader(train_config, tokenizer, text=text)
trainer = Trainer(train_config, model)

Total tokens                   | 341,120
Num Trainable Params           | 162,826,560
Train device                   | cuda, NVIDIA GeForce RTX 3090, N=1
Training precision             | torch.bfloat16
Flash Attention                | True
torch.compile()                | True





In [8]:
trainer.train(dataloader)

Training steps                 | 804 
Step: 0, Training Loss: 11.31816, LR: 0.0000013, Tokens/sec: 170.54935834817286
Computing Eval loss, steps: 17
Step: 0, Eval Loss: 11.27680
Step: 1, Training Loss: 11.29193, LR: 0.0000025, Tokens/sec: 181.5487660845126
Step: 2, Training Loss: 11.29968, LR: 0.0000038, Tokens/sec: 74733.96459704115
Step: 3, Training Loss: 11.26571, LR: 0.0000050, Tokens/sec: 75182.05492310377
Step: 4, Training Loss: 11.23035, LR: 0.0000063, Tokens/sec: 81731.63147243724
Step: 5, Training Loss: 11.22250, LR: 0.0000075, Tokens/sec: 89066.35475309359
Step: 6, Training Loss: 11.23940, LR: 0.0000087, Tokens/sec: 70858.7236512347
Step: 7, Training Loss: 11.09252, LR: 0.0000100, Tokens/sec: 83605.2084499759
Step: 8, Training Loss: 11.12186, LR: 0.0000113, Tokens/sec: 87362.69968022296
Step: 9, Training Loss: 11.01938, LR: 0.0000125, Tokens/sec: 74693.19033100348
Step: 10, Training Loss: 10.94824, LR: 0.0000138, Tokens/sec: 88827.23607928514
Step: 11, Training Loss: 10.95432

In [9]:
input_ids = tokenizer(["HAMLET:\nTo be or"], return_tensors="pt")['input_ids'].to(trainer.device)
idx = model.generate(input_ids, temperature=0.25, top_k=50, max_new_tokens=64)
print(tokenizer.batch_decode(idx)[0])

HAMLET:
To be or for with with with now.

BENVOLIO:
I'll be so my heart's thee.

HORTENSIO:
That's the king, I'll not I am him.

BAPTISTA:
I'll be so I have thee, and make thee,
