In [1]:
from model import ModelConfig, LlamaModel
from train import TrainerConfig, DataLoader, Trainer

from transformers import AutoTokenizer

In [2]:
tokenizer_id = "HuggingFaceTB/SmolLM2-135M"

In [3]:
tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
tokenizer.pad_token = tokenizer.eos_token


In [4]:
model_config = ModelConfig(
    vocab_size=tokenizer.vocab_size,
    d_model=576,
    d_head=64,
    d_mlp_proj=1536,
    n_layers=30,
    n_kv_heads=3,
    n_attn_heads=9,
    rms_norm_eps=1e-5,
    initializer_range=0.041666666666666664,
    rope_theta=100000.0,
    padding_idx=tokenizer.pad_token_id
)

In [5]:
train_config = TrainerConfig(
    per_device_train_batch_size=8,
    max_seq_len=2048,
    num_epochs=50,
    learning_rate=1e-3,
    log_dir="runs/shakespeare"
)

In [6]:
with open("data/tiny_shakespeare.txt") as f:
    text = f.read()

In [7]:
model = LlamaModel(model_config)
dataloader = DataLoader(train_config, tokenizer, text=text)
trainer = Trainer(train_config, model)

Total train tokens             | 342,016
Num Trainable Params           | 162,826,560
Train device                   | cuda, NVIDIA GeForce RTX 3090, N=1
Training precision             | torch.bfloat16
Flash Attention                | True
torch.compile()                | True





In [8]:
trainer.train(dataloader)

Training steps                 | 950 
Step: 0, Training Loss: 11.31190, Tokens/sec: 1490.120841474059
Step: 1, Training Loss: 9.69302, Tokens/sec: 1605.889267044552
Step: 2, Training Loss: 12.86033, Tokens/sec: 87127.19376181753
Step: 3, Training Loss: 10.63981, Tokens/sec: 88562.79837798575
Step: 4, Training Loss: 9.64350, Tokens/sec: 88759.39753710364
Step: 5, Training Loss: 8.81989, Tokens/sec: 87127.25399751143
Step: 6, Training Loss: 8.48675, Tokens/sec: 87472.40680398425
Step: 7, Training Loss: 7.74859, Tokens/sec: 87093.91736807216
Step: 8, Training Loss: 7.59799, Tokens/sec: 86772.18886528346
Step: 9, Training Loss: 7.34216, Tokens/sec: 88012.3603651888
Step: 10, Training Loss: 7.11566, Tokens/sec: 87378.20169779556
Step: 11, Training Loss: 6.82298, Tokens/sec: 84790.50715805775
Step: 12, Training Loss: 6.68290, Tokens/sec: 86732.86018722753
Step: 13, Training Loss: 6.63552, Tokens/sec: 88123.0075860156
Step: 14, Training Loss: 6.62263, Tokens/sec: 88060.90426346315
Step: 15, T

In [9]:
input_ids = tokenizer(["HAMLET:\nTo be or"], return_tensors="pt")['input_ids'].to(trainer.device)
idx = model.generate(input_ids, temperature=0.25, top_k=50, max_new_tokens=64)
print(tokenizer.batch_decode(idx)[0])

HAMLET:
To be or tombs, there as that ere I was said
That now, now, I cannot thou:
Therefore I have disont to come back to joy;
Which now I am all, for thy soul:
What then is't that now but that far;
But, he's fallen not honest against her
