In [1]:
from model import ModelConfig, LlamaModel
from train import TrainerConfig, SimpleDataLoader, Trainer

from transformers import AutoTokenizer

In [2]:
tokenizer_id = "HuggingFaceTB/SmolLM2-135M"

In [3]:
tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
tokenizer.pad_token = tokenizer.eos_token

In [4]:
model_config = ModelConfig(
    vocab_size=tokenizer.vocab_size,
    d_model=576,
    d_head=64,
    d_mlp_proj=1536,
    n_layers=4,
    n_kv_heads=3,
    n_attn_heads=9,
    rms_norm_eps=1e-5,
    initializer_range=0.041666666666666664,
    rope_theta=100000.0,
    padding_idx=tokenizer.pad_token_id
)

In [5]:
train_config = TrainerConfig(
    per_device_train_batch_size=8,
    max_seq_len=1024,
    num_epochs=64,
    eval_interval_steps=25,
    learning_rate=1e-4,
    grad_clip_norm=1.0,
    val_size=0.1,
    log_dir="runs/shakespeare",
    warmup_ratio=0.1
)

In [6]:
with open("data/tiny_shakespeare.txt") as f:
    text = f.read()

In [7]:
model = LlamaModel(model_config)
dataloader = SimpleDataLoader(train_config, tokenizer, text=text)
trainer = Trainer(train_config, model)

Total tokens                   | 342,016
Num Trainable Params           | 205,307,712
Train device                   | cuda, NVIDIA GeForce RTX 3090, N=1
Training precision             | torch.bfloat16
Flash Attention                | True
torch.compile()                | True
DistributedDataParallel        | False
Batch size                     | 8,192




In [8]:
trainer.train(dataloader)

Training steps                 | 2,432 
Step: 0, Training Loss: 11.27814, LR: 0.0000050, Tokens/sec: 245.21
Step: 1, Training Loss: 11.26458, LR: 0.0000054, Tokens/sec: 257.17
Step: 2, Training Loss: 11.16380, LR: 0.0000058, Tokens/sec: 43166.34
Step: 3, Training Loss: 11.08978, LR: 0.0000062, Tokens/sec: 49253.00
Computing Eval loss, steps: 5
Step: 3, Eval Loss: 11.05825
Step: 4, Training Loss: 11.04331, LR: 0.0000066, Tokens/sec: 56529.39
Step: 5, Training Loss: 11.02092, LR: 0.0000070, Tokens/sec: 53418.12
Step: 6, Training Loss: 10.91256, LR: 0.0000073, Tokens/sec: 55672.44
Step: 7, Training Loss: 10.78383, LR: 0.0000077, Tokens/sec: 54809.09
Step: 8, Training Loss: 10.64979, LR: 0.0000081, Tokens/sec: 55418.08
Step: 9, Training Loss: 10.64204, LR: 0.0000085, Tokens/sec: 49197.20
Step: 10, Training Loss: 10.51057, LR: 0.0000089, Tokens/sec: 56150.28
Step: 11, Training Loss: 10.47069, LR: 0.0000093, Tokens/sec: 56400.19
Step: 12, Training Loss: 10.26100, LR: 0.0000097, Tokens/sec: 5

In [12]:
input_text = """
All:
Content, content.

MENENIUS:
O sir, you are not right: have you not known
The worthiest men have done't?

CORIOLANUS:
""".strip()

input_ids = tokenizer([input_text], return_tensors="pt")['input_ids'].to(trainer.device)
idx = model.generate(input_ids, temperature=0.25, top_k=50, max_new_tokens=128)
print(tokenizer.batch_decode(idx)[0])

All:
Content, content.

MENENIUS:
O sir, you are not right: have you not known
The worthiest men have done't?

CORIOLANUS:
Away for them.

COR:
How my I will so.

CORIOLANUS:
We wouldst; I will be you, sir.

CORIOLANUS:
Your away is a heads.

CORIOLANUS:
Pray you home.

CORIOLANUS:
My son, leave the senate in a C was,
E night; he shall never.

BRUTUS:
When we will, sir.

MENENIUS:
That a day yourI have done King me.


