In [1]:
from model import ModelConfig, LlamaModel
from train import TrainerConfig, FileDataLoader, Trainer

from transformers import AutoTokenizer

In [2]:
tokenizer_id = "HuggingFaceTB/SmolLM2-135M"

In [3]:
tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
tokenizer.pad_token = tokenizer.eos_token

In [4]:
model_config = ModelConfig(
    vocab_size=tokenizer.vocab_size,
    d_model=576,
    d_head=64,
    d_mlp_proj=1536,
    n_layers=30,
    n_kv_heads=3,
    n_attn_heads=9,
    rms_norm_eps=1e-5,
    initializer_range=0.041666666666666664,
    rope_theta=100000.0,
    padding_idx=tokenizer.pad_token_id
)

In [5]:
train_config = TrainerConfig(
    per_device_train_batch_size=32,
    max_seq_len=2048,
    num_epochs=1,
    eval_interval_steps=100,
    learning_rate=1e-3,
    grad_clip_norm=1.0,
    tokens_folder="fineweb-edu_tok",
    max_steps=1000,
    log_dir="runs/fineweb",
    warmup_ratio=0.1,
    val_size=0.0005
)

In [6]:
model = LlamaModel(model_config)
dataloader = FileDataLoader(train_config, tokenizer)
trainer = Trainer(train_config, model)

Total tokens                   | 10,101,737,472
Num Trainable Params           | 162,826,560
Train device                   | cuda, NVIDIA A100-SXM4-80GB, N=8
Training precision             | torch.bfloat16
Flash Attention                | True
torch.compile()                | True





In [7]:
trainer.train(dataloader)

Training steps                 | 1,000 
Step: 0, Training Loss: 11.27769, LR: 0.0000100, Tokens/sec: 1425.5069511910128
Computing Eval loss, steps: 78
Step: 0, Eval Loss: 11.24022
Step: 1, Training Loss: 11.23619, LR: 0.0000200, Tokens/sec: 1779.3861157874926
Step: 2, Training Loss: 11.14878, LR: 0.0000300, Tokens/sec: 246992.93520791107
Step: 3, Training Loss: 10.97960, LR: 0.0000400, Tokens/sec: 246286.03921751503
Step: 4, Training Loss: 10.83010, LR: 0.0000500, Tokens/sec: 248138.47047151692
Step: 5, Training Loss: 10.59780, LR: 0.0000600, Tokens/sec: 247080.66778849915
Step: 6, Training Loss: 10.20184, LR: 0.0000700, Tokens/sec: 248375.73130491533
Step: 7, Training Loss: 9.94503, LR: 0.0000800, Tokens/sec: 247456.64110105
Step: 8, Training Loss: 9.64306, LR: 0.0000900, Tokens/sec: 247568.435538636
Step: 9, Training Loss: 9.58912, LR: 0.0001000, Tokens/sec: 248998.8865784912
Step: 10, Training Loss: 9.42804, LR: 0.0001100, Tokens/sec: 248277.19359325164
Step: 11, Training Loss: 9.37

In [8]:
input_ids = tokenizer(["The world is"], return_tensors="pt")['input_ids'].to(trainer.device)
idx = model.generate(input_ids, temperature=0.25, top_k=50, max_new_tokens=256)
print(tokenizer.batch_decode(idx)[0])

The world is a very important part of the world.
The first time the world is the world’s largest and most important. The world is the world’s largest and most important. The world is the world’s largest and most important.
The world is the world’s largest and most important. The world is the world’s largest and most important and most important. The world is the world’s largest and most important.
The world is the world’s largest and most important and most important.
The world is the world’s most important and most important.
The world is the world’s most important and most important.
The world is the world’s most important and most important.
The world is the world’s largest nation.
The world is the world’s largest and most important.
The world is the world’s largest.
The world is the world’s largest.
The world is the world’s largest.
The world is the world’s largest.
The world is the world’s largest.
The world is the world’s largest.
The world is the world’s largest.
The world is th