In [1]:
from model import ModelConfig, LlamaModel
from train import TrainerConfig, SimpleDataLoader, Trainer

from transformers import AutoTokenizer

In [2]:
tokenizer_id = "HuggingFaceTB/SmolLM2-135M"

In [3]:
tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
tokenizer.pad_token = tokenizer.eos_token

In [4]:
model_config = ModelConfig(
    vocab_size=tokenizer.vocab_size,
    d_model=960,
    d_head=64,
    d_mlp_proj=2560,
    n_layers=32,
    n_kv_heads=5,
    n_attn_heads=15,
    rms_norm_eps=1e-5,
    initializer_range=0.02,
    rope_theta=100000.0,
    padding_idx=tokenizer.pad_token_id
)

In [5]:
train_config = TrainerConfig(
    per_device_train_batch_size=8,
    max_seq_len=512,
    num_epochs=12,
    eval_interval_steps=25,
    learning_rate=1e-3,
    grad_clip_norm=1.0,
    val_size=0.05,
    log_dir="runs/think_exp1_base_nothink_32layer_big_lr1e-3_12epochs",
    warmup_ratio=0.1
)

In [6]:
with open("data/complete_shakespeare.txt") as f:
    text = f.read()

In [7]:
model = LlamaModel(model_config)
dataloader = SimpleDataLoader(train_config, tokenizer, text=text)
trainer = Trainer(train_config, model)

Total tokens                   | 1,596,416
Num Trainable Params           | 409,007,040
Train device                   | cuda, NVIDIA GeForce RTX 3090, N=1
Training precision             | torch.bfloat16
Flash Attention                | True
torch.compile()                | True
DistributedDataParallel        | False
Batch size                     | 4,096




In [8]:
trainer.train(dataloader)

Training steps                 | 4,452 
Step: 0, Training Loss: 10.96608, LR: 0.0000500, Tokens/sec: 365.53
Step: 1, Training Loss: 10.72938, LR: 0.0000521, Tokens/sec: 393.28
Step: 2, Training Loss: 10.25065, LR: 0.0000543, Tokens/sec: 41905.32
Step: 3, Training Loss: 10.04979, LR: 0.0000564, Tokens/sec: 39745.76
Computing Eval loss, steps: 20
Step: 3, Eval Loss: 9.58651
Step: 4, Training Loss: 9.70757, LR: 0.0000585, Tokens/sec: 36094.66
Step: 5, Training Loss: 9.45795, LR: 0.0000607, Tokens/sec: 40488.38
Step: 6, Training Loss: 9.29785, LR: 0.0000628, Tokens/sec: 41442.04
Step: 7, Training Loss: 9.23244, LR: 0.0000649, Tokens/sec: 40687.52
Step: 8, Training Loss: 9.21922, LR: 0.0000671, Tokens/sec: 37934.86
Step: 9, Training Loss: 9.06639, LR: 0.0000692, Tokens/sec: 40421.01
Step: 10, Training Loss: 8.92146, LR: 0.0000713, Tokens/sec: 40734.68
Step: 11, Training Loss: 8.85211, LR: 0.0000735, Tokens/sec: 40871.76
Step: 12, Training Loss: 8.72774, LR: 0.0000756, Tokens/sec: 38833.94
S

In [9]:
input_text = """
ALL. Content, content.

MENENIUS. O sir, you are not right: have you not known
The worthiest men have done't?

CORIOLANUS.
""".strip()

input_ids = tokenizer([input_text], return_tensors="pt")['input_ids'].to(trainer.device)
idx = model.generate(input_ids, temperature=0.25, top_k=50, max_new_tokens=128)
print(tokenizer.batch_decode(idx)[0])

ALL. Content, content.

MENENIUS. O sir, you are not right: have you not known
The worthiest men have done't?

CORIOLANUS.
O, content him!
The gods have lost a deed with men in Rome,
And none but we are merry, not without cause.
The enemy is foolish, and we must change;
If you receive the king, you must not use
How things goers, and be at further
As you can wish your honour and your nature
As it is like to be so.

HAMLET.
That’s certain,
And must change’t.

QUEEN.
To cut and kill him.

HAMLET.
Have you forgot me?

POL
