In [3]:
from model import ModelConfig, LlamaModel
from train import TrainerConfig, FileDataLoader, Trainer

from transformers import AutoTokenizer

In [4]:
tokenizer_id = "HuggingFaceTB/SmolLM-360M"

In [5]:
tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
tokenizer.pad_token = tokenizer.eos_token

In [8]:
tokenizer.eos_token_id

0

In [17]:
x = tokenizer(["Aman", "You are a doofus"], return_tensors='pt', padding="longest", padding_side='left')['input_ids']

In [19]:
tokenizer.batch_decode(x, skip_special_tokens=True)

['Aman', 'You are a doofus']

In [4]:
model_config = ModelConfig(
    vocab_size=tokenizer.vocab_size,
    d_model=960,
    d_head=64,
    d_mlp_proj=2560,
    n_layers=32,
    n_kv_heads=5,
    n_attn_heads=15,
    rms_norm_eps=1e-5,
    initializer_range=0.02,
    rope_theta=100000.0,
    padding_idx=tokenizer.pad_token_id
)

In [5]:
train_config = TrainerConfig(
    per_device_train_batch_size=32,
    grad_accumulation_steps=4,
    max_seq_len=2048,
    num_epochs=2,
    eval_interval_steps=500,
    learning_rate=1e-3,
    grad_clip_norm=1.0,
    tokens_folder="wiki_hindi_tok/",
    log_dir="runs/hindi_wiki",
    warmup_ratio=0.01,
    val_size=0.005,
    checkpoint_save_interval=1000,
)

In [6]:
model = LlamaModel(model_config)
dataloader = FileDataLoader(train_config, tokenizer)
trainer = Trainer(train_config, model)

Total tokens                   | 270,505,984
Shard range rank:0             | (0,131423)
Num Trainable Params           | 409,007,040
Train device                   | cuda, NVIDIA H200, N=4
Training precision             | torch.bfloat16
Flash Attention                | True
torch.compile()                | True
DistributedDataParallel        | False
Batch size                     | 262,144




In [7]:
trainer.train(dataloader)

Training steps                 | 2,054 


  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


Step: 0, Training Loss: 11.00659, LR: 0.0000500, Tokens/sec: 32287.57
Step: 1, Training Loss: 9.38478, LR: 0.0000975, Tokens/sec: 630610.14
Step: 2, Training Loss: 8.16895, LR: 0.0001450, Tokens/sec: 624619.20
Step: 3, Training Loss: 7.52400, LR: 0.0001925, Tokens/sec: 624507.36
Computing Eval loss, steps: 21
Step: 3, Eval Loss: 7.51354


  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


Step: 4, Training Loss: 7.45392, LR: 0.0002400, Tokens/sec: 628849.92
Step: 5, Training Loss: 7.14768, LR: 0.0002875, Tokens/sec: 625545.73
Step: 6, Training Loss: 6.66279, LR: 0.0003350, Tokens/sec: 626470.59
Step: 7, Training Loss: 6.22066, LR: 0.0003825, Tokens/sec: 624861.38
Step: 8, Training Loss: 5.75518, LR: 0.0004300, Tokens/sec: 626854.53
Step: 9, Training Loss: 5.30798, LR: 0.0004775, Tokens/sec: 627772.67
Step: 10, Training Loss: 4.84102, LR: 0.0005250, Tokens/sec: 623923.06
Step: 11, Training Loss: 4.45114, LR: 0.0005725, Tokens/sec: 625755.42
Step: 12, Training Loss: 4.16633, LR: 0.0006200, Tokens/sec: 624366.42
Step: 13, Training Loss: 3.99765, LR: 0.0006675, Tokens/sec: 625337.98
Step: 14, Training Loss: 3.84446, LR: 0.0007150, Tokens/sec: 625368.88
Step: 15, Training Loss: 3.78879, LR: 0.0007625, Tokens/sec: 623409.40
Step: 16, Training Loss: 3.74289, LR: 0.0008100, Tokens/sec: 624871.62
Step: 17, Training Loss: 3.74924, LR: 0.0008575, Tokens/sec: 623160.72
Step: 18, Tr

In [11]:
input_ids = tokenizer(["आज की चर्चा"], return_tensors="pt")['input_ids'].to(trainer.device)
idx = model.generate(input_ids, temperature=0.25, top_k=50, max_new_tokens=256)
print(tokenizer.batch_decode(idx)[0])

आज की चर्चा करता है। अपने पिता के निर्देशन के लिए प्रेम का निर्माण जब होता है तो अपने पिता के पिता के रूप में बहुत प्रसिद्ध होता है। इस प्रकार वह अपने पिता के पिता के रूप में अपने पिता के रूप में अपने पिता के रूप में प्रेम करते हैं। उन्होंने अपने पि
