In [1]:
from model import ModelConfig, LlamaModel
from train import TrainerConfig, FileDataLoader, Trainer

from transformers import AutoTokenizer

In [2]:
tokenizer_id = "HuggingFaceTB/SmolLM2-135M"

In [3]:
tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
tokenizer.pad_token = tokenizer.eos_token

In [4]:
model_config = ModelConfig(
    vocab_size=tokenizer.vocab_size,
    d_model=576,
    d_head=64,
    d_mlp_proj=1536,
    n_layers=30,
    n_kv_heads=3,
    n_attn_heads=9,
    rms_norm_eps=1e-5,
    initializer_range=0.041666666666666664,
    rope_theta=100000.0,
    padding_idx=tokenizer.pad_token_id
)

In [5]:
train_config = TrainerConfig(
    per_device_train_batch_size=8,
    max_seq_len=2048,
    num_epochs=1,
    eval_interval_steps=100,
    learning_rate=1e-3,
    grad_clip_norm=1.0,
    tokens_folder="wiki_hindi_tok",
    max_steps=2000,
    log_dir="runs/hindi_wiki",
    warmup_ratio=0.1,
    val_size=0.005
)

In [6]:
model = LlamaModel(model_config)
dataloader = FileDataLoader(train_config, tokenizer)
trainer = Trainer(train_config, model)

Total tokens                   | 270,491,648
Num Trainable Params           | 162,826,560
Train device                   | cuda, NVIDIA GeForce RTX 3090, N=1
Training precision             | torch.bfloat16
Flash Attention                | True
torch.compile()                | True





In [None]:
trainer.train(dataloader)

Training steps                 | 2,000 
Step: 0, Training Loss: 11.31632, LR: 0.0000050, Tokens/sec: 1472.411248717527
Computing Eval loss, steps: 83
Step: 0, Eval Loss: 11.09461
Step: 1, Training Loss: 11.04354, LR: 0.0000100, Tokens/sec: 1631.558609134983
Step: 2, Training Loss: 10.77904, LR: 0.0000150, Tokens/sec: 84052.02716088382
Step: 3, Training Loss: 10.29739, LR: 0.0000200, Tokens/sec: 86054.25690366127
Step: 4, Training Loss: 9.77577, LR: 0.0000250, Tokens/sec: 81848.7191781952
Step: 5, Training Loss: 9.60134, LR: 0.0000300, Tokens/sec: 83818.23877841422
Step: 6, Training Loss: 8.84350, LR: 0.0000350, Tokens/sec: 83332.67424299945
Step: 7, Training Loss: 8.33997, LR: 0.0000400, Tokens/sec: 80517.70288147315
Step: 8, Training Loss: 7.92904, LR: 0.0000450, Tokens/sec: 83517.34097057937
Step: 9, Training Loss: 7.53227, LR: 0.0000500, Tokens/sec: 84051.03498362956
Step: 10, Training Loss: 7.28492, LR: 0.0000550, Tokens/sec: 81848.68605997278
Step: 11, Training Loss: 7.09386, LR: 

In [10]:
input_ids = tokenizer(["आज की चर्चा"], return_tensors="pt")['input_ids'].to(trainer.device)
idx = model.generate(input_ids, temperature=0.25, top_k=50, max_new_tokens=256)
print(tokenizer.batch_decode(idx)[0])

आज की चर्चा मिलता है। इसके प्रमुख प्रभावित होने के लिए प्रति प्राप्त होते हैं। इसका प्राचीन प्रकाशित होता है। इसके अतिरिक्त प्रसारण में ही प्रतिभाग्य प्राप्त होता है। इसके प्रभावित हैं। इस प्रकार के अनुसार अपने प्रतिपत्ति में ही है। विश्वास म
