In [1]:
from model import ModelConfig, LlamaModel
from train import TrainerConfig, FileDataLoader, Trainer

from transformers import AutoTokenizer

In [2]:
tokenizer_id = "HuggingFaceTB/SmolLM2-135M"

In [3]:
tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
tokenizer.pad_token = tokenizer.eos_token

In [4]:
model_config = ModelConfig(
    vocab_size=tokenizer.vocab_size,
    d_model=576,
    d_head=64,
    d_mlp_proj=1536,
    n_layers=30,
    n_kv_heads=3,
    n_attn_heads=9,
    rms_norm_eps=1e-5,
    initializer_range=0.041666666666666664,
    rope_theta=100000.0,
    padding_idx=tokenizer.pad_token_id
)

In [5]:
train_config = TrainerConfig(
    per_device_train_batch_size=8,
    max_seq_len=2048,
    num_epochs=1,
    learning_rate=1e-3,
    tokens_folder="wiki_hindi_tok",
    max_steps=2000
)

In [6]:
model = LlamaModel(model_config)
dataloader = FileDataLoader(train_config, tokenizer)
trainer = Trainer(train_config, model)

Total train tokens             | 270,491,648
Num Trainable Params           | 162,826,560
Train device                   | cuda, NVIDIA GeForce RTX 3090, N=1
Training precision             | torch.bfloat16
Flash Attention                | True
torch.compile()                | True





In [7]:
trainer.train(dataloader)

Training steps                 | 2,000 
Step: 0, Training Loss: 11.44452, Tokens/sec: 1523.6792020482028
Step: 1, Training Loss: 9.92958, Tokens/sec: 1658.0446699343945
Step: 2, Training Loss: 7.48547, Tokens/sec: 88535.48187497325
Step: 3, Training Loss: 7.08707, Tokens/sec: 85621.38507734686
Step: 4, Training Loss: 6.42159, Tokens/sec: 86666.16139748672
Step: 5, Training Loss: 5.76335, Tokens/sec: 85075.0035268395
Step: 6, Training Loss: 4.90985, Tokens/sec: 87301.0103515507
Step: 7, Training Loss: 4.56522, Tokens/sec: 85954.65296049694
Step: 8, Training Loss: 4.28419, Tokens/sec: 84144.34179420414
Step: 9, Training Loss: 4.03407, Tokens/sec: 87916.10755802804
Step: 10, Training Loss: 3.93829, Tokens/sec: 87618.75117636021
Step: 11, Training Loss: 3.97180, Tokens/sec: 86250.11046215301
Step: 12, Training Loss: 3.90830, Tokens/sec: 87801.91245281768
Step: 13, Training Loss: 3.79044, Tokens/sec: 86141.00576033592
Step: 14, Training Loss: 3.74311, Tokens/sec: 86699.7608479747
Step: 15, 

In [10]:
input_ids = tokenizer(["आज की चर्चा"], return_tensors="pt")['input_ids'].to(trainer.device)
idx = model.generate(input_ids, temperature=0.25, top_k=50, max_new_tokens=256)
print(tokenizer.batch_decode(idx)[0])

आज की चर्चा मिलता है। इसके प्रमुख प्रभावित होने के लिए प्रति प्राप्त होते हैं। इसका प्राचीन प्रकाशित होता है। इसके अतिरिक्त प्रसारण में ही प्रतिभाग्य प्राप्त होता है। इसके प्रभावित हैं। इस प्रकार के अनुसार अपने प्रतिपत्ति में ही है। विश्वास म
