In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
import pandas as pd
import json
from datasets import Dataset, DatasetDict


In [None]:
def read_json_from_file(filepath):
    with open(filepath, 'r', encoding='utf-8') as file:
        data = json.load(file)
        return pd.DataFrame(data) 

data = read_json_from_file("./data/refined_hindi_data.json")
dataset = Dataset.from_pandas(data)

train_dataset, val_dataset = dataset.train_test_split(test_size=0.2, seed=42)

combined_dataset = DatasetDict({
    "train" : train_dataset, 
    "val" : val_dataset
})

combined_dataset

In [None]:
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B")

def tokenize_function(examples):
    inputs = tokenizer(examples["instruction"], examples["output"], truncation=True, max_length=512, padding="max_length", return_tensors="pt")
    inputs["labels"] = inputs["input_ids"].clone()
    return inputs

tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["instruction", "output"])



In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=50,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    bf16=True
)
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2-0.5B", device_map="auto")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
)

print("Starting training...")
trainer.train()