In [None]:
pip uninstall -y bitsandbytes typing_extensions
pip install --upgrade bitsandbytes typing_extensions
pip install pandas
pip install accelerate
pip install transformers
pip install datasets
pip install Scikit-Learn

In [None]:
import os
import gc
import torch
import pandas as pd
from datasets import Dataset
from sklearn.model_selection import train_test_split
from accelerate import dispatch_model
from huggingface_hub import login
from transformers import (
    AutoModelForCausalLM, 
    AutoTokenizer, 
    Trainer, 
    TrainingArguments,
    BitsAndBytesConfig,
    DataCollatorWithPadding
)


In [None]:
access_token="your-access-token"
model_name='meta-llama/Llama-3.2-1B'

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name,token=access_token)
model = AutoModelForCausalLM.from_pretrained(model_name,token=access_token)
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
# model.gradient_checkpointing_enable()

In [None]:
# data = pd.read_csv('subset_1.csv')
data = pd.read_csv('qa_pairs.csv')
qa_pairs = pd.DataFrame(data)


# ✅ Step 5: Process Data (Prepare Question-Answer Pairs)
def process_data(qa_pairs):
    data = []
    for _, row in qa_pairs.iterrows():
        question = row['question']
        answer = row['answer']
        context = row['verse_text']
        input_text = f"question: {question} context: {context}"
        output_text = answer
        data.append({'input_text': input_text, 'target_text': output_text})
    return data

# def tokenize_function(examples):
#     # Format input as: "Question: ... Context: ... Answer: ..."
#     full_text = [f"Question: {q} Context: {c} Answer: {a}" for q, c, a in zip(examples["input_text"], examples["context"], examples["target_text"])]

#     # Tokenize the full sequence
#     model_inputs = tokenizer(
#         full_text,
#         max_length=512,  
#         truncation=True,
#         padding="max_length"
#     )

#     # Shift labels for causal LM loss (ignore padding)
#     labels = model_inputs["input_ids"].copy()
#     labels = [[-100 if token == tokenizer.pad_token_id else token for token in label] for label in labels]

#     model_inputs["labels"] = labels
#     return model_inputs

def tokenize_function(examples):
    # Concatenate input_text (question + context) and target_text (answer)
    full_text = [q + " " + a for q, a in zip(examples["input_text"], examples["target_text"])]

    # Tokenize the full sequence
    model_inputs = tokenizer(
        full_text,
        max_length=512,  # Ensure it fits within LLaMA context window
        truncation=True,
        padding="max_length"
    )

    # Shift labels for causal LM loss (ignore padding)
    labels = model_inputs["input_ids"].copy()
    labels = [[-100 if token == tokenizer.pad_token_id else token for token in label] for label in labels]

    model_inputs["labels"] = labels
    return model_inputs


training_data = process_data(qa_pairs)

# ✅ Step 6: Split Data into Train & Eval Sets
train_data, eval_data = train_test_split(training_data, test_size=0.2, random_state=42)
train_df = pd.DataFrame(train_data)
eval_df = pd.DataFrame(eval_data)
train_dataset = Dataset.from_pandas(train_df)
eval_dataset = Dataset.from_pandas(eval_df)



# ✅ Step 8: Tokenize the Dataset
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_eval_dataset = eval_dataset.map(tokenize_function, batched=True)

# ✅ Step 9: Define Training Arguments (Low Memory Usage)
training_args = TrainingArguments(
    output_dir="./llama3_finetune_new",
    learning_rate=2e-5,
    per_device_train_batch_size=2,  # 🔥 Lower batch size to fit GPU
    per_device_eval_batch_size=2,  # 🔥 Lower batch size
    num_train_epochs=3,            #num of times entire data will be passes to model
    weight_decay=0.01,             # helps prevent overfitting by penalizing large weights 
    save_strategy="no",
    gradient_accumulation_steps=4,  # 🔥 Simulates batch size of 4
    bf16=True,  # 🔥 Enables mixed precision training
    # logging_steps=50,  # Log training progress every 50 steps
    optim = "adamw_8bit",
    warmup_steps=500,            # gradually increase the learning rate at the start ,helps stabilize the model and prevents large updates that might destabilize training
    logging_dir="./llama3_logs_new"
)
# ✅ Ensure model is in training mode
model.train()

# ✅ Ensure all parameters require gradients
for param in model.parameters():
    param.requires_grad = True 

In [None]:
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    data_collator=data_collator,         #format input correctly
    # packing=False       #rows are processed seperatly, saves memory consumes processing
)

# ✅ Step 11: Train the Model
trainer.train()

In [None]:
login(token = access_token) 
model_name1 = "AbhirajSinghRajpurohit/Llama-3.2-1B-karma-finetuned"

model.push_to_hub(model_name1)
tokenizer.push_to_hub(model_name1)

print(f"✅ Model uploaded to: https://huggingface.co/{model_name1}")

In [None]:
# Step	Training Loss
# 500	2.682600
# 1000	0.740300
# 1500	0.262900
# 2000	0.163400
# 2500	0.121700
# 3000	0.089800
# 3500	0.078600
# 4000	0.069800
# 4500	0.061800
# 5000	0.055100
# 5500	0.049200
# 6000	0.044800
# 6500	0.041700
# 7000	0.039200
