### Installation

In [None]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1,<4.0.0" "huggingface_hub>=0.34.0" hf_transfer
    !pip install --no-deps unsloth

In [None]:
import pandas as pd
from unsloth import FastLanguageModel
import torch
from datasets import Dataset, DatasetDict
import pandas as pd
from sklearn.model_selection import train_test_split
from unsloth.chat_templates import get_chat_template

### Unsloth

In [None]:
max_seq_length = 2048 
dtype = None 
load_in_4bit = True


fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",     
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",   
    "unsloth/Mistral-Small-Instruct-2409",   
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",          
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",         

    "unsloth/Llama-3.2-1B-bnb-4bit",         
    "unsloth/Llama-3.2-1B-Instruct-bnb-4bit",
    "unsloth/Llama-3.2-3B-bnb-4bit",
    "unsloth/Llama-3.2-3B-Instruct-bnb-4bit",

    "unsloth/Llama-3.3-70B-Instruct-bnb-4bit" 
]

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-3B-Instruct",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...",
)

We now add LoRA adapters so we only need to update 1 to 10% of all parameters!

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, 
    bias = "none",    
    use_gradient_checkpointing = "unsloth", 
    random_state = 3407,
    use_rslora = False,  
    loftq_config = None, 
)

<a name="Data"></a>
### Data Prep
.

In [None]:
data = pd.read_csv('/kaggle/input/q-a-data/chunk_qa_deduplicated.csv')

In [None]:
train_df, temp_df = train_test_split(data, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)


dataset = DatasetDict({
    "train": Dataset.from_pandas(train_df),
    "validation": Dataset.from_pandas(val_df),
    "test": Dataset.from_pandas(test_df),
})

tokenizer = get_chat_template(
    tokenizer,
    chat_template="llama-3",  
)

def format_conversation(row):
    return [
        {"role": "user", "content": row["Question"]},
        {"role": "assistant", "content": row["Answer"]},
    ]

def formatting_prompts_func(examples):
    convos = [format_conversation({"Question": q, "Answer": a}) 
             for q, a in zip(examples["Question"], examples["Answer"])]
    texts = [tokenizer.apply_chat_template(
        convo,
        tokenize=False,
        add_generation_prompt=False
    ) for convo in convos]
    return {"text": texts}

formatted_dataset = DatasetDict({
    "train": dataset["train"].map(formatting_prompts_func, batched=True, 
                                remove_columns=["Chunk Index", "Question", "Answer"]),
    "validation": dataset["validation"].map(formatting_prompts_func, batched=True,
                                          remove_columns=["Chunk Index", "Question", "Answer"]),
    "test": dataset["test"].map(formatting_prompts_func, batched=True,
                               remove_columns=["Chunk Index", "Question", "Answer"]),
})

print("\nSample formatted training example:")
print(formatted_dataset["train"]["text"][0])

print("\nDataset structure:")
print(formatted_dataset)

We look at how the conversations are structured for item 5:

In [None]:
formatted_dataset['train'][6]

<a name="Train"></a>
### Train the model

In [None]:
!pip install wandb

In [None]:
from kaggle_secrets import UserSecretsClient
import wandb

user_secrets = UserSecretsClient()
wandb_api_key = user_secrets.get_secret("WANDB_API_KEY")

wandb.login(key=wandb_api_key)

In [None]:
wandb.init(
    project="energyAi", 
    config={
        "model": "Llama-3-8B-Instruct",
        "dataset": "EV_QA_pairs",
        "lora_rank": 16,
        "per_device_train_batch_size" : 2,
        "gradient_accumulation_steps" : 4,
        "num_train_epochs" : 10,
        "warmup_steps" : 10,
        "learning_rate" : 1e-5,
        "logging_steps" : 1,
        "optim" : "adamw_8bit",
        "weight_decay" : 0.01,
        "lr_scheduler_type" : "linear"
    }
)

In [None]:
from trl import SFTConfig, SFTTrainer
from transformers import DataCollatorForSeq2Seq
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=formatted_dataset['train'],
    eval_dataset=formatted_dataset['validation'],  # Added evaluation dataset
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    packing=False,  # Set to True if sequences are similar length
    args=SFTConfig(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        num_train_epochs = 10,
        warmup_steps=10,
        #max_steps=60,
        learning_rate=1e-5,
        logging_steps=1,
        eval_strategy="steps",  
        eval_steps=10,              
        save_steps=20,              
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
        report_to="wandb" if wandb.run else None, 
    ),
)


In [None]:
from tqdm import tqdm
!pip install rouge
from rouge import Rouge

In [None]:
formatted_dataset["test"][1]

In [None]:
# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

In [None]:
trainer_stats = trainer.train()

In [None]:
from huggingface_hub import login, upload_folder

login(token="hf_token")

upload_folder(
    folder_path="/path/to/your/kaggle/model",
    repo_id="Ziad177/llama-3.2-instruct-fine-tuned",
    repo_type="model"
)

In [None]:
# @title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(
    f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

In [None]:
from unsloth import FastLanguageModel
import torch

def evaluate_qa(model, test_dataset, tokenizer, max_samples=None):
    """
    Evaluates model on test set with ROUGE and accuracy metrics.
    Works with the formatted dataset containing only 'text' column.
    """
    from rouge import Rouge
    rouge = Rouge()
    
    model.eval()
    predictions = []
    references = []
    correct = 0
    
    # Extract questions and answers from formatted text
    questions = []
    true_answers = []
    for text in test_dataset["text"]:
        parts = text.split("<|start_header_id|>assistant<|end_header_id|>")
        if len(parts) >= 2:
            q_part = parts[0].split("<|start_header_id|>user<|end_header_id|>")[-1]
            questions.append(q_part.split("<|eot_id|>")[0].strip())
            true_answers.append(parts[1].split("<|eot_id|>")[0].strip())
    
    test_subset = list(zip(questions, true_answers))[:max_samples] if max_samples else list(zip(questions, true_answers))
    
    for question, true_answer in tqdm(test_subset, desc="Evaluating"):
        # Format input with chat template
        conversation = [
            {"role": "user", "content": question},
            {"role": "assistant", "content": ""},  # Empty for generation
        ]
        input_text = tokenizer.apply_chat_template(
            conversation,
            tokenize=False,
            add_generation_prompt=True
        )
        
        # Generate answer
        inputs = tokenizer(input_text, return_tensors="pt").to("cuda")
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=100,
                pad_token_id=tokenizer.eos_token_id
            )
        
        # Decode and clean output
        full_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
        generated_answer = full_output.split("<|start_header_id|>assistant<|end_header_id|>")[-1].strip()
        
        # Store for metrics
        predictions.append(generated_answer)
        references.append(true_answer)
        
        # Simple exact match check
        if true_answer.lower() in generated_answer.lower():
            correct += 1
    
    # Calculate metrics
    rouge_scores = rouge.get_scores(predictions, references, avg=True) if predictions else {}
    accuracy = correct / len(test_subset) if test_subset else 0
    
    return {
        "rouge": rouge_scores,
        "accuracy": accuracy,
        "samples": list(zip(questions, predictions, true_answers))[:5]  # First 5 examples
    }

# Run evaluation
test_results = evaluate_qa(
    model=model,
    test_dataset=formatted_dataset["test"],
    tokenizer=tokenizer,
    max_samples=100  # Set to None for full dataset
)

# Print results
if test_results["rouge"]:
    print(f"\nROUGE-L F1: {test_results['rouge']['rouge-l']['f']:.3f}")
print(f"Accuracy: {test_results['accuracy']:.2%}")

print("\nSample predictions:")
for i, (question, pred, true) in enumerate(test_results["samples"]):
    print(f"\nExample {i+1}:")
    print(f"Question: {question}")
    print(f"Generated: {pred}")
    print(f"True Answer: {true}")

<a name="Save"></a>
### Saving, loading finetuned models


In [None]:
model.save_pretrained("lora_model")  
tokenizer.save_pretrained("lora_model")
# model.push_to_hub("your_name/lora_model", token = "...") # Online saving
# tokenizer.push_to_hub("your_name/lora_model", token = "...") # Online saving

### Saving to float16 for VLLM


In [None]:
# Merge to 16bit
# #model.save_pretrained_merged("model", tokenizer, save_method = "merged_16bit",)
# model.push_to_hub_merged("Ziad177/llama-3.2-instruct-fine-tuned", tokenizer, save_method = "merged_16bit", token="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")

# Merge to 4bit
#if False: model.save_pretrained_merged("model", tokenizer, save_method = "merged_4bit",)
model.push_to_hub_merged("Ziad177/llama-3.2-instruct-fine-tuned-4bit", tokenizer, save_method = "merged_4bit_forced", token = "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")