# 🚀 QLoRA Fine-tuning for LLMs

This notebook implements QLoRA fine-tuning for:
- Mistral 7B
- Other compatible models

Compatible with:
- Google Colab
- Kaggle
- Local GPU clusters

In [None]:
import os
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments
)
from peft import (
    LoraConfig,
    prepare_model_for_kbit_training,
    get_peft_model
)
from datasets import load_dataset

# Check GPU availability
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

## Setup Environment and Data

In [None]:
# Clone repository if not exists
if not os.path.exists('datasets-base'):
    !git clone https://github.com/bentex2006/datasets-base.git

# Load configuration
import json
with open('datasets-base/config/model_configs/mistral_qlora_config.json', 'r') as f:
    config = json.load(f)

# Set model parameters
MODEL_NAME = config['model_name_or_path']
DATA_PATH = 'datasets-base/data/processed/hinglish_mistral.jsonl'
OUTPUT_DIR = 'results/mistral-savage'

## Load and Process Dataset

In [None]:
# Load dataset
dataset = load_dataset('json', data_files=DATA_PATH, split='train')

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME,
    padding_side="right",
    use_fast=False
)
tokenizer.pad_token = tokenizer.eos_token

# Prepare training prompt template
def create_prompt(example):
    instruction = example['instruction']
    input_text = example['input']
    output = example['output']
    
    # Format: <s>[INST] instruction + input [/INST] output </s>
    prompt = f"<s>[INST] {instruction}\n{input_text} [/INST] {output} </s>"
    return prompt

# Tokenize dataset
def tokenize_function(examples):
    prompts = [create_prompt(example) for example in examples]
    return tokenizer(prompts, truncation=True, max_length=config['max_seq_length'])

tokenized_dataset = dataset.map(tokenize_function, batched=True)
print(f"Dataset size: {len(tokenized_dataset)} examples")

## Initialize Model with QLoRA

In [None]:
# Quantization configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True
)

# Load base model
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto"
)

# Prepare model for k-bit training
model = prepare_model_for_kbit_training(model)

# LoRA configuration
lora_config = LoraConfig(
    r=config['lora_r'],
    lora_alpha=config['lora_alpha'],
    lora_dropout=config['lora_dropout'],
    bias="none",
    task_type="CAUSAL_LM"
)

# Get PEFT model
model = get_peft_model(model, lora_config)

## Training Setup

In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=config['num_train_epochs'],
    per_device_train_batch_size=config['per_device_train_batch_size'],
    gradient_accumulation_steps=config['gradient_accumulation_steps'],
    learning_rate=config['learning_rate'],
    max_grad_norm=config['max_grad_norm'],
    warmup_ratio=config['warmup_ratio'],
    lr_scheduler_type=config['lr_scheduler_type'],
    save_strategy="steps",
    save_steps=50,
    logging_steps=10,
    bf16=True,
    torch_compile=False,
    save_total_limit=3,
)

# Initialize trainer
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=lambda x: {'input_ids': torch.stack([torch.tensor(x[i]['input_ids']) for i in range(len(x))])}
)

## Start Training

In [None]:
# Train model
trainer.train()

# Save final model
trainer.save_model()
print(f"Model saved to {OUTPUT_DIR}")

## Test the Model

In [None]:
def generate_response(instruction, input_text):
    prompt = f"<s>[INST] {instruction}\n{input_text} [/INST]"
    inputs = tokenizer(prompt, return_tensors="pt", padding=True)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    
    outputs = model.generate(
        **inputs,
        max_length=256,
        temperature=0.7,
        top_p=0.95,
        num_return_sequences=1,
    )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response.split('[/INST]')[-1].strip()

# Test examples
test_cases = [
    {"instruction": "playful banter", "input": "Aaj meeting mein so gaya main"},
    {"instruction": "witty response", "input": "Tu bohot smart hai"},
    {"instruction": "neutral conversation", "input": "Coffee peene chalein?"}
]

for case in test_cases:
    response = generate_response(case['instruction'], case['input'])
    print(f"Instruction: {case['instruction']}")
    print(f"Input: {case['input']}")
    print(f"Response: {response}\n")