<a href="https://colab.research.google.com/github/avikumart/LLM-GenAI-Transformers-Notebooks/blob/main/Unsloth_finetuning/unsloth_llm_traininig_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install unsloth

In [None]:
import argparse
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
# Replace transformers imports for model/tokenizer with Unsloth's FastLanguageModel
from transformers import TrainingArguments, Trainer
import unsloth
from unsloth import FastLanguageModel
import os

# --- Configuration for Unsloth LoRA ---
# Define the rank and target modules for LoRA
# These are common recommended settings for Unsloth
MAX_SEQ_LENGTH = 1024 # Will be overwritten by command-line arg
DTYPE = torch.bfloat16 # None for auto detection. torch.bfloat16 recommended for newer GPUs.
LOAD_IN_4BIT = True # Use 4bit quantization (QLoRA) to reduce memory
R = 16 # LoRA rank
LORA_ALPHA = 16 # LoRA scaling factor
TARGET_MODULES = [
    "q_proj",
    "k_proj",
    "v_proj",
    "o_proj",
    "gate_proj",
    "up_proj",
    "down_proj",
    "embed_tokens",  # Added for better performance on some models
    "lm_head",       # Added for better performance on some models
]

# --- Preprocessing Function ---
def preprocess_function(examples, tokenizer, max_length):
    """Tokenizes and formats data for Causal Language Modeling in a standard instruction format."""
    # Use a simple instruction format
    text = [f"Input: {q.strip()}\nOutput: {a.strip()}{tokenizer.eos_token}" for q, a in zip(examples["Input"], examples["output"])]

    # Tokenizer is now the Unsloth tokenizer, but standard transformers function works
    model_inputs = tokenizer(
        text,
        max_length=max_length,
        truncation=True,
        padding="max_length", # Pad to max_length for Trainer compatibility
    )
    # Causal Language Modeling requires labels to be the input IDs
    model_inputs["labels"] = model_inputs["input_ids"].copy()
    return model_inputs

def main(data_path, model_name, output_dir, final_model_dir, learning_rate, train_batch_size, eval_batch_size, num_train_epochs, max_seq_length, device_index):
    """
    Loads data, sets up the model and tokenizer using Unsloth's FastLanguageModel,
    tokenizes the dataset, runs training, and saves the LoRA adapters.
    """
    print("Starting Unsloth LoRA fine-tuning script...")

    # 1. Load and prepare data
    print(f"Loading data from {data_path}...")
    try:
        # FIX: Handle potential ParserError with malformed CSV by using 'python' engine and skipping bad lines
        data = pd.read_csv(data_path, engine='python', on_bad_lines='warn')
    except FileNotFoundError:
        print(f"Error: Data file not found at {data_path}")
        return

    data = data.dropna()
    print(f"Loaded {len(data)} rows after dropping nulls.")

    # Convert pandas dataframe to Hugging Face DatasetDict
    train_df, test_df = train_test_split(data, test_size=0.2, random_state=42)
    train_dataset = Dataset.from_pandas(train_df)
    test_dataset = Dataset.from_pandas(test_df)
    dataset = DatasetDict({
        'train': train_dataset,
        'test': test_dataset
    })
    print(f"Dataset split: Train={len(train_dataset)}, Test={len(test_dataset)}")

    # 2. Load Model and Tokenizer using Unsloth's FastLanguageModel
    print(f"Loading Unsloth model and tokenizer: {model_name}...")

    # --- Unsloth Model Loading (replaces AutoTokenizer and AutoModelForCausalLM) ---
    # Unsloth handles the device selection and quantization internally.
    # The device_index argument is implicitly handled by Unsloth/PyTorch's environment setup
    # or can be set via CUDA_VISIBLE_DEVICES outside of the script.
    # For simplicity, we remove the explicit torch.device logic.
    try:
        model, tokenizer = FastLanguageModel.from_pretrained(
            model_name = model_name,
            max_seq_length = max_seq_length,
            dtype = DTYPE,
            load_in_4bit = LOAD_IN_4BIT, # Enables QLoRA
            # If using a gated model, uncomment and provide your Hugging Face token:
            # token = "hf_...",
        )
    except Exception as e:
        print(f"Error loading Unsloth model or tokenizer: {e}")
        return

    # 3. Apply LoRA adapters
    model = FastLanguageModel.get_peft_model(
        model,
        r = R,
        target_modules = TARGET_MODULES,
        lora_alpha = LORA_ALPHA,
        lora_dropout = 0, # Unsloth optimized: set to 0
        bias = "none",    # Unsloth optimized: set to "none"
        use_gradient_checkpointing = "unsloth", # Optimized gradient checkpointing
        random_state = 3407,
        use_rslora = False,
    )

    print(f"LoRA adapters applied. Trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad)}")

    # 4. Tokenize Dataset
    print("Tokenizing dataset...")
    def wrapped_preprocess(examples):
        return preprocess_function(examples, tokenizer, max_length=max_seq_length)

    tokenized_datasets = dataset.map(
        wrapped_preprocess,
        batched=True,
        remove_columns=['Input', 'output', '__index_level_0__']
    )

    # 5. Setup Training Arguments and Trainer (TRL's SFTTrainer is often used with Unsloth,
    # but the standard Hugging Face Trainer is also fully compatible and used here for simplicity)
    print("Setting up Trainer...")
    training_args = TrainingArguments(
        output_dir=output_dir,
        eval_strategy="epoch",
        learning_rate=learning_rate,
        per_device_train_batch_size=train_batch_size,
        per_device_eval_batch_size=eval_batch_size,
        num_train_epochs=num_train_epochs,
        weight_decay=0.01,
        # Add recommended arguments for LoRA/QLoRA training
        fp16=not torch.cuda.is_available() or DTYPE != torch.bfloat16, # Use FP16 if bfloat16 is not available/used
        bf16=torch.cuda.is_available() and DTYPE == torch.bfloat16,   # Use BF16 if available and specified
        gradient_accumulation_steps=1, # Adjust if batch size is too small
        # Log Unsloth's performance
        report_to="none", # Change to "wandb" or "tensorboard" if needed
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["test"],
    )

    # 6. Start Training
    print("Starting training...")
    trainer.train()
    print("Training complete!")

    # 7. Save the final LoRA adapters and tokenizer
    # Unsloth saves only the small LoRA adapters by default with save_pretrained
    print(f"Saving final LoRA adapters and tokenizer to {final_model_dir}...")
    os.makedirs(final_model_dir, exist_ok=True)
    # Saves the LoRA adapters
    trainer.model.save_pretrained(final_model_dir)
    # Saves the Unsloth tokenizer
    tokenizer.save_pretrained(final_model_dir)

    # OPTIONAL: Save the merged model (base model + LoRA weights)
    # This is needed to use the model without Unsloth or Peft/LoRA
    print("\n--- OPTIONAL: Merging and saving full model ---")
    merged_model_dir = final_model_dir + "_merged"
    print(f"Merging LoRA adapters into base model and saving to {merged_model_dir}...")
    model.save_pretrained_merged(merged_model_dir, tokenizer, save_method="merged_16bit",)
    print("Full merged model saved successfully.")

    print("Model and tokenizer saved successfully.")

# --- Command Line Argument Parser ---
if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Fine-tune a Causal Language Model on a custom dataset using Unsloth LoRA.")

    # Required and Model Arguments (same as original)
    parser.add_argument("--data_path", type=str, default="/content/all_medtext.csv", required=True, help="Path to the input CSV data file.")
    parser.add_argument("--model_name", type=str, default="unsloth/Llama-3.2-1B", help="Hugging Face model ID.")

    # Training Arguments (same as original)
    parser.add_argument("--output_dir", type=str, default="./results", help="Directory for Trainer checkpoints and logs.")
    parser.add_argument("--final_model_dir", type=str, default="./finetuned_model_unsloth", help="Directory to save the final fine-tuned LoRA adapters and tokenizer.")
    parser.add_argument("--learning_rate", type=float, default=2e-5, help="Learning rate for the optimizer.")
    parser.add_argument("--train_batch_size", type=int, default=8, help="Batch size per device for training.")
    parser.add_argument("--eval_batch_size", type=int, default=8, help="Batch size per device for evaluation.")
    parser.add_argument("--num_train_epochs", type=int, default=3, help="Total number of training epochs to perform.")
    parser.add_argument("--max_seq_length", type=int, default=1024, help="Maximum sequence length for tokenization.")

    # GPU Node Selection (kept for command-line compatibility, but Unsloth handles it internally)
    parser.add_argument(
        "--device_index",
        type=int,
        default=0,
        help="Index of the GPU device to use for training (Unsloth typically uses the best available GPU or 0 by default)."
    )

    # Instead of calling parse_args(), which expects command-line arguments
    # and fails if a required argument is not provided, we will manually
    # provide the default values to the main function. This allows the script
    # to run directly in a Colab cell without needing explicit command-line args.

    main(
        data_path="/content/all_medtext.csv",
        model_name="unsloth/Llama-3.2-1B",
        output_dir="./results",
        final_model_dir="./finetuned_model_unsloth",
        learning_rate=2e-1,
        train_batch_size=64,
        eval_batch_size=64,
        num_train_epochs=3,
        max_seq_length=1024,
        device_index=0 # Unsloth typically handles this automatically
    )