<a href="https://colab.research.google.com/github/bharathbolla/The-LLM-Cookbook-Practical-Recipes-for-Fine-Tuning-Optimization-and-Deployment/blob/main/Chapter_8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Recipe-1: Standard LoRA Fine-Tuning

In [None]:
from huggingface_hub import HfApi
from huggingface_hub import login

api = HfApi()
whoami = api.whoami(token="hf_xxxxxxxxxxxxxxxxxxxxxxxxx")
print(whoami)
login("hf_xxxxxxxxxxxxxxxx")

{'type': 'user', 'id': '65feba1b57cc48d9d30d11cf', 'name': 'kalpasubbaiah', 'fullname': 'Kalpa Subbaiah', 'email': 'kalpa.subbaiah@gmail.com', 'emailVerified': True, 'canPay': False, 'periodEnd': None, 'isPro': False, 'avatarUrl': '/avatars/319094e0eb55ce89334d7bd3685ceeb0.svg', 'orgs': [{'type': 'org', 'id': '681b0cb0dba891d54be0773d', 'name': 'mcp-course', 'fullname': 'Hugging Face MCP Course', 'email': None, 'canPay': False, 'periodEnd': None, 'avatarUrl': 'https://cdn-avatars.huggingface.co/v1/production/uploads/62d648291fa3e4e7ae3fa6e8/itgTDqMrnvgNfJZJ4YmCt.png', 'roleInOrg': 'read', 'isEnterprise': False}], 'auth': {'type': 'access_token', 'accessToken': {'displayName': 'hugging_face_token_read', 'role': 'read', 'createdAt': '2025-04-22T09:03:46.223Z'}}}


In [None]:
# --- Recipe: Standard LoRA Fine-Tuning ---
# Goal: Fine-tune a base model using LoRA with the Hugging Face PEFT library.
# Method: Adapts the IFT recipe (Ch 7) using Gemma-2B and Dolly subset.
# Libraries: transformers, datasets, peft, torch, accelerate, bitsandbytes (optional, for bf16/fp16)
# Note: Requires significant VRAM even without quantization for Gemma-2B base.
#       Installs: pip install transformers datasets peft accelerate torch bitsandbytes sentencepiece

import torch
import copy
from datasets import load_dataset, Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from peft import get_peft_model, LoraConfig, TaskType, prepare_model_for_kbit_training

# --- Configuration ---
MODEL_CHECKPOINT = "google/gemma-2b" # Base model
DATASET_NAME = "databricks/databricks-dolly-15k" # Instruction dataset
OUTPUT_DIR = "./lora_finetune_output"
# LoRA Config
LORA_R = 16 # LoRA rank (e.g., 8, 16, 32, 64)
LORA_ALPHA = 32 # LoRA alpha (scaling factor, often 2*r)
LORA_DROPOUT =.05
# Specify target modules for LoRA. Common practice for Gemma/Llama-like models:
LORA_TARGET_MODULES = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]

# Training Params
NUM_EPOCHS = 1
BATCH_SIZE = 1 # Very small batch size for LoRA on Gemma-2B without QLoRA
GRADIENT_ACCUMULATION_STEPS = 16 # Effective batch size 1*16=16
LEARNING_RATE = 1e-4 # LoRA often uses higher LR than full FT
MAX_LENGTH = 512 # Max sequence length
NUM_SAMPLES_PER_DATASET = 500 # Use small subset for demo

# --- 1. Load Tokenizer & Define Prompt Template (Same as IFT Chapter) ---
print(f"Loading tokenizer for checkpoint: {MODEL_CHECKPOINT}")
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    print(f"Set PAD token to EOS token: {tokenizer.pad_token}")

PROMPT_WITH_INPUT_TEMPLATE = (
    "Below is an instruction that describes a task, paired with an input that provides further context. "
    "Write a response that appropriately completes the request.\n\n"
    "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n"
)
PROMPT_NO_INPUT_TEMPLATE = (
    "Below is an instruction that describes a task. "
    "Write a response that appropriately completes the request.\n\n"
    "### Instruction:\n{instruction}\n\n### Response:\n"
)

2025-08-30 09:25:24.896833: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1756545924.920257     138 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1756545924.927528     138 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Loading tokenizer for checkpoint: google/gemma-2b


In [None]:


# --- 2. Data Loading and Preprocessing Function (Same as IFT Chapter) ---
def format_and_tokenize(example, dataset_type='dolly'): # Defaulting to Dolly structure
    """Formats (Alpaca style), tokenizes, and prepares labels with masking for IFT."""
    instruction = example.get("instruction", "")
    input_context = example.get("context", "") # Dolly uses 'context'
    output = example.get("response", "")      # Dolly uses 'response'

    if input_context and input_context.strip():
        prompt_start = PROMPT_WITH_INPUT_TEMPLATE.format(instruction=instruction, input=input_context)
    else:
        prompt_start = PROMPT_NO_INPUT_TEMPLATE.format(instruction=instruction)

    full_text = prompt_start + output + tokenizer.eos_token
    tokenized_full = tokenizer(full_text, truncation=True, padding="max_length", max_length=MAX_LENGTH)
    tokenized_prompt = tokenizer(prompt_start, truncation=True, padding="max_length", max_length=MAX_LENGTH)
    prompt_length = len(tokenized_prompt["input_ids"])
    labels = copy.deepcopy(tokenized_full["input_ids"])

    # Mask prompt tokens
    for i in range(prompt_length):
         if i < len(labels): labels[i] = -100

    tokenized_full["labels"] = labels
    return tokenized_full

print(f"\n--- Processing Dataset: {DATASET_NAME} ---")
try:
    raw_dataset = load_dataset(DATASET_NAME, split=f"train[:{NUM_SAMPLES_PER_DATASET}]")
    tokenized_dataset = raw_dataset.map(
        format_and_tokenize, remove_columns=raw_dataset.column_names
    )
    split_ds = tokenized_dataset.train_test_split(test_size=0.1, seed=42)
    final_datasets = {"train": split_ds["train"], "validation": split_ds["test"]}
    print("Dataset processed.")
except Exception as e:
    print(f"Error processing dataset: {e}")
    exit()

# --- 3. Load Base Model (Not Quantized for standard LoRA) ---
print(f"\nLoading base model: {MODEL_CHECKPOINT}")
print("Loading in bf16/fp16 for memory efficiency...")
# Determine compute dtype
compute_dtype = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else torch.float16

try:
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_CHECKPOINT,
        torch_dtype=compute_dtype, # Load in lower precision
        device_map="auto" # Distribute across GPUs if available
    )
    # Ensure pad token ID is set
    if model.config.pad_token_id is None:
         model.config.pad_token_id = tokenizer.pad_token_id

    # Optional: Prepare model for k-bit training if using gradient checkpointing with lower precision loading
    # model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True) # Set use_gradient_checkpointing based on TrainingArgs

    print(f"Base model loaded in {compute_dtype}. Device map: {model.hf_device_map}")
except Exception as e:
    print(f"Error loading base model: {e}")
    print("Standard LoRA on Gemma-2B might still require > 24GB VRAM. Consider QLoRA.")
    exit()


# --- 4. Configure LoRA ---
print("\nConfiguring LoRA...")
peft_config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    target_modules=LORA_TARGET_MODULES,
    lora_dropout=LORA_DROPOUT,
    bias="none", # Typically 'none', 'all', or 'lora_only'
    task_type=TaskType.CAUSAL_LM # Important for Causal LM tasks
)

# --- 5. Wrap Model with PEFT ---
print("Applying PEFT LoRA adapters to the model...")
try:
    model = get_peft_model(model, peft_config)
    print("PEFT model created successfully.")
    model.print_trainable_parameters() # Shows the small percentage of trainable parameters
except Exception as e:
    print(f"Error applying PEFT: {e}")
    exit()

# --- 6. Training Arguments ---
print("\nDefining Training Arguments...")
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    learning_rate=LEARNING_RATE,
    # Optimizer choice can matter, AdamW 8bit is memory efficient if bitsandbytes installed
    # optim="paged_adamw_8bit", # Or adamw_torch
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="loss",
    fp16= (compute_dtype == torch.float16), # Enable based on compute_dtype
    bf16= (compute_dtype == torch.bfloat16), # Enable based on compute_dtype
    gradient_checkpointing=True, # Often needed for larger models/LoRA
    push_to_hub=False,
    report_to="none"
)

# --- 7. Data Collator ---
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# --- 8. Initialize Trainer ---
trainer = Trainer(
    model=model, # Pass the PEFT model
    args=training_args,
    train_dataset=final_datasets["train"],
    eval_dataset=final_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# --- 9. Train ---
print("\nStarting LoRA fine-tuning...")
try:
    train_result = trainer.train()
    # Note: PEFT saves only the adapter weights by default
    trainer.save_model(OUTPUT_DIR) # Saves adapter config & weights to OUTPUT_DIR
    print(f"LoRA training finished. Adapter saved to {OUTPUT_DIR}")
    # Log metrics
    metrics = train_result.metrics
    trainer.log_metrics("train", metrics)
    trainer.save_metrics("train", metrics)
    trainer.save_state()
except Exception as e:
    print(f"Error during LoRA training: {e}")
    exit()

# --- 10. Evaluate ---
print("\nEvaluating final LoRA model...")
try:
    eval_results = trainer.evaluate()
    print("Evaluation Results (Loss):")
    print(eval_results)
    # perplexity = math.exp(eval_results['eval_loss'])
    # print(f"Perplexity: {perplexity:.2f}")
    trainer.log_metrics("eval", eval_results)
    trainer.save_metrics("eval", eval_results)
except Exception as e:
    print(f"Error during evaluation: {e}")

# --- End of Recipe ---


--- Processing Dataset: databricks/databricks-dolly-15k ---


Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Dataset processed.

Loading base model: google/gemma-2b
Loading in bf16/fp16 for memory efficiency...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Base model loaded in torch.bfloat16. Device map: {'model.embed_tokens': 0, 'lm_head': 0, 'model.layers.0': 0, 'model.layers.1': 0, 'model.layers.2': 0, 'model.layers.3': 0, 'model.layers.4': 0, 'model.layers.5': 0, 'model.layers.6': 1, 'model.layers.7': 1, 'model.layers.8': 1, 'model.layers.9': 1, 'model.layers.10': 1, 'model.layers.11': 1, 'model.layers.12': 1, 'model.layers.13': 1, 'model.layers.14': 1, 'model.layers.15': 1, 'model.layers.16': 1, 'model.layers.17': 1, 'model.norm': 1, 'model.rotary_emb': 1}

Configuring LoRA...
Applying PEFT LoRA adapters to the model...


  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


PEFT model created successfully.
trainable params: 19,611,648 || all params: 2,525,784,064 || trainable%: 0.7765

Defining Training Arguments...

Starting LoRA fine-tuning...


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Error during LoRA training: element 0 of tensors does not require grad and does not have a grad_fn

Evaluating final LoRA model...


Epoch,Training Loss,Validation Loss
0,No log,2.353364


Evaluation Results (Loss):
{'eval_loss': 2.353363513946533}
***** eval metrics *****
  eval_loss = 2.3534


## Recipe-2: QLoRA Fine-Tuning

In [None]:
pip install -U bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl.metadata (11 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cusparse-cu12==12.3.1.1

In [None]:
# --- Recipe: QLoRA on a Budget ---
# Goal: Fine-tune a base model using QLoRA (4-bit quantization + LoRA).
# Method: Uses Gemma-2B, Dolly subset, bitsandbytes for 4-bit loading, and peft.
# Libraries: transformers, datasets, peft, accelerate, torch, bitsandbytes, sentencepiece
# Note: Should fit in GPUs with >= 12-16GB VRAM. Install bitsandbytes: pip install bitsandbytes

import torch
import copy
from datasets import load_dataset, Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig # Needed for quantization config
)
from peft import get_peft_model, LoraConfig, TaskType, prepare_model_for_kbit_training

# --- Configuration (Similar to LoRA recipe) ---
MODEL_CHECKPOINT = "google/gemma-2b"
DATASET_NAME = "databricks/databricks-dolly-15k"
OUTPUT_DIR = "./qlora_finetune_output"
# LoRA Config
LORA_R = 16
LORA_ALPHA = 32
LORA_DROPOUT = 0.05
LORA_TARGET_MODULES = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
# Training Params
NUM_EPOCHS = 1
# Can potentially use larger batch size with QLoRA vs standard LoRA
BATCH_SIZE = 2 # Start small, increase if memory allows
GRADIENT_ACCUMULATION_STEPS = 8 # Effective batch size 2*8=16
LEARNING_RATE = 1e-4 # QLoRA might tolerate slightly higher LR sometimes
MAX_LENGTH = 512
NUM_SAMPLES_PER_DATASET = 500

# --- 1. Load Tokenizer & Define Prompt Template (Same as LoRA recipe) ---
print(f"Loading tokenizer for checkpoint: {MODEL_CHECKPOINT}")
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

PROMPT_WITH_INPUT_TEMPLATE = (
    "Below is an instruction that describes a task, paired with an input that provides further context. "
    "Write a response that appropriately completes the request.\n\n"
    "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n"
)
PROMPT_NO_INPUT_TEMPLATE = (
    "Below is an instruction that describes a task. "
    "Write a response that appropriately completes the request.\n\n"
    "### Instruction:\n{instruction}\n\n### Response:\n"
)

# --- 2. Data Loading and Preprocessing Function (Same as LoRA recipe) ---
def format_and_tokenize(example, dataset_type='dolly'):
    instruction = example.get("instruction", "")
    input_context = example.get("context", "")
    output = example.get("response", "")
    if input_context and input_context.strip():
        prompt_start = PROMPT_WITH_INPUT_TEMPLATE.format(instruction=instruction, input=input_context)
    else:
        prompt_start = PROMPT_NO_INPUT_TEMPLATE.format(instruction=instruction)
    full_text = prompt_start + output + tokenizer.eos_token
    tokenized_full = tokenizer(full_text, truncation=True, padding="max_length", max_length=MAX_LENGTH)
    tokenized_prompt = tokenizer(prompt_start, truncation=True, padding="max_length", max_length=MAX_LENGTH)
    prompt_length = len(tokenized_prompt["input_ids"])
    labels = copy.deepcopy(tokenized_full["input_ids"])
    for i in range(prompt_length):
         if i < len(labels): labels[i] = -100
    tokenized_full["labels"] = labels
    return tokenized_full

print(f"\n--- Processing Dataset: {DATASET_NAME} ---")
try:
    raw_dataset = load_dataset(DATASET_NAME, split=f"train[:{NUM_SAMPLES_PER_DATASET}]")
    tokenized_dataset = raw_dataset.map(
        format_and_tokenize, remove_columns=raw_dataset.column_names
    )
    split_ds = tokenized_dataset.train_test_split(test_size=0.1, seed=42)
    final_datasets = {"train": split_ds["train"], "validation": split_ds["test"]}
    print("Dataset processed.")
except Exception as e:
    print(f"Error processing dataset: {e}")
    exit()

# --- 3. Configure Quantization (BitsAndBytes) ---
print("\nConfiguring 4-bit quantization...")
# Determine compute dtype
compute_dtype = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else torch.float16
print(f"Using compute dtype: {compute_dtype}")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4", # Use NF4 (NormalFloat4) data type for optimal results
    bnb_4bit_compute_dtype=compute_dtype, # Computations done in bf16/fp16
    bnb_4bit_use_double_quant=True, # Optional: Use double quantization for extra memory savings
)

# --- 4. Load Base Model in 4-bit ---
print(f"\nLoading base model ({MODEL_CHECKPOINT}) in 4-bit...")
try:
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_CHECKPOINT,
        quantization_config=bnb_config, # Apply quantization config
        device_map={'': 0},              #  force all weights to cuda:0
        torch_dtype=compute_dtype
    )
    # Ensure pad token ID is set
    if model.config.pad_token_id is None:
         model.config.pad_token_id = tokenizer.pad_token_id

    print(f"Base model loaded in 4-bit. Device map: {model.hf_device_map}")
except Exception as e:
    print(f"Error loading base model in 4-bit: {e}")
    print("Ensure 'bitsandbytes' is installed correctly for your CUDA version.")
    exit()

# --- 5. Prepare Model for PEFT & Configure LoRA ---
# Prepare for k-bit training must be called BEFORE applying PEFT config
# Enable gradient checkpointing for more memory savings
model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)
print("Model prepared for k-bit training.")

print("\nConfiguring LoRA...")
peft_config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    target_modules=LORA_TARGET_MODULES, # Target modules might differ slightly per model, check docs
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

# --- 6. Wrap Model with PEFT ---
print("Applying PEFT LoRA adapters to the 4-bit model...")
try:
    model = get_peft_model(model, peft_config)
    print("PEFT model created successfully.")
    model.print_trainable_parameters() # Shows the small percentage of trainable parameters
except Exception as e:
    print(f"Error applying PEFT: {e}")
    exit()

# --- 7. Training Arguments ---
# Use paged optimizer for more memory efficiency with QLoRA
use_paged_optimizer = True # Requires bitsandbytes >= 0.41.1
optim_choice = "paged_adamw_8bit" if use_paged_optimizer else "adamw_torch"
print(f"Using optimizer: {optim_choice}")

print("\nDefining Training Arguments...")
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    learning_rate=LEARNING_RATE,
    optim=optim_choice, # Use paged optimizer
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="loss",
    fp16= (compute_dtype == torch.float16), # Enable based on compute_dtype
    bf16= (compute_dtype == torch.bfloat16), # Enable based on compute_dtype
    gradient_checkpointing=True, # Crucial for QLoRA memory saving
    push_to_hub=False,
    report_to="none",
    per_device_eval_batch_size=1
)

# --- 8. Data Collator ---
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# --- 9. Initialize Trainer ---
trainer = Trainer(
    model=model, # Pass the QLoRA PEFT model
    args=training_args,
    train_dataset=final_datasets["train"],
    eval_dataset=final_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# --- 10. Train ---
print("\nStarting QLoRA fine-tuning...")
try:
    train_result = trainer.train()
    trainer.save_model(OUTPUT_DIR) # Saves adapter config & weights
    print(f"QLoRA training finished. Adapter saved to {OUTPUT_DIR}")
    metrics = train_result.metrics
    trainer.log_metrics("train", metrics)
    trainer.save_metrics("train", metrics)
    trainer.save_state()
except Exception as e:
    print(f"Error during QLoRA training: {e}")
    exit()

# --- 11. Evaluate ---
print("\nEvaluating final QLoRA model...")
try:
    eval_results = trainer.evaluate()
    print("Evaluation Results (Loss):")
    print(eval_results)
    trainer.log_metrics("eval", eval_results)
    trainer.save_metrics("eval", eval_results)
except Exception as e:
    print(f"Error during evaluation: {e}")

# --- End of Recipe ---


2025-08-30 09:08:22.972850: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1756544903.148890      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1756544903.200057      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Loading tokenizer for checkpoint: google/gemma-2b


tokenizer_config.json:   0%|          | 0.00/33.6k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]


--- Processing Dataset: databricks/databricks-dolly-15k ---


README.md: 0.00B [00:00, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


databricks-dolly-15k.jsonl:   0%|          | 0.00/13.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/15011 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Dataset processed.

Configuring 4-bit quantization...
Using compute dtype: torch.bfloat16

Loading base model (google/gemma-2b) in 4-bit...


config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

Base model loaded in 4-bit. Device map: {'': 0}
Model prepared for k-bit training.

Configuring LoRA...
Applying PEFT LoRA adapters to the 4-bit model...


  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


PEFT model created successfully.
trainable params: 19,611,648 || all params: 2,525,784,064 || trainable%: 0.7765
Using optimizer: paged_adamw_8bit

Defining Training Arguments...

Starting QLoRA fine-tuning...


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Epoch,Training Loss,Validation Loss


## Recipe-3: Accelerated PEFT with UnslothAI

In [None]:
pip install unsloth

Collecting unsloth
  Downloading unsloth-2025.8.10-py3-none-any.whl.metadata (52 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.3/52.3 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting unsloth_zoo>=2025.8.9 (from unsloth)
  Downloading unsloth_zoo-2025.8.9-py3-none-any.whl.metadata (9.5 kB)
Collecting xformers>=0.0.27.post2 (from unsloth)
  Downloading xformers-0.0.32.post2-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (1.1 kB)
Collecting tyro (from unsloth)
  Downloading tyro-0.9.30-py3-none-any.whl.metadata (11 kB)
Collecting transformers!=4.47.0,!=4.52.0,!=4.52.1,!=4.52.2,!=4.52.3,!=4.53.0,!=4.54.0,!=4.55.0,!=4.55.1,>=4.51.3 (from unsloth)
  Downloading transformers-4.56.0-py3-none-any.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.1/40.1 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Collecting trl!=0.15.0,!=0.19.0,!=0.9.0,!=0.9.1,!=0.9.2,!=0.9.3,>=0.7.9 (from unsloth)
  Downloading trl-0.22.1-py3-none-any.

In [None]:
# --- Recipe: Turbocharged PEFT with UnslothAI ---
# Goal: Demonstrate accelerated LoRA/QLoRA fine-tuning using UnslothAI.
# Method: Adapts the QLoRA recipe showing minimal code changes for Unsloth.
# Libraries: unsloth, torch, datasets, peft, transformers, sentencepiece
# Note: Requires UnslothAI installation specific to your environment (GPU/PyTorch/CUDA).
#       Example install: pip install "unsloth[pytorch-ampere-torch210]" --extra-index-url https://pypi.unsloth.ai
#       Check UnslothAI GitHub for correct install command.
import unsloth
import torch
import copy
from datasets import load_dataset, Dataset

# --- UnslothAI Import ---
from unsloth import FastLanguageModel # Main import
from unsloth import is_bfloat16_supported
from peft import LoraConfig, TaskType # PEFT imports remain similar

from transformers import (
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig # Still used for quantization config if desired
)


# --- Configuration (Similar to QLoRA recipe) ---
MODEL_CHECKPOINT = "google/gemma-2b" # Unsloth supports many models
DATASET_NAME = "databricks/databricks-dolly-15k"
OUTPUT_DIR = "./unsloth_qlora_finetune_output"
# LoRA Config (Same as before)
LORA_R = 16
LORA_ALPHA = 32
LORA_DROPOUT = 0.05
# Target modules might be automatically inferred by Unsloth for some models,
# but specifying is often still good practice or required. Check Unsloth docs.
LORA_TARGET_MODULES = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
# Training Params
NUM_EPOCHS = 1
# Unsloth's memory savings might allow larger batch sizes vs standard QLoRA
BATCH_SIZE = 4 # Try increasing this (e.g., 8, 16) if memory allows!
GRADIENT_ACCUMULATION_STEPS = 4 # Adjust so BATCH_SIZE * GRAD_ACCUM is your target effective batch size
LEARNING_RATE = 1e-4
MAX_LENGTH = 512
NUM_SAMPLES_PER_DATASET = 500

# --- 1. Load Tokenizer & Define Prompt Template (Same as LoRA/QLoRA recipe) ---
print(f"Loading tokenizer for checkpoint: {MODEL_CHECKPOINT}")
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

PROMPT_WITH_INPUT_TEMPLATE = (
    "Below is an instruction that describes a task, paired with an input that provides further context. "
    "Write a response that appropriately completes the request.\n\n"
    "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n"
)
PROMPT_NO_INPUT_TEMPLATE = (
    "Below is an instruction that describes a task. "
    "Write a response that appropriately completes the request.\n\n"
    "### Instruction:\n{instruction}\n\n### Response:\n"
)

# --- 2. Data Loading and Preprocessing Function (Same as LoRA/QLoRA recipe) ---
def format_and_tokenize(example, dataset_type='dolly'):
    instruction = example.get("instruction", "")
    input_context = example.get("context", "")
    output = example.get("response", "")
    if input_context and input_context.strip():
        prompt_start = PROMPT_WITH_INPUT_TEMPLATE.format(instruction=instruction, input=input_context)
    else:
        prompt_start = PROMPT_NO_INPUT_TEMPLATE.format(instruction=instruction)
    full_text = prompt_start + output + tokenizer.eos_token
    tokenized_full = tokenizer(full_text, truncation=True, padding="max_length", max_length=MAX_LENGTH)
    tokenized_prompt = tokenizer(prompt_start, truncation=True, padding="max_length", max_length=MAX_LENGTH)
    prompt_length = len(tokenized_prompt["input_ids"])
    labels = copy.deepcopy(tokenized_full["input_ids"])
    for i in range(prompt_length):
         if i < len(labels): labels[i] = -100
    tokenized_full["labels"] = labels
    return tokenized_full

print(f"\n--- Processing Dataset: {DATASET_NAME} ---")
try:
    raw_dataset = load_dataset(DATASET_NAME, split=f"train[:{NUM_SAMPLES_PER_DATASET}]")
    tokenized_dataset = raw_dataset.map(
        format_and_tokenize, remove_columns=raw_dataset.column_names
    )
    split_ds = tokenized_dataset.train_test_split(test_size=0.1, seed=42)
    final_datasets = {"train": split_ds["train"], "validation": split_ds["test"]}
    print("Dataset processed.")
except Exception as e:
    print(f"Error processing dataset: {e}")
    exit()

# --- 3. Load Model with UnslothAI ---
# Unsloth handles quantization and optimizations internally during loading.
print(f"\nLoading model ({MODEL_CHECKPOINT}) with UnslothAI...")
# Determine max_seq_length based on data preprocessing
max_seq_length = MAX_LENGTH
# Determine dtype ("auto", None, torch.float16, torch.bfloat16)
dtype = None # Autodetect
load_in_4bit = True # Enable QLoRA optimizations

try:
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = MODEL_CHECKPOINT,
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
        # token = "hf_...", # Add token if using gated models like Llama
    )
    print("Unsloth FastLanguageModel loaded successfully.")
     # Ensure pad token ID is set correctly after potential Unsloth modifications
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        print(f"Set PAD token to EOS token: {tokenizer.pad_token}")
    # Unsloth might handle model's pad_token_id internally, but check if needed
    # if model.config.pad_token_id is None: model.config.pad_token_id = tokenizer.pad_token_id

except Exception as e:
    print(f"Error loading model with UnslothAI: {e}")
    print("Ensure UnslothAI is installed correctly for your environment.")
    exit()

# --- 4. Apply PEFT (LoRA) Configuration ---
# Unsloth integrates with PEFT. Model is already prepared.
print("\nApplying PEFT LoRA adapters using Unsloth's optimized method...")
try:
    # Use FastLanguageModel.get_peft_model instead of standard get_peft_model
    model = FastLanguageModel.get_peft_model(
        model, # Pass the Unsloth model object
        r = LORA_R,
        target_modules = LORA_TARGET_MODULES,
        lora_alpha = LORA_ALPHA,
        lora_dropout = LORA_DROPOUT,
        bias = "none", # Or "all" or "lora_only"
        use_gradient_checkpointing = True, # Recommended by Unsloth
        random_state = 3407, # From Unsloth examples
        max_seq_length = max_seq_length, # Optional but good practice
        use_rslora = False,  # Rank Stable LoRA (experimental)
        loftq_config = None, # LoftQ configuration (experimental)
    )
    print("Unsloth PEFT model created successfully.")
    # Unsloth model object might not have print_trainable_parameters, but PEFT is applied.
except Exception as e:
    print(f"Error applying PEFT with Unsloth: {e}")
    exit()

# --- 5. Training Arguments ---
# Arguments are largely the same as standard Trainer
print("\nDefining Training Arguments...")
bf16_supported = is_bfloat16_supported() # Use unsloth's check

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    learning_rate=LEARNING_RATE,
    # Unsloth works with standard optimizers, paged optimizers often still good
    optim="paged_adamw_8bit",
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="loss",
    fp16=not bf16_supported, # Use fp16 if bf16 not available
    bf16=bf16_supported, # Use bf16 if available
    # gradient_checkpointing is handled by Unsloth's get_peft_model
    push_to_hub=False,
    report_to="none"
)

# --- 6. Data Collator ---
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# --- 7. Initialize Trainer ---
# Use standard Hugging Face Trainer
trainer = Trainer(
    model=model, # Pass the Unsloth PEFT model
    args=training_args,
    train_dataset=final_datasets["train"],
    eval_dataset=final_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# --- 8. Train ---
print("\nStarting Unsloth accelerated fine-tuning...")
# Expect faster training and potentially lower memory usage compared to standard QLoRA
try:
    train_result = trainer.train()
    # Unsloth saves adapters in a compatible format
    # Use trainer.save_model() which internally calls model.save_pretrained()
    trainer.save_model(OUTPUT_DIR)
    print(f"Unsloth training finished. Adapter saved to {OUTPUT_DIR}")
    metrics = train_result.metrics
    trainer.log_metrics("train", metrics)
    trainer.save_metrics("train", metrics)
    trainer.save_state()
except Exception as e:
    print(f"Error during Unsloth training: {e}")
    exit()

# --- 9. Evaluate ---
print("\nEvaluating final Unsloth PEFT model...")
try:
    eval_results = trainer.evaluate()
    print("Evaluation Results (Loss):")
    print(eval_results)
    trainer.log_metrics("eval", eval_results)
    trainer.save_metrics("eval", eval_results)
except Exception as e:
    print(f"Error during evaluation: {e}")

# --- End of Recipe ---


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


2025-08-30 09:50:41.025774: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1756547441.380821      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1756547441.486828      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


🦥 Unsloth Zoo will now patch everything to make training faster!
Loading tokenizer for checkpoint: google/gemma-2b


tokenizer_config.json:   0%|          | 0.00/33.6k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]


--- Processing Dataset: databricks/databricks-dolly-15k ---


README.md: 0.00B [00:00, ?B/s]

databricks-dolly-15k.jsonl:   0%|          | 0.00/13.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/15011 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Dataset processed.

Loading model (google/gemma-2b) with UnslothAI...
==((====))==  Unsloth 2025.8.10: Fast Gemma patching. Transformers: 4.56.0.
   \\   /|    Tesla T4. Num GPUs = 2. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/33.6k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.05.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.


Unsloth FastLanguageModel loaded successfully.

Applying PEFT LoRA adapters using Unsloth's optimized method...


Unsloth 2025.8.10 patched 18 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


Unsloth PEFT model created successfully.

Defining Training Arguments...

Starting Unsloth accelerated fine-tuning...


  trainer = Trainer(
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 450 | Num Epochs = 1 | Total steps = 15
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 4 x 1) = 32
 "-____-"     Trainable parameters = 19,611,648 of 2,525,784,064 (0.78% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Epoch,Training Loss,Validation Loss
1,2.5909,2.349695


Unsloth: Not an error, but GemmaForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient


Unsloth training finished. Adapter saved to ./unsloth_qlora_finetune_output
***** train metrics *****
  epoch                    =        1.0
  total_flos               =  2576846GF
  train_loss               =     2.5511
  train_runtime            = 0:03:54.48
  train_samples_per_second =      1.919
  train_steps_per_second   =      0.064

Evaluating final Unsloth PEFT model...


Evaluation Results (Loss):
{'eval_loss': 2.3496947288513184, 'eval_runtime': 10.5971, 'eval_samples_per_second': 4.718, 'eval_steps_per_second': 0.377, 'epoch': 1.0}
***** eval metrics *****
  epoch                   =        1.0
  eval_loss               =     2.3497
  eval_runtime            = 0:00:10.59
  eval_samples_per_second =      4.718
  eval_steps_per_second   =      0.377


## Below recipes are not part of the book

## Recipe: Merging LoRA Adapters
### This recipe not needed

In [None]:
# --- Recipe: Baking the Adapters In (Merging LoRA Adapters) ---
# Goal: Merge trained LoRA adapter weights into the base model for deployment.
# Method: Loads base model and adapter, then uses peft's merge_and_unload().
# Libraries: transformers, peft, torch, bitsandbytes (if merging into quantized)
# Note: Ensure the adapter was trained for the specified base model.

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel # Import PeftModel for loading adapters

# --- Configuration ---
# Base model checkpoint MUST match the one used during PEFT training
BASE_MODEL_CHECKPOINT = "google/gemma-2b"
# Path to the saved PEFT adapter weights (e.g., from LoRA, QLoRA, or Unsloth recipe)
ADAPTER_PATH = "./qlora_finetune_output" # Example: Use QLoRA output
# Path to save the merged model
MERGED_MODEL_OUTPUT_DIR = "./merged_qlora_model"

# --- Options for Merging ---
# 1. Merge into original precision model (requires loading base model in fp16/bf16/fp32)
#    - Result: Standard HF model, no PEFT needed at inference. Larger size.
# 2. Merge into quantized model (e.g., 4-bit QLoRA merged into 4-bit base)
#    - Result: Quantized model with adapter baked in. Still requires bitsandbytes at inference. Smaller size.
MERGE_INTO_QUANTIZED = True # Set to False to merge into original precision

print(f"Base Model: {BASE_MODEL_CHECKPOINT}")
print(f"Adapter Path: {ADAPTER_PATH}")
print(f"Merge into Quantized: {MERGE_INTO_QUANTIZED}")

# --- 1. Load Base Model ---
print("\nLoading base model...")
try:
    if MERGE_INTO_QUANTIZED:
        print("Loading base model in 4-bit for merging...")
        compute_dtype = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else torch.float16
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=compute_dtype,
        )
        base_model = AutoModelForCausalLM.from_pretrained(
            BASE_MODEL_CHECKPOINT,
            quantization_config=bnb_config,
            device_map={'':torch.cuda.current_device()}, # Load on GPU if possible
            trust_remote_code=True # If needed for model
        )
        print("Base model loaded in 4-bit.")
    else:
        print("Loading base model in native precision (fp16/bf16)...")
        compute_dtype = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else torch.float16
        base_model = AutoModelForCausalLM.from_pretrained(
            BASE_MODEL_CHECKPOINT,
            torch_dtype=compute_dtype,
            device_map={'':torch.cuda.current_device()},
            trust_remote_code=True # If needed
        )
        print(f"Base model loaded in {compute_dtype}.")

    # Load tokenizer associated with the base model
    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_CHECKPOINT)
    if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token
    if base_model.config.pad_token_id is None: base_model.config.pad_token_id = tokenizer.pad_token_id

except Exception as e:
    print(f"Error loading base model: {e}")
    exit()

# --- 2. Load PEFT Adapter ---
print(f"\nLoading PEFT adapter from: {ADAPTER_PATH}")
try:
    # Load the PEFT model - this attaches the adapter weights to the base model
    # Ensure the base_model is already loaded on the correct device(s) via device_map
    model_with_adapter = PeftModel.from_pretrained(
        base_model, # Pass the loaded base model
        ADAPTER_PATH,
        is_trainable=False # Load for inference/merging
    )
    print("PEFT adapter loaded onto the base model.")
except Exception as e:
    print(f"Error loading PEFT adapter: {e}")
    print("Ensure the adapter path is correct and compatible with the base model.")
    exit()

# --- 3. Merge Adapter Weights ---
print("\nMerging adapter weights into the base model...")
try:
    # merge_and_unload() merges weights and removes PEFT layers/hooks
    # If merging into quantized, the result is still quantized but with updated weights
    merged_model = model_with_adapter.merge_and_unload()
    print("Adapter merged successfully.")
    # merged_model is now a standard Hugging Face model (potentially quantized)
    # It no longer requires the `peft` library for inference.
    # If MERGE_INTO_QUANTIZED=True, it still requires `bitsandbytes`.
except Exception as e:
    print(f"Error merging adapter: {e}")
    exit()

# --- 4. Save Merged Model ---
print(f"\nSaving merged model to: {MERGED_MODEL_OUTPUT_DIR}")
try:
    merged_model.save_pretrained(MERGED_MODEL_OUTPUT_DIR)
    tokenizer.save_pretrained(MERGED_MODEL_OUTPUT_DIR) # Save tokenizer with merged model
    print("Merged model and tokenizer saved.")
except Exception as e:
    print(f"Error saving merged model: {e}")

# --- 5. Test Merged Model (Optional) ---
print("\nTesting merged model (optional)...")
try:
    prompt = "Instruction: Say hello.\nResponse:" # Example prompt
    inputs = tokenizer(prompt, return_tensors="pt").to(merged_model.device)
    outputs = merged_model.generate(**inputs, max_new_tokens=10, pad_token_id=tokenizer.eos_token_id)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(f"Prompt: {prompt}")
    print(f"Merged Model Response: {response}")
except Exception as e:
    print(f"Error during merged model inference test: {e}")


# --- End of Recipe ---


Base Model: google/gemma-2b
Adapter Path: ./qlora_finetune_output
Merge into Quantized: True

Loading base model...
Loading base model in 4-bit for merging...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Base model loaded in 4-bit.

Loading PEFT adapter from: ./qlora_finetune_output
Error loading PEFT adapter: Can't find 'adapter_config.json' at './qlora_finetune_output'
Ensure the adapter path is correct and compatible with the base model.

Merging adapter weights into the base model...
Error merging adapter: name 'model_with_adapter' is not defined

Saving merged model to: ./merged_qlora_model
Error saving merged model: name 'merged_model' is not defined

Testing merged model (optional)...
Error during merged model inference test: name 'merged_model' is not defined


## Recipe: LoRA vs. Full Fine-Tuning Showdown

In [None]:
pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Collecting fsspec>=2021.05.0 (from fsspec[http]>=2021.05.0->evaluate)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading evaluate-0.4.5-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.whl (183 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.9/183.9 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, evaluate
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Successfully uninstalled fsspec-2025.3.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gcsfs 2024.10.0 requires fsspec==20

In [None]:
# --- Recipe: LoRA vs. Full Fine-Tuning Showdown ---
# Goal: Compare performance metrics of a LoRA/QLoRA model vs. a fully fine-tuned model.
# Method: Loads saved models from previous recipes (Full FT and LoRA/QLoRA)
#         and evaluates them on the same test set.
# Note: Assumes you have run and saved models from:
#       - A Full FT recipe (e.g., IFT on Gemma-2B, Ch 7 equivalent but full FT)
#       - A LoRA/QLoRA recipe (e.g., ch8_recipe_qlora_ft)
#       Both MUST use the same base model and task/dataset for valid comparison.

import torch
import evaluate
import numpy as np
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM, # Or specific task head like SequenceClassification
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding, # Or specific collator
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig
)
from peft import PeftModel # Needed to load LoRA adapter
import os
import math

# --- Configuration ---
# --- !!! UPDATE THESE PATHS !!! ---
# Path to the saved FULLY Fine-Tuned model (e.g., from Ch 6/7 equivalent)
FULL_FT_MODEL_PATH = "./instruction_finetune_output_FULL_FT" # NEEDS TO BE CREATED SEPARATELY
# Path to the saved PEFT (LoRA/QLoRA) adapter
PEFT_ADAPTER_PATH = "./qlora_finetune_output" # From ch8_recipe_qlora_ft
# Base model checkpoint - MUST be the same for both models being compared
BASE_MODEL_CHECKPOINT = "google/gemma-2b"
# Dataset and task details - MUST be the same used for training both models
DATASET_NAME = "databricks/databricks-dolly-15k"
TASK_TYPE = "IFT" # Indicate task ('IFT', 'Classification', 'NER', etc.)
NUM_SAMPLES_FOR_EVAL = 200 # Use a subset of test data for quicker evaluation demo
BATCH_SIZE = 4 # Evaluation batch size (adjust based on memory)

# --- Check if models exist ---
if not os.path.exists(FULL_FT_MODEL_PATH):
    print(f"Error: Fully Fine-Tuned model not found at {FULL_FT_MODEL_PATH}")
    print("Please run a full fine-tuning script first and update the path.")
    exit()
if not os.path.exists(PEFT_ADAPTER_PATH):
    print(f"Error: PEFT adapter not found at {PEFT_ADAPTER_PATH}")
    print("Please run the LoRA/QLoRA fine-tuning recipe first and update the path.")
    exit()

# --- 1. Load Test Data and Tokenizer ---
print(f"Loading dataset: {DATASET_NAME}")
try:
    # Load the split used for testing during training runs
    # Assuming a validation split was used, or load 'test' if available/appropriate
    # For Dolly, there's no predefined test split, so we sample from train
    raw_dataset_full = load_dataset(DATASET_NAME, split="train")
    # Create a deterministic test split from the full data
    # IMPORTANT: Ensure this split was NOT used during training of EITHER model
    test_dataset_raw = raw_dataset_full.select(range(NUM_SAMPLES_PER_EVAL)) # Simple subset for demo
    print(f"Using {len(test_dataset_raw)} samples for evaluation.")
except Exception as e:
    print(f"Error loading dataset: {e}")
    exit()

print(f"\nLoading tokenizer: {BASE_MODEL_CHECKPOINT}")
try:
    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_CHECKPOINT)
    if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token
except Exception as e:
    print(f"Error loading tokenizer: {e}")
    exit()

# --- 2. Preprocess Test Data ---
# Use the *exact same* preprocessing as during training
# Reusing IFT preprocessing from previous recipes
MAX_LENGTH = 512 # Must match training
PROMPT_WITH_INPUT_TEMPLATE = (
    "Below is an instruction that describes a task, paired with an input that provides further context. "
    "Write a response that appropriately completes the request.\n\n"
    "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n"
)
PROMPT_NO_INPUT_TEMPLATE = (
    "Below is an instruction that describes a task. "
    "Write a response that appropriately completes the request.\n\n"
    "### Instruction:\n{instruction}\n\n### Response:\n"
)
def format_and_tokenize_ift(example):
    instruction = example.get("instruction", "")
    input_context = example.get("context", "")
    output = example.get("response", "")
    if input_context and input_context.strip():
        prompt_start = PROMPT_WITH_INPUT_TEMPLATE.format(instruction=instruction, input=input_context)
    else:
        prompt_start = PROMPT_NO_INPUT_TEMPLATE.format(instruction=instruction)
    # For evaluation, we often don't need the full text + EOS, just the prompt part
    # However, for loss calculation (perplexity), we need labels. Tokenize full text.
    full_text = prompt_start + output + tokenizer.eos_token
    tokenized_full = tokenizer(full_text, truncation=True, padding="max_length", max_length=MAX_LENGTH)
    tokenized_prompt = tokenizer(prompt_start, truncation=True, padding="max_length", max_length=MAX_LENGTH)
    prompt_length = len(tokenized_prompt["input_ids"])
    labels = copy.deepcopy(tokenized_full["input_ids"])
    for i in range(prompt_length):
         if i < len(labels): labels[i] = -100
    tokenized_full["labels"] = labels
    return tokenized_full

print("\nTokenizing evaluation dataset...")
try:
    if TASK_TYPE == "IFT":
         tokenized_test_dataset = test_dataset_raw.map(
             format_and_tokenize_ift, remove_columns=test_dataset_raw.column_names
         )
    # Add elif blocks here for preprocessing specific to other tasks (Classification, NER)
    # elif TASK_TYPE == "Classification": ...
    else:
         raise ValueError(f"Preprocessing for TASK_TYPE='{TASK_TYPE}' not implemented in this recipe.")
    tokenized_test_dataset.set_format("torch")
    print("Tokenization complete.")
except Exception as e:
    print(f"Error during tokenization: {e}")
    exit()

# --- 3. Setup Collator, Metrics, Eval Args ---
if TASK_TYPE == "IFT":
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
    metric_to_report = "eval_loss" # Use loss for IFT/Causal LM
    metric_for_best = "loss"
    compute_metrics_fn = None # Trainer calculates loss by default
# Add elif blocks for other task collators and metrics
# elif TASK_TYPE == "Classification": ...
else:
     raise ValueError(f"Collator/Metrics for TASK_TYPE='{TASK_TYPE}' not implemented.")

# Minimal args for evaluation
eval_args = TrainingArguments(
    output_dir="./eval_comparison_output", # Temp dir
    per_device_eval_batch_size=BATCH_SIZE,
    do_train=False, do_eval=True, report_to="none",
    fp16=torch.cuda.is_available() and not torch.cuda.is_bf16_supported(),
    bf16=torch.cuda.is_available() and torch.cuda.is_bf16_supported(),
)

# --- 4. Evaluate FULLY Fine-Tuned Model ---
print(f"\n--- Evaluating FULLY Fine-Tuned Model ({FULL_FT_MODEL_PATH}) ---")
full_ft_results = {"eval_loss": "Error"} # Default
try:
    # Load the saved full model
    # Adjust dtype/device_map as needed based on how it was saved/resource limits
    compute_dtype = torch.bfloat16 if eval_args.bf16 else (torch.float16 if eval_args.fp16 else torch.float32)
    full_ft_model = AutoModelForCausalLM.from_pretrained(
        FULL_FT_MODEL_PATH,
        torch_dtype=compute_dtype,
        device_map={'':torch.cuda.current_device()}
    )
    print("Fully Fine-Tuned model loaded.")

    full_trainer = Trainer(
        model=full_ft_model, args=eval_args, eval_dataset=tokenized_test_dataset,
        tokenizer=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics_fn
    )
    print("Running evaluation on Fully Fine-Tuned model...")
    full_ft_results = full_trainer.evaluate()
    print("\nFully Fine-Tuned Model Evaluation Results:")
    print(full_ft_results)

except Exception as e:
    print(f"Error during Fully Fine-Tuned model evaluation: {e}")
    if 'full_ft_model' in locals(): del full_ft_model
    torch.cuda.empty_cache()


# --- 5. Evaluate PEFT (LoRA/QLoRA) Model ---
print(f"\n--- Evaluating PEFT Model ({PEFT_ADAPTER_PATH}) ---")
peft_results = {"eval_loss": "Error"} # Default
try:
    # Load base model (quantized if PEFT was QLoRA)
    is_qlora = "qlora" in PEFT_ADAPTER_PATH.lower() # Simple check based on path name
    bnb_config = None
    if is_qlora:
        print("Loading base model in 4-bit for QLoRA evaluation...")
        bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4",
                                        bnb_4bit_compute_dtype=compute_dtype)

    base_model_peft = AutoModelForCausalLM.from_pretrained(
        BASE_MODEL_CHECKPOINT,
        quantization_config=bnb_config, # None if not QLoRA
        torch_dtype=compute_dtype if not is_qlora else None, # Load native if not quantized
        device_map={'':torch.cuda.current_device()}
    )
    if base_model_peft.config.pad_token_id is None: base_model_peft.config.pad_token_id = tokenizer.pad_token_id
    print("Base model loaded.")

    # Load the PEFT adapter onto the base model
    peft_model = PeftModel.from_pretrained(base_model_peft, PEFT_ADAPTER_PATH)
    print("PEFT adapter loaded.")

    # Note: For evaluation, merging might be slightly faster if memory allows
    # peft_model = peft_model.merge_and_unload()
    # print("PEFT adapter merged for evaluation.")

    peft_trainer = Trainer(
        model=peft_model, args=eval_args, eval_dataset=tokenized_test_dataset,
        tokenizer=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics_fn
    )
    print("Running evaluation on PEFT model...")
    peft_results = peft_trainer.evaluate()
    print("\nPEFT Model Evaluation Results:")
    print(peft_results)

except Exception as e:
    print(f"Error during PEFT model evaluation: {e}")
    if 'base_model_peft' in locals(): del base_model_peft
    if 'peft_model' in locals(): del peft_model
    torch.cuda.empty_cache()


# --- 6. Comparison ---
print("\n--- Performance Comparison ---")
# Compare based on the primary metric (loss for IFT)
full_metric = full_ft_results.get(metric_to_report, "N/A")
peft_metric = peft_results.get(metric_to_report, "N/A")

print(f"Task Type: {TASK_TYPE}")
print(f"Metric Compared: {metric_to_report}")
print(f"Fully Fine-Tuned Model: {full_metric:.4f}" if isinstance(full_metric, float) else full_metric)
print(f"PEFT (LoRA/QLoRA) Model: {peft_metric:.4f}" if isinstance(peft_metric, float) else peft_metric)

print("\nObservations:")
print("- Compare the metric values. Lower loss is better for IFT.")
print("- Full FT *might* achieve slightly better peak performance if resources allow.")
print("- PEFT achieves comparable or slightly lower performance with drastically fewer trainable parameters and lower resource usage during training.")
print("- Consider the trade-off: peak performance vs. training/deployment efficiency.")
print("\nResource Usage Comparison (Typical Expectations):")
print("- Training VRAM: Full FT >> LoRA > QLoRA / Unsloth")
print("- Training Time: Full FT > LoRA / QLoRA (Unsloth often fastest)")
print("- Saved Artifact Size: Full FT (Full Model) >> LoRA / QLoRA (Adapter only)")

# --- End of Recipe ---


2025-08-30 10:27:26.627583: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1756549646.806144      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1756549646.861172      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Error: Fully Fine-Tuned model not found at ./instruction_finetune_output_FULL_FT
Please run a full fine-tuning script first and update the path.
Error: PEFT adapter not found at ./qlora_finetune_output
Please run the LoRA/QLoRA fine-tuning recipe first and update the path.
Loading dataset: databricks/databricks-dolly-15k


README.md: 0.00B [00:00, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


databricks-dolly-15k.jsonl:   0%|          | 0.00/13.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/15011 [00:00<?, ? examples/s]

Error loading dataset: name 'NUM_SAMPLES_PER_EVAL' is not defined

Loading tokenizer: google/gemma-2b


tokenizer_config.json:   0%|          | 0.00/33.6k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]


Tokenizing evaluation dataset...
Error during tokenization: name 'test_dataset_raw' is not defined

--- Evaluating FULLY Fine-Tuned Model (./instruction_finetune_output_FULL_FT) ---
Error during Fully Fine-Tuned model evaluation: Repo id must use alphanumeric chars or '-', '_', '.', '--' and '..' are forbidden, '-' and '.' cannot start or end the name, max length is 96: './instruction_finetune_output_FULL_FT'.

--- Evaluating PEFT Model (./qlora_finetune_output) ---
Loading base model in 4-bit for QLoRA evaluation...


config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

Base model loaded.
Error during PEFT model evaluation: Can't find 'adapter_config.json' at './qlora_finetune_output'

--- Performance Comparison ---
Task Type: IFT
Metric Compared: eval_loss
Error
Error

Observations:
- Compare the metric values. Lower loss is better for IFT.
- Full FT *might* achieve slightly better peak performance if resources allow.
- PEFT achieves comparable or slightly lower performance with drastically fewer trainable parameters and lower resource usage during training.
- Consider the trade-off: peak performance vs. training/deployment efficiency.

Resource Usage Comparison (Typical Expectations):
- Training VRAM: Full FT >> LoRA > QLoRA / Unsloth
- Training Time: Full FT > LoRA / QLoRA (Unsloth often fastest)
- Saved Artifact Size: Full FT (Full Model) >> LoRA / QLoRA (Adapter only)
