# QLoRA Fine-Tuning on Google Colab (Phi-2 + DialogSum or your custom CSV)

This notebook fine-tunes a **2.7B**-parameter base model (**microsoft/phi-2**) using **QLoRA** (4-bit quantization + LoRA adapters) with Hugging Face **Transformers**, **PEFT**, **bitsandbytes**, **Accelerate**, **TRL**, and **datasets**.

You can:
1. Use the sample dataset (**DialogSum**), or
2. Upload your **own CSV** with columns like `prompt,response` *or* `instruction,input,output`.

**Before running:**
- In Colab, go to **Runtime → Change runtime type → GPU**.
- Make sure you have a Hugging Face account and have **accepted the model license** for `microsoft/phi-2` on the Hub.
- Keep your HF access token handy.


In [1]:
import torch
print("GPU available:", torch.cuda.is_available())
print("GPU name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "None")


GPU available: True
GPU name: Tesla T4


In [2]:
%%bash
pip -q install -U bitsandbytes==0.43.3 transformers==4.43.3 peft==0.11.1 accelerate==0.33.0 datasets==2.19.1 einops==0.8.0 scipy evaluate rouge_score trl==0.9.6 sentencepiece huggingface_hub

     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 43.7/43.7 kB 1.6 MB/s eta 0:00:00
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 61.0/61.0 kB 3.8 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 137.5/137.5 MB 8.4 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 9.4/9.4 MB 131.3 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 251.6/251.6 kB 21.3 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 315.1/315.1 kB 27.3 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 542.0/542.0 kB 38.5 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 43.2/43.2 kB 3.4 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 245.8/245.8 kB 22.4 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 84.1/84.1 kB 7.6 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 172.0/172.0 kB 16.1 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 18.0/18.0 MB 70.6 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gcsfs 2025.3.0 requires fsspec==2025.3.0, but you have fsspec 2024.3.1 which is incompatible.
thinc 8.3.6 requires numpy<3.0.0,>=2.0.0, but you have numpy 1.26.4 which is incompatible.
opencv-contrib-python 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= "3.9", but you have numpy 1.26.4 which is incompatible.
opencv-python 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= "3.9", but you have numpy 1.26.4 which is incompatible.
opencv-python-headless 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= "3.9", but you have numpy 1.26.4 which is incompatible.


In [None]:
import os, time, math, random
import torch
from datasets import load_dataset, Dataset, DatasetDict
from transformers import (
    AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig,
    TrainingArguments, Trainer, DataCollatorForLanguageModeling,
    GenerationConfig
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from huggingface_hub import login
from functools import partial
import pandas as pd
import numpy as np

print('Torch:', torch.__version__)

In [None]:
# 🔐 Login to Hugging Face so the notebook can download gated models (like microsoft/phi-2)
login()  # Paste your HF token when prompted

In [None]:
# ==================== CONFIG ====================
MODEL_ID = 'microsoft/phi-2'  # base model

# Option A: use the DialogSum dataset from HF
USE_HF_DATASET = True
HF_DATASET_NAME = 'neil-code/dialogsum-test'  # small sample version for quick runs

# Option B: upload your CSV in Colab (Runtime ▶ Run all will pause at upload cell later)
# Expected columns for custom CSV:
#   - Either: prompt,response
#   - Or: instruction,input,output

# Training hyperparameters (safe defaults for a T4)
TRAIN_STEPS = 400  # increase later (e.g., 1000+) for better results
PER_DEVICE_BATCH = 1
GRAD_ACCUM = 4
LR = 2e-4
MAX_NEW_TOKENS = 128
SEED = 42
OUTPUT_DIR = f"./peft-qlora-{int(time.time())}"
os.environ['WANDB_DISABLED'] = 'true'

In [None]:
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)

if USE_HF_DATASET:
    ds = load_dataset(HF_DATASET_NAME)
    # Expect: columns: dialogue, summary, topic, id
else:
    # Upload CSV via Colab UI
    try:
        from google.colab import files  # type: ignore
        uploaded = files.upload()
        csv_path = list(uploaded.keys())[0]
    except Exception as e:
        raise RuntimeError('Upload your CSV first or set USE_HF_DATASET=True')
    df = pd.read_csv(csv_path)
    # Normalize to a common schema similar to DialogSum
    if set(['prompt','response']).issubset(df.columns):
        df = df.rename(columns={'prompt':'dialogue','response':'summary'})
    elif set(['instruction','input','output']).issubset(df.columns):
        # Join instruction + input into one 'dialogue' field
        df['dialogue'] = df['instruction'].fillna('') + '\n' + df['input'].fillna('')
        df['summary'] = df['output']
    else:
        raise ValueError('CSV must have either (prompt,response) or (instruction,input,output) columns.')
    df = df[['dialogue','summary']].dropna().reset_index(drop=True)
    # 90/10 split, then 50/50 split of the 10% for val/test
    n = len(df)
    train_df = df.sample(frac=0.9, random_state=SEED)
    rest_df = df.drop(train_df.index)
    val_df = rest_df.sample(frac=0.5, random_state=SEED)
    test_df = rest_df.drop(val_df.index)
    ds = DatasetDict({
        'train': Dataset.from_pandas(train_df),
        'validation': Dataset.from_pandas(val_df),
        'test': Dataset.from_pandas(test_df)
    })

print(ds)

# Summary of below cell:
Choose compute precision → Uses bfloat16 (on GPUs that support it) or float16 (fallback).

Setup quantization config with BitsAndBytesConfig:

Load model in 4-bit precision (saves VRAM).

Use special quantization type "nf4".

No double quantization (keeps things simple).

Load model (AutoModelForCausalLM) with quantization and device_map='auto' so HuggingFace automatically spreads model layers across available GPU(s).

Load tokenizer → matches the model and sets padding.

If tokenizer doesn’t have a padding token, it uses the end-of-sequence token (eos).

Final print confirms successful 4-bit model load.

In short: This cell loads the base language model (phi-2) in memory-efficient 4-bit precision using bitsandbytes, and prepares the tokenizer for text processing.


In [None]:
compute_dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float16
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)
device_map = 'auto'
base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    device_map=device_map,
    quantization_config=bnb_config,
    trust_remote_code=True,
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True, padding_side='left', use_fast=False)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
print('Loaded model in 4-bit with bitsandbytes ✅')

# Summary:

generate_text(...)Puts model in evaluation mode (not training).

Tokenizes the prompt into tensors.

Creates a generation configuration:

do_sample=True → random sampling instead of always picking most likely token.

temperature=0.7 → balances creativity vs determinism.

top_p=0.9 → nucleus sampling (choose from top 90% probability mass).

Stops at EOS token, pads with EOS if needed.

Runs model generation (model.generate).

Decodes tokens back to readable text.
 Basically: Runs the model to generate natural language output for a given input text.

format_dialogsum_prompt(...)

Prepares a standardized prompt for the DialogSum dataset.

Takes raw conversation text and wraps it into an instruction style prompt:

Instruct: Summarize the following conversation.
<dialogue here>
Output:


This way, the model clearly understands the task.

In short: These helper functions allow you to (1) generate text with controlled randomness, and (2) format conversations into a prompt that the model can summarize.

In [None]:
def generate_text(model, tokenizer, prompt, max_new_tokens=MAX_NEW_TOKENS, temperature=0.7, top_p=0.9):
    model.eval()
    inputs = tokenizer(prompt, return_tensors='pt').to(model.device)
    gen_cfg = GenerationConfig(
        do_sample=True,
        max_new_tokens=max_new_tokens,
        temperature=temperature,
        top_p=top_p,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.eos_token_id,
    )
    with torch.no_grad():
        out = model.generate(**inputs, generation_config=gen_cfg)
    return tokenizer.batch_decode(out, skip_special_tokens=True)

def format_dialogsum_prompt(dialogue_text):
    return f"Instruct: Summarize the following conversation.\n{dialogue_text}\nOutput:\n"

# This cell:

Picks one example dialogue + its human-written summary.

Formats the conversation into a prompt.

Runs the base model (zero-shot, no fine-tuning).

Prints:

The input prompt

The human reference summary

The model’s generated summary

In [None]:
idx = 0
sample_dialogue = ds['test'][idx]['dialogue'] if 'test' in ds else ds['validation'][idx]['dialogue']
sample_summary = ds['test'][idx]['summary'] if 'test' in ds else ds['validation'][idx]['summary']
prompt = format_dialogsum_prompt(sample_dialogue)
base_out = generate_text(base_model, tokenizer, prompt)[0]
print('-'*80)
print('INPUT PROMPT:\n', prompt)
print('-'*80)
print('BASELINE HUMAN SUMMARY:\n', sample_summary)
print('-'*80)
print('MODEL (zero-shot) OUTPUT:\n', base_out.split('Output:\n')[-1])

# This cell:

Builds instruction-style prompts (with special markers).

Figures out model’s maximum sequence length.

Tokenizes those formatted prompts so the model can be trained on them.

Purpose: To transform raw dataset (dialogue + summary) → into structured, tokenized input/output pairs for supervised fine-tuning.

In [None]:
INTRO_BLURB = 'Below is an instruction that describes a task. Write a response that appropriately completes the request.'
INSTRUCTION_KEY = '### Instruct: Summarize the below conversation.'
RESPONSE_KEY = '### Output:'
END_KEY = '### End'

def create_prompt_formats(sample):
    blurb = f"\n{INTRO_BLURB}"
    instruction = f"{INSTRUCTION_KEY}"
    input_context = f"{sample['dialogue']}" if sample.get('dialogue') else None
    response = f"{RESPONSE_KEY}\n{sample['summary']}"
    end = f"{END_KEY}"
    parts = [p for p in [blurb, instruction, input_context, response, end] if p]
    sample['text'] = '\n\n'.join(parts)
    return sample

def get_max_length(model):
    for k in ['n_positions','max_position_embeddings','seq_length']:
        v = getattr(model.config, k, None)
        if v:
            return int(v)
    return min(int(getattr(model.config, 'max_position_embeddings', 2048)), 2048)

def tokenize_batch(batch, tokenizer, max_length):
    return tokenizer(batch['text'], max_length=max_length, truncation=True)

# This code:

Gets the model’s max sequence length.

Loops over dataset splits.

Formats each sample into instruction-response style text.

Tokenizes into IDs.

Removes too-long samples.

Saves everything into processed (a fully training-ready dataset).

End result:
processed = a clean, tokenized dataset, ready to feed into HuggingFace’s Trainer.

In [None]:
max_len = get_max_length(base_model)
print('Using max_length =', max_len)
processed = DatasetDict()
for split in ds.keys():
    cur = ds[split].map(create_prompt_formats)
    cur = cur.map(partial(tokenize_batch, tokenizer=tokenizer, max_length=max_len), batched=True,
                  remove_columns=[c for c in ds[split].column_names if c not in ['text']])
    cur = cur.filter(lambda s: len(s['input_ids']) < max_len)
    processed[split] = cur
    print(split, processed[split])

# This cell:

Adjusts the 4-bit quantized model so it’s safe for fine-tuning (prepare_model_for_kbit_training).

Turns on gradient checkpointing to save memory.

Prepares the base model for QLoRA training (parameter-efficient fine-tuning on limited GPU resources

In [None]:
base_model = prepare_model_for_kbit_training(base_model)
base_model.gradient_checkpointing_enable()
print('Model prepared for QLoRA training ✅')

# This cell:

Defines the LoRA adapter configuration (where and how they’re added).

Inserts LoRA adapters into the base quantized model.

Prints how many parameters will actually be trained (usually <1% of the full model).

In [None]:
lora_cfg = LoraConfig(
    r=32,
    lora_alpha=32,
    target_modules=['q_proj','k_proj','v_proj','dense'],  # works well for Phi-2
    bias='none',
    lora_dropout=0.05,
    task_type='CAUSAL_LM',
)
peft_model = get_peft_model(base_model, lora_cfg)

def print_trainable_parameters(m):
    trainable = 0; total = 0
    for _, p in m.named_parameters():
        num = p.numel(); total += num
        if p.requires_grad: trainable += num
    print(f'Trainable params: {trainable:,} / {total:,} ({100*trainable/total:.4f}%)')

print_trainable_parameters(peft_model)

# This cell:

Configures how training will run (steps, optimizer, logging, evaluation).

Prepares a data collator for batching tokenized text.

Creates a Hugging Face Trainer with model + datasets.

Disables caching so gradient checkpointing works.

At this point → the model is fully ready to start training with:

In [None]:
args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=PER_DEVICE_BATCH,
    gradient_accumulation_steps=GRAD_ACCUM,
    learning_rate=LR,
    max_steps=TRAIN_STEPS,
    warmup_steps=1,
    logging_steps=25,
    save_steps=100,
    evaluation_strategy='steps',
    eval_steps=100,
    optim='paged_adamw_8bit',
    group_by_length=True,
    report_to='none',
    gradient_checkpointing=True,
)
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

trainer = Trainer(
    model=peft_model,
    args=args,
    train_dataset=processed['train'],
    eval_dataset=processed.get('validation', None),
    data_collator=data_collator,
)
peft_model.config.use_cache = False
print('Trainer ready ✅')

In [None]:
trainer.train()

# This cell:

Creates a save directory for the LoRA adapter.

Saves only the LoRA-trained weights (tiny, efficient).

Saves the tokenizer so the model can be used later.

Prints the save path for confirmation.

In [None]:
save_dir = os.path.join(OUTPUT_DIR, 'adapter')
peft_model.save_pretrained(save_dir)
tokenizer.save_pretrained(OUTPUT_DIR)
print('Saved LoRA adapter to:', save_dir)

# This cell:

Reloads the original pretrained model with 4-bit quantization.

Loads and attaches the fine-tuned LoRA adapter weights.

Produces a final model (ft_model) ready for inference.

In [None]:
reload_base = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    device_map='auto',
    quantization_config=bnb_config,
    trust_remote_code=True,
)
from peft import PeftModel
ft_model = PeftModel.from_pretrained(reload_base, save_dir, is_trainable=False)
print('Adapter loaded for inference ✅')

# This cell tests your fine-tuned LoRA adapter on a sample dialogue.

You can compare:

The raw input (prompt)

The human-written summary (sample_summary)

The model-generated summary (ft_out)

In [None]:
prompt = format_dialogsum_prompt(sample_dialogue)
ft_out = generate_text(ft_model, tokenizer, prompt)[0]
print('-'*80)
print('INPUT PROMPT:\n', prompt)
print('-'*80)
print('BASELINE HUMAN SUMMARY:\n', sample_summary)
print('-'*80)
print('QLoRA (fine-tuned) OUTPUT:\n', ft_out.split('Output:\n')[1].split('###')[0])

# This cell evaluates how well the baseline and fine-tuned models summarize dialogues compared to humans.

ROUGE gives a numeric score; higher values mean closer to human summaries.

This is how you measure the effect of QLoRA fine-tuning.

In [None]:
import evaluate
rouge = evaluate.load('rouge')
N = min(10, len(processed.get('test', processed['validation'])))
dlg = (processed.get('test', processed['validation']))[:N]['text']
human_refs = (ds.get('test', ds['validation']))[:N]['summary']

orig_summaries, ft_summaries = [], []
for i in range(N):
    # derive the dialogue back from text to feed into prompt (simple heuristic)
    # For reliable eval, you should keep the raw dialogs separately.
    raw_dialogue = ds.get('test', ds['validation'])[i]['dialogue']
    p = format_dialogsum_prompt(raw_dialogue)
    o = generate_text(reload_base, tokenizer, p)[0]
    f = generate_text(ft_model, tokenizer, p)[0]
    orig_summaries.append(o.split('Output:\n')[-1])
    ft_summaries.append(f.split('Output:\n')[-1])

orig_scores = rouge.compute(predictions=orig_summaries, references=human_refs, use_stemmer=True)
ft_scores = rouge.compute(predictions=ft_summaries, references=human_refs, use_stemmer=True)
print('\nOriginal model ROUGE:', orig_scores)
print('Fine-tuned model ROUGE:', ft_scores)