In [None]:
!pip install -q "transformers>=4.35.0" accelerate bitsandbytes peft trl datasets sentencepiece evaluate rouge_score bert_score

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/61.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# =====================================
# Cell 2 — Imports & setup
# =====================================
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    TrainingArguments,
    DataCollatorForLanguageModeling,
)
from peft import LoraConfig, get_peft_model, PeftModel
from trl import SFTTrainer
from transformers import logging
logging.set_verbosity_error()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


Using device: cuda


In [None]:
# =====================================
# Cell 3 — Define model & tokenizer
# =====================================
MODEL_NAME = "mistralai/Mistral-7B-v0.3"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token

base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
)
base_model.config.use_cache = False
base_model.config.pretraining_tp = 1


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/601 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

In [None]:
# =====================================
# Cell 4 — Prepare model for LoRA training
# =====================================
lora_config = LoraConfig(
    r=64,
    lora_alpha=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
)

model = get_peft_model(base_model, lora_config)
model.print_trainable_parameters()


trainable params: 54,525,952 || all params: 7,302,549,504 || trainable%: 0.7467


In [None]:
# =====================================
# Cell 5 — Load and preprocess dataset
# =====================================
dataset = load_dataset("coqa")

def format_example(example):
    # convert CoQA format -> instruction + answer text
    q = example["questions"][-1]
    a = example["answers"]["input_text"][-1] # Corrected from 'text' to 'input_text'
    return {"text": f"Question: {q}\nAnswer: {a}"}

dataset = dataset.map(format_example)

tokenized_dataset = dataset.map(
    lambda e: tokenizer(e["text"], truncation=True, padding="max_length", max_length=512),
    batched=True,
)

print(tokenized_dataset)

Map:   0%|          | 0/7199 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/7199 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['source', 'story', 'questions', 'answers', 'text', 'input_ids', 'attention_mask'],
        num_rows: 7199
    })
    validation: Dataset({
        features: ['source', 'story', 'questions', 'answers', 'text', 'input_ids', 'attention_mask'],
        num_rows: 500
    })
})


In [None]:
# =====================================
# Cell 6 — Validation loss before fine-tuning (BASE MODEL)
# =====================================
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
import math

def eval_loss_and_ppl(model, tokenizer, hf_dataset, batch_size=4, max_length=512):
    model.eval()
    model.to(device)

    # Remove columns that are not 'input_ids' or 'attention_mask'
    hf_dataset_for_collator = hf_dataset.remove_columns([col for col in hf_dataset.column_names if col not in ['input_ids', 'attention_mask']])

    collator = DataCollatorForLanguageModeling(tokenizer, mlm=False, return_tensors="pt")
    loader = DataLoader(hf_dataset_for_collator, batch_size=batch_size, shuffle=False, collate_fn=lambda examples: collator(examples))
    total_loss, total_tokens = 0.0, 0
    with torch.no_grad():
        for batch in tqdm(loader, desc="Eval loss"):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            tokens = batch["input_ids"].ne(tokenizer.pad_token_id).sum().item()
            total_loss += loss.item() * tokens
            total_tokens += tokens
    avg_loss = total_loss / total_tokens
    ppl = math.exp(avg_loss)
    return {"avg_loss": avg_loss, "ppl": ppl}

base_metrics = eval_loss_and_ppl(base_model, tokenizer, tokenized_dataset["validation"].select(range(200)))
print("Before fine-tuning:", base_metrics)

Eval loss:   0%|          | 0/50 [00:00<?, ?it/s]

Before fine-tuning: {'avg_loss': 3.7333735501769505, 'ppl': 41.81995216532704}


In [None]:
# Eval helper: loss & perplexity (re-defining is safe)
import math
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
from transformers import DataCollatorForLanguageModeling

def eval_loss_and_ppl(model, tokenizer, hf_dataset, batch_size=4):
    model.eval()
    model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))

    # Remove columns that are not 'input_ids' or 'attention_mask'
    hf_dataset_for_collator = hf_dataset.remove_columns([col for col in hf_dataset.column_names if col not in ['input_ids', 'attention_mask']])

    collator = DataCollatorForLanguageModeling(tokenizer, mlm=False, return_tensors="pt")
    loader = DataLoader(hf_dataset_for_collator, batch_size=batch_size, shuffle=False, collate_fn=lambda examples: collator(examples))
    total_loss = 0.0
    total_tokens = 0
    with torch.no_grad():
        for batch in tqdm(loader, desc="Eval loss"):
            batch = {k: v.to(model.device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            token_count = int(batch.get("attention_mask").sum().item()) if "attention_mask" in batch else batch["input_ids"].ne(tokenizer.pad_token_id).sum().item()
            total_loss += float(loss.item()) * token_count
            total_tokens += token_count
    avg_loss = total_loss / max(1, total_tokens)
    ppl = math.exp(avg_loss)
    return {"avg_loss": avg_loss, "ppl": ppl}


In [None]:
import os, math
from torch.optim import AdamW
from transformers import get_scheduler
from accelerate import Accelerator
from torch.utils.data import DataLoader
from transformers import DataCollatorForLanguageModeling
from tqdm.auto import tqdm

# --- Hyperparameters (change as needed) ---
num_epochs = 1
per_device_train_batch_size = 1
gradient_accumulation_steps = 4
learning_rate = 2e-4
train_subset = 800   # set to None to use full train split
save_dir = "./mistral_sft_lora"

# --- Prepare train data loader ---
train_data = tokenized_dataset["train"].select(range(min(train_subset, len(tokenized_dataset["train"])))) if train_subset else tokenized_dataset["train"]
# Remove columns that are not 'input_ids' or 'attention_mask' for the collator
train_data_for_collator = train_data.remove_columns([col for col in train_data.column_names if col not in ['input_ids', 'attention_mask']])
collator = DataCollatorForLanguageModeling(tokenizer, mlm=False, return_tensors="pt")
train_loader = DataLoader(train_data_for_collator, batch_size=per_device_train_batch_size, shuffle=True, collate_fn=lambda exs: collator(exs))

# --- Optimizer: only train parameters that require_grad (LoRA params) ---
optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=learning_rate)

# --- Scheduler ---
num_update_steps_per_epoch = max(1, math.ceil(len(train_loader) / gradient_accumulation_steps))
max_train_steps = num_epochs * num_update_steps_per_epoch
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=max_train_steps)

# --- Accelerator (auto mixed precision) ---
if torch.cuda.is_available():
    mixed_precision = "bf16" if torch.cuda.is_bf16_supported() else "fp16"
else:
    mixed_precision = "no"

accelerator = Accelerator(mixed_precision=mixed_precision)
print("Accelerator mixed precision:", mixed_precision)

# Prepare objects
model, optimizer, train_loader, lr_scheduler = accelerator.prepare(model, optimizer, train_loader, lr_scheduler)

global_step = 0
model.train()
for epoch in range(num_epochs):
    progress_bar = tqdm(range(len(train_loader)), desc=f"Epoch {epoch+1}")
    for step, batch in enumerate(train_loader):
        with accelerator.accumulate(model):
            # ensure batch on device
            batch = {k: v.to(accelerator.device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            accelerator.backward(loss)

            # optimizer step handled by accelerator accumulation
            if accelerator.sync_gradients:
                optimizer.step()
                lr_scheduler.step()
                optimizer.zero_grad()

        global_step += 1
        if (step + 1) % 50 == 0:
            try:
                print(f"[Epoch {epoch+1}] step {step+1} loss: {loss.item():.4f}")
            except:
                print(f"[Epoch {epoch+1}] step {step+1} loss: (couldn't read loss scalar)")

        progress_bar.update(1)

# unwrap model (for saving/eval)
accelerator.wait_for_everyone()
unwrapped_model = accelerator.unwrap_model(model)
print("Finished training. Global steps:", global_step)


Accelerator mixed precision: bf16


Epoch 1:   0%|          | 0/800 [00:00<?, ?it/s]

[Epoch 1] step 50 loss: 3.0450
[Epoch 1] step 100 loss: 2.4590
[Epoch 1] step 150 loss: 2.4009
[Epoch 1] step 200 loss: 2.8688
[Epoch 1] step 250 loss: 2.0788
[Epoch 1] step 300 loss: 2.5621
[Epoch 1] step 350 loss: 2.0653
[Epoch 1] step 400 loss: 2.2547
[Epoch 1] step 450 loss: 2.9567
[Epoch 1] step 500 loss: 1.4165
[Epoch 1] step 550 loss: 2.5119
[Epoch 1] step 600 loss: 2.9169
[Epoch 1] step 650 loss: 2.2854
[Epoch 1] step 700 loss: 2.3622
[Epoch 1] step 750 loss: 2.5478
[Epoch 1] step 800 loss: 2.5076
Finished training. Global steps: 800


In [None]:
# Save PEFT adapter + tokenizer
os.makedirs(save_dir, exist_ok=True)
# `unwrapped_model` is PeftModel wrapping the base; save_pretrained writes the adapter
try:
    unwrapped_model.save_pretrained(save_dir)
    tokenizer.save_pretrained(save_dir)
    print("Saved LoRA adapter + tokenizer to:", save_dir)
except Exception as e:
    # fallback if save_pretrained fails on this wrapper
    print("Primary save failed:", e)
    try:
        # If `unwrapped_model` is PeftModel, this should work
        from peft import PeftModel
        if isinstance(unwrapped_model, PeftModel):
            unwrapped_model.save_pretrained(save_dir)
            tokenizer.save_pretrained(save_dir)
            print("Saved via PeftModel fallback to:", save_dir)
    except Exception as e2:
        print("Fallback save also failed:", e2)


Saved LoRA adapter + tokenizer to: ./mistral_sft_lora


In [None]:
# Load adapter onto a fresh base model (keeps base_model unchanged)
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel

bnb_cfg = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16,
)

# load base in 4-bit again (device_map="auto")
base_for_eval = AutoModelForCausalLM.from_pretrained(MODEL_NAME, quantization_config=bnb_cfg, device_map="auto")
base_for_eval.config.use_cache = False

ft_model = PeftModel.from_pretrained(base_for_eval, save_dir)
ft_model.eval()
print("Loaded fine-tuned PEFT model from", save_dir)


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Loaded fine-tuned PEFT model from ./mistral_sft_lora


In [None]:
# Small subset for speed; pick a larger slice for final evaluation
val_sample = tokenized_dataset["validation"].select(range(min(200, len(tokenized_dataset["validation"]))))
ft_metrics = eval_loss_and_ppl(ft_model, tokenizer, val_sample, batch_size=2)
print("Fine-tuned model metrics (subset):", ft_metrics)

# if you saved base metrics earlier, compare them here:
try:
    print("Previously computed base metrics:", base_metrics)
except NameError:
    print("No in-memory base_metrics found (you can re-run base eval if needed).")


Eval loss:   0%|          | 0/100 [00:00<?, ?it/s]

Fine-tuned model metrics (subset): {'avg_loss': 2.398970152808463, 'ppl': 11.01183003690711}
Previously computed base metrics: {'avg_loss': 3.7333735501769505, 'ppl': 41.81995216532704}


In [None]:
# If you still have the original `base_model`, use it; otherwise re-load base similarly as above.
def generate_responses(model, tokenizer, prompts, max_new_tokens=64, device=None):
    if device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()
    outs = []
    for p in prompts:
        inputs = tokenizer(p, return_tensors="pt", truncation=True, padding=True).to(device)
        with torch.no_grad():
            gen = model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False)
        outs.append(tokenizer.decode(gen[0], skip_special_tokens=True))
    return outs

# Build small prompt list from your raw dataset (or use pre-made prompts)
prompts = [ex["text"].split("\n")[0] for ex in tokenized_dataset["validation"].select(range(10))]

# Use base_for_eval, which is a freshly loaded base model instance
base_preds = generate_responses(base_for_eval, tokenizer, prompts)
ft_preds = generate_responses(ft_model, tokenizer, prompts)

for i, p in enumerate(prompts):
    print(f"\nPROMPT: {p}")
    print("BASE : ", base_preds[i])
    print("FINE : ", ft_preds[i])



PROMPT: Question: Did they want Cotton to change the color of her fur?
BASE :  Question: Did they want Cotton to change the color of her fur?
Answer: No.
Question: Did they want Cotton to change her personality?
Answer: No.
Question: Did they want Cotton to change her name?
Answer: No.
Question: Did they want Cotton to change her age?
Answer: No.

FINE :  Question: Did they want Cotton to change the color of her fur?
Answer: No.
Question: Did they want Cotton to change her personality?
Answer: No.
Question: Did they want Cotton to change her name?
Answer: No.
Question: Did they want Cotton to change her age?
Answer: No.


PROMPT: Question: were they excited
BASE :  Question: were they excited?
Answer: yes.
Question: what did they do?
Answer: they went to the beach.
Question: what did they do there?
Answer: they played in the sand.
Question: what did they do after that?
Answer: they went home.
Question
FINE :  Question: were they excited?
Answer: yes.
Question: what did they do?
Answer

In [None]:
# compute BLEU/ROUGE/BERTScore and a paired bootstrap CI for BLEU
import evaluate
import random
import numpy as np

bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")
bertscore = evaluate.load("bertscore")

refs = [ex["text"] for ex in tokenized_dataset["validation"].select(range(10))]  # adjust selection to match prompts size

def compute_generation_metrics(references, predictions):
    res = {}
    res["bleu"] = bleu.compute(predictions=predictions, references=references)["bleu"]
    r = rouge.compute(predictions=predictions, references=references)
    res["rougeL"] = r.get("rougeL", None)
    bs = bertscore.compute(predictions=predictions, references=references, lang="en")
    res["bertscore_f1"] = float(np.mean(bs["f1"]))
    return res

metrics_base = compute_generation_metrics(refs, base_preds)
metrics_ft = compute_generation_metrics(refs, ft_preds)
print("Base metrics:", metrics_base)
print("FT metrics  :", metrics_ft)

# Paired bootstrap for BLEU diff
def paired_bootstrap_metric_diff(references, preds_a, preds_b, metric_fn, n_bootstrap=1000, seed=42):
    rng = random.Random(seed)
    n = len(references)
    diffs = []
    for _ in range(n_bootstrap):
        idxs = [rng.randrange(n) for _ in range(n)]
        refs_s = [references[i] for i in idxs]
        a_s = [preds_a[i] for i in idxs]
        b_s = [preds_b[i] for i in idxs]
        val_a = metric_fn(refs_s, a_s)
        val_b = metric_fn(refs_s, b_s)
        diffs.append(val_b - val_a)
    diffs = np.array(diffs)
    return float(np.mean(diffs)), (float(np.percentile(diffs, 2.5)), float(np.percentile(diffs, 97.5)))

def bleu_metric_fn(refs, preds):
    return bleu.compute(predictions=preds, references=refs)["bleu"]

mean_diff, ci = paired_bootstrap_metric_diff(refs, base_preds, ft_preds, bleu_metric_fn, n_bootstrap=500)
print("BLEU mean diff (FT - BASE):", mean_diff, "95% CI:", ci)


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Base metrics: {'bleu': 0.15910334818940228, 'rougeL': np.float64(0.30377361536623115), 'bertscore_f1': 0.8966879725456238}
FT metrics  : {'bleu': 0.15910334818940228, 'rougeL': np.float64(0.30377361536623115), 'bertscore_f1': 0.8966879725456238}
BLEU mean diff (FT - BASE): 0.0 95% CI: (0.0, 0.0)
