# Fine-Tuning vs Baseline (Gemma 2 2B Instruct)

This notebook compares a baseline instruction-tuned LLM against a fine-tuned version on time-sensitive Python release questions.

Sections:
- Baseline: load the same model used in your RAG notebook and ask the 6 demo questions
- Fine-tuning: train a LoRA/QLoRA adapter using your JSONL dataset and re-evaluate the same questions

Notes:
- Target model: `google/gemma-2-2b-it` (same as your RAG notebook)
- Dataset: `data/processed/fine-tuning-training-data.v4.cleaned.jsonl` (or upload via Colab)
- Designed for Google Colab (T4/L4/A100); runs with 4-bit quantization



In [None]:
# If running on Colab, install/upgrade required packages
pip -q install -U transformers accelerate peft trl bitsandbytes datasets sentencepiece protobuf pandas



In [None]:
# Optional: Hugging Face login (only if your model is gated)
# from huggingface_hub import login
# login(token="<hf_token>")



In [None]:
import os, sys, json, math
from typing import List, Dict, Optional
import torch

from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
)

try:
    from transformers import BitsAndBytesConfig
    _bnb_available = True
except Exception:
    _bnb_available = False

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MODEL_ID = "google/gemma-2-2b-it"  # same baseline as RAG notebook

print("Device:", DEVICE)
print("Model:", MODEL_ID)

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)

if DEVICE == "cuda" and _bnb_available:
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float16,
    )
    base_model = AutoModelForCausalLM.from_pretrained(
        MODEL_ID,
        device_map="auto",
        quantization_config=bnb_config,
        torch_dtype=torch.bfloat16,
    )
else:
    dtype = torch.float32 if DEVICE == "cpu" else torch.float16
    base_model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype=dtype)
    base_model.to(DEVICE)

base_model.eval()
print("Baseline model loaded.")



In [None]:
GEN_CFG = {
    "max_new_tokens": 600,
    "temperature": 0.3,
    "top_p": 0.9,
    "repetition_penalty": 1.1,
}

SYSTEM_PROMPT = "You are a Python programming assistant."


def _format_chat(messages: List[Dict[str, str]], add_generation_prompt: bool = True) -> Dict[str, torch.Tensor]:
    if hasattr(tokenizer, "apply_chat_template") and tokenizer.chat_template is not None:
        effective_messages = messages
        if messages and messages[0].get("role") == "system":
            system_text = messages[0]["content"]
            effective_messages = messages[1:]
            if effective_messages and effective_messages[0].get("role") == "user":
                effective_messages = effective_messages.copy()
                effective_messages[0] = {
                    "role": "user",
                    "content": f"{system_text}\n\n{effective_messages[0]['content']}"
                }
            else:
                effective_messages = [{"role": "user", "content": system_text}]
        prompt_text = tokenizer.apply_chat_template(
            effective_messages,
            tokenize=False,
            add_generation_prompt=add_generation_prompt
        )
    else:
        sys_msg = ""
        if messages and messages[0].get("role") == "system":
            sys_msg = f"System: {messages[0]['content']}\n"
            user_msgs = messages[1:]
        else:
            user_msgs = messages
        convo = "\n".join([f"{m['role'].capitalize()}: {m['content']}" for m in user_msgs])
        prompt_text = (sys_msg + convo + ("\nAssistant:" if add_generation_prompt else ""))

    inputs = tokenizer(prompt_text, return_tensors="pt")
    return {k: v.to(DEVICE) for k, v in inputs.items()}

@torch.inference_mode()
def generate_from_messages(
    model,
    messages: List[Dict[str, str]],
    max_new_tokens: int = GEN_CFG["max_new_tokens"],
    temperature: float = GEN_CFG["temperature"],
    top_p: float = GEN_CFG["top_p"],
    repetition_penalty: float = GEN_CFG["repetition_penalty"],
) -> str:
    inputs = _format_chat(messages, add_generation_prompt=True)
    input_len = inputs["input_ids"].shape[-1]
    outputs = model.generate(
        **inputs,
        do_sample=True,
        temperature=temperature,
        top_p=top_p,
        repetition_penalty=repetition_penalty,
        max_new_tokens=max_new_tokens,
        pad_token_id=tokenizer.eos_token_id,
    )
    gen_ids = outputs[0][input_len:]
    text = tokenizer.decode(gen_ids, skip_special_tokens=True)
    return text.strip()


def ask_baseline(question: str, system_prompt: Optional[str] = SYSTEM_PROMPT, **gen_kwargs) -> str:
    messages = []
    if system_prompt:
        messages.append({"role": "system", "content": system_prompt})
    messages.append({"role": "user", "content": question})
    return generate_from_messages(base_model, messages, **gen_kwargs)



In [None]:
DEMO_QUESTIONS = [
    "What was added in Python 3.12.2 released in March 2024?",
    "What specific bug fixes and security advisories were included in Python 3.12.3 (April 2024)?",
    "Which issues and regressions were addressed in Python 3.13.1 and 3.13.2, and on what dates were they released?",
    "Which CVEs were fixed in Python 3.12.x during midâ€‘2024, and which modules were impacted?",
    "Which PEPs targeting Python 3.14 changed status between alpha and beta, and what wording changed in their Accepted texts?",
    "What were the documented release blockers and notable open issues listed before the Python 3.13.1 release, and which were resolved by that release?",
]

for i, q in enumerate(DEMO_QUESTIONS, 1):
    print(f"\nQ{i}: {q}\n")
    ans = ask_baseline(q)
    print(ans)



---

## Fine-tuning with your JSONL dataset (QLoRA)
We will fine-tune the same baseline model using your dataset:
- Preferred path: `data/processed/fine-tuning-training-data.v4.cleaned.jsonl`
- If not found, you can upload the file when running in Colab.



In [None]:
# Locate dataset (adjust this if running locally vs Colab)
from pathlib import Path

# Preferred local path
DATA_PATHS = [
    Path("data/processed/fine-tuning-training-data.v4.cleaned.jsonl"),
    Path("/content/data/processed/fine-tuning-training-data.v4.cleaned.jsonl"),
    Path("/content/fine-tuning-training-data.v4.cleaned.jsonl"),
]

DATA_PATH = None
for p in DATA_PATHS:
    if p.exists():
        DATA_PATH = str(p)
        break

if DATA_PATH is None:
    print("Dataset not found at default paths. Upload the JSONL file or mount drive and set DATA_PATH manually.")
else:
    print("Using dataset:", DATA_PATH)



In [None]:
# Load JSONL chat-style dataset with datasets
from datasets import load_dataset

dataset = None
if DATA_PATH is not None:
    dataset = load_dataset("json", data_files=DATA_PATH, split="train")
    print(dataset)
else:
    raise FileNotFoundError("Please set DATA_PATH to your JSONL file.")



In [None]:
# Formatting function: convert messages -> chat template string
# We keep it simple for demo: train on full conversation (prompt + answer)
# For production, you can mask inputs using TRL's response_template.

def format_example(example):
    msgs = example.get("messages")
    if not msgs:
        return ""
    try:
        text = tokenizer.apply_chat_template(
            msgs,
            tokenize=False,
            add_generation_prompt=False,
        )
    except Exception:
        # Fallback: naive concatenation
        parts = []
        for m in msgs:
            role = m.get("role", "user")
            parts.append(f"{role}: {m.get('content','')}")
        text = "\n".join(parts)
    # Ensure an EOS to bound samples
    eos = tokenizer.eos_token or "</s>"
    return text + eos

from datasets import Dataset

def formatting_func(examples):
    texts = []
    for msgs in examples["messages"]:
        texts.append(format_example({"messages": msgs}))
    return {"text": texts}

# Map to a text field for SFTTrainer
processed = dataset.map(formatting_func, batched=True, remove_columns=dataset.column_names)
print(processed)



In [None]:
# QLoRA configuration and SFT training (TRL + PEFT)
from trl import SFTTrainer
from peft import LoraConfig, PeftModel
from transformers import TrainingArguments

OUTPUT_DIR = "outputs/gemma2-2b-it-lora"

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=[
        "q_proj","k_proj","v_proj","o_proj",
        "gate_proj","up_proj","down_proj",
    ],
)

# Training arguments (tune for your GPU)
train_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=1,               # increase to 2-3 if you have time
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    learning_rate=2e-4,
    lr_scheduler_type="cosine",
    warmup_ratio=0.03,
    logging_steps=20,
    save_steps=200,
    save_total_limit=2,
    bf16=(torch.cuda.is_available()),
    fp16=not torch.cuda.is_available(),
    optim="paged_adamw_8bit",
    gradient_checkpointing=True,
    report_to=[],
)

# Use the already-quantized base_model when on CUDA+bnb
trainer = SFTTrainer(
    model=base_model,
    tokenizer=tokenizer,
    train_dataset=processed,
    args=train_args,
    dataset_text_field="text",
    packing=False,
    peft_config=lora_config,
    max_seq_length=2048,
)

trainer.train()

# Save the LoRA adapter
trainer.model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print("Saved adapter to:", OUTPUT_DIR)



In [None]:
# Load the fine-tuned adapter for inference
from peft import PeftModel

ft_model = PeftModel.from_pretrained(base_model, OUTPUT_DIR)
ft_model.eval()
print("Loaded LoRA adapter for inference.")


def ask_finetuned(question: str, system_prompt: Optional[str] = SYSTEM_PROMPT, **gen_kwargs) -> str:
    messages = []
    if system_prompt:
        messages.append({"role": "system", "content": system_prompt})
    messages.append({"role": "user", "content": question})
    return generate_from_messages(ft_model, messages, **gen_kwargs)



In [None]:
# Re-ask the same demo questions: baseline vs fine-tuned
for i, q in enumerate(DEMO_QUESTIONS, 1):
    print("\n" + "="*80)
    print(f"Q{i}: {q}")
    print("-"*80)
    base = ask_baseline(q)
    print("Baseline:\n", base)
    print("-"*80)
    ft = ask_finetuned(q)
    print("Fine-tuned:\n", ft)

