# Fine-Tuning vs Baseline (Gemma 2 2B Instruct)

This notebook compares a baseline instruction-tuned LLM against a fine-tuned version on time-sensitive Python release questions.

Sections:
- Baseline: load the same model used in your RAG notebook and ask the 6 demo questions
- Fine-tuning: train a LoRA/QLoRA adapter using your JSONL dataset and re-evaluate the same questions

Notes:
- Target model: `google/gemma-2-2b-it` (same as your RAG notebook)
- Dataset: `data/processed/fine-tuning-training-data.v4.cleaned.jsonl` (or upload via Colab)
- Designed for Google Colab (T4/L4/A100); runs with 4-bit quantization



In [None]:
# Environment setup – pinned for reproducibility

%pip install -q \
  "numpy==1.26.4" \
  "protobuf==4.25.3" \
  "transformers==4.43.3" \
  "accelerate==0.29.3" \
  "peft==0.11.1" \
  "trl==0.9.6" \
  bitsandbytes datasets sentencepiece pandas

import numpy as np
import torch, transformers, datasets, peft, trl, google.protobuf

print("NumPy:", np.__version__)
print("PyTorch:", torch.__version__)
print("transformers:", transformers.__version__)
print("datasets:", datasets.__version__)
print("peft:", peft.__version__)
print("trl:", trl.__version__)
print("protobuf:", google.protobuf.__version__)


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/294.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m286.7/294.6 kB[0m [31m16.2 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.6/294.6 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
grpcio-status 1.71.2 requires protobuf<6.0dev,>=5.26.1, but you have protobuf 4.25.3 which is incompatible.
ydf 0.13.0 requires protobuf<7.0.0,>=5.29.1, but you have protobuf 4.25.3 which is incompatible.
opentelemetry-proto 1.37.0 requires protobuf<7.0,>=5.0, but you have protobuf 4.25.3 which is incompatible.[0m[31m
[0mNumPy: 1.26.4
PyTorch: 2.9.0+cu126
transformers: 4.43.3
datasets: 4.4.1
peft: 0.11.1
trl: 0.9.6
protobuf: 4.25.3


In [None]:
# Optional: Hugging Face login (only if your model is gated)
from huggingface_hub import login
login(token=""Access Token"")



In [None]:
import os, sys, json, math
from typing import List, Dict, Optional
import torch

from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
)

try:
    from transformers import BitsAndBytesConfig
    _bnb_available = True
except Exception:
    _bnb_available = False

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MODEL_ID = "google/gemma-2-2b-it"  # same baseline as RAG notebook

print("Device:", DEVICE)
print("Model:", MODEL_ID)

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)

if DEVICE == "cuda" and _bnb_available:
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float16,
    )
    base_model = AutoModelForCausalLM.from_pretrained(
        MODEL_ID,
        device_map="auto",
        quantization_config=bnb_config,
        torch_dtype=torch.bfloat16,
    )
else:
    dtype = torch.float32 if DEVICE == "cpu" else torch.float16
    base_model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype=dtype)
    base_model.to(DEVICE)

base_model.eval()
print("Baseline model loaded.")



Device: cuda
Model: google/gemma-2-2b-it


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Baseline model loaded.


In [None]:
GEN_CFG = {
    "max_new_tokens": 600,
    "temperature": 0.3,
    "top_p": 0.9,
    "repetition_penalty": 1.1,
}

SYSTEM_PROMPT = "You are a Python programming assistant."


def _format_chat(messages: List[Dict[str, str]], add_generation_prompt: bool = True) -> Dict[str, torch.Tensor]:
    if hasattr(tokenizer, "apply_chat_template") and tokenizer.chat_template is not None:
        effective_messages = messages
        if messages and messages[0].get("role") == "system":
            system_text = messages[0]["content"]
            effective_messages = messages[1:]
            if effective_messages and effective_messages[0].get("role") == "user":
                effective_messages = effective_messages.copy()
                effective_messages[0] = {
                    "role": "user",
                    "content": f"{system_text}\n\n{effective_messages[0]['content']}"
                }
            else:
                effective_messages = [{"role": "user", "content": system_text}]
        prompt_text = tokenizer.apply_chat_template(
            effective_messages,
            tokenize=False,
            add_generation_prompt=add_generation_prompt
        )
    else:
        sys_msg = ""
        if messages and messages[0].get("role") == "system":
            sys_msg = f"System: {messages[0]['content']}\n"
            user_msgs = messages[1:]
        else:
            user_msgs = messages
        convo = "\n".join([f"{m['role'].capitalize()}: {m['content']}" for m in user_msgs])
        prompt_text = (sys_msg + convo + ("\nAssistant:" if add_generation_prompt else ""))

    inputs = tokenizer(prompt_text, return_tensors="pt")
    return {k: v.to(DEVICE) for k, v in inputs.items()}

@torch.inference_mode()
def generate_from_messages(
    model,
    messages: List[Dict[str, str]],
    max_new_tokens: int = GEN_CFG["max_new_tokens"],
    temperature: float = GEN_CFG["temperature"],
    top_p: float = GEN_CFG["top_p"],
    repetition_penalty: float = GEN_CFG["repetition_penalty"],
) -> str:
    inputs = _format_chat(messages, add_generation_prompt=True)
    input_len = inputs["input_ids"].shape[-1]
    outputs = model.generate(
        **inputs,
        do_sample=True,
        temperature=temperature,
        top_p=top_p,
        repetition_penalty=repetition_penalty,
        max_new_tokens=max_new_tokens,
        pad_token_id=tokenizer.eos_token_id,
    )
    gen_ids = outputs[0][input_len:]
    text = tokenizer.decode(gen_ids, skip_special_tokens=True)
    return text.strip()


def ask_baseline(question: str, system_prompt: Optional[str] = SYSTEM_PROMPT, **gen_kwargs) -> str:
    messages = []
    if system_prompt:
        messages.append({"role": "system", "content": system_prompt})
    messages.append({"role": "user", "content": question})
    return generate_from_messages(base_model, messages, **gen_kwargs)



In [None]:
DEMO_QUESTIONS = [
    "What was added in Python 3.12.2 released in March 2024?",
]

for i, q in enumerate(DEMO_QUESTIONS, 1):
    print(f"\nQ{i}: {q}\n")
    ans = ask_baseline(q)
    print(ans)




Q1: What was added in Python 3.12.2 released in March 2024?

I do not have access to real-time information, including specific release notes for software updates like Python 3.12.2.  

**To find the details about what was added in Python 3.12.2, I recommend checking these resources:**

* **The official Python website:** https://www.python.org/
* **Python's Release Notes page:** https://docs.python.org/3/whatsnew/
* **The Python Enhancement Proposals (PEP) repository:** https://peps.python.org/


These sources will provide you with the most accurate and up-to-date information on the changes made in Python 3.12.2.


---

## Fine-tuning with your JSONL dataset (QLoRA)
We will fine-tune the same baseline model using your dataset:
- Preferred path: `data/processed/fine-tuning-training-data.v4.cleaned.jsonl`
- If not found, you can upload the file when running in Colab.



In [None]:
# Locate dataset (adjust this if running locally vs Colab)
from pathlib import Path

# Preferred local path
DATA_PATHS = [
    Path("data/processed/fine-tuning-training-data.v4.cleaned.jsonl"),
    Path("/content/data/processed/fine-tuning-training-data.v4.cleaned.jsonl"),
    Path("/content/fine-tuning-training-data.v4.cleaned.jsonl"),
]

DATA_PATH = None
for p in DATA_PATHS:
    if p.exists():
        DATA_PATH = str(p)
        break

if DATA_PATH is None:
    print("Dataset not found at default paths. Upload the JSONL file or mount drive and set DATA_PATH manually.")
else:
    print("Using dataset:", DATA_PATH)



Using dataset: /content/fine-tuning-training-data.v4.cleaned.jsonl


In [None]:
# Load JSONL chat-style dataset with datasets
from datasets import load_dataset

dataset = None
if DATA_PATH is not None:
    dataset = load_dataset("json", data_files=DATA_PATH, split="train")
    print(dataset)
else:
    raise FileNotFoundError("Please set DATA_PATH to your JSONL file.")



Dataset({
    features: ['messages', 'source_sheet'],
    num_rows: 1289
})


In [None]:
# Formatting function: convert messages -> chat template string
# We keep it simple for demo: train on full conversation (prompt + answer)
# For production, you can mask inputs using TRL's response_template.

def format_example(example):
    msgs = example.get("messages")
    if not msgs:
        return ""
    try:
        text = tokenizer.apply_chat_template(
            msgs,
            tokenize=False,
            add_generation_prompt=False,
        )
    except Exception:
        # Fallback: naive concatenation
        parts = []
        for m in msgs:
            role = m.get("role", "user")
            parts.append(f"{role}: {m.get('content','')}")
        text = "\n".join(parts)
    # Ensure an EOS to bound samples
    eos = tokenizer.eos_token or "</s>"
    return text + eos

from datasets import Dataset

def formatting_func(examples):
    texts = []
    for msgs in examples["messages"]:
        texts.append(format_example({"messages": msgs}))
    return {"text": texts}

# Map to a text field for SFTTrainer
processed = dataset.map(formatting_func, batched=True, remove_columns=dataset.column_names)
print(processed)



Dataset({
    features: ['text'],
    num_rows: 1289
})


In [None]:
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

MODEL_NAME = "google/gemma-2-2b-it"   # same as before
OUTPUT_DIR = "outputs/gemma2-2b-it-lora"

# 1. 4-bit quantization config (QLoRA style)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
)

# 2. Reload base model cleanly in 4-bit
base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
)

# Disable cache for training
base_model.config.use_cache = False

# Prepare model for k-bit training (sets up gradients correctly)
base_model = prepare_model_for_kbit_training(base_model)

# 3. LoRA config
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],
)

# 4. Wrap with LoRA – this creates trainable adapter params
base_model = get_peft_model(base_model, lora_config)
base_model.print_trainable_parameters()

# 5. Tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token

# 6. Tokenize your dataset (processed must have "text" column)
#    ↓↓↓ REDUCED max_length to 1024 to save memory
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        max_length=1024,           # was 2048
        padding="max_length",
    )

tokenized_dataset = processed.map(
    tokenize_function,
    batched=True,
    remove_columns=processed.column_names,
)

# 7. Data collator for causal LM
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

# 8. TrainingArguments – memory friendly settings
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=1,
    per_device_train_batch_size=1,    # was 2
    gradient_accumulation_steps=16,   # keep effective batch size similar
    learning_rate=2e-4,
    lr_scheduler_type="cosine",
    warmup_ratio=0.03,
    logging_steps=20,
    save_steps=200,
    save_total_limit=2,
    bf16=torch.cuda.is_available(),
    fp16=not torch.cuda.is_available(),
    optim="paged_adamw_8bit",
    gradient_checkpointing=True,      # ON now to save memory
    report_to="none",
)

# 9. Clear any leftover cache before starting
torch.cuda.empty_cache()

# 10. Trainer
trainer = Trainer(
    model=base_model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
)

# 11. Train
trainer.train()

# 12. Save adapter + tokenizer
trainer.model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print("Saved adapter to:", OUTPUT_DIR)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

trainable params: 20,766,720 || all params: 2,635,108,608 || trainable%: 0.7881


It is strongly recommended to train Gemma2 models with the `eager` attention implementation instead of `sdpa`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`.
  return fn(*args, **kwargs)


Step,Training Loss
20,1.4733
40,0.279
60,0.1532
80,0.1124


Saved adapter to: outputs/gemma2-2b-it-lora


In [None]:
# Load the fine-tuned adapter for inference
from peft import PeftModel

ft_model = PeftModel.from_pretrained(base_model, OUTPUT_DIR)
ft_model.eval()
print("Loaded LoRA adapter for inference.")


def ask_finetuned(question: str, system_prompt: Optional[str] = SYSTEM_PROMPT, **gen_kwargs) -> str:
    messages = []
    if system_prompt:
        messages.append({"role": "system", "content": system_prompt})
    messages.append({"role": "user", "content": question})
    return generate_from_messages(ft_model, messages, **gen_kwargs)



Loaded LoRA adapter for inference.


In [None]:
# Re-ask the same demo questions: baseline vs fine-tuned
for i, q in enumerate(DEMO_QUESTIONS, 1):
    print("\n" + "="*80)
    print(f"Q{i}: {q}")
    print("-"*80)
    base = ask_baseline(q)
    print("Baseline:\n", base)
    print("-"*80)
    ft = ask_finetuned(q)
    print("Fine-tuned:\n", ft)




Q1: What was added in Python 3.12.2 released in March 2024?
--------------------------------------------------------------------------------


RuntimeError: Index put requires the source and destination dtypes match, got Half for the destination and Float for the source.