# Fine-Tuning vs Baseline (Gemma 2 2B Instruct) — v6

This notebook implements the fine-tuning of the same Baseline model with a custom curated dataset to enable the LLM to answer time sensitive questions.


In [1]:
# === Colab bootstrap
import os, sys, subprocess

REQ = {
    "transformers": "4.43.3",
    "accelerate": "0.29.3",
    "peft": "0.11.1",
    "trl": "0.9.6",
    "datasets": None,
    "sentencepiece": None,
    "bitsandbytes": None,
}

def _get_ver(pkg_name: str):
    try:
        import importlib.metadata as md
        return md.version(pkg_name)
    except Exception:
        return None

def _needs_install():
    for pkg, ver in REQ.items():
        v = _get_ver(pkg)
        if v is None:
            return True
        if ver is not None and v != ver:
            return True
    return False

def _pip_install():
    pkgs = []
    for pkg, ver in REQ.items():
        pkgs.append(f"{pkg}=={ver}" if ver else pkg)
    cmd = [sys.executable, "-m", "pip", "install", "-q", "-U", "--no-cache-dir"] + pkgs
    subprocess.check_call(cmd)


if _needs_install():
    print("Installing / fixing environment...")
    _pip_install()


    if not os.environ.get("COLAB_BOOTSTRAP_RESTARTED"):
        os.environ["COLAB_BOOTSTRAP_RESTARTED"] = "1"
        print("Restarting runtime once for a clean import state...")
        os._exit(0)

print("Environment OK.")
print("Python:", sys.version.split()[0])
print("transformers:", _get_ver("transformers"))
print("accelerate:", _get_ver("accelerate"))
print("peft:", _get_ver("peft"))
print("trl:", _get_ver("trl"))
print("datasets:", _get_ver("datasets"))
print("bitsandbytes:", _get_ver("bitsandbytes"))



Environment OK.
Python: 3.12.12
transformers: 4.43.3
accelerate: 0.29.3
peft: 0.11.1
trl: 0.9.6
datasets: 4.4.2
bitsandbytes: 0.49.0


In [None]:
from huggingface_hub import login
login(token="acess token")



In [3]:

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

try:
    from transformers import BitsAndBytesConfig
    _bnb_available = True
except Exception:
    _bnb_available = False

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MODEL_ID = "google/gemma-2-2b-it"

print("Device:", DEVICE)
print("Model:", MODEL_ID)

# Tokenizer
baseline_tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)

# Prefer fp16 compute on CUDA (T4 safe), no bf16 assumption
if DEVICE == "cuda" and _bnb_available:
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
    )
    baseline_model = AutoModelForCausalLM.from_pretrained(
        MODEL_ID,
        device_map="auto",
        quantization_config=bnb_config,
        torch_dtype=torch.float16,
    )
else:
    dtype = torch.float32 if DEVICE == "cpu" else torch.float16
    baseline_model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype=dtype)
    baseline_model.to(DEVICE)

baseline_model.eval()
print("Baseline model loaded.")



Device: cuda
Model: google/gemma-2-2b-it


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/47.0k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/838 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/24.2k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/241M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

Baseline model loaded.


In [4]:
# Generation helpers for Gemma chat
from typing import List, Dict, Optional

SYSTEM_PROMPT = "You are a Python programming assistant."


def build_prompt(tokenizer, question: str, system_prompt: Optional[str] = SYSTEM_PROMPT) -> str:
    if system_prompt:
        user_content = system_prompt.strip() + "\n\n" + question.strip()
    else:
        user_content = question.strip()
    messages = [{"role": "user", "content": user_content}]
    return baseline_tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
    )


@torch.inference_mode()
def ask_model(model, tokenizer, question: str, max_new_tokens: int = 400) -> str:
    prompt = build_prompt(tokenizer, question)
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        do_sample=False,
        pad_token_id=tokenizer.eos_token_id,
        use_cache=True,
    )
    gen_ids = outputs[0][inputs["input_ids"].shape[1]:]
    return tokenizer.decode(gen_ids, skip_special_tokens=True).strip()



In [5]:
# Manual query: Baseline model
question = "What were the documented release blockers and notable open issues listed before the Python 3.13.1 release, and which were resolved by that release?"
if not question:
    raise ValueError("Please enter a non-empty question.")
print("=== BASELINE RESPONSE ===")
print("Question:",question)
print("A: ",ask_model(baseline_model, baseline_tokenizer, question))



=== BASELINE RESPONSE ===
Question: What were the documented release blockers and notable open issues listed before the Python 3.13.1 release, and which were resolved by that release?
A:  I do not have access to real-time information, including specific release notes for software like Python. 

To find the documented release blockers and open issues for Python 3.13.1, I recommend checking the official Python website and the Python 3.13.1 release notes. 

Here's where you can find that information:

* **Python's Official Website:** https://www.python.org/
* **Python 3.13.1 Release Notes:** https://www.python.org/downloads/release/3.13.1/

You'll likely find:

* **Release Blockers:** These are issues that prevented the release of the version. They are usually high-priority bugs that need to be fixed before a release.
* **Open Issues:** These are issues that are still being worked on and haven't been resolved yet. 

By reviewing these sections, you'll get the most accurate and up-to-date 

## Fine-Tuning using LORA

In [6]:
# Locate dataset
from pathlib import Path

DATA_PATHS = [
    Path("/content/fine_tuning_train-v5.jsonl")
]

DATA_PATH = None
for p in DATA_PATHS:
    if p.exists():
        DATA_PATH = str(p)
        break

if DATA_PATH is None:
    raise FileNotFoundError("Dataset not found. Upload fine_tuning_train-v5.jsonl or mount drive and set DATA_PATH.")
else:
    print("Using dataset:", DATA_PATH)



Using dataset: /content/fine_tuning_train-v5.jsonl


In [7]:
# Load JSONL chat-style dataset
from datasets import load_dataset

raw_ds = load_dataset("json", data_files=DATA_PATH, split="train")
print(raw_ds)



Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['messages'],
    num_rows: 600
})


In [8]:
# Map messages -> single chat template string for supervised LM
from datasets import Dataset

def format_example(example):
    msgs = example.get("messages")
    if not msgs:
        return ""
    try:
        text = baseline_tokenizer.apply_chat_template(
            msgs,
            tokenize=False,
            add_generation_prompt=False,
        )
    except Exception:
        # Fallback: naive concatenation
        parts = []
        for m in msgs:
            role = m.get("role", "user")
            parts.append(f"{role}: {m.get('content','')}")
        text = "\n".join(parts)
    eos = baseline_tokenizer.eos_token or "</s>"
    return text + eos


def formatting_func(batch):
    texts = []
    for msgs in batch["messages"]:
        texts.append(format_example({"messages": msgs}))
    return {"text": texts}

processed = raw_ds.map(formatting_func, batched=True, remove_columns=raw_ds.column_names)
print(processed)
print("Sample:\n", processed[0]["text"].split("\n")[:6])



Map:   0%|          | 0/600 [00:00<?, ? examples/s]

Dataset({
    features: ['text'],
    num_rows: 600
})
Sample:
 ['<bos><start_of_turn>user', 'You are a Python programming assistant.', '', 'Precisely, Was Python 3.12.3 a feature release or a bugfix release? Please include the release date.<end_of_turn>', '<start_of_turn>model', '- Python 3.12.3 is a maintenance (bugfix) release.']


In [9]:
# QLoRA setup and Trainer
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

MODEL_NAME = MODEL_ID
OUTPUT_DIR = "outputs/gemma2-2b-it-lora-v5"

# 1) 4-bit quantization config (T4-safe compute dtype)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

# 2) Reload a clean base model for training in 4-bit
train_base = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
)
train_base.config.use_cache = False  # disable cache during training
train_base = prepare_model_for_kbit_training(train_base)

# 3) LoRA config
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],
)

# 4) Wrap with LoRA
train_model = get_peft_model(train_base, lora_config)
train_model.print_trainable_parameters()

# 5) Tokenizer (pad to eos)
train_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
train_tokenizer.pad_token = train_tokenizer.eos_token

# 6) Tokenize dataset
MAX_LEN = 1024

def tokenize_function(examples):
    return train_tokenizer(
        examples["text"],
        truncation=True,
        max_length=MAX_LEN,
        padding="max_length",
    )

tokenized_dataset = processed.map(
    tokenize_function,
    batched=True,
    remove_columns=processed.column_names,
)

# 7) Data collator
collator = DataCollatorForLanguageModeling(
    tokenizer=train_tokenizer,
    mlm=False,
)

# 8) TrainingArguments — train longer, safe dtypes for T4
args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=3,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=16,
    learning_rate=2e-4,
    lr_scheduler_type="cosine",
    warmup_ratio=0.03,
    logging_steps=20,
    save_steps=200,
    save_total_limit=2,
    bf16=False,
    fp16=torch.cuda.is_available(),
    optim="paged_adamw_8bit",
    gradient_checkpointing=True,
    report_to="none",
)

# 9) Train
if torch.cuda.is_available():
    torch.cuda.empty_cache()

trainer = Trainer(
    model=train_model,
    args=args,
    train_dataset=tokenized_dataset,
    data_collator=collator,
)

trainer.train()

# 10) Save adapter + tokenizer
trainer.model.save_pretrained(OUTPUT_DIR)
train_tokenizer.save_pretrained(OUTPUT_DIR)
print("Saved adapter to:", OUTPUT_DIR)



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

trainable params: 20,766,720 || all params: 2,635,108,608 || trainable%: 0.7881


Map:   0%|          | 0/600 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
It is strongly recommended to train Gemma2 models with the `eager` attention implementation instead of `sdpa`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`.
  return fn(*args, **kwargs)


Step,Training Loss
20,2.5577
40,0.291
60,0.0835
80,0.0529
100,0.0509


Saved adapter to: outputs/gemma2-2b-it-lora-v5


In [10]:
# Load fine-tuned adapter for inference (fresh base -> attach adapter)
from peft import PeftModel

# Fresh base (same quantization as training)
inf_base = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
)

ft_model = PeftModel.from_pretrained(inf_base, OUTPUT_DIR)
ft_model.eval()

ft_tokenizer = train_tokenizer  # reuse tokenizer saved during training
print("Fine-tuned adapter loaded.")



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Fine-tuned adapter loaded.


In [12]:
# Manual query: Fine-tuned model
question = "Which CVEs were fixed in Python 3.12.x during mid‑2024, and which modules were impacted?"
if not question:
    raise ValueError("Please enter a non-empty question.")
print("=== FINE-TUNED RESPONSE ===")
print("Question: ",question)
print("A: ",ask_model(ft_model, ft_tokenizer, question))



=== FINE-TUNED RESPONSE ===
Question:  Which CVEs were fixed in Python 3.12.x during mid‑2024, and which modules were impacted?
A:  Here are representative CVE fixes affecting Python 3.12.x around mid‑2024, with impacted standard-library areas:
- CVE-2024-4032: impacted `ipaddress`; scope/logic bypass risk via incorrect is_private/is_global classification. Fixed in 3.12.4 (and other maintained branches).
- CVE-2024-6232: impacted `tarfile`; ReDoS/DoS via crafted inputs. Fixed in 3.12.6-era security work.
- CVE-2024-7592: impacted `http.cookies`; DoS via inefficient parsing. Fixed in 3.12.6-era security work.
- CVE-2024-8088: impacted `zipfile.Path`; DoS. Fixed in 3.12.6-era security work.
- libexpat CVEs (multiple): impacted `xml.parsers.expat / bundled libexpat`; varies; addressed by updating bundled libexpat. Fixed in 3.12.6-era security work.
Note: exact “fixed in version” can vary by branch; for a thesis/demo dataset, keep a consistent mapping aligned to the release notes/advisorie