# 02_train — Fine-tune (LoRA/QLoRA) for Preference Extraction

Goal: train an instruction-tuned LLM to convert Turkish free-text laptop requests into Laprop `preferences` JSON.

This notebook:
- loads the JSONL dataset generated by `01_dataset.ipynb`
- fine-tunes an instruct model with **4-bit QLoRA** (works on Colab T4)
- saves the LoRA adapter to Google Drive (`colab/artifacts/...`)
- runs a small generation sanity check on the test set

You must enable GPU: `Runtime -> Change runtime type -> GPU`.


In [None]:
# --- 0) Drive mount + paths (Colab) ---
from google.colab import drive
drive.mount("/content/drive")

from pathlib import Path

# TODO: adjust to your Drive project folder
DRIVE_PROJECT_DIR = Path("/content/drive/MyDrive/laprop")
COLAB_DIR = DRIVE_PROJECT_DIR / "colab"

DATASET_DIR = COLAB_DIR / "data" / "prefs_dataset_v1"
ARTIFACTS_DIR = COLAB_DIR / "artifacts" / "prefs_extractor"
ARTIFACTS_DIR.mkdir(parents=True, exist_ok=True)

print("DATASET_DIR:", DATASET_DIR)
print("ARTIFACTS_DIR:", ARTIFACTS_DIR)


In [None]:
# --- 1) Clone/pull the repo into the Colab runtime ---
REPO_DIR = Path("/content/laprop-recommender")
REPO_URL = "https://github.com/ahmedberatAI/laprop-recommender.git"

if not REPO_DIR.exists():
    !git clone --depth 1 {REPO_URL} /content/laprop-recommender
else:
    !git -C /content/laprop-recommender pull

%cd /content/laprop-recommender


In [None]:
# --- 2) Install dependencies ---
%pip install -q -r /content/laprop-recommender/colab/requirements_colab.txt
%pip install -q -e /content/laprop-recommender


In [None]:
# --- 3) GPU sanity check ---
!nvidia-smi

import torch

print("torch:", torch.__version__)
print("cuda available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("device:", torch.cuda.get_device_name(0))
    cap = torch.cuda.get_device_capability(0)
    print("capability:", cap)
else:
    cap = (0, 0)

# bf16 is typically supported on Ampere+ (A100/L4 etc). T4 is fp16 only.
USE_BF16 = bool(torch.cuda.is_available() and cap[0] >= 8)
print("USE_BF16:", USE_BF16)


In [None]:
# --- 4) Imports ---
import json
import os
import random
import re
import time
from dataclasses import dataclass
from typing import Any, Dict, List

from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    set_seed,
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training


In [None]:
# --- 5) Training config (edit these) ---
set_seed(42)

# A safe default that fits Colab T4 with QLoRA.
# You can switch to a larger model if you have more VRAM.
MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"

MAX_LEN = 512

# LoRA hyperparams
LORA_R = 16
LORA_ALPHA = 32
LORA_DROPOUT = 0.05

# Training hyperparams
NUM_EPOCHS = 2
LEARNING_RATE = 2e-4
BATCH_SIZE = 8
GRAD_ACCUM = 4
WARMUP_RATIO = 0.03

RUN_ID = time.strftime("%Y%m%d_%H%M%S")
MODEL_TAG = MODEL_NAME.split("/")[-1].lower().replace(".", "_")
OUT_DIR = ARTIFACTS_DIR / f"{MODEL_TAG}_{RUN_ID}"
OUT_DIR.mkdir(parents=True, exist_ok=True)

print("OUT_DIR:", OUT_DIR)


In [None]:
# --- 6) Load dataset (from Drive) ---
data_files = {
    "train": str(DATASET_DIR / "train.jsonl"),
    "validation": str(DATASET_DIR / "val.jsonl"),
    "test": str(DATASET_DIR / "test.jsonl"),
}
ds = load_dataset("json", data_files=data_files)

print(ds)
print("columns:", ds["train"].column_names)
print("example:")
ds["train"][0]


In [None]:
# --- 7) Load tokenizer + base model (4-bit) ---
import torch

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16 if USE_BF16 else torch.float16,
)

tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME,
    use_fast=True,
    trust_remote_code=True,
)

if tokenizer.eos_token is None:
    # Most models define EOS; this is a fallback.
    tokenizer.eos_token = "</s>"

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)

model.config.use_cache = False
model.gradient_checkpointing_enable()

print("pad_token_id:", tokenizer.pad_token_id)
print("eos_token_id:", tokenizer.eos_token_id)


In [None]:
# --- 8) Build supervised fine-tuning examples with prompt-masking ---
# We concatenate:  prompt + completion + EOS
# and mask labels for the prompt part so the model is trained only on the JSON completion.


def find_target_modules(m) -> List[str]:
    """Auto-detect common projection module names for LoRA."""
    preferred = [
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ]
    present = set()
    for name, _ in m.named_modules():
        leaf = name.split(".")[-1]
        if leaf in preferred:
            present.add(leaf)
    if present:
        return sorted(present)

    # fallback for other architectures
    fallback = ["query_key_value", "dense", "fc1", "fc2"]
    present = set()
    for name, _ in m.named_modules():
        leaf = name.split(".")[-1]
        if leaf in fallback:
            present.add(leaf)
    return sorted(present)


TARGET_MODULES = find_target_modules(model)
print("TARGET_MODULES:", TARGET_MODULES)


def encode_example(ex: Dict[str, Any]) -> Dict[str, Any]:
    prompt = ex["prompt"]
    completion = ex["completion"]

    prompt_ids = tokenizer(prompt, add_special_tokens=False)["input_ids"]
    comp_ids = tokenizer(completion, add_special_tokens=False)["input_ids"]
    comp_ids = comp_ids + [tokenizer.eos_token_id]

    # keep completion; truncate prompt first if needed
    if len(prompt_ids) + len(comp_ids) > MAX_LEN:
        keep = MAX_LEN - len(comp_ids)
        if keep <= 0:
            prompt_ids = []
            comp_ids = comp_ids[:MAX_LEN]
        else:
            prompt_ids = prompt_ids[-keep:]

    input_ids = prompt_ids + comp_ids
    labels = ([-100] * len(prompt_ids)) + comp_ids
    attn = [1] * len(input_ids)

    return {"input_ids": input_ids, "labels": labels, "attention_mask": attn}


tokenized = ds.map(
    encode_example,
    remove_columns=ds["train"].column_names,
)

print(tokenized)
print("tokenized example lens:", len(tokenized["train"][0]["input_ids"]))


In [None]:
# --- 9) Data collator (pads input_ids/labels) ---
import torch


@dataclass
class DataCollatorForCausalLM:
    tokenizer: Any
    pad_to_multiple_of: int = 8

    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
        max_len = max(len(f["input_ids"]) for f in features)
        if self.pad_to_multiple_of:
            m = self.pad_to_multiple_of
            max_len = ((max_len + m - 1) // m) * m

        pad_id = int(self.tokenizer.pad_token_id)
        batch_input_ids = []
        batch_attention = []
        batch_labels = []
        for f in features:
            l = len(f["input_ids"])
            pad = max_len - l
            batch_input_ids.append(f["input_ids"] + [pad_id] * pad)
            batch_attention.append(f["attention_mask"] + [0] * pad)
            batch_labels.append(f["labels"] + [-100] * pad)

        return {
            "input_ids": torch.tensor(batch_input_ids, dtype=torch.long),
            "attention_mask": torch.tensor(batch_attention, dtype=torch.long),
            "labels": torch.tensor(batch_labels, dtype=torch.long),
        }


collator = DataCollatorForCausalLM(tokenizer)


In [None]:
# --- 10) Attach LoRA adapter (QLoRA) ---
model = prepare_model_for_kbit_training(model)

lora_cfg = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=TARGET_MODULES,
)

model = get_peft_model(model, lora_cfg)
model.print_trainable_parameters()


In [None]:
# --- 11) Train ---
training_args = TrainingArguments(
    output_dir=str(OUT_DIR),
    num_train_epochs=float(NUM_EPOCHS),
    learning_rate=float(LEARNING_RATE),
    warmup_ratio=float(WARMUP_RATIO),
    lr_scheduler_type="cosine",
    per_device_train_batch_size=int(BATCH_SIZE),
    per_device_eval_batch_size=int(BATCH_SIZE),
    gradient_accumulation_steps=int(GRAD_ACCUM),
    logging_steps=20,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    report_to="none",
    remove_unused_columns=False,
    optim="paged_adamw_8bit",
    fp16=not USE_BF16,
    bf16=USE_BF16,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
    data_collator=collator,
    tokenizer=tokenizer,
)

trainer.train()


In [None]:
# --- 12) Save adapter to Drive ---
adapter_dir = OUT_DIR / "adapter"
adapter_dir.mkdir(parents=True, exist_ok=True)

trainer.model.save_pretrained(adapter_dir)
tokenizer.save_pretrained(adapter_dir)

meta = {
    "model_name": MODEL_NAME,
    "max_len": MAX_LEN,
    "lora": {
        "r": LORA_R,
        "alpha": LORA_ALPHA,
        "dropout": LORA_DROPOUT,
        "target_modules": TARGET_MODULES,
    },
    "train": {
        "epochs": NUM_EPOCHS,
        "lr": LEARNING_RATE,
        "batch_size": BATCH_SIZE,
        "grad_accum": GRAD_ACCUM,
        "warmup_ratio": WARMUP_RATIO,
    },
}

(adapter_dir / "meta.json").write_text(json.dumps(meta, ensure_ascii=False, indent=2), encoding="utf-8")

print("saved adapter:", adapter_dir)


In [None]:
# --- 13) Quick sanity check on test set (small sample) ---
model.eval()


def generate_completion(prompt: str, max_new_tokens: int = 180) -> str:
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        out = model.generate(
            **inputs,
            max_new_tokens=int(max_new_tokens),
            do_sample=False,
            temperature=0.0,
            top_p=1.0,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.pad_token_id,
        )
    gen = out[0][inputs["input_ids"].shape[1] :]
    return tokenizer.decode(gen, skip_special_tokens=True).strip()


def extract_json_obj(text: str) -> Any:
    s = (text or "").strip()
    m = re.search(r"\{.*\}", s, flags=re.S)
    if not m:
        return None
    cand = m.group(0)
    try:
        return json.loads(cand)
    except Exception:
        return None


sample_n = 30
idxs = list(range(len(ds["test"])))
random.shuffle(idxs)
idxs = idxs[:sample_n]

ok_json = 0
ok_usage = 0

for j, i in enumerate(idxs):
    ex = ds["test"][i]
    pred_text = generate_completion(ex["prompt"]) 
    pred_obj = extract_json_obj(pred_text)
    gold = ex.get("target_prefs") or {}

    if pred_obj is not None:
        ok_json += 1
        if pred_obj.get("usage_key") == gold.get("usage_key"):
            ok_usage += 1

    if j < 3:
        print("---")
        print("input:", ex["input_text"])
        print("gold:", gold)
        print("pred_raw:", pred_text)
        print("pred_obj:", pred_obj)

print("\nmetrics (sample):")
print(f"- json_parse_rate: {ok_json}/{sample_n} = {ok_json/sample_n:.3f}")
print(f"- usage_key_acc  : {ok_usage}/{sample_n} = {ok_usage/sample_n:.3f}")
