# 01_dataset — Preference Extraction Dataset Builder

This notebook generates a **synthetic** dataset for training an LLM to convert Turkish free-text laptop requests into Laprop's structured `preferences` JSON.

**Outputs** (JSONL):
- `train.jsonl`
- `val.jsonl`
- `test.jsonl`

Each row contains:
- `input_text`: user request (Turkish)
- `target_prefs`: structured preferences dict
- `target_json`: canonical JSON string of `target_prefs`
- `prompt` / `completion`: a simple instruction-tuning format

Notes:
- Text generation uses both ASCII and Turkish characters (via `\u....` escapes) to simulate real input.
- Targets are based on the existing scenario registry (`laprop.recommend.scenarios`).


In [None]:
# --- 0) Drive mount + paths (Colab) ---
# If you run this outside Colab, comment out the next 2 lines.
from google.colab import drive
drive.mount("/content/drive")

from pathlib import Path

# TODO: change this to your own Drive project directory
# Example:
#   MyDrive/laprop/colab  (where you copied the repo's colab/ folder)
DRIVE_PROJECT_DIR = Path("/content/drive/MyDrive/laprop")
COLAB_DIR = DRIVE_PROJECT_DIR / "colab"

OUT_DIR = COLAB_DIR / "data" / "prefs_dataset_v1"
OUT_DIR.mkdir(parents=True, exist_ok=True)

print("COLAB_DIR:", COLAB_DIR)
print("OUT_DIR:", OUT_DIR)


In [None]:
# --- 1) Clone/pull the repo in the Colab runtime ---
from pathlib import Path

REPO_DIR = Path("/content/laprop-recommender")
REPO_URL = "https://github.com/ahmedberatAI/laprop-recommender.git"

if not REPO_DIR.exists():
    !git clone --depth 1 {REPO_URL} /content/laprop-recommender
else:
    !git -C /content/laprop-recommender pull

%cd /content/laprop-recommender


In [None]:
# --- 2) Install dependencies ---
# Colab best practice: use %pip (it installs into the current kernel environment).
%pip install -q -r /content/laprop-recommender/colab/requirements_colab.txt
%pip install -q -e /content/laprop-recommender


In [None]:
# --- 3) Imports ---
import json
import random
import re
from collections import Counter
from typing import Any, Dict, List, Optional

from laprop.recommend.scenarios import SCENARIOS
from laprop.config.rules import GAMING_TITLE_SCORES
from laprop.app.nlp import normalize_and_complete_preferences, parse_free_text_to_preferences

print("scenarios:", len(SCENARIOS))
print("game_titles:", len(GAMING_TITLE_SCORES))


In [None]:
# --- 4) Config ---
SEED = 42
VARIANTS_PER_SCENARIO = 25  # 100 scenarios * 25 = 2500 samples
TRAIN_FRAC = 0.90
VAL_FRAC = 0.05

rng = random.Random(SEED)


In [None]:
# --- 5) Helpers: canonical JSON + preference schema filtering ---

ALLOWED_KEYS = {
    "min_budget",
    "max_budget",
    "usage_key",
    "screen_max",
    "productivity_profile",
    "dev_mode",
    "gaming_titles",
    "min_gpu_score_required",
    "gaming_min_gpu",
    "design_profiles",
    "design_gpu_hint",
    "design_min_ram_hint",
}


def _clean_prefs(p: Dict[str, Any]) -> Dict[str, Any]:
    out: Dict[str, Any] = {}
    for k, v in (p or {}).items():
        if k not in ALLOWED_KEYS:
            continue
        # JSON-friendly
        if isinstance(v, (int, float, str, bool)) or v is None:
            out[k] = v
        elif isinstance(v, list):
            out[k] = [str(x) for x in v]
        else:
            out[k] = str(v)

    # Budgets should be ints when present
    if "min_budget" in out:
        out["min_budget"] = int(float(out["min_budget"]))
    if "max_budget" in out:
        out["max_budget"] = int(float(out["max_budget"]))

    # Normalize and fill derived fields where applicable
    try:
        out = normalize_and_complete_preferences(out)
    except Exception:
        pass

    # Keep only allowed keys after normalization
    out = {k: out[k] for k in list(out.keys()) if k in ALLOWED_KEYS}

    return out


def _dumps_target(p: Dict[str, Any]) -> str:
    return json.dumps(p, ensure_ascii=False, sort_keys=True)


In [None]:
# --- 6) Text generation templates ---

def _tr_thousands(n: int) -> str:
    # 35000 -> "35.000"
    return f"{int(n):,}".replace(",", ".")


def _budget_phrase(min_b: int, max_b: int, rng: random.Random) -> str:
    kmin = int(round(min_b / 1000))
    kmax = int(round(max_b / 1000))
    opts = [
        f"{kmin}-{kmax}k",
        f"{kmin}k-{kmax}k TL",
        f"{_tr_thousands(min_b)}-{_tr_thousands(max_b)} TL",
        f"{kmin} bin ile {kmax} bin aras\u0131",
        f"b\u00fct\u00e7e {kmin}-{kmax}k",
        f"{kmin}-{kmax}k aras\u0131 laptop",
    ]
    return rng.choice(opts)


USAGE_PHRASES = {
    "gaming": [
        "oyun i\u00e7in",
        "gaming",
        "oyun oynayaca\u011f\u0131m",
        "y\u00fcksek ayar oyun",
    ],
    "portability": [
        "ta\u015f\u0131nabilir olsun",
        "hafif ve ince",
        "pil \u00f6nemli",
        "ultrabook tarz\u0131",
    ],
    "productivity": [
        "ofis i\u015fleri",
        "excel / rapor / sunum",
        "\u00fcretkenlik",
        "g\u00fcnl\u00fck kullan\u0131m",
    ],
    "design": [
        "tasar\u0131m i\u015fleri",
        "grafik / video",
        "render / 3d",
        "tasar\u0131m odakl\u0131",
    ],
    "dev": [
        "yaz\u0131l\u0131m geli\u015ftirme",
        "coding i\u015fi",
        "programlama",
        "dev laptop",
    ],
}

DESIGN_APPS = {
    "graphic": ["photoshop", "illustrator", "figma"],
    "video": ["premiere", "after effects", "davinci resolve"],
    "3d": ["blender", "maya", "3ds max"],
    "cad": ["autocad", "revit", "solidworks"],
}

DEV_MODE_KWS = {
    "web": ["web", "backend", "api", "django", "flask", "node", "react"],
    "ml": ["ml", "ai", "yapay zeka", "pytorch", "tensorflow", "cuda"],
    "mobile": ["android", "ios", "xcode", "swift", "kotlin"],
    "gamedev": ["unity", "unreal", "oyun motoru", "3d engine"],
    "general": ["cs", "okul", "ders", "coding"],
}

PROD_PROFILE_KWS = {
    "office": ["office", "dok\u00fcman", "sunum", "word"],
    "data": ["excel", "analiz", "rapor", "tablo"],
    "multitask": ["\u00e7oklu g\u00f6rev", "\u00e7ok pencere", "multitask"],
    "light_dev": ["hafif yaz\u0131l\u0131m", "script", "scripting"],
}


def _join_items(items: List[str], rng: random.Random) -> str:
    if not items:
        return ""
    if len(items) == 1:
        return items[0]
    sep = rng.choice([", ", " + ", " ve "])
    return sep.join(items)


def render_text(prefs: Dict[str, Any], rng: random.Random) -> str:
    usage_key = prefs.get("usage_key")
    min_b = int(prefs.get("min_budget", 0) or 0)
    max_b = int(prefs.get("max_budget", 0) or 0)

    parts: List[str] = []

    if min_b and max_b:
        parts.append(_budget_phrase(min_b, max_b, rng))

    if usage_key in USAGE_PHRASES:
        parts.append(rng.choice(USAGE_PHRASES[usage_key]))

    # Usage-specific details
    if usage_key == "gaming":
        titles = list(prefs.get("gaming_titles") or [])
        if titles:
            parts.append(_join_items(titles, rng))
        # sometimes add a generic performance hint
        if rng.random() < 0.30:
            parts.append(rng.choice(["ak\u0131c\u0131 olsun", "y\u00fcksek fps", "\u0131s\u0131nmas\u0131n"]))

    elif usage_key == "portability":
        smax = prefs.get("screen_max")
        if smax is not None and rng.random() < 0.85:
            try:
                smax_f = float(smax)
                parts.append(rng.choice([f"{smax_f:g} in\u00e7", f"ekran {smax_f:g} in\u00e7 \u00fcst\u00fc olmas\u0131n"]))
            except (TypeError, ValueError):
                pass
        if rng.random() < 0.30:
            parts.append(rng.choice(["\u00e7antada ta\u015f\u0131yaca\u011f\u0131m", "okul i\u00e7in", "seyahat"]))

    elif usage_key == "productivity":
        prof = prefs.get("productivity_profile")
        kws = PROD_PROFILE_KWS.get(str(prof), [])
        if kws:
            parts.append(rng.choice(kws))
        if rng.random() < 0.25:
            parts.append(rng.choice(["uzun \u00f6m\u00fcrl\u00fc olsun", "sessiz olsun"]))

    elif usage_key == "design":
        profs = list(prefs.get("design_profiles") or [])
        if profs:
            # mention relevant apps
            apps: List[str] = []
            for p in profs:
                apps += DESIGN_APPS.get(str(p), [])
            apps = list(dict.fromkeys(apps))
            if apps:
                parts.append(_join_items(rng.sample(apps, k=min(len(apps), rng.randint(1, 3))), rng))
        min_ram = prefs.get("design_min_ram_hint")
        if min_ram is not None and rng.random() < 0.85:
            parts.append(f"en az {int(min_ram)}gb ram")

    elif usage_key == "dev":
        dev_mode = str(prefs.get("dev_mode") or "general")
        kws = DEV_MODE_KWS.get(dev_mode, DEV_MODE_KWS["general"])
        parts.append(rng.choice(kws))
        if dev_mode in ("ml", "gamedev") and rng.random() < 0.80:
            parts.append(rng.choice(["cuda \u015fart", "nvidia olsun", "gpu gerekli"]))

    # Generic wrapper text
    prefix = rng.choice([
        "laptop \u00f6ner",
        "bana laptop laz\u0131m",
        "notebook bak\u0131yorum",
        "laptop tavsiye",
        "",
    ]).strip()

    # Randomize order a bit
    rng.shuffle(parts)
    body = ", ".join([p for p in parts if p])
    text = (prefix + ": " + body).strip(": ") if prefix else body

    # Small noise
    if rng.random() < 0.15:
        text += rng.choice([" l\u00fctfen", " acil", " ya"]) 

    return text


In [None]:
# --- 7) Build dataset rows ---

SYSTEM_INSTRUCTION = (
    "Kullan\u0131c\u0131n\u0131n laptop iste\u011fini Laprop tercih format\u0131na (preferences JSON) \u00e7evir. "
    "Sadece JSON d\u00f6nd\u00fcr. "
    "Ana alanlar: min_budget, max_budget, usage_key ve amaca g\u00f6re opsiyonel alanlar (gaming_titles, dev_mode, design_profiles, productivity_profile, screen_max...)."
)

rows: List[Dict[str, Any]] = []
for sc in SCENARIOS:
    sid = sc.get("scenario_id")
    base = _clean_prefs(dict(sc.get("prefs") or {}))

    for i in range(int(VARIANTS_PER_SCENARIO)):
        text = render_text(base, rng)
        target = dict(base)
        target_json = _dumps_target(target)

        prompt = f"{SYSTEM_INSTRUCTION}\nKullan\u0131c\u0131: {text}\nJSON:"

        rows.append(
            {
                "id": f"{sid}_v{i:02d}",
                "scenario_id": sid,
                "usage_key": target.get("usage_key"),
                "input_text": text,
                "target_prefs": target,
                "target_json": target_json,
                "prompt": prompt,
                "completion": target_json,
            }
        )

rng.shuffle(rows)
print("rows:", len(rows))
print("usage distribution:", Counter(r.get("usage_key") for r in rows))
rows[0]


In [None]:
# --- 8) Train/val/test split + write JSONL ---

def write_jsonl(path: Path, items: List[Dict[str, Any]]) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    with path.open("w", encoding="utf-8") as f:
        for it in items:
            f.write(json.dumps(it, ensure_ascii=False) + "\n")


n = len(rows)
n_train = int(n * float(TRAIN_FRAC))
n_val = int(n * float(VAL_FRAC))

train_rows = rows[:n_train]
val_rows = rows[n_train : n_train + n_val]
test_rows = rows[n_train + n_val :]

print("split:", len(train_rows), len(val_rows), len(test_rows))

train_path = OUT_DIR / "train.jsonl"
val_path = OUT_DIR / "val.jsonl"
test_path = OUT_DIR / "test.jsonl"

write_jsonl(train_path, train_rows)
write_jsonl(val_path, val_rows)
write_jsonl(test_path, test_rows)

print("wrote:")
print("-", train_path)
print("-", val_path)
print("-", test_path)


In [None]:
# --- 9) Quick sanity check: can the rule-based parser recover the basics? (optional) ---

def _get(p: Dict[str, Any], key: str):
    v = p.get(key)
    if isinstance(v, list):
        return tuple(v)
    return v


KEYS_TO_CHECK = ["usage_key", "dev_mode", "productivity_profile"]

sample_eval = rows[: min(300, len(rows))]
ok = {k: 0 for k in KEYS_TO_CHECK}

for r in sample_eval:
    pred = parse_free_text_to_preferences(r["input_text"])
    gold = r["target_prefs"]
    for k in KEYS_TO_CHECK:
        if _get(pred, k) == _get(gold, k):
            ok[k] += 1

print("rule-based match rates (subset):")
for k in KEYS_TO_CHECK:
    print(f"- {k}: {ok[k]}/{len(sample_eval)} = {ok[k]/len(sample_eval):.3f}")

print("\nexample:")
ex = sample_eval[0]
print("text:", ex["input_text"])
print("gold:", ex["target_prefs"])
print("pred:", parse_free_text_to_preferences(ex["input_text"]))
