In [None]:
"""
This script mirrors the /pca-2d-embeddings data preparation flow but is completely
self-contained. It:
1. Loads QP metadata directly from data/qp-listings.csv
2. Builds a descriptive text blob for every job
3. Invokes an embedding provider (Gemini by default or a local SentenceTransformer)
4. Stores each embedding as <QP_ID>.npy under the requested output directory

Usage (from repo root):
    python standalone_scripts/generate_embeddings.py --output-dir data/embeddings

Set the GEMINI_API_KEY environment variable if you use the default Gemini backend,
or pass --local-model all-MiniLM-L6-v2 to run with sentence-transformers instead.
"""
from __future__ import annotations

import argparse
import json
import os
import re
from dataclasses import dataclass, field
from pathlib import Path
from typing import Callable, Iterable, List, Optional

import numpy as np
import pandas as pd

try:
    import google.generativeai as genai  # type: ignore
except ImportError:  # pragma: no cover - optional dependency
    genai = None


# ------------------------------------------------------------------------------------
# Data models and loaders (lifted from src.qps.utils but embedded for standalone use)
# ------------------------------------------------------------------------------------


@dataclass
class SubSector:
    id: Optional[str]
    name: Optional[str]


@dataclass
class Sector:
    id: Optional[str]
    name: Optional[str]
    sub_sectors: List[SubSector] = field(default_factory=list)


@dataclass
class Occupation:
    id: Optional[str]
    code: Optional[str]
    description: Optional[str]


@dataclass
class QP:
    code: str
    version: Optional[float]
    job_role: str
    job_role_description: str
    sector: Sector
    occupation: Occupation
    technical: bool
    economic_sector: Optional[str]
    economic_sector_type: Optional[str]

    @property
    def _id(self) -> str:
        if self.version is not None:
            return f"{self.code.replace('/', '_')}_{self.version}"
        return self.code


def _ensure_dict(value) -> Optional[dict]:
    if value is None or value == "":
        return None
    if isinstance(value, dict):
        return value
    try:
        parsed = json.loads(value)
        if isinstance(parsed, dict):
            return parsed
    except Exception:
        return None
    return None


def _parse_sector(raw: Optional[str]) -> Sector:
    data = _ensure_dict(raw) or {}
    sub_sectors_raw = data.get("subSectors") or []
    sub_sectors = [
        SubSector(
            id=str(sub.get("subSectorID")) if sub.get("subSectorID") else None,
            name=sub.get("subSectorName"),
        )
        for sub in sub_sectors_raw
    ]
    return Sector(
        id=str(data.get("sectorID")) if data.get("sectorID") else None,
        name=data.get("sectorName"),
        sub_sectors=sub_sectors,
    )


def _parse_occupation(raw: Optional[str]) -> Occupation:
    obj = _ensure_dict(raw) or {}
    return Occupation(
        id=obj.get("occupationID"),
        code=obj.get("occupationCode"),
        description=obj.get("occupationDesc"),
    )


def _parse_param_desc(raw: Optional[str]) -> Optional[str]:
    obj = _ensure_dict(raw)
    if not obj:
        return None
    return obj.get("paramDesc")


def _parse_technical(raw: Optional[str]) -> bool:
    return (_parse_param_desc(raw) or "").strip().lower() == "technical"


def _load_qp_records(csv_path: Path) -> list[dict]:
    df = pd.read_csv(csv_path)
    df = df.sort_values(by="version", ascending=False)
    df = df.drop_duplicates(subset=["qpCode"])
    df = df.replace({pd.NA: None, np.nan: None})
    df = df[df["matched_filename"].notna()]
    df = df[df["matched_filename"] != ""]
    return [row.to_dict() for _, row in df.iterrows()]


def load_qps(qp_csv: Path = Path("data/qp-listings.csv")) -> list[QP]:
    qps: list[QP] = []
    for row in _load_qp_records(qp_csv):
        qp = QP(
            code=row.get("qpCode"),
            version=row.get("version"),
            job_role=row.get("jobRole"),
            job_role_description=row.get("jobRoleDesc"),
            sector=_parse_sector(row.get("sectors")),
            occupation=_parse_occupation(row.get("occupation")),
            technical=_parse_technical(row.get("qpParamOne")),
            economic_sector=_parse_param_desc(row.get("qpParamTwo")),
            economic_sector_type=_parse_param_desc(row.get("qpParamThree")),
        )
        qps.append(qp)
    return qps


def _sanitize_qp_id(qp_id: str) -> str:
    """Convert QP IDs into filesystem-safe filenames."""
    return re.sub(r"[^A-Za-z0-9._-]", "_", qp_id)


def _format_embedding_text(qp: QP) -> str:
    """Serialize a QP into a descriptive prompt for the embedding model."""
    sub_sectors = ""
    if qp.sector and qp.sector.sub_sectors:
        names = ", ".join(sub_sector.name for sub_sector in qp.sector.sub_sectors)
        sub_sectors = f"\n    Sub-Sectors: {names}"

    occupation = qp.occupation.description if qp.occupation else "Unknown"
    econ_sector = qp.economic_sector or "Unknown"
    econ_type = qp.economic_sector_type or "Unknown"
    technical = "Technical" if qp.technical else "Non-Technical"

    return (
        f"Job Title: {qp.job_role}\n"
        f"Job Description: {qp.job_role_description}\n"
        f"Sector: {qp.sector.name if qp.sector else 'Unknown'}{sub_sectors}\n"
        f"Occupation: {occupation}\n"
        f"Technical/Non-Technical: {technical}\n"
        f"Economic Sector: {econ_sector}\n"
        f"Type (Organized/Unorganized): {econ_type}\n"
    )


def _build_gemini_generator(model_name: str) -> Callable[[str], Iterable[float]]:
    if genai is None:
        raise RuntimeError("google-generativeai package is required for Gemini embeddings.")
    api_key = os.environ.get("GEMINI_API_KEY")
    if not api_key:
        raise RuntimeError("Set the GEMINI_API_KEY environment variable to use Gemini embeddings.")

    genai.configure(api_key=api_key)

    def _embed(text: str) -> Iterable[float]:
        result = genai.embed_content(
            model=model_name,
            content=text,
            task_type="semantic_similarity",
        )
        return result["embedding"]

    return _embed


def _build_local_generator(model_name: str) -> Callable[[str], Iterable[float]]:
    import torch  # Local model path requires torch + sentence-transformers
    from sentence_transformers import SentenceTransformer

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = SentenceTransformer(model_name).to(device)

    def _embed(text: str) -> Iterable[float]:
        return model.encode(text).tolist()

    return _embed


def generate_embeddings(
    output_dir: Path,
    provider: Callable[[str], Iterable[float]],
    limit: int | None = None,
    force: bool = False,
) -> None:
    """Iterate over all QPs and persist embeddings to disk."""
    qps = load_qps()
    total = len(qps) if limit is None else min(limit, len(qps))
    output_dir.mkdir(parents=True, exist_ok=True)

    for idx, qp in enumerate(qps[:total], start=1):
        safe_id = _sanitize_qp_id(qp._id)
        out_path = output_dir / f"{safe_id}.npy"
        if out_path.exists() and not force:
            print(f"[{idx}/{total}] Skipping {qp._id} (already exists)")
            continue

        text = _format_embedding_text(qp)
        embedding = np.asarray(list(provider(text)), dtype=np.float32)
        np.save(out_path, embedding)
        print(f"[{idx}/{total}] Saved embedding for {qp._id} -> {out_path}")


def _parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="Generate QP embeddings.")
    parser.add_argument(
        "--output-dir",
        type=Path,
        default=Path("data/embeddings"),
        help="Directory where <QP_ID>.npy files will be written.",
    )
    parser.add_argument(
        "--model",
        default="models/embedding-001",
        help="Gemini embedding model name.",
    )
    parser.add_argument(
        "--local-model",
        help="If provided, use the given sentence-transformers model instead of Gemini.",
    )
    parser.add_argument(
        "--limit",
        type=int,
        help="Stop after processing N QPs (useful for smoke testing).",
    )
    parser.add_argument(
        "--force",
        action="store_true",
        help="Recompute embeddings even when the .npy file already exists.",
    )
    return parser.parse_args()


def main() -> None:
    args = _parse_args()
    if args.local_model:
        provider = _build_local_generator(args.local_model)
        print(f"Using local sentence-transformers model: {args.local_model}")
    else:
        provider = _build_gemini_generator(args.model)
        print(f"Using Gemini model: {args.model}")

    generate_embeddings(
        output_dir=args.output_dir,
        provider=provider,
        limit=args.limit,
        force=args.force,
    )


if __name__ == "__main__":
    main()
