# 02 — RAGAS Evaluation of Text Retrieval

This notebook stubs RAGAS-based evaluation for Picosearch's text retrieval components.

## Metrics tracked
| Metric | What it measures |
|---|---|
| `context_precision` | Are retrieved docs actually relevant? |
| `context_recall` | Were all relevant docs retrieved? |
| `faithfulness` | Does the answer stick to retrieved context? |
| `answer_relevancy` | Does the answer address the question? |

## Evaluation dataset
Golden Q&A pairs mapping search queries → expected asset IDs.


In [None]:
# Dependencies
# !pip install ragas datasets langchain-anthropic python-dotenv

In [None]:
import os
from dotenv import load_dotenv

load_dotenv("../backend/.env")

ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY", "")
print("Anthropic key loaded:", bool(ANTHROPIC_API_KEY))

In [None]:
# ── STUB: Golden evaluation dataset ─────────────────────────────────────────
GOLDEN_DATASET = [
    {
        "question": "team brainstorming session",
        "ground_truth_ids": ["asset-003", "asset-007"],
        "ground_truth": "Images showing collaborative ideation with sticky notes or whiteboards",
    },
    {
        "question": "minimalist workspace for remote work",
        "ground_truth_ids": ["asset-004"],
        "ground_truth": "Clean desk setup with laptop and minimal props",
    },
    {
        "question": "diverse marketing professionals",
        "ground_truth_ids": ["asset-001", "asset-005"],
        "ground_truth": "Photos of mixed-background teams in professional settings",
    },
]

print(f"[STUB] {len(GOLDEN_DATASET)} golden examples loaded")

In [None]:
# ── STUB: Run retrieval and collect contexts ──────────────────────────────────
import httpx

API_URL = "http://localhost:8000"

def retrieve(query: str, mode: str = "hybrid") -> list[dict]:
    """Call the Picosearch API and return results."""
    # STUB: uncomment when API is running
    # resp = httpx.post(f"{API_URL}/search", json={"query": query, "mode": mode})
    # return resp.json()["results"]
    return [{"id": "asset-003", "description": "Team brainstorm", "score": 0.85}]

eval_rows = []
for row in GOLDEN_DATASET:
    retrieved = retrieve(row["question"])
    contexts = [r["description"] for r in retrieved]
    eval_rows.append({
        "question": row["question"],
        "contexts": contexts,
        "ground_truth": row["ground_truth"],
        "answer": contexts[0] if contexts else "",  # use top result as "answer" stub
    })

print(f"[STUB] Collected {len(eval_rows)} eval rows")

In [None]:
# ── STUB: RAGAS evaluation ───────────────────────────────────────────────────
# from datasets import Dataset
# from ragas import evaluate
# from ragas.metrics import context_precision, context_recall, faithfulness, answer_relevancy
# from langchain_anthropic import ChatAnthropic
#
# llm = ChatAnthropic(model="claude-haiku-4-5-20251001")
#
# ds = Dataset.from_list(eval_rows)
# result = evaluate(
#     ds,
#     metrics=[context_precision, context_recall, faithfulness, answer_relevancy],
#     llm=llm,
# )
# result.to_pandas()

print("[STUB] RAGAS evaluation — uncomment and run with a live API + real golden data")

In [None]:
# ── STUB: Score summary ───────────────────────────────────────────────────────
# Paste result.to_pandas() output here after a real run
import pandas as pd

PLACEHOLDER_SCORES = {
    "context_precision": None,
    "context_recall": None,
    "faithfulness": None,
    "answer_relevancy": None,
}

pd.DataFrame([PLACEHOLDER_SCORES])