# GraphRAG MVP Evaluation

This notebook evaluates two retrieval routes on the built index under `data/index`:

- **vector_only**: vector retrieval only (graph expansion disabled)
- **hybrid**: vector + graph paths + fusion (default)

It records latency, evidence validity, and a small Hit@k score, then produces a few matplotlib plots.

Prerequisite:

```bash
python -m ingestion.build_index --data_dir data/sources --out_dir data/index
```

Tip:
- If you have `OPENAI_API_KEY` set and want a purely offline evaluation, unset it before running this notebook.



In [None]:
from __future__ import annotations

import os
from pathlib import Path

INDEX_DIR = Path(os.environ.get("INDEX_DIR", "data/index"))
os.environ.setdefault("INDEX_DIR", str(INDEX_DIR))
os.environ.setdefault("CHROMA_DIR", str(INDEX_DIR / "chroma"))
os.environ.setdefault("GRAPH_PATH", str(INDEX_DIR / "graph.json.gz"))

# Prefer offline behavior by default (no model downloads). You can override via env.
os.environ.setdefault("ALLOW_MODEL_DOWNLOAD", "0")

required_files = [
    INDEX_DIR / "chunks.jsonl",
    INDEX_DIR / "graph.json.gz",
    INDEX_DIR / "manifest.json",
]
required_dirs = [INDEX_DIR / "chroma"]

missing = [p for p in required_files if not p.exists()] + [p for p in required_dirs if not p.exists()]
if missing:
    print("Index artifacts not found. Missing:")
    for p in missing:
        print(f"- {p.as_posix()}")
    print("\nPlease run:")
    print("  python -m ingestion.build_index --data_dir data/sources --out_dir data/index")
    raise SystemExit(1)

print("Index looks ready:", INDEX_DIR.as_posix())



In [None]:
from __future__ import annotations

import matplotlib.pyplot as plt
import networkx as nx
import pandas as pd

from agents.graph import answer_query
from app.settings import Settings
from ingestion.graph_index import load_graph
from ingestion.utils import read_jsonl

settings = Settings.from_env()

# Load chunks.jsonl into a dict for validation / hit@k.
chunks_records = read_jsonl(settings.chunks_path)
chunks_by_id = {r["chunk_id"]: r for r in chunks_records if r.get("chunk_id")}

graph: nx.MultiDiGraph = load_graph(settings.graph_path)

print("Loaded:")
print("- chunks:", len(chunks_by_id))
print("- graph nodes:", graph.number_of_nodes())
print("- graph edges:", graph.number_of_edges())



In [None]:
from __future__ import annotations

QUERIES = [
    {"id": "Q1", "query": "What is the prerequisite chain to DL301?"},
    {"id": "Q2", "query": "Is AI101 a prerequisite of ML201? Explain briefly."},
    {"id": "Q3", "query": "AI101 -> ML201 -> DL301: confirm this chain using the handbook."},
    {"id": "Q4", "query": "In 2025, which courses depend on DL301?"},
    {"id": "Q5", "query": "What projects are mentioned for DL301 (year 2024)?"},
    {"id": "Q6", "query": "Which year is NLP310 and what is its prerequisite?"},
    {"id": "Q7", "query": "Which courses are in the AI Track program in 2024?"},
    {"id": "Q8", "query": "What are the prerequisites for CAP400?"},
    {"id": "Q9", "query": "AI101 的先修课是什么？"},
    {"id": "Q10", "query": "DL301 的先修链是什么？"},
    {"id": "Q11", "query": "2025 年的 capstone 项目说明是什么？"},
    {"id": "Q12", "query": "MLOps320: what is it about and what is its prerequisite?"},
]

# Manual labels for a tiny Hit@k evaluation (at least 3 queries).
# The keyword can be a course code (e.g., "DL301") or a source identifier (e.g., "courses_mock.csv").
HIT_LABELS = {
    "Q1": {"keyword": "DL301"},
    "Q6": {"keyword": "NLP310"},
    "Q8": {"keyword": "CAP400"},
    "Q7": {"keyword": "courses_mock.csv"},
}

pd.DataFrame(QUERIES)



In [None]:
from __future__ import annotations

from typing import Any, Dict, List


def citation_valid_rate(citations: List[Dict[str, Any]]) -> float:
    if not citations:
        return 0.0
    ok = 0
    for c in citations:
        cid = str(c.get("chunk_id") or "")
        if cid and cid in chunks_by_id:
            ok += 1
    return ok / float(len(citations))


def is_graph_path_valid(path_obj: Dict[str, Any]) -> bool:
    steps = path_obj.get("path", []) or []
    if not steps:
        return False
    for s in steps:
        u = s.get("source")
        v = s.get("target")
        if not u or not v:
            return False
        if not graph.has_node(u) or not graph.has_node(v):
            return False
        if not graph.has_edge(u, v):
            return False
    return True


def path_valid_rate(paths: List[Dict[str, Any]]) -> float:
    if not paths:
        return 0.0
    ok = sum(1 for p in paths if is_graph_path_valid(p))
    return ok / float(len(paths))


def run_one(query: str, *, route: str | None, brief: bool, top_k: int, k_hop: int) -> Dict[str, Any]:
    # route: force with "vector_only"; set to None to use the default routing.
    return answer_query(
        query=query,
        brief=brief,
        top_k=top_k,
        k_hop=k_hop,
        route=route,
        settings=settings,
    )


def run_eval(queries: List[Dict[str, str]], *, top_k: int, k_hop: int, brief: bool) -> pd.DataFrame:
    rows: List[Dict[str, Any]] = []
    for q in queries:
        qid = q["id"]
        qtext = q["query"]
        for mode in ("vector_only", "hybrid"):
            route = "vector_only" if mode == "vector_only" else None
            try:
                result = run_one(qtext, route=route, brief=brief, top_k=top_k, k_hop=k_hop)
            except Exception as exc:
                # Keep the evaluation loop running even if one query fails.
                result = {
                    "answer": "",
                    "citations": [],
                    "graph_paths": [],
                    "debug": {
                        "error": str(exc),
                        "route": (route or "hybrid"),
                        "latency_ms": {"total": float("nan")},
                    },
                }

            citations = list(result.get("citations", []) or [])
            graph_paths = list(result.get("graph_paths", []) or [])
            debug = dict(result.get("debug", {}) or {})

            total_latency = float(debug.get("latency_ms", {}).get("total", float("nan")))
            route_used = str(debug.get("route") or "")

            rows.append(
                {
                    "id": qid,
                    "query": qtext,
                    "mode": mode,
                    "route": route_used,
                    "total_latency_ms": total_latency,
                    "num_citations": len(citations),
                    "num_graph_paths": len(graph_paths),
                    "citation_valid_rate": citation_valid_rate(citations),
                    "path_valid_rate": path_valid_rate(graph_paths),
                    # keep the full objects for later analysis
                    "citations": citations,
                    "graph_paths": graph_paths,
                    "debug": debug,
                }
            )

    return pd.DataFrame(rows)



In [None]:
from __future__ import annotations

TOP_K = 5
K_HOP = 2
BRIEF = True

df = run_eval(QUERIES, top_k=TOP_K, k_hop=K_HOP, brief=BRIEF)

df[["id", "mode", "route", "total_latency_ms", "num_citations", "num_graph_paths", "citation_valid_rate", "path_valid_rate"]].head(10)



In [None]:
from __future__ import annotations

summary = (
    df.groupby("mode")
    .agg(
        mean_latency_ms=("total_latency_ms", "mean"),
        p95_latency_ms=("total_latency_ms", lambda s: float(s.quantile(0.95))),
        mean_num_citations=("num_citations", "mean"),
        mean_num_graph_paths=("num_graph_paths", "mean"),
        mean_citation_valid_rate=("citation_valid_rate", "mean"),
        mean_path_valid_rate=("path_valid_rate", "mean"),
    )
    .sort_index()
)
summary



In [None]:
from __future__ import annotations

import numpy as np

order = [q["id"] for q in QUERIES]

# 1) Latency comparison per query (vector_only vs hybrid)
lat = df.pivot(index="id", columns="mode", values="total_latency_ms").reindex(order)

x = np.arange(len(lat.index))
width = 0.38

fig, ax = plt.subplots(figsize=(12, 4))
ax.bar(x - width / 2, lat["vector_only"].fillna(0.0), width, label="vector_only")
ax.bar(x + width / 2, lat["hybrid"].fillna(0.0), width, label="hybrid")
ax.set_xticks(x)
ax.set_xticklabels(lat.index)
ax.set_ylabel("total_latency_ms")
ax.set_title("Latency per query: vector_only vs hybrid")
ax.legend()
plt.tight_layout()
plt.show()

# 2) Validity metrics comparison (mean)
valid = df.groupby("mode")[["citation_valid_rate", "path_valid_rate"]].mean().reindex(["vector_only", "hybrid"])
metrics = ["citation_valid_rate", "path_valid_rate"]

x2 = np.arange(len(metrics))
fig, ax = plt.subplots(figsize=(7, 4))
ax.bar(x2 - width / 2, valid.loc["vector_only", metrics].values, width, label="vector_only")
ax.bar(x2 + width / 2, valid.loc["hybrid", metrics].values, width, label="hybrid")
ax.set_xticks(x2)
ax.set_xticklabels(metrics)
ax.set_ylim(0.0, 1.05)
ax.set_title("Validity metrics (mean)")
ax.legend()
plt.tight_layout()
plt.show()

# 3) Route distribution (optional)
route_counts = df["route"].value_counts()
fig, ax = plt.subplots(figsize=(6, 3))
ax.bar(route_counts.index.astype(str), route_counts.values)
ax.set_title("Route distribution")
ax.set_ylabel("count")
plt.tight_layout()
plt.show()



In [None]:
from __future__ import annotations


def citation_haystack(citation: Dict[str, Any]) -> str:
    cid = str(citation.get("chunk_id") or "")
    rec = chunks_by_id.get(cid, {})
    meta = dict(rec.get("metadata", {}) or {})
    source = str(meta.get("source") or "")
    text = str(rec.get("text") or "")
    return (source + "\n" + text).lower()


def hit_at_k(row: pd.Series, keyword: str, k: int = 5) -> bool:
    kw = keyword.lower().strip()
    cits = list(row.get("citations", []) or [])
    for c in cits[:k]:
        if kw and kw in citation_haystack(c):
            return True
    return False


hit_rows: List[Dict[str, Any]] = []
for qid, info in HIT_LABELS.items():
    keyword = info["keyword"]
    for mode in ("vector_only", "hybrid"):
        row = df[(df["id"] == qid) & (df["mode"] == mode)].iloc[0]
        hit = hit_at_k(row, keyword=keyword, k=5)
        hit_rows.append({"id": qid, "mode": mode, "keyword": keyword, "hit": hit})

hit_df = pd.DataFrame(hit_rows)
hit_df



In [None]:
from __future__ import annotations

hit_summary = hit_df.groupby("mode")["hit"].mean().to_dict()
hit_summary



In [None]:
from __future__ import annotations

# Small narrative summary (auto-generated from the metrics)

hybrid = summary.loc["hybrid"]
vector_only = summary.loc["vector_only"]

print("=== Summary ===")
print(f"Queries evaluated: {len(QUERIES)}")
print("")
print("Latency:")
print(f"- vector_only mean: {vector_only['mean_latency_ms']:.1f} ms")
print(f"- hybrid      mean: {hybrid['mean_latency_ms']:.1f} ms")
print("")
print("Explainability (graph_paths):")
print(f"- vector_only mean graph_paths: {vector_only['mean_num_graph_paths']:.2f}")
print(f"- hybrid      mean graph_paths: {hybrid['mean_num_graph_paths']:.2f}")
print("")
print("Hit@5 (manual labels):")
print(f"- vector_only: {hit_summary.get('vector_only', 0.0):.2f}")
print(f"- hybrid:      {hit_summary.get('hybrid', 0.0):.2f}")
print("")

# Failure cases (1-2): missing graph paths or low hit
hybrid_rows = df[df["mode"] == "hybrid"].set_index("id")
fail_ids: List[str] = []

# 1) No graph paths
no_path = hybrid_rows[hybrid_rows["num_graph_paths"] == 0].index.tolist()
fail_ids.extend(no_path)

# 2) Not hitting any manual keyword
for qid in HIT_LABELS.keys():
    r = hit_df[(hit_df["id"] == qid) & (hit_df["mode"] == "hybrid")].iloc[0]
    if not bool(r["hit"]):
        fail_ids.append(qid)

# Keep unique, stable order
seen = set()
fail_ids = [x for x in fail_ids if not (x in seen or seen.add(x))]

if fail_ids:
    print("Failure cases (up to 2):")
    for qid in fail_ids[:2]:
        qtext = next(q["query"] for q in QUERIES if q["id"] == qid)
        row = hybrid_rows.loc[qid]
        print(f"- {qid}: {qtext}")
        print(
            f"  citations={int(row['num_citations'])}, graph_paths={int(row['num_graph_paths'])}, "
            f"hit@5={(hit_df[(hit_df['id']==qid) & (hit_df['mode']=='hybrid')]['hit'].iloc[0] if qid in HIT_LABELS else 'n/a')}")
        print("  Analysis: This is usually caused by missing entities in the query, sparse graph coverage, or weak retrieval.")
else:
    print("No obvious failure cases detected in this small run.")

print("\nTakeaway:")
print(
    "Hybrid retrieval typically improves explainability by producing prerequisite paths, "
    "and can improve hit rates when the graph adds evidence chunks not surfaced by vector similarity alone."
)

