In [1]:
# Cell 0 — imports; config; logging
import requests
import os, json, time, hashlib, warnings
from collections import defaultdict

import numpy as np
import pandas as pd

warnings.filterwarnings("ignore")

from beir import util as beir_util
from beir.datasets.data_loader import GenericDataLoader
from beir.retrieval.evaluation import EvaluateRetrieval

from sentence_transformers import SentenceTransformer


In [2]:
# parameters 
DATASET_SPEC = "beir:fiqa" 
CACHE_DIR = "./cache"
REPORT_DIR = "./reports"
K_VALUES = [1, 3, 10]

# LLM-judge parameters
JUDGE_ENABLED   = True
OLLAMA_URL      = "http://localhost:11434/api/generate"
JUDGE_MODEL     = "llama3.1:8b-instruct-q8_0" 
JUDGE_TOP_K     = 5                        
JUDGE_MAX_Q     = 10   

# Retrievers
MODEL_REGISTRY = [
    {"name": "DPR",   "hf_id": "facebook-dpr-question_encoder-single-nq-base",      "type": "dense"},
    {"name": "E5",    "hf_id": "intfloat/e5-base-v2",                               "type": "dense"},
    {"name": "MPNet", "hf_id": "sentence-transformers/multi-qa-mpnet-base-dot-v1",  "type": "dense"},
]
os.makedirs(CACHE_DIR, exist_ok=True)
os.makedirs(REPORT_DIR, exist_ok=True)


In [3]:
# Data loader
def load_beir_dataset(spec: str, base_dir="./datasets"):
    assert spec.startswith("beir:"), "This cell handles BEIR only; see custom loader later."
    name = spec.split(":", 1)[1]
    url = f"https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{name}.zip"
    data_path = beir_util.download_and_unzip(url, base_dir)          # -> ./datasets/<name>
    corpus, queries, qrels = GenericDataLoader(data_folder=data_path).load(split="test")
    return data_path, corpus, queries, qrels

def normalize_corpus_texts(corpus: dict):
    
    ids, texts = [], []
    for did, obj in corpus.items():
        t = (obj.get("title") or "").strip()
        x = (obj.get("text") or "").strip()
        txt = (t + ". " + x).strip() if t and x else (t or x)
        ids.append(did); texts.append(txt)
    return ids, texts

DATA_PATH, CORPUS, QUERIES, QRELS = load_beir_dataset(DATASET_SPEC)
DOC_IDS, DOC_TEXTS = normalize_corpus_texts(CORPUS)

# Subset sizes
N_DOCS = 200     
N_QUERIES = 10 
K_VALUES = [1, 3, 10]


doc_ids_all = list(CORPUS.keys())
doc_ids_sub = set(doc_ids_all[:N_DOCS])
corpus_sub = {did: CORPUS[did] for did in doc_ids_all[:N_DOCS]}

qrels_sub = {}
for qid, drels in QRELS.items():
    keep = {did: rel for did, rel in drels.items() if did in doc_ids_sub}
    if keep:
        qrels_sub[qid] = keep

query_ids_labeled = list(qrels_sub.keys())[:N_QUERIES]
queries_sub = {qid: QUERIES[qid] for qid in query_ids_labeled}
qrels_sub = {qid: qrels_sub[qid] for qid in query_ids_labeled}

# encoding
doc_texts_sub = []
doc_ids_seq = []
for did, obj in corpus_sub.items():
    t = (obj.get("title") or "").strip()
    x = (obj.get("text") or "").strip()
    doc_texts_sub.append((t + ". " + x).strip() if t and x else (t or x))
    doc_ids_seq.append(did)

print(f"Subset → docs={len(doc_ids_seq)}; queries={len(queries_sub)} (all with labels)")



100%|██████████| 57638/57638 [00:00<00:00, 219600.82it/s]

Subset → docs=200; queries=7 (all with labels)





In [4]:
# helpers
def model_key(hf_id: str) -> str:
    return hashlib.md5(hf_id.encode()).hexdigest()[:10]

def load_st_model(hf_id: str):
    return SentenceTransformer(hf_id, trust_remote_code=True)

def cache_paths(cache_dir, dataset_path, model_id):
    base = os.path.join(cache_dir, f"{os.path.basename(dataset_path)}__{model_id}")
    return base + ".docs.npy", base + ".meta.json"

def encode_docs_cached(model, texts, cache_dir, dataset_path, model_id):
    emb_path, meta_path = cache_paths(cache_dir, dataset_path, model_id)
    if os.path.exists(emb_path):
        emb_docs = np.load(emb_path)
        return emb_docs, {"cached": True}
    t0 = time.time()
    emb_docs = model.encode(texts, normalize_embeddings=True, show_progress_bar=False)
    np.save(emb_path, emb_docs)
    meta = {"cached": False, "encode_secs": round(time.time() - t0, 3), "n_docs": len(texts)}
    with open(meta_path, "w") as f: json.dump(meta, f)
    return emb_docs, meta

def evaluate_model(model, doc_ids, emb_docs, queries, qrels, k_values):
    results = {}
    latencies = []
    for qid, q in queries.items():
        t0 = time.time()
        qvec = model.encode(q, normalize_embeddings=True)
        scores = emb_docs @ qvec
        latencies.append(time.time() - t0)
        results[qid] = {doc_ids[i]: float(scores[i]) for i in range(len(doc_ids))}
    retr = EvaluateRetrieval("cos_sim")
    ndcg, _map, recall, precision = retr.evaluate(qrels, results, k_values=k_values)
    return {
        "nDCG": ndcg,
        "MAP": _map,
        "Recall": recall,
        "Precision": precision,
        "latency_avg_ms": round(1000 * (sum(latencies) / max(1, len(latencies))), 2)
    }

def recommend(models_metrics: dict, primary="NDCG@10"):
    # rank by quality first, break ties by coverage then latency. suject to change
    def key(m):
        s = models_metrics[m]
        return (
            s["nDCG"].get(primary, 0.0),
            s["Recall"].get("Recall@10", 0.0),
            -s["latency_avg_ms"]
        )
    return sorted(models_metrics.keys(), key=key, reverse=True)


In [5]:
# text map for judge and printing
DOC_TEXT_BY_ID = dict(zip(doc_ids_seq, doc_texts_sub))

def rank_topk(model, emb_docs, doc_ids, queries, topk=5):
    ranked = {}
    for qid, q in queries.items():
        qvec = model.encode(q, normalize_embeddings=True)
        scores = emb_docs @ qvec
        order = np.argsort(scores)[::-1][:topk]
        ranked[qid] = [(doc_ids[i], float(scores[i])) for i in order]
    return ranked

# ---- LLM judge ----
_JUDGE_RUBRIC = """You are an impartial retrieval judge for financial Q&A.
For each user query and the candidate passages, assign integer scores 0-5 for:
- relevance (semantic match),
- specificity (directness to the question),
- faithfulness (no contradiction within the snippet; ignore outside knowledge),
- overall (holistic usefulness; not an average).
Return STRICT JSON with schema:
{
  "per_passage":[
    {"doc_id":"<id>","relevance":0,"specificity":0,"faithfulness":0,"overall":0,"justification":"<=350 chars"}
  ],
  "best_doc_id":"<id>",
  "reason_best":"<=200 chars"
}
"""

def _ollama_generate(prompt: str, model: str = JUDGE_MODEL, url: str = OLLAMA_URL, max_retries: int = 3):
    payload = {"model": model, "prompt": prompt, "options": {"temperature": 0.2}, "stream": False}
    last = None
    for a in range(max_retries):
        r = requests.post(url, json=payload, timeout=120)
        last = r
        if r.ok:
            return r.json()["response"]
        time.sleep(1.2 * (a + 1))
    raise RuntimeError(f"Ollama request failed: {getattr(last,'text','<no response>')}")

def _safe_score(x):
    try:
        v = float(x)
    except Exception:
        return np.nan
    return max(1.0, min(5.0, v))

def judge_query(query_text: str, ranked_docs: list) -> dict:
    cands = []
    for did, _ in ranked_docs:
        txt = (DOC_TEXT_BY_ID.get(did, "") or "").replace("\n", " ").strip()
        cands.append(f"[{did}] {txt[:900]}")
    prompt = (
        f"Query:\n{query_text}\n\n"
        f"Candidate passages:\n" + "\n\n".join(cands) + "\n\n"
        f"Instructions:\n" + _JUDGE_RUBRIC
    )

    raw = _ollama_generate(prompt)
    raw = raw.strip().strip("`")
    try:
        obj = json.loads(raw)
    except json.JSONDecodeError:
        i, j = raw.find("{"), raw.rfind("}")
        obj = json.loads(raw[i:j+1])

    # --- normalize/clamp all numeric fields to [1,5] ---
    if "per_passage" in obj and isinstance(obj["per_passage"], list):
        norm = []
        for p in obj["per_passage"]:
            p = dict(p)  # copy
            p["relevance"]   = _safe_score(p.get("relevance", 0))
            p["specificity"] = _safe_score(p.get("specificity", 0))
            p["faithfulness"]= _safe_score(p.get("faithfulness", 0))
            p["overall"]     = _safe_score(p.get("overall", 0))
            norm.append(p)
        obj["per_passage"] = norm
    return obj


def summarize_judge(per_model_rankings: dict, queries: dict, max_q: int, topk: int):
    """
    per_model_rankings: {model_name: {qid: [(doc_id, score), ...]}}
    returns: (df_per_query, df_summary, pairwise_wins_matrix)
    """
    qids = list(queries.keys())[:max_q]
    rows = []
    models = list(per_model_rankings.keys())
    wins = {m: {n: 0 for n in models} for m in models}

    for qid in qids:
        avgs = {}
        for m in models:
            judged = judge_query(queries[qid], per_model_rankings[m][qid][:topk])
            scores = [ _safe_score(p.get("overall", 0)) for p in judged["per_passage"] ]
            scores = [s for s in scores if not np.isnan(s)]
            overall = float(np.mean(scores)) if scores else np.nan

            avgs[m] = overall
            rows.append({"qid": qid, "model": m, "overall_avg_topK": overall,
                         "best_doc_id": judged.get("best_doc_id", "")})
        best = max(avgs.values())
        winners = [m for m,v in avgs.items() if np.isclose(v, best)]
        for w in winners:
            for o in models:
                if o != w:
                    wins[w][o] += 1

    df_perq = pd.DataFrame(rows)
    df_summary = df_perq.groupby("model")["overall_avg_topK"].mean().sort_values(ascending=False).to_frame("JudgeOverall@topK")
    win_mat = pd.DataFrame(wins).T
    return df_perq, df_summary, win_mat



In [None]:
# evaluation loop
all_metrics = {}
ranked_per_model = {}

for m in MODEL_REGISTRY:
    name, hf_id = m["name"], m["hf_id"]
    print(f"Evaluating {name} …")
    model = load_st_model(hf_id)
    emb_docs, meta = encode_docs_cached(model, doc_texts_sub, CACHE_DIR, DATA_PATH, model_key(hf_id))
    metrics = evaluate_model(model, doc_ids_seq, emb_docs, queries_sub, qrels_sub, k_values=K_VALUES)
    all_metrics[name] = metrics

    ranked_per_model[name] = rank_topk(model, emb_docs, doc_ids_seq, queries_sub, topk=JUDGE_TOP_K)

# Tabular summary
rows = []
for name, m in all_metrics.items():
    row = {
        "Model": name,
        "nDCG@1": round(m["nDCG"].get("NDCG@1", 0.0), 4),
        "nDCG@3": round(m["nDCG"].get("NDCG@3", 0.0), 4),
        "nDCG@10": round(m["nDCG"].get("NDCG@10", 0.0), 4),
        "Recall@10": round(m["Recall"].get("Recall@10", 0.0), 4),
        "P@10": round(m["Precision"].get("P@10", 0.0), 4),
        "Avg Latency (ms)": m["latency_avg_ms"]
    }
    rows.append(row)

df = pd.DataFrame(rows).sort_values("nDCG@10", ascending=False).reset_index(drop=True)
df


Evaluating DPR …
Evaluating E5 …
Evaluating MPNet …


Unnamed: 0,Model,nDCG@1,nDCG@3,nDCG@10,Recall@10,P@10,Avg Latency (ms)
0,E5,1.0,1.0,1.0,1.0,0.1,28.91
1,MPNet,1.0,1.0,1.0,1.0,0.1,28.08
2,DPR,0.5714,0.7517,0.7947,1.0,0.1,31.92


In [7]:
if JUDGE_ENABLED:
    print("Running LLM-judge …")
    df_judge_perq, df_judge_summary, df_winmat = summarize_judge(
        ranked_per_model, queries_sub, max_q=JUDGE_MAX_Q, topk=JUDGE_TOP_K
    )
    
    print("\nLLM-Judge summary (higher is better):")
    print(df_judge_summary.round(3))
    print("\nPairwise wins (queries judged):")
    print(df_winmat)
else:
    df_judge_perq = df_judge_summary = df_winmat = None


Running LLM-judge …

LLM-Judge summary (higher is better):
       JudgeOverall@topK
model                   
MPNet              3.800
E5                 3.543
DPR                3.429

Pairwise wins (queries judged):
       DPR  E5  MPNet
DPR      0   3      3
E5       3   0      3
MPNet    5   5      0


In [8]:
order = recommend(all_metrics, primary="NDCG@10")

report = {
    "dataset": DATASET_SPEC,
    "models": MODEL_REGISTRY,
    "k_values": K_VALUES,
    "metrics": all_metrics,
    "recommendation_order": order,
    "llm_judge": {
        "enabled": bool(JUDGE_ENABLED),
        "model": JUDGE_MODEL if JUDGE_ENABLED else None,
        "top_k": JUDGE_TOP_K,
        "max_queries": JUDGE_MAX_Q,
        "per_query": (df_judge_perq.to_dict(orient="records") if df_judge_perq is not None else None),
        "summary": (df_judge_summary["JudgeOverall@topK"].to_dict() if df_judge_summary is not None else None),
        "pairwise_wins": (df_winmat.to_dict() if df_winmat is not None else None)
    }
}
out_path = os.path.join(REPORT_DIR, "fiqa_report.json")
with open(out_path, "w") as f: json.dump(report, f, indent=2)

print("Recommendation (BEIR):", " > ".join(order))
if df_judge_summary is not None:
    print("Judge ranking:", " > ".join(list(df_judge_summary.index)))
print("Report:", out_path)
df

 

Recommendation (BEIR): MPNet > E5 > DPR
Judge ranking: MPNet > E5 > DPR
Report: ./reports\fiqa_report.json


Unnamed: 0,Model,nDCG@1,nDCG@3,nDCG@10,Recall@10,P@10,Avg Latency (ms)
0,E5,1.0,1.0,1.0,1.0,0.1,28.91
1,MPNet,1.0,1.0,1.0,1.0,0.1,28.08
2,DPR,0.5714,0.7517,0.7947,1.0,0.1,31.92
