
# ðŸ§ª ImpliRetÂ Evaluation Notebook

This tutorial lets you:

1. **Load** any of the six ImpliRet subsets.
2. **Index & evaluate** the builtâ€‘in retrievers (BM25, ColBERTâ€‘v2, Contriever, Dragon+, HippoRAGÂ 2, ReasonIRâ€‘8B).
3. **Run longâ€‘context or RAGâ€‘style readers** (Llamaâ€‘3, GPTâ€‘4.1, etc.) against the retrieved docs.
4. **Export** metrics toÂ `metrics/latest.json` so the README badges stay in sync.

> **Tip**: Run on âš¡Â Colab GPU for fastest turnaround.


In [None]:
# @title Install dependencies (â‰ˆÂ 1Â min)
# collapse-hide
DEVICE = "cuda"  #@param ["cuda", "cpu"]
!pip -q install -r https://raw.githubusercontent.com/ZeinabTaghavi/ImpliRet/main/requirements.txt

import os, json, pathlib, itertools, pprint, time


## 1Â Â·Â Load a subset

In [None]:
from datasets import load_dataset

SPLIT = "arithmetic"       #@param ["arithmetic", "wknow", "temporal"]
STYLE = "multispeaker"     #@param ["multispeaker", "unispeaker"]

ds = load_dataset("zeinabTaghavi/ImpliRet", name=STYLE, split=SPLIT)
print(f"Loaded {len(ds):,} examples  |  Columns â†’ {list(ds.features.keys())}")
print("\nSample question â†’", ds[0]["question"])
print("Implicit document snippet â†’", ds[0]["pos_document"][:200], "...")

## 2Â Â·Â Run a single retriever

In [None]:
from Retrieval.retrieve_indexing import index_and_save
from Retrieval.reporting import evaluate_run, save_metrics_table

OUTPUT = pathlib.Path("Retrieval/results")
OUTPUT.mkdir(parents=True, exist_ok=True)

RETRIEVER = "bm25"      #@param ["bm25", "colbert", "contriever", "dragon_plus", "hipporag2", "reasonir8b"]

run_file = index_and_save(
    output_folder=str(OUTPUT),
    category=SPLIT,
    discourse=STYLE,
    retriever_name=RETRIEVER,
    device=DEVICE
)

metrics = evaluate_run(run_file)
pprint.pp(metrics)

## 3Â Â·Â Benchmark all builtâ€‘ins (optional, takes time)

In [None]:
ALL_RETRIEVERS = ["bm25", "colbert", "contriever", "dragon_plus", "hipporag2", "reasonir8b"]
table = {}

for r in ALL_RETRIEVERS:
    run_file = index_and_save(str(OUTPUT), SPLIT, STYLE, r, DEVICE)
    table[r] = evaluate_run(run_file)["ndcg@10"]
    
table

### Visualise nDCG@10

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(6,3))
plt.bar(table.keys(), table.values())
plt.ylabel("nDCG@10 â†‘")
plt.xticks(rotation=45, ha="right")
plt.title(f"{STYLE}-{SPLIT}")
plt.show()

## 4Â Â·Â RAG / Longâ€‘context Reader

In [None]:
from RAG_Style.scripts.sync.sync_run_tests import run_single_experiment
CONFIG = "RAG_Style/experiment_configs/bm/A_Multi_llama_bm_10.yaml"

result_path = run_single_experiment(CONFIG, hf_token=os.getenv("HF_TOKEN", ""))
print("Saved generations â†’", result_path)

### Compute ROUGEâ€‘1 recall

In [None]:
from RAG_Style.scripts.reporting import rouge_report

rouge_metrics = rouge_report(result_path)
print(rouge_metrics)

## 5Â Â·Â Export metrics

In [None]:
latest = {
    "retrieval": {"ndcg10": {"avg": sum(table.values())/len(table)}},
    "rag": {"rouge1": {"avg": rouge_metrics["rouge1"]}}
}
with open("metrics/latest.json", "w") as f:
    json.dump(latest, f, indent=2)
print("Wrote metrics/latest.json")