In [1]:
from datasets import load_dataset, Dataset
from ragas import evaluate
from ragas.metrics import context_recall, context_precision, answer_correctness
from src.retriever import get_relevant_chunks
import os

In [2]:
hotpot = load_dataset("hotpot_qa", "distractor", split="validation[:50]")

In [None]:
hotpot[49]

In [3]:
questions = hotpot["question"]
ground_truths = hotpot["answer"]       # 正确答案
contexts_block = hotpot["context"]      # dict，有 title & sentences

In [4]:
#把每个 title+sentences 拼成一个段落，写入 documents/hotpot/*.txt
out_dir = "documents/hotpot"
os.makedirs(out_dir, exist_ok=True)

paras = []  # [(title, paragraph_str), ...]
for block in contexts_block:
    for title, sents in zip(block["title"], block["sentences"]):
        para = " ".join(sents).strip()
        paras.append((title, para))

# 去重
seen, uniq = set(), []
for title, para in paras:
    if para not in seen:
        seen.add(para)
        uniq.append((title, para))

for i, (title, para) in enumerate(uniq):
    safe = "".join(c if c.isalnum() else "_" for c in title)[:30]
    fname = f"{i:04d}_{safe}.txt"
    with open(f"{out_dir}/{fname}", "w", encoding="utf-8") as f:
        f.write(para)

print(f"Wrote {len(uniq)} paragraphs to {out_dir}")

Wrote 491 paragraphs to documents/hotpot


In [5]:
#手搓 Top-3 检索 
contexts_list, manual_hits = [], []

print("=== Top-3 检索 & 手动检查 ===")
for i, (q, gt) in enumerate(zip(questions, ground_truths)):
    ctxs = get_relevant_chunks(q, k=3)   # 全局 3-选最相似
    contexts_list.append(ctxs)
    hit = any(gt in c for c in ctxs)
    manual_hits.append(hit)
    print(f"[{i:02d}] hit={hit}  retrieved={len(ctxs)}")

print(f"\nManual Recall@3: {sum(manual_hits)/len(manual_hits):.3f}")

=== Top-3 检索 & 手动检查 ===
[00] hit=False  retrieved=3
[01] hit=False  retrieved=3
[02] hit=True  retrieved=3
[03] hit=False  retrieved=3
[04] hit=False  retrieved=3
[05] hit=True  retrieved=3
[06] hit=True  retrieved=3
[07] hit=True  retrieved=3
[08] hit=True  retrieved=3
[09] hit=False  retrieved=3
[10] hit=True  retrieved=3
[11] hit=True  retrieved=3
[12] hit=False  retrieved=3
[13] hit=True  retrieved=3
[14] hit=False  retrieved=3
[15] hit=False  retrieved=3
[16] hit=False  retrieved=3
[17] hit=False  retrieved=3
[18] hit=False  retrieved=3
[19] hit=True  retrieved=3
[20] hit=True  retrieved=3
[21] hit=False  retrieved=3
[22] hit=False  retrieved=3
[23] hit=True  retrieved=3
[24] hit=True  retrieved=3
[25] hit=False  retrieved=3
[26] hit=False  retrieved=3
[27] hit=False  retrieved=3
[28] hit=True  retrieved=3
[29] hit=True  retrieved=3
[30] hit=True  retrieved=3
[31] hit=True  retrieved=3
[32] hit=False  retrieved=3
[33] hit=True  retrieved=3
[34] hit=True  retrieved=3
[35] hit=False

In [6]:
#  组装 RAGAS 数据并评估
answers = [" ".join(c) for c in contexts_list]  # 或替换为你的 LLM 生成

data = {
    "question":     questions,
    "contexts":     contexts_list,
    "answer":       answers,
    "ground_truth": ground_truths,
}
print("Data lengths:", {k: len(v) for k, v in data.items()})

eval_ds = Dataset.from_dict(data)
scores  = evaluate(
    dataset=eval_ds,
    metrics=[context_recall, context_precision, answer_correctness]
)
df = scores.to_pandas()

print("\n=== RAGAS 评估结果 ===")
print(df)

avg = df[["context_recall","context_precision","answer_correctness"]].mean()
print("\n=== 平均指标 ===")
print(avg.to_string())

Data lengths: {'question': 50, 'contexts': 50, 'answer': 50, 'ground_truth': 50}


Evaluating:   0%|          | 0/150 [00:00<?, ?it/s]


=== RAGAS 评估结果 ===
                                           user_input  \
0   Were Scott Derrickson and Ed Wood of the same ...   
1   What government position was held by the woman...   
2   What science fantasy young adult series, told ...   
3   Are the Laleli Mosque and Esma Sultan Mansion ...   
4   The director of the romantic comedy "Big Stone...   
5   2014 S/S is the debut album of a South Korean ...   
6   Who was known by his stage name Aladin and hel...   
7   The arena where the Lewiston Maineiacs played ...   
8     Who is older, Annie Morton or Terry Richardson?   
9   Are Local H and For Against both from the Unit...   
10  What is the name of the fight song of the univ...   
11  What screenwriter with credits for "Evolution"...   
12  What year did Guns N Roses perform a promo for...   
13  Are Random House Tower and 888 7th Avenue both...   
14  The football manager who recruited David Beckh...   
15  Brown State Fishing Lake is in a country that ...   
16  The Ver

In [None]:
#  查看第n个样例
n = 2
print(f"--- Sample {n} ---")
print("Q :", questions[n])
print("GT:", ground_truths[n])
print("Ctxs:")
for j, c in enumerate(contexts_list[n]):
    print(f"  ({j})", c[:200], "…")
print("\nAns:", answers[n])

--- Sample 2 ---
Q : What science fantasy young adult series, told in first person, has a set of companion books narrating the stories of enslaved worlds and alien species?
GT: Animorphs
Ctxs:
  (0) K. A. Applegate, and published by Scholastic.  It is told in first person, with all six main characters taking turns narrating the books through their own perspectives.  Horror, war, dehumanization, …
  (1) and "The Andalite Chronicles".  The book is introduced by Tobias, who flies to the valley of the free Hork-Bajir, where Jara Hamee tells him the story of how the Yeerks enslaved the Hork-Bajir, and …
  (2) Animorphs is a science fantasy series of young adult books written by Katherine Applegate and her husband Michael Grant, writing together under the name K. A. Applegate, and published by Scholastic. …

Ans: K. A. Applegate, and published by Scholastic.  It is told in first person, with all six main characters taking turns narrating the books through their own perspectives.  Horror, war