In [1]:
from datasets import load_dataset, Dataset
from ragas import evaluate
from ragas.metrics import context_recall, context_precision, answer_correctness
from src.retriever import get_relevant_chunks
import os

  _embeddings = HuggingFaceEmbeddings(model_name=settings.EMBEDDING_MODEL)


In [2]:
hotpot = load_dataset("hotpot_qa", "distractor", split="validation[:300]")

In [3]:
hotpot[23]

{'id': '5ae2070a5542994d89d5b313',
 'question': 'Which performance act has a higher instrument to person ratio, Badly Drawn Boy or Wolf Alice? ',
 'answer': 'Badly Drawn Boy',
 'type': 'comparison',
 'level': 'hard',
 'supporting_facts': {'title': ['Badly Drawn Boy', 'Wolf Alice'],
  'sent_id': [0, 1]},
 'context': {'title': ['Wolf Alice',
   'Tom Rothrock',
   'Badly Drawn Boy discography',
   'Something to Talk About (Badly Drawn Boy song)',
   'About a Boy (film)',
   'Silent Sigh',
   'Badly Drawn Boy',
   'The Hour of Bewilderbeast',
   'About a Boy (soundtrack)',
   'Under an Hour'],
  'sentences': [['Wolf Alice are a four-piece alternative rock band from North London, formed initially as a two-person band in 2010.',
    ' Its members since 2012 are Ellie Rowsell (vocals, guitar), Joff Oddie (guitars, vocals), Theo Ellis (bass), and Joel Amey (drums, vocals).'],
   ['Tom Rothrock is an international record producer, composer, musician and owner of Bong Load Records.',
    ' Tom R

In [4]:
questions = hotpot["question"]
ground_truths = hotpot["answer"]       # 正确答案
contexts_block = hotpot["context"]      # dict，有 title & sentences

In [5]:
#把每个 title+sentences 拼成一个段落，写入 documents/hotpot/*.txt
out_dir = "documents/hotpot"
os.makedirs(out_dir, exist_ok=True)

paras = []  # [(title, paragraph_str), ...]
for block in contexts_block:
    for title, sents in zip(block["title"], block["sentences"]):
        para = " ".join(sents).strip()
        paras.append((title, para))

# 去重
seen, uniq = set(), []
for title, para in paras:
    if para not in seen:
        seen.add(para)
        uniq.append((title, para))

for i, (title, para) in enumerate(uniq):
    safe = "".join(c if c.isalnum() else "_" for c in title)[:30]
    fname = f"{i:04d}_{safe}.txt"
    with open(f"{out_dir}/{fname}", "w", encoding="utf-8") as f:
        f.write(para)

print(f"Wrote {len(uniq)} paragraphs to {out_dir}")

Wrote 2963 paragraphs to documents/hotpot


In [6]:
#手搓 Top-3 检索 
contexts_list, manual_hits = [], []

print("=== Top-3 检索 & 手动检查 ===")
for i, (q, gt) in enumerate(zip(questions, ground_truths)):
    ctxs = get_relevant_chunks(q, k=5)   # k=3
    contexts_list.append(ctxs)
    hit = any(gt in c for c in ctxs)
    manual_hits.append(hit)
    print(f"[{i:02d}] hit={hit}  retrieved={len(ctxs)}")

print(f"\nManual Recall@3: {sum(manual_hits)/len(manual_hits):.3f}")

=== Top-3 检索 & 手动检查 ===
[Retriever] Intersection count: 11
[00] hit=False  retrieved=5
[Retriever] Intersection count: 8
[01] hit=False  retrieved=5
[Retriever] Intersection count: 28
[02] hit=True  retrieved=5
[Retriever] Intersection count: 27
[03] hit=False  retrieved=5
[Retriever] Intersection count: 16
[04] hit=False  retrieved=5
[Retriever] Intersection count: 22
[05] hit=True  retrieved=5
[Retriever] Intersection count: 14
[06] hit=True  retrieved=5
[Retriever] Intersection count: 26
[07] hit=True  retrieved=5
[Retriever] Intersection count: 13
[08] hit=True  retrieved=5
[Retriever] Intersection count: 8
[09] hit=False  retrieved=5
[Retriever] Intersection count: 36
[10] hit=True  retrieved=5
[Retriever] Intersection count: 20
[11] hit=True  retrieved=5
[Retriever] Intersection count: 19
[12] hit=True  retrieved=5
[Retriever] Intersection count: 29
[13] hit=True  retrieved=5
[Retriever] Intersection count: 28
[14] hit=True  retrieved=5
[Retriever] Intersection count: 30
[15] hit

In [8]:
from ragas.llms import LangchainLLMWrapper
from langchain_openai import ChatOpenAI
from config import settings
#  组装 RAGAS 数据并评估
answers = [" ".join(c) for c in contexts_list]  # 或替换为你的 LLM 生成

data = {
    "question":     questions,
    "contexts":     contexts_list,
    "answer":       answers,
    "ground_truth": ground_truths,
}
print("Data lengths:", {k: len(v) for k, v in data.items()})

eval_ds = Dataset.from_dict(data)
scores  = evaluate(
    dataset=eval_ds,
    metrics=[context_recall, context_precision],
    llm = LangchainLLMWrapper(ChatOpenAI(model=settings.CHAT_MODEL))
)
df = scores.to_pandas()

print("\n=== RAGAS 评估结果 ===")
print(df)

avg = df[["context_recall","context_precision"]].mean()
print("\n=== 平均指标 ===")
print(avg.to_string())

Data lengths: {'question': 300, 'contexts': 300, 'answer': 300, 'ground_truth': 300}


Evaluating:   0%|          | 0/600 [00:00<?, ?it/s]

Exception raised in Job[128]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-3.5-turbo in organization org-SoTb0R93gMK1wmRJjsVnpJIm on requests per min (RPM): Limit 500, Used 500, Requested 1. Please try again in 120ms. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'requests', 'param': None, 'code': 'rate_limit_exceeded'}})
Exception raised in Job[164]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-3.5-turbo in organization org-SoTb0R93gMK1wmRJjsVnpJIm on tokens per min (TPM): Limit 200000, Used 198992, Requested 1333. Please try again in 97ms. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})
Exception raised in Job[244]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-3.5-turbo in organization org-SoTb0R93gMK1wmRJjsVnpJIm on tokens per min (TPM): Limit 200000,


=== RAGAS 评估结果 ===
                                            user_input  \
0    Were Scott Derrickson and Ed Wood of the same ...   
1    What government position was held by the woman...   
2    What science fantasy young adult series, told ...   
3    Are the Laleli Mosque and Esma Sultan Mansion ...   
4    The director of the romantic comedy "Big Stone...   
..                                                 ...   
295  What army did the namesake of the ship launche...   
296  The Church of the Guanche People was founded i...   
297  What officially ended the first phase of the m...   
298  The mass killing that took place at Oakland, C...   
299  When did the rock band that sang "All Join Han...   

                                    retrieved_contexts  \
0    [Scott Derrickson (born July 16, 1966) is an A...   
1    [Kiss and Tell is a 1945 American comedy film ...   
2    [Animorphs is a science fantasy series of youn...   
3    [The Esma Sultan Mansion (Turkish: "Esma Sulta

In [9]:
#  查看第n个样例
n = 5
print(f"--- Sample {n} ---")
print("Q :", questions[n])
print("GT:", ground_truths[n])
print("Ctxs:")
for j, c in enumerate(contexts_list[n]):
    print(f"  ({j})", c[:1000], "…")
print("\nAns:", answers[n])

--- Sample 5 ---
Q : 2014 S/S is the debut album of a South Korean boy group that was formed by who?
GT: YG Entertainment
Ctxs:
  (0) 2014 S/S is the debut album of South Korean group WINNER.  It was released on August 12, 2014 by the group's record label, YG Entertainment.  The members were credited for writing the lyrics and composing the majority of the album's songs. …
  (1) Madtown (Hangul: 매드타운 ), often stylized as MADTOWN, is a South Korean boy group formed in 2014 by J. Tune Camp.  The group consists of Moos, Daewon, Lee Geon, Jota, Heo Jun, Buffy and H.O.  Their debut album, "Mad Town", was released on October 6, 2014.  Two of the members, Moos and Buffy, …
  (2) Winner (Hangul: 위너), often stylized as WINNER, is a South Korean boy group formed in 2013 by YG Entertainment and debuted in 2014.  It currently consists of four members, Jinwoo, Seunghoon, Mino and Seungyoon.  Originally a five-piece group with Taehyun, who later departed from the group in November …
  (3) History (K