In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import os
import numpy as np
from tqdm import tqdm

current_dir = os.getcwd()
project_root = os.path.dirname(current_dir)
if project_root not in sys.path:
    sys.path.append(project_root)

from minisearch.search import Searcher 
from minisearch.index import PositionalIndex

import aux
import metrics

In [3]:
aux.download_and_sample(force_reload=False, n_queries=2000, n_docs=100000)

Dataset already exists. Skipping download.


In [4]:
import numpy as np
from tqdm import tqdm

def recall_at_candidates(queries_df, qrels, searcher, int_to_str_id, k=1000):
    hits = 0
    total = 0
    cand_sizes = []

    for _, row in tqdm(queries_df.iterrows(), total=len(queries_df), desc=f"Recall@{k} (candidates)"):
        qid, text = int(row["query_id"]), row["text"]
        if qid not in qrels:
            continue

        target = next(iter(qrels[qid]))

        cand = searcher.candidates(text)  # Set[int]
        cand_sizes.append(len(cand))

        cand_str = {int_to_str_id[i] for i in cand}
        hits += int(target in cand_str)
        total += 1

    recall = hits / total if total else 0.0
    print(f"\nRecall@{k} (candidate pool): {recall:.4f} ({hits}/{total})")
    if cand_sizes:
        print(f"Candidate size: mean={np.mean(cand_sizes):.1f}, median={np.median(cand_sizes):.1f}, p95={np.percentile(cand_sizes, 95):.1f}")
    return recall


In [5]:
docs_df, queries_df, qrels = aux.load_data()
index, int_to_str_id = aux.build_index(docs_df)

searcher = Searcher(index, [0.0, 1.0])

recall_at_candidates(queries_df, qrels, searcher, int_to_str_id, k=1000)


Indexing...


100%|██████████| 101937/101937 [01:35<00:00, 1063.70it/s]
Recall@1000 (candidates): 100%|██████████| 2000/2000 [01:03<00:00, 31.49it/s]


Recall@1000 (candidate pool): 0.8765 (1753/2000)
Candidate size: mean=847.2, median=1000.0, p95=1000.0





0.8765