In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import os
import numpy as np
from tqdm import tqdm

current_dir = os.getcwd()
project_root = os.path.dirname(current_dir)
if project_root not in sys.path:
    sys.path.append(project_root)

from minisearch.search import Searcher 
from minisearch.index import PositionalIndex

import aux
import metrics

In [3]:
aux.download_and_sample(force_reload=False, n_queries=2000, n_docs=100000)

Dataset already exists. Skipping download.


In [4]:
import numpy as np
from tqdm import tqdm

def recall_at_candidates(queries_df, qrels, searcher, int_to_str_id, k=1000):
    hits = 0
    total = 0
    cand_sizes = []

    for _, row in tqdm(queries_df.iterrows(), total=len(queries_df), desc=f"Recall@{k} (candidates)"):
        qid, text = int(row["query_id"]), row["text"]
        if qid not in qrels:
            continue

        target = next(iter(qrels[qid]))

        cand = searcher.candidates(text)
        cand_sizes.append(len(cand))

        cand_str = {int_to_str_id[i] for i in cand}
        hits += int(target in cand_str)
        total += 1

    recall = hits / total if total else 0.0
    print(f"\nRecall@{k} (candidate pool): {recall:.4f} ({hits}/{total})")
    if cand_sizes:
        print(f"Candidate size: mean={np.mean(cand_sizes):.1f}, median={np.median(cand_sizes):.1f}, p95={np.percentile(cand_sizes, 95):.1f}")
    return recall


In [5]:
from minisearch.fast_ranking import FastRanker
from minisearch.quorum import QuorumCandidateGenerator, QuorumConfig
from minisearch.index import tokenize

# build index
docs_df, queries_df, qrels = aux.load_data()
index, int_to_str_id = aux.build_index(docs_df)

# build ranker
weights = [0, 1.0, 0, 1.0, 0.6]
ranker = FastRanker(index, weights)

# build quorym
quorum = QuorumCandidateGenerator(index, tokenize_fn=tokenize, config=QuorumConfig(activate_if_candidates_lt=1000, target=1000, cap=5000))   

# buold searcher
searcher = Searcher(index, ranker, quorum)

Indexing...


100%|██████████| 101937/101937 [01:28<00:00, 1154.93it/s]


In [6]:
recall_at_candidates(queries_df, qrels, searcher, int_to_str_id, k=1000)


Recall@1000 (candidates): 100%|██████████| 2000/2000 [01:00<00:00, 33.29it/s]


Recall@1000 (candidate pool): 0.8765 (1753/2000)
Candidate size: mean=847.2, median=1000.0, p95=1000.0





0.8765