In [1]:
%load_ext autoreload
%autoreload 2

In [8]:
import sys
import os
import numpy as np
from tqdm import tqdm

current_dir = os.getcwd()
project_root = os.path.dirname(current_dir)
if project_root not in sys.path:
    sys.path.append(project_root)

from minisearch.search import Searcher 
from minisearch.index import PositionalIndex, tokenize

from minisearch.fast_ranking import FastRanker
from minisearch.quorum import QuorumCandidateGenerator, QuorumConfig

import aux
import metrics

In [3]:
aux.download_and_sample(force_reload=False, n_queries=2000, n_docs=100000)

Dataset already exists. Skipping download.


In [4]:
# build index
docs_df, queries_df, qrels = aux.load_data()
index, int_to_str_id = aux.build_index(docs_df)

Indexing...


100%|██████████| 101937/101937 [01:28<00:00, 1158.25it/s]


In [5]:
# build ranker
weights = [0, 1.0, 0, 1.0, 0.6]
ranker = FastRanker(index, weights)

In [9]:
quorum = quorum = QuorumCandidateGenerator(index, tokenize_fn=tokenize, config=QuorumConfig(activate_if_candidates_lt=1000, target=1000, cap=5000))   


searcher = Searcher(index, ranker, quorum)

In [10]:
mrr_scores = []
ndcg_scores = []

print("Evaluating...")
for _, row in tqdm(queries_df.iterrows(), total=len(queries_df)):
    qid, text = row["query_id"], row["text"]
    
    if qid not in qrels: continue
    targets = qrels[qid]
    
    results_int = [doc_id for doc_id, _ in searcher.search(text)]
    
    results_str = [int_to_str_id[i] for i in results_int]
    
    mrr_scores.append(metrics.mrr(results_str, targets, k=10))
    ndcg_scores.append(metrics.ndcg(results_str, targets, k=10))

print(f"\nResult Metrics:")
print(f"MRR@10:  {np.mean(mrr_scores):.4f}")
print(f"NDCG@10: {np.mean(ndcg_scores):.4f}")

Evaluating...


100%|██████████| 2000/2000 [01:21<00:00, 24.58it/s]


Result Metrics:
MRR@10:  0.6048
NDCG@10: 0.6479



