In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import os
import numpy as np
from tqdm import tqdm

current_dir = os.getcwd()
project_root = os.path.dirname(current_dir)
if project_root not in sys.path:
    sys.path.append(project_root)

from minisearch.search import Searcher 
from minisearch.index import PositionalIndex

import aux
import metrics

In [3]:
aux.download_and_sample(force_reload=False, n_queries=1000, n_docs=100000)

Dataset already exists. Skipping download.


In [4]:
docs_df, queries_df, qrels = aux.load_data()

index, int_to_str_id = aux.build_index(docs_df)
searcher = Searcher(index)

mrr_scores = []
ndcg_scores = []

print("Evaluating...")
for _, row in tqdm(queries_df.iterrows(), total=len(queries_df)):
    qid, text = row["query_id"], row["text"]
    
    if qid not in qrels: continue
    targets = qrels[qid]
    
    results_int = [doc_id for doc_id, _ in searcher.search(text)]
    
    results_str = [int_to_str_id[i] for i in results_int]
    
    mrr_scores.append(metrics.mrr(results_str, targets, k=10))
    ndcg_scores.append(metrics.ndcg(results_str, targets, k=10))

print(f"\nResult Metrics:")
print(f"MRR@10:  {np.mean(mrr_scores):.4f}")
print(f"NDCG@10: {np.mean(ndcg_scores):.4f}")

Indexing...


100%|██████████| 100956/100956 [01:24<00:00, 1193.60it/s]


Building BM25 matrix...
BM25 Matrix built: 100956 docs x 1799422 terms
Evaluating...


100%|██████████| 1000/1000 [07:20<00:00,  2.27it/s] 


Result Metrics:
MRR@10:  0.4388
NDCG@10: 0.4570



