In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import os
import numpy as np
from tqdm import tqdm

current_dir = os.getcwd()
project_root = os.path.dirname(current_dir)
if project_root not in sys.path:
    sys.path.append(project_root)

from minisearch.search import Searcher 
from minisearch.index import PositionalIndex

import aux
import metrics

In [3]:
aux.download_and_sample(force_reload=False, n_queries=500, n_docs=10000)

Dataset already exists. Skipping download.


In [4]:
docs_df, queries_df, qrels = aux.load_data()

index, int_to_str_id = aux.build_index(docs_df)
searcher = Searcher(index)

mrr_scores = []
ndcg_scores = []

print("Evaluating...")
for _, row in tqdm(queries_df.iterrows(), total=len(queries_df)):
    qid, text = row["query_id"], row["text"]
    
    # Пропускаем, если для этого запроса нет правильных ответов в нашей выборке
    if qid not in qrels: continue
    targets = qrels[qid]
    
    # Поиск
    results_int = [doc_id for doc_id, _ in searcher.search(text)]
    
    # Конвертируем внутренние int ID обратно в строковые ID MS MARCO
    results_str = [int_to_str_id[i] for i in results_int]
    
    # Считаем метрики
    mrr_scores.append(metrics.mrr(results_str, targets, k=10))
    ndcg_scores.append(metrics.ndcg(results_str, targets, k=10))

print(f"\nResult Metrics:")
print(f"MRR@10:  {np.mean(mrr_scores):.4f}")
print(f"NDCG@10: {np.mean(ndcg_scores):.4f}")

Indexing...


100%|██████████| 10499/10499 [00:09<00:00, 1166.47it/s]


Building sparse matrix for ranking...
Matrix built: 10499 docs x 364893 terms
Evaluating...


100%|██████████| 500/500 [02:04<00:00,  4.00it/s]


Result Metrics:
MRR@10:  0.2929
NDCG@10: 0.3423



