In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import os
import numpy as np
from tqdm import tqdm

current_dir = os.getcwd()
project_root = os.path.dirname(current_dir)
if project_root not in sys.path:
    sys.path.append(project_root)

from minisearch.search import Searcher 
from minisearch.index import PositionalIndex

import aux
import metrics

In [3]:
aux.download_and_sample(force_reload=False, n_queries=2000, n_docs=100000)

Dataset already exists. Skipping download.


In [None]:
docs_df, queries_df, qrels = aux.load_data()

index, int_to_str_id = aux.build_index(docs_df)

searcher = Searcher(index, [1.0, 0])

In [None]:
mrr_scores = []
ndcg_scores = []

print("Evaluating...")
for _, row in tqdm(queries_df.iterrows(), total=len(queries_df)):
    qid, text = row["query_id"], row["text"]
    
    if qid not in qrels: continue
    targets = qrels[qid]
    
    results_int = [doc_id for doc_id, _ in searcher.search(text)]
    
    results_str = [int_to_str_id[i] for i in results_int]
    
    mrr_scores.append(metrics.mrr(results_str, targets, k=10))
    ndcg_scores.append(metrics.ndcg(results_str, targets, k=10))

print(f"\nResult Metrics:")
print(f"MRR@10:  {np.mean(mrr_scores):.4f}")
print(f"NDCG@10: {np.mean(ndcg_scores):.4f}")

**weights experiment**

In [4]:
docs_df, queries_df, qrels = aux.load_data()

index, int_to_str_id = aux.build_index(docs_df)

searcher1 = Searcher(index, [1.0, 0])
searcher2 = Searcher(index, [0, 1.0])

Indexing...


100%|██████████| 101937/101937 [01:19<00:00, 1288.84it/s]


In [5]:
mrr1, ndcg1 = [], []
mrr2, ndcg2 = [], []

print("Evaluating (2 searchers)...")
for _, row in tqdm(queries_df.iterrows(), total=len(queries_df)):
    qid, text = row["query_id"], row["text"]

    if qid not in qrels:
        continue
    targets = qrels[qid]

    # Searcher 1
    res1_int = [doc_id for doc_id, _ in searcher1.search(text)]
    res1_str = [int_to_str_id[i] for i in res1_int]
    mrr1.append(metrics.mrr(res1_str, targets, k=10))
    ndcg1.append(metrics.ndcg(res1_str, targets, k=10))

    # Searcher 2
    res2_int = [doc_id for doc_id, _ in searcher2.search(text)]
    res2_str = [int_to_str_id[i] for i in res2_int]
    mrr2.append(metrics.mrr(res2_str, targets, k=10))
    ndcg2.append(metrics.ndcg(res2_str, targets, k=10))

print("\nResult Metrics:")
print(f"Searcher1 | MRR@10:  {np.mean(mrr1):.4f} | NDCG@10: {np.mean(ndcg1):.4f}")
print(f"Searcher2 | MRR@10:  {np.mean(mrr2):.4f} | NDCG@10: {np.mean(ndcg2):.4f}")

# Optional: paired deltas (same queries)
print("\nDeltas (Searcher2 - Searcher1):")
print(f"ΔMRR@10:  {(np.mean(mrr2) - np.mean(mrr1)):.4f}")
print(f"ΔNDCG@10: {(np.mean(ndcg2) - np.mean(ndcg1)):.4f}")


Evaluating (2 searchers)...


100%|██████████| 2000/2000 [01:59<00:00, 16.71it/s]


Result Metrics:
Searcher1 | MRR@10:  0.1697 | NDCG@10: 0.2089
Searcher2 | MRR@10:  0.4420 | NDCG@10: 0.4616

Deltas (Searcher2 - Searcher1):
ΔMRR@10:  0.2723
ΔNDCG@10: 0.2527



