In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import os
import numpy as np
from tqdm import tqdm

current_dir = os.getcwd()
project_root = os.path.dirname(current_dir)
if project_root not in sys.path:
    sys.path.append(project_root)

from minisearch.search import Searcher 
from minisearch.index import PositionalIndex

import aux
import metrics

In [3]:
aux.download_and_sample(force_reload=False, n_queries=2000, n_docs=100000)

Dataset already exists. Skipping download.


In [4]:
docs_df, queries_df, qrels = aux.load_data()

index, int_to_str_id = aux.build_index(docs_df)

weight_sets = [
    ("bm25_only",        [0.0, 1.0, 0.0, 0.0]),
    ("overlap_only",     [1.0, 0.0, 0.0, 0.0]),
    ("all_1",            [1.0, 1.0, 1.0, 1.0]),
    ("bm25_cov",         [0.0, 1.0, 0.5, 0.0]),
    ("bm25_title",       [0.0, 1.0, 0.0, 0.5]),
    ("bm25_cov_title",   [0.0, 1.0, 0.5, 0.5]),
    ("bm25_cov_strong",  [0.0, 1.0, 1.0, 0.0]),
    ("bm25_title_strong",[0.0, 1.0, 0.0, 1.0]),
    ("bm25_small_bonus", [0.1, 1.0, 0.2, 0.2]),
]

results = []

for name, w in weight_sets:
    searcher = Searcher(index, w)

    mrr_scores, ndcg_scores = [], []
    for _, row in tqdm(queries_df.iterrows(), total=len(queries_df), desc=name):
        qid, text = row["query_id"], row["text"]
        if qid not in qrels:
            continue
        targets = qrels[qid]

        results_int = [doc_id for doc_id, _ in searcher.search(text)]
        results_str = [int_to_str_id[i] for i in results_int]

        mrr_scores.append(metrics.mrr(results_str, targets, k=10))
        ndcg_scores.append(metrics.ndcg(results_str, targets, k=10))

    mrr = float(np.mean(mrr_scores)) if mrr_scores else 0.0
    ndcg = float(np.mean(ndcg_scores)) if ndcg_scores else 0.0
    results.append((name, w, mrr, ndcg))

results.sort(key=lambda x: x[3], reverse=True)

print("\nTop configs by NDCG@10:")
for name, w, mrr, ndcg in results[:10]:
    print(f"{name:16s} w={w} | MRR@10={mrr:.4f} NDCG@10={ndcg:.4f}")


Indexing...


100%|██████████| 101937/101937 [01:26<00:00, 1183.69it/s]
bm25_only: 100%|██████████| 2000/2000 [01:03<00:00, 31.47it/s]
overlap_only: 100%|██████████| 2000/2000 [01:03<00:00, 31.36it/s]
all_1: 100%|██████████| 2000/2000 [01:04<00:00, 31.23it/s]
bm25_cov: 100%|██████████| 2000/2000 [01:03<00:00, 31.42it/s]
bm25_title: 100%|██████████| 2000/2000 [01:03<00:00, 31.48it/s]
bm25_cov_title: 100%|██████████| 2000/2000 [01:07<00:00, 29.83it/s]
bm25_cov_strong: 100%|██████████| 2000/2000 [01:03<00:00, 31.40it/s]
bm25_title_strong: 100%|██████████| 2000/2000 [01:06<00:00, 29.88it/s]
bm25_small_bonus: 100%|██████████| 2000/2000 [01:07<00:00, 29.79it/s]


Top configs by NDCG@10:
bm25_title_strong w=[0.0, 1.0, 0.0, 1.0] | MRR@10=0.6032 NDCG@10=0.6461
bm25_title       w=[0.0, 1.0, 0.0, 0.5] | MRR@10=0.5980 NDCG@10=0.6419
bm25_cov_title   w=[0.0, 1.0, 0.5, 0.5] | MRR@10=0.5960 NDCG@10=0.6401
bm25_small_bonus w=[0.1, 1.0, 0.2, 0.2] | MRR@10=0.5922 NDCG@10=0.6365
bm25_only        w=[0.0, 1.0, 0.0, 0.0] | MRR@10=0.5911 NDCG@10=0.6355
bm25_cov         w=[0.0, 1.0, 0.5, 0.0] | MRR@10=0.5896 NDCG@10=0.6341
bm25_cov_strong  w=[0.0, 1.0, 1.0, 0.0] | MRR@10=0.5896 NDCG@10=0.6339
all_1            w=[1.0, 1.0, 1.0, 1.0] | MRR@10=0.5784 NDCG@10=0.6212
overlap_only     w=[1.0, 0.0, 0.0, 0.0] | MRR@10=0.1746 NDCG@10=0.2214



