In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import os
import numpy as np
from tqdm import tqdm

current_dir = os.getcwd()
project_root = os.path.dirname(current_dir)
if project_root not in sys.path:
    sys.path.append(project_root)

from minisearch.search import Searcher 
from minisearch.index import PositionalIndex, tokenize

from minisearch.fast_ranking import FastRanker
from minisearch.quorum import QuorumCandidateGenerator, QuorumConfig

import aux
import metrics

In [3]:
aux.download_and_sample(force_reload=False, n_queries=2000, n_docs=100000)

Dataset already exists. Skipping download.


In [4]:
# build index
docs_df, queries_df, qrels = aux.load_data()
index, int_to_str_id = aux.build_index(docs_df)

Indexing...


100%|██████████| 101937/101937 [01:25<00:00, 1187.51it/s]


In [5]:
# build ranker
weights = [0, 1.0, 0, 1.0, 0.6]
ranker = FastRanker(index, weights)

**big quorum**

In [6]:
# build quorum
cfg = QuorumConfig(
    activate_if_candidates_lt=2000,
    target=5000,
    cap=50000,
    anchor_pool=8,
    max_df_frac=0.2,
    k_frac=0.35,
    min_k=2,
    max_universe=200000
    )

quorum = QuorumCandidateGenerator(index, tokenize_fn=tokenize, config=cfg)   
searcher = Searcher(index, ranker, quorum)

In [7]:
scores = aux.eval_ranking(searcher, queries_df, qrels, int_to_str_id, ks=(1, 10, 100))
for k, v in scores.items():
    print(f"{k}: {v:.4f}")

Evaluating: 100%|██████████| 2000/2000 [03:05<00:00, 10.80it/s]

MRR@1: 0.5115
NDCG@1: 0.5115
MRR@10: 0.6052
NDCG@10: 0.6487
MRR@100: 0.6094
NDCG@100: 0.6688





**Small quorum**

In [8]:
# build quorum
cfg = QuorumConfig(activate_if_candidates_lt=1000, target=1000, cap=5000)

quorum = QuorumCandidateGenerator(index, tokenize_fn=tokenize, config=cfg)   
searcher = Searcher(index, ranker, quorum)

In [9]:
scores = aux.eval_ranking(searcher, queries_df, qrels, int_to_str_id, ks=(1, 10, 100))
for k, v in scores.items():
    print(f"{k}: {v:.4f}")


Evaluating: 100%|██████████| 2000/2000 [01:36<00:00, 20.71it/s]

MRR@1: 0.5115
NDCG@1: 0.5115
MRR@10: 0.6048
NDCG@10: 0.6479
MRR@100: 0.6089
NDCG@100: 0.6668





**dummy quorum**

In [6]:
# build quorum
cfg = QuorumConfig(activate_if_candidates_lt=1000, target=50, cap=2000)

quorum = QuorumCandidateGenerator(index, tokenize_fn=tokenize, config=cfg)   
searcher = Searcher(index, ranker, quorum)

In [7]:
scores = aux.eval_ranking(searcher, queries_df, qrels, int_to_str_id, ks=(1, 10, 100))
for k, v in scores.items():
    print(f"{k}: {v:.4f}")


Evaluating: 100%|██████████| 2000/2000 [01:07<00:00, 29.60it/s]

MRR@1: 0.4920
NDCG@1: 0.4920
MRR@10: 0.5641
NDCG@10: 0.5939
MRR@100: 0.5649
NDCG@100: 0.5972





**R2 model**

In [13]:
from sentence_transformers import CrossEncoder
ce = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

doc_text = aux.make_doc_text_map(docs_df, id_col="doc_id", text_col="body")

mrr1 = []
mrr10 = []
from tqdm import tqdm
import numpy as np
import metrics

for _, row in tqdm(queries_df.iterrows(), total=len(queries_df)):
    qid = int(row["query_id"])
    if qid not in qrels:
        continue
    text = row["text"]
    targets = qrels[qid]

    results_int = [doc_id for doc_id, _ in searcher.search(text, top_k=100)]
    results_int = aux.rerank_with_cross_encoder(searcher, text, results_int, int_to_str_id, doc_text, ce, rerank_k=10)

    results_str = [int_to_str_id[i] for i in results_int]
    mrr1.append(metrics.mrr(results_str, targets, k=1))
    mrr10.append(metrics.mrr(results_str, targets, k=10))

print("MRR@1:", float(np.mean(mrr1)))
print("MRR@10:", float(np.mean(mrr10)))


100%|██████████| 2000/2000 [03:22<00:00,  9.87it/s]

MRR@1: 0.6055
MRR@10: 0.6759392857142857



