In [1]:
from pathlib import Path
import sys
sys.path.append('../')
from quantized_fast_forward.fast_forward.index import FaissPQIndex, Mode
from quantized_fast_forward.fast_forward.encoder import QueryEncoder
from quantized_fast_forward.fast_forward.ranking import Ranking
from pyserini.search.faiss import AggretrieverQueryEncoder
import numpy as np
import pickle

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# define the query encoders
class FFAggretrieverQueryEncoder(QueryEncoder):
    def __init__(self, model_name, device="cpu"):
        self._enc = AggretrieverQueryEncoder(model_name, device=device)

    # for whatever reason, pyserini flattens the outputs, so we need this
    # TODO: implement batch encoding
    def encode(self, queries):
        return np.array([self._enc.encode(q) for q in queries])

In [3]:
# load the queries
import ir_datasets

psg20 = ir_datasets.load("msmarco-passage/trec-dl-2020")
queries_psg20 = {x.query_id: x.text for x in psg20.queries_iter()}

In [4]:
# load the ranking
bm25_ranking = Ranking.from_file(Path("/home/catalinlup/MyWorkspace/MasterThesis/datasets/run_files/msmarco_psg_bm25_rankings.tsv"))
print(f"loaded run with {len(bm25_ranking)} queries")

# print(splade_ranking)

for q_id in bm25_ranking.q_ids:
    assert q_id in queries_psg20

loaded run with 200 queries


In [5]:
# # run the re-ranking experiment without any interpolation
# alpha2 = 0.0
# result2 = index.get_scores(
#     bm25_ranking,
#     queries_psg20,
#     alpha=alpha2,
# )

In [6]:

# print(
#     "BM25",
#     calc_aggregate(METRICS, psg20.qrels_iter(), bm25_ranking.run),
# )
# print(
#     f"BM25, Aggretriever (alpha={alpha2})",
#     calc_aggregate(METRICS, psg20.qrels_iter(), result2[alpha2].run),
# )

# Experiments

In [5]:
INDICES = {
    'agg_m_6_k_256_10000': '/home/catalinlup/MyWorkspace/MasterThesis/datasets/quantized_indices/agg_m_6_k_256_10000.pickle',
    'agg_m_6_k_256_200000': '/home/catalinlup/MyWorkspace/MasterThesis/datasets/quantized_indices/agg_m_6_k_256_200000.pickle',
    # 'agg_m_6_k_512_200000': '/home/catalinlup/MyWorkspace/MasterThesis/datasets/quantized_indices/agg_m_6_k_512_200000.pickle',
    # 'agg_m_6_k_1024_200000': '/home/catalinlup/MyWorkspace/MasterThesis/datasets/quantized_indices/agg_m_6_k_1024_200000.pickle',
    # 'agg_m_6_k_2048_200000': '/home/catalinlup/MyWorkspace/MasterThesis/datasets/quantized_indices/agg_m_6_k_2048_200000.pickle',
    'agg_m_6_k_4096_200000': '/home/catalinlup/MyWorkspace/MasterThesis/datasets/quantized_indices/agg_m_6_k_4096_200000.pickle'
}

In [6]:
from ir_measures import calc_aggregate, nDCG, AP, RR
METRICS = [nDCG@10, AP(rel=2)@1000, RR(rel=2)@10]

ALPHA_RANGE = [0, 0.1, 0.3, 0.5, 0.7]

def perform_experiment(index_path: str):
    # define the index
    index = FaissPQIndex.from_disk(
    Path(index_path),
    FFAggretrieverQueryEncoder("castorini/aggretriever-cocondenser"),
    )

    result = index.get_scores(
        bm25_ranking,
        queries_psg20,
        alpha=ALPHA_RANGE,
    )

    return result

def run_experiments(experiment_dict: dict) -> dict:
    results = dict()
    for experiment_name in experiment_dict.keys():
        results[experiment_name] = perform_experiment(INDICES[experiment_name])

    return results


def print_experimental_result(experimental_results: dict):
    for experiment_name in experimental_results.keys():
        for ALPHA in ALPHA_RANGE:
            print(
                f"BM25, Aggretriever (quant={experiment_name} ,alpha={ALPHA})",
                calc_aggregate(METRICS, psg20.qrels_iter(), experimental_results[experiment_name][ALPHA].run),
            )
        

In [7]:
results = run_experiments(INDICES)

100%|█████████████████████████████████████████| 200/200 [00:19<00:00, 10.18it/s]
100%|█████████████████████████████████████████| 200/200 [00:21<00:00,  9.37it/s]
100%|█████████████████████████████████████████| 200/200 [00:22<00:00,  8.71it/s]


In [8]:
pickle.dump(results, open('results/bm25_agg_pq_interpolation_results.pickle', 'wb'))

In [9]:
print_experimental_result(results)

BM25, Aggretriever (quant=agg_m_6_k_256_10000 ,alpha=0) {RR(rel=2)@10: 0.42116402116402113, nDCG@10: 0.31777260168661636, AP(rel=2)@1000: 0.19007536867518368}
BM25, Aggretriever (quant=agg_m_6_k_256_10000 ,alpha=0.1) {RR(rel=2)@10: 0.536111111111111, nDCG@10: 0.40659782260008326, AP(rel=2)@1000: 0.23576199774190162}
BM25, Aggretriever (quant=agg_m_6_k_256_10000 ,alpha=0.3) {RR(rel=2)@10: 0.6116255144032922, nDCG@10: 0.48139347389733217, AP(rel=2)@1000: 0.28775042034605847}
BM25, Aggretriever (quant=agg_m_6_k_256_10000 ,alpha=0.5) {RR(rel=2)@10: 0.6513374485596709, nDCG@10: 0.5330075624541892, AP(rel=2)@1000: 0.3309835914053803}
BM25, Aggretriever (quant=agg_m_6_k_256_10000 ,alpha=0.7) {RR(rel=2)@10: 0.6536522633744857, nDCG@10: 0.5409773554316752, AP(rel=2)@1000: 0.3419102659176829}
BM25, Aggretriever (quant=agg_m_6_k_256_200000 ,alpha=0) {RR(rel=2)@10: 0.49212228101116995, nDCG@10: 0.36089018324647426, AP(rel=2)@1000: 0.20934656069624788}
BM25, Aggretriever (quant=agg_m_6_k_256_200000