In [1]:
from pathlib import Path
import sys
sys.path.append('../')
from quantized_fast_forward.fast_forward.index import FaissPQIndex, Mode
from quantized_fast_forward.fast_forward.encoder import QueryEncoder
from quantized_fast_forward.fast_forward.ranking import Ranking
from pyserini.search.faiss import AggretrieverQueryEncoder
import numpy as np
import pickle

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# define the query encoders
class FFAggretrieverQueryEncoder(QueryEncoder):
    def __init__(self, model_name, device="cpu"):
        self._enc = AggretrieverQueryEncoder(model_name, device=device)

    # for whatever reason, pyserini flattens the outputs, so we need this
    # TODO: implement batch encoding
    def encode(self, queries):
        return np.array([self._enc.encode(q) for q in queries])

In [3]:
# load the queries
import ir_datasets

psg20 = ir_datasets.load("msmarco-passage/trec-dl-2020")
queries_psg20 = {x.query_id: x.text for x in psg20.queries_iter()}

In [5]:
# load the ranking
bm25_ranking = Ranking.from_file(Path("/home/catalinlup/MyWorkspace/MasterThesis/datasets/run_files/ms_marco_psg_splade_ranking.tsv"))
print(f"loaded run with {len(bm25_ranking)} queries")

# print(splade_ranking)

for q_id in bm25_ranking.q_ids:
    assert q_id in queries_psg20

loaded run with 200 queries


# Experiments

In [9]:
INDICES = {
    'agg_m_6_k_256_10000': '/home/catalinlup/MyWorkspace/MasterThesis/datasets/quantized_indices/agg_m_6_k_256_10000.pickle',
    'agg_m_6_k_256_200000': '/home/catalinlup/MyWorkspace/MasterThesis/datasets/quantized_indices/agg_m_6_k_256_200000.pickle',
    # 'agg_m_6_k_512_200000': '/home/catalinlup/MyWorkspace/MasterThesis/datasets/quantized_indices/agg_m_6_k_512_200000.pickle',
    # 'agg_m_6_k_1024_200000': '/home/catalinlup/MyWorkspace/MasterThesis/datasets/quantized_indices/agg_m_6_k_1024_200000.pickle',
    # 'agg_m_6_k_2048_200000': '/home/catalinlup/MyWorkspace/MasterThesis/datasets/quantized_indices/agg_m_6_k_2048_200000.pickle',
    'agg_m_6_k_4096_200000': '/home/catalinlup/MyWorkspace/MasterThesis/datasets/quantized_indices/agg_m_6_k_4096_200000.pickle'
}

In [10]:
from ir_measures import calc_aggregate, nDCG, AP, RR
METRICS = [nDCG@10, AP(rel=2)@1000, RR(rel=2)@10]

ALPHA_RANGE = [0, 0.1, 0.3, 0.5, 0.7]

def perform_experiment(index_path: str):
    # define the index
    index = FaissPQIndex.from_disk(
    Path(index_path),
    FFAggretrieverQueryEncoder("castorini/aggretriever-cocondenser"),
    )

    result = index.get_scores(
        bm25_ranking,
        queries_psg20,
        alpha=ALPHA_RANGE,
    )

    return result

def run_experiments(experiment_dict: dict) -> dict:
    results = dict()
    for experiment_name in experiment_dict.keys():
        results[experiment_name] = perform_experiment(INDICES[experiment_name])

    return results


def print_experimental_result(experimental_results: dict):
    for experiment_name in experimental_results.keys():
        for ALPHA in ALPHA_RANGE:
            print(
                f"SPLADE, Aggretriever (quant={experiment_name} ,alpha={ALPHA})",
                calc_aggregate(METRICS, psg20.qrels_iter(), experimental_results[experiment_name][ALPHA].run),
            )
        

In [11]:
results = run_experiments(INDICES)

100%|█████████████████████████████████████████| 200/200 [00:21<00:00,  9.34it/s]
100%|█████████████████████████████████████████| 200/200 [00:19<00:00, 10.01it/s]
100%|█████████████████████████████████████████| 200/200 [00:23<00:00,  8.48it/s]


In [12]:
pickle.dump(results, open('results/splade_agg_pq_interpolation_results.pickle', 'wb'))

In [13]:
print_experimental_result(results)

BM25, Aggretriever (quant=agg_m_6_k_256_10000 ,alpha=0) {nDCG@10: 0.25162016387562613, AP(rel=2)@1000: 0.13821564177450968, RR(rel=2)@10: 0.36285273368606696}
BM25, Aggretriever (quant=agg_m_6_k_256_10000 ,alpha=0.1) {nDCG@10: 0.7074406777640486, AP(rel=2)@1000: 0.4873874111446551, RR(rel=2)@10: 0.8125220458553791}
BM25, Aggretriever (quant=agg_m_6_k_256_10000 ,alpha=0.3) {nDCG@10: 0.7129086115669578, AP(rel=2)@1000: 0.4916819458380693, RR(rel=2)@10: 0.842283950617284}
BM25, Aggretriever (quant=agg_m_6_k_256_10000 ,alpha=0.5) {nDCG@10: 0.710379817642511, AP(rel=2)@1000: 0.4902840514285131, RR(rel=2)@10: 0.8330246913580247}
BM25, Aggretriever (quant=agg_m_6_k_256_10000 ,alpha=0.7) {nDCG@10: 0.7098922120022008, AP(rel=2)@1000: 0.48974215557441425, RR(rel=2)@10: 0.8299382716049382}
BM25, Aggretriever (quant=agg_m_6_k_256_200000 ,alpha=0) {nDCG@10: 0.293018495991052, AP(rel=2)@1000: 0.1715424941569156, RR(rel=2)@10: 0.4103762492651381}
BM25, Aggretriever (quant=agg_m_6_k_256_200000 ,alpha=