General Setup for all the datasets

In [1]:
import pyterrier as pt

if not pt.started():
    pt.init()

PyTerrier 0.10.0 has loaded Terrier 5.9 (built by craigm on 2024-05-02 17:40) and terrier-helper 0.0.8

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


Evaluation metrics used for all the datasets

In [2]:
from pyterrier.measures import RR, nDCG, MAP

eval_metrics = [RR @ 10, nDCG @ 10, MAP @ 100]

Create the query encoder that will run on CPU. Encoder used for embedding all the datasets/queries

In [3]:
from gte_base_en_encoder import GTEBaseDocumentEncoder

q_encoder = GTEBaseDocumentEncoder("Alibaba-NLP/gte-base-en-v1.5")


## NFCorpus

In [17]:
from experiments_helper import load_sparse_index_from_disk

dataset_name = "nfcorpus"

retriever = load_sparse_index_from_disk(dataset_name)

Testing the sparse retrieval

In [18]:
from experiments_helper import run_single_experiment_name

dataset_test_name = "irds:beir/nfcorpus/test"
run_single_experiment_name(retriever, dataset_test_name, eval_metrics, "BM25")

Unnamed: 0,name,RR@10,nDCG@10,AP@100
0,BM25,0.534378,0.322219,0.143582


Retrieve the dense index(already loaded into memory)

In [19]:
from experiments_helper import load_dense_index_from_disk

dense_index = load_dense_index_from_disk(dataset_name, q_encoder)

100%|██████████| 3633/3633 [00:00<00:00, 923901.44it/s]


In [14]:
from fast_forward.util.pyterrier import FFScore

from fast_forward.util.pyterrier import FFInterpolate

ff_score = FFScore(dense_index)
ff_int = FFInterpolate(alpha=0.05)

Find most optimal alpha from default set [0.25, 0.05, 0.1, 0.5, 0.9]

In [15]:
from experiments_helper import find_optimal_alpha_name

dev_set_name = "irds:beir/nfcorpus/dev"
pipeline_find_alpha = retriever % 100 >> ff_score >> ff_int
find_optimal_alpha_name(pipeline_find_alpha, ff_int, dev_set_name)

[INFO] [starting] opening zip file
[INFO] [finished] opening zip file [5ms]
GridScan: 100%|██████████| 5/5 [00:41<00:00,  8.30s/it]

Best map is 0.124061
Best setting is ['<fast_forward.util.pyterrier.FFInterpolate object at 0x7fbb8f896e00> alpha=0.05']





Create pipeline with 1000 docs retrieved per query

In [16]:
from experiments_helper import run_single_experiment_name

dataset_test_name = "irds:beir/nfcorpus/test"

pipeline = retriever % 1000 >> ff_score >> ff_int
run_single_experiment_name(pipeline, dataset_test_name, eval_metrics, "BM25 >> gte-base-en-v1.5")

Unnamed: 0,name,RR@10,nDCG@10,AP@100
0,BM25 >> gte-base-en-v1.5,0.553675,0.343964,0.15507


Same experiment as above using the default_complete_test_pipeline_name methods

In [None]:
from experiments_helper import default_complete_test_pipeline_name

dataset_name = "nfcorpus"
dev_set_name = "irds:beir/nfcorpus/dev"
dataset_test_name = "irds:beir/nfcorpus/test"

default_complete_test_pipeline_name(dataset_name, dev_set_name, dataset_test_name, q_encoder, eval_metrics)

Run pipeline for FIQA dataset

In [4]:
from experiments_helper import default_complete_test_pipeline_name

dataset_name = "fiqa"
dev_set_name = "irds:beir/fiqa/dev"
dataset_test_name = "irds:beir/fiqa/test"

default_complete_test_pipeline_name(dataset_name, dev_set_name, dataset_test_name, q_encoder, eval_metrics)

100%|██████████| 57638/57638 [00:00<00:00, 1695905.25it/s]
GridScan: 100%|██████████| 5/5 [01:51<00:00, 22.23s/it]


Best map is 0.290816
Best setting is ['<fast_forward.util.pyterrier.FFInterpolate object at 0x7f086c477250> alpha=0.05']


Unnamed: 0,name,RR@10,nDCG@10,AP@100
0,fiqa: BM25 >> gte-base-en-v1.5,0.399449,0.335966,0.274557


## For the Scifact dataset, considering the lack of a dev set, the train set was used for finetuning the alpha value.

In [5]:
from experiments_helper import default_complete_test_pipeline_name

dataset_name = "scifact"
dev_set_name = "irds:beir/scifact/train"
dataset_test_name = "irds:beir/scifact/test"

default_complete_test_pipeline_name(dataset_name, dev_set_name, dataset_test_name, q_encoder, eval_metrics)

100%|██████████| 5136/5136 [00:00<00:00, 977136.23it/s]
[INFO] [starting] opening zip file
[INFO] [finished] opening zip file [52ms]
GridScan:   0%|          | 0/5 [00:00<?, ?it/s]no vectors for 195680777
no vectors for 195689316
no vectors for 195689757
no vectors for 195683603
no vectors for 145383432
no vectors for 155200372
no vectors for 154243324
no vectors for 154050141
no vectors for 143868995
no vectors for 117907685
no vectors for 140907540
no vectors for 145335387
no vectors for 116075383
no vectors for 196664003
no vectors for 116556376
no vectors for 108886332
no vectors for 145416918
no vectors for 140098548
no vectors for 109946221
no vectors for 109795294
no vectors for 195317463
no vectors for 121581019
no vectors for 146653163
no vectors for 168265642
no vectors for 104143831
no vectors for 143381103
no vectors for 198309074
no vectors for 167469018
no vectors for 167944455
no vectors for 198133135
no vectors for 153755807
no vectors for 120385993
no vectors for 15454

KeyboardInterrupt: 

Given that the scidocs dataset offers only one dataset, we will split it into dev and test set. More exactly, we will split the topics because that is what we are testing against. I chose the 'text' topics as this dataset offers 2 topics categories.

In [None]:
from experiments_helper import split_dev_test, default_complete_test_pipeline

dataset_name = "scidocs"
dataset = pt.get_dataset("irds:beir/scidocs")
topics = dataset.get_topics('text')

dev_topics, test_topics = split_dev_test(topics, test_size=0.8)

default_complete_test_pipeline(dataset_name, dataset.get_qrels(), dev_topics, test_topics, q_encoder, eval_metrics)


100%|██████████| 25657/25657 [00:00<00:00, 1241156.78it/s]
GridScan: 100%|██████████| 5/5 [00:46<00:00,  9.37s/it]


Best map is 0.024039
Best setting is ['<fast_forward.util.pyterrier.FFInterpolate object at 0x7fecf5038f70> alpha=0.05']


A similar approach is also followed for the "arguana" dataset.

In [None]:
from experiments_helper import split_dev_test, default_complete_test_pipeline

dataset_name = "arguana"
dataset = pt.get_dataset("irds:beir/arguana")
topics = dataset.get_topics()

dev_topics, test_topics = split_dev_test(topics, test_size=0.8)

default_complete_test_pipeline(dataset_name, dataset.get_qrels(), dev_topics, test_topics, q_encoder, eval_metrics)


A similar approach is also followed for the "cqadupstack/english" dataset.

In [None]:
from experiments_helper import split_dev_test, default_complete_test_pipeline, default_complete_test_pipeline_nogrid

dataset_name = "cqadupstack/english"
dataset = pt.get_dataset("irds:beir/cqadupstack/english")
topics = dataset.get_topics('text')

dev_topics, test_topics = split_dev_test(topics, test_size=0.8)

default_complete_test_pipeline_nogrid(dataset_name, dataset.get_qrels(), dev_topics, test_topics, q_encoder, eval_metrics)

100%|██████████| 40221/40221 [00:00<00:00, 1703601.12it/s]
