In [2]:
import pyterrier as pt

if not pt.started():
    pt.init(tqdm="notebook")


PyTerrier 0.10.0 has loaded Terrier 5.9 (built by craigm on 2024-05-02 17:40) and terrier-helper 0.0.8

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


### Create Sparse Index on disk -> can be skipped if index is created

In [2]:
dataset = pt.get_dataset("irds:beir/fiqa")

index_path = "./sparse_index_fiqa"
indexer = pt.IterDictIndexer(index_path)
index_ref = indexer.index(dataset.get_corpus_iter(), fields=["text"])


beir/fiqa documents:   0%|          | 0/57638 [00:00<?, ?it/s]

ValueError: Index already exists at ./sparse_index_fiqa/data.properties

### Load sparse index in memory

In [3]:
index_path = "./sparse_index_fiqa"
# Load index to memory
index = pt.IndexFactory.of(index_path, memory=True)

bm25 = pt.BatchRetrieve(index, wmodel="BM25")

From pyterrier docc: NB: BatchRetrieve will accept anything “index-like”, i.e. a string location of an index, an IndexRef or an Index.

In [8]:
from pyterrier.measures import RR, nDCG, MAP

bm25 = pt.BatchRetrieve(index, wmodel="BM25")
testset = pt.get_dataset("irds:beir/fiqa/test")
pt.Experiment(
    [~bm25],
    testset.get_topics(),
    testset.get_qrels(),
    eval_metrics=[RR @ 10, nDCG @ 10, MAP @ 100],
)

Unnamed: 0,name,RR@10,nDCG@10,AP@100
0,Cache(BR(BM25)),0.310271,0.252589,0.20864


### Create the GTE-base-en-v1.5 encoders

In [4]:
from gte_base_en_encoder import GTEBaseDocumentEncoder
import torch

q_encoder = GTEBaseDocumentEncoder("Alibaba-NLP/gte-base-en-v1.5")
d_encoder = GTEBaseDocumentEncoder(
        "Alibaba-NLP/gte-base-en-v1.5",
        device="cuda:0" if torch.cuda.is_available() else "cpu",
    )

### Retrieve dense index from disk and load it to memory

In [5]:
from fast_forward import OnDiskIndex, Mode
from pathlib import Path

ff_index = OnDiskIndex.load(
    Path("./dense_index_fiqa_GTE-base/ffindex_fiqa_gte-base-en-v1.5.h5"), query_encoder=q_encoder, mode=Mode.MAXP
).to_memory()

100%|██████████| 57638/57638 [00:00<00:00, 274211.02it/s]


In [6]:
from fast_forward.util.pyterrier import FFScore
from fast_forward.util.pyterrier import FFInterpolate

ff_score = FFScore(ff_index)
ff_int = FFInterpolate(alpha=0.05) #added the optimal version from the beginning in order to avoid running GridsSearch again

### Experiment with 500 candidates that eventually halts

In [10]:
pt.Experiment(
    [~bm25 % 500 >> ff_score >> ff_int],
    testset.get_topics(),
    testset.get_qrels(),
    eval_metrics=[RR @ 10, nDCG @ 10, MAP @ 100],
    names=["BM25 >> FF"],
)

Unnamed: 0,name,RR@10,nDCG@10,AP@100
0,BM25 >> FF,0.399449,0.335966,0.274293


### Experiment with 1000 candidates that loops forever -> tried both on disk and mem indexes

In [None]:
pt.Experiment(
    [~bm25 % 1000 >> ff_score >> ff_int],
    testset.get_topics(),
    testset.get_qrels(),
    eval_metrics=[RR @ 10, nDCG @ 10, MAP @ 100],
    names=["BM25 >> FF"],
)

In [None]:
indexer_mem = pt.IterDictIndexer(
    str(Path.cwd()),  # this will be ignored
    type=pt.index.IndexingType.MEMORY,
)
index_ref_2 = indexer_mem.index(dataset.get_corpus_iter(), fields=["text"])
bm25_from_mem = pt.BatchRetrieve(index_ref_2, wmodel="BM25")

In [None]:
pt.Experiment(
    [~bm25_from_mem % 1000 >> ff_score >> ff_int],
    testset.get_topics(),
    testset.get_qrels(),
    eval_metrics=[RR @ 10, nDCG @ 10, MAP @ 100],
    names=["BM25 >> FF"],
)