In [19]:
import numpy as np
import h5py
import faiss
import math
import time
import pandas as pd
from tqdm import tqdm, trange
import ir_datasets
from collections import defaultdict

In [48]:
DIM=768
M=96
K=256
SAMPLE_SIZE=200000
# SAMPLE_SIZE=1000
DATASET_PATH='/home/catalinlup/MyWorkspace/MasterThesis/datasets/h5_indices/faiss.msmarco-v1-passage.tct_colbert.h5'
INDEX_SAVE_PATH=f'./saved_indices/pq_index_{M}_{K}.faiss'
INDEX_SAVE_PATH_FULL=f'./saved_indices_full/pq_index_{M}_{K}.faiss'
QUERY_DATASET_PATH='/home/catalinlup/MyWorkspace/MasterThesis/datasets/encoded_test_set_queries/msmarco-test-2020-queries.npy'
QIDS_DATASET_PATH='/home/catalinlup/MyWorkspace/MasterThesis/datasets/encoded_test_set_queries/qids_2020.npy'
QREL_PATH='msmarco-passage/trec-dl-2020'
BATCH_SIZE = 1000
TOP_K=5000

# Index Training

In [21]:
index = faiss.IndexPQ(DIM, M, int(math.log2(K)), 0)

In [22]:
def sample_from_dataset_vectors(dataset: h5py.Dataset, sample_size) -> np.ndarray:
    """
    Takes a random sample from the provided h5 dataset. If the sample size is none, raturns the entire dataset.
    """

    if sample_size == None:
        return dataset[:, :]
    

    random_ids = np.random.choice(dataset.shape[0], size=sample_size, replace=False)
    random_ids.sort()

    num_batches = math.ceil(random_ids.shape[0] / BATCH_SIZE)

    random_samples = []

    for bi in trange(num_batches):

        index_start = bi * BATCH_SIZE
        index_end = min((bi + 1) * BATCH_SIZE, random_ids.shape[0])

        random_id_batch = random_ids[index_start:index_end]
        random_samples.append(dataset[random_id_batch])

    return np.concatenate(random_samples)

In [23]:
# train the index
with h5py.File(DATASET_PATH, 'r') as dataset:
    training_sample = sample_from_dataset_vectors(dataset['vectors'], SAMPLE_SIZE)

    print('Dataset sampled')

    time_start = time.time()
    index.train(training_sample)
    time_end = time.time()

print(f'Index trained. Time: {time_end - time_start}')

100%|█████████████████████████████████████████| 200/200 [00:14<00:00, 14.28it/s]


Dataset sampled
Index trained. Time: 833.1374173164368


In [24]:
# save the index
from faiss import write_index

write_index(index, INDEX_SAVE_PATH)

# Add data to the index

In [25]:
# load index
from faiss import read_index

index = read_index(INDEX_SAVE_PATH)
assert index.metric_type == 0

In [26]:
with h5py.File(DATASET_PATH, 'r') as dataset:
    num_batches = math.ceil(dataset['vectors'].shape[0] / BATCH_SIZE)

    for bi in trange(num_batches):

        index_start = bi * BATCH_SIZE
        index_end = min((bi + 1) * BATCH_SIZE, dataset['vectors'].shape[0])

        batch = dataset['vectors'][index_start:index_end]
        index.add(batch)

100%|███████████████████████████████████████| 8842/8842 [40:58<00:00,  3.60it/s]


In [27]:
# save the index
from faiss import write_index

write_index(index, INDEX_SAVE_PATH_FULL)

# Running experiments

In [49]:
# load index
from faiss import read_index

index = read_index(INDEX_SAVE_PATH_FULL)

print(index.ntotal)

assert index.ntotal == 8841823
assert index.metric_type == 0

8841823


In [50]:
# load query dataset
query_vectors = np.load(QUERY_DATASET_PATH)
qids = np.load(QIDS_DATASET_PATH)

print(QUERY_DATASET_PATH)
print(QIDS_DATASET_PATH)

/home/catalinlup/MyWorkspace/MasterThesis/datasets/encoded_test_set_queries/msmarco-test-2020-queries.npy
/home/catalinlup/MyWorkspace/MasterThesis/datasets/encoded_test_set_queries/qids_2020.npy


In [51]:
# perform the search and build the runfile

run = defaultdict(dict)

for i in trange(query_vectors.shape[0]):
    query_vector = query_vectors[i]
    qid = qids[i]


    D, I = index.search(query_vector.reshape((1,query_vector.shape[0])), TOP_K)


    for i in range(I.shape[1]):
        doc_id = I[0][i]
        doc_score = D[0][i]
        run[str(qid)][str(doc_id)] = float(doc_score)


100%|█████████████████████████████████████████| 200/200 [00:46<00:00,  4.33it/s]


In [52]:
# load the qrel and evaluate the 
psgs = ir_datasets.load(QREL_PATH)

In [53]:
print(next(psgs.qrels_iter()))

TrecQrel(query_id='23849', doc_id='1020327', relevance=2, iteration='0')


In [54]:
from ir_measures import calc_aggregate, nDCG, AP, RR

METRICS = [nDCG@10, AP(rel=2)@1000, RR(rel=2)@10]

calc_aggregate(METRICS, psgs.qrels_iter(), run)

{nDCG@10: 0.5436612068654874,
 AP(rel=2)@1000: 0.33754363265102855,
 RR(rel=2)@10: 0.7305555555555556}