In [1]:
import faiss
import numpy as np

In [2]:
# Loaders for the SIFT datasets

def bvecs_read(fname):
    a = np.fromfile(fname, dtype=np.int32, count=1)
    b = np.fromfile(fname, dtype=np.uint8)
    d = a[0]
    return b.reshape(-1, d + 4)[:, 4:].copy()


def ivecs_read(fname):
    a = np.fromfile(fname, dtype='int32')
    d = a[0]
    return a.reshape(-1, d + 1)[:, 1:].copy()


def fvecs_read(fname):
    return ivecs_read(fname).view('float32')

In [3]:
def recall_at_r(I, gt, r):
    """
    Compute Recall@r over the all queries.

    Args:
        I (np.ndarray): Retrieval result, with shape(#queries, ANY), integer.
                        The index of the database item
        gt (np.ndarray): Groundtruth. np.array with shape(#queries, ANY). Integer.
                         Only gt[:, 0] is used
        r (int): Top-r

    Returns:
        The average recall@r over all queries
    """
    assert r <= I.shape[1]
    assert len(I) == len(gt)
    n_ok = (I[:, :r] == gt[:, :1]).sum()
    return n_ok / float(I.shape[0])

In [4]:
base = fvecs_read("/home/catalinlup/MyWorkspace/MasterThesis/ThesisCodebase/big_data/siftsmall/siftsmall_base.fvecs")
training_data = fvecs_read("/home/catalinlup/MyWorkspace/MasterThesis/ThesisCodebase/big_data/siftsmall/siftsmall_learn.fvecs")
queries = fvecs_read("/home/catalinlup/MyWorkspace/MasterThesis/ThesisCodebase/big_data/siftsmall/siftsmall_query.fvecs")
ground_truth = ivecs_read("/home/catalinlup/MyWorkspace/MasterThesis/ThesisCodebase/big_data/siftsmall/siftsmall_groundtruth.ivecs")

In [5]:
M = 8
VEC_DIM = base.shape[1]
NBITS=8
index = faiss.IndexPQ(VEC_DIM, M, NBITS)

In [6]:
index.train(training_data)
assert index.is_trained

In [7]:
index.add(base)

In [8]:
top_k = ground_truth.shape[1]
results = []
distances, I = index.search(queries, k=10)


In [9]:
recall_at_r(I, ground_truth, 10)

0.88