In [2]:
import numpy as np
import faiss

#### Prepare the data

In [3]:
np.random.seed(42)

N = 100_000     # number of database vectors
D = 128         # vector dimensionality
NQ = 5          # number of queries


xb = np.random.randn(N, D).astype("float32")   # database
xq = np.random.randn(NQ, D).astype("float32")  # queries

#### Define the Hyperparameters

In [4]:
nlist = 1024     # number of IVF clusters
nprobe = 10      # number of clusters searched at query time

M = 16           # number of PQ sub-vectors
nbits = 8        # bits per sub-vector (256 centroids)

#### Training

In [5]:
quantizer = faiss.IndexFlatL2(D)

index = faiss.IndexIVFPQ(
    quantizer,   # coarse quantizer
    D,           # dimensionality
    nlist,       # IVF clusters
    M,           # PQ sub-vectors
    nbits        # bits per sub-vector
)

# Train the index
index.train(xb)

print(index.is_trained)

True


#### Indexing

In [8]:
index.add(xb)
print(index.ntotal)

100000


#### Retrieval

In [9]:
index.nprobe = nprobe

k = 5  # top-k neighbors
distances, indices = index.search(xq, k)

print(distances.shape)  # (NQ, k)
print(indices.shape)    # (NQ, k)

(5, 5)
(5, 5)


In [None]:
# Which clusters were probed?
centroid_distances, centroid_ids = index.quantizer.search(xq, nprobe)