In [1]:
import faiss
import numpy as np

#### Hyperparameters of PQ

In [11]:
d = 4  # dimension of the vectors we want to index
M = 2  # number of sub spaces
nbits = 4  # number of bits per sub vector (Run K means with 2^nbits clusters)
metric = faiss.METRIC_L2 # metric to cluster the Sub Vectors

n_train = 1000 # number of training vectors

#### Build the PQ Index

In [12]:
# X = np.random.random((n_train, d)).astype('float32')
X = np.array([
    [1, 2, 3, 4],
    [2, 1, 4, 3],
    [10, 11, 12, 13],
    [11, 10, 13, 12],
    [1, 2, 3, 4],
    [2, 1, 4, 3],
    [10, 11, 12, 13],
    [11, 10, 13, 12],
    [1, 2, 3, 4],
    [2, 1, 4, 3],
    [10, 11, 12, 13],
    [11, 10, 13, 12],
    [1, 2, 3, 4],
    [2, 1, 4, 3],
    [10, 11, 12, 13],
    [11, 10, 13, 12],
    [1, 2, 3, 4],
    [2, 1, 4, 3],
    [10, 11, 12, 13],
    [11, 10, 13, 12],
    [1, 2, 3, 4],
    [2, 1, 4, 3],
    [10, 11, 12, 13],
    [11, 10, 13, 12],
    [1, 2, 3, 4],
    [2, 1, 4, 3],
    [10, 11, 12, 13],
    [11, 10, 13, 12],
    [1, 2, 3, 4],
    [2, 1, 4, 3],
    [10, 11, 12, 13],
    [11, 10, 13, 12],
    [1, 2, 3, 4],
    [2, 1, 4, 3],
    [10, 11, 12, 13],
    [11, 10, 13, 12],
    [1, 2, 3, 4],
    [2, 1, 4, 3],
    [10, 11, 12, 13],
    [11, 10, 13, 12],
    [1, 2, 3, 4],
    [2, 1, 4, 3],
    [10, 11, 12, 13],
    [11, 10, 13, 12],
    [1, 2, 3, 4],
    [2, 1, 4, 3],
    [10, 11, 12, 13],
    [11, 10, 13, 12],
    [1, 2, 3, 4],
    [2, 1, 4, 3],
    [10, 11, 12, 13],
    [11, 10, 13, 12],
    [1, 2, 3, 4],
    [2, 1, 4, 3],
    [10, 11, 12, 13],
    [11, 10, 13, 12],
    [1, 2, 3, 4],
    [2, 1, 4, 3],
    [10, 11, 12, 13],
    [11, 10, 13, 12],
    [1, 2, 3, 4],
    [2, 1, 4, 3],
    [10, 11, 12, 13],
    [11, 10, 13, 12],
    [1, 2, 3, 4],
    [2, 1, 4, 3],
    [10, 11, 12, 13],
    [11, 10, 13, 12],
    [1, 2, 3, 4],
    [2, 1, 4, 3],
    [10, 11, 12, 13],
    [11, 10, 13, 12],
    [1, 2, 3, 4],
    [2, 1, 4, 3],
    [10, 11, 12, 13],
    [11, 10, 13, 12],
    [1, 2, 3, 4],
    [2, 1, 4, 3],
    [10, 11, 12, 13],
    [11, 10, 13, 12],
    [1, 2, 3, 4],
    [2, 1, 4, 3],
    [10, 11, 12, 13],
    [11, 10, 13, 12],
    [1, 2, 3, 4],
    [2, 1, 4, 3],
    [10, 11, 12, 13],
    [11, 10, 13, 12],
])

n_train = X.shape[0]
d = X.shape[1]

# Size of X in KB
print(f"Size of X of shape {n_train} x {d} in KB: {X.nbytes / 1024} Kilobytes")

index = faiss.IndexPQ(
    d,        # dimension of the vectors we want to index
    M,        # number of sub spaces
    nbits,    # number of bits per sub vector
    metric    # metric to cluster the Sub Vectors

)

# This will Train the Product Quantizer to learn the codebooks (cluster centroids)
index.train(X)

# Analyse the learned codebooks
codebooks = faiss.vector_to_array(index.pq.centroids).reshape(M, 2**nbits, d // M)
print("Learned Codebooks of Shape:", codebooks.shape)

# Print the codebooks
for m in range(M):
    print(f"Codebook for Subspace {m}:")
    print(codebooks[m])
    print("\n")


# Add the training vectors to the index
index.add(X) # This will encode the vectors using the learned codebooks


# Extract PQ codes
codes = faiss.vector_to_array(index.codes)
print(f"PQ Codes Shape: {codes.shape} with Minimum Code Value: {codes.min()} and Maximum Code Value: {codes.max()}")
# 0 Represent: 0000_0000 (i.e., first cluster in both subspaces: Total Size is M * nbits = 2 * 4 = 8 bits = 1 byte)
# 255 Represent: 1111_1111 (i.e., last cluster in both subspaces: Total Size is M * nbits = 2 * 4 = 8 bits = 1 byte)

# Total Size of PQ codes in KB
print(f"Size of PQ codes in KB: {codes.nbytes / 1024} Kilobytes")


Size of X of shape 88 x 4 in KB: 2.75 Kilobytes
Learned Codebooks of Shape: (2, 16, 2)
Codebook for Subspace 0:
[[ 0.9990225   2.0019512 ]
 [10.989258   10.009766  ]
 [ 1.0009756   1.998045  ]
 [10.9999895   9.99999   ]
 [ 1.0029325   1.9941463 ]
 [ 1.9980469   1.0009766 ]
 [ 1.0009756   1.998045  ]
 [ 9.970732   11.032258  ]
 [ 0.99999905  1.9999981 ]
 [11.021495    9.980478  ]
 [ 9.99999    10.9999895 ]
 [ 9.99999    10.9999895 ]
 [ 0.9980478   2.0039082 ]
 [ 2.0019531   0.99902344]
 [10.019541   10.978526  ]
 [ 9.990225   11.010732  ]]


Codebook for Subspace 1:
[[ 2.9970675  4.0039024]
 [12.987305  12.011719 ]
 [ 3.0029268  3.99609  ]
 [12.999988  11.999989 ]
 [ 3.0087976  3.9882927]
 [ 3.9960938  3.0029297]
 [ 3.0029268  3.99609  ]
 [11.964878  13.038123 ]
 [ 2.9999971  3.9999962]
 [13.025403  11.976574 ]
 [11.999989  12.999988 ]
 [11.999989  12.999988 ]
 [ 2.9941435  4.0078163]
 [ 4.0039062  2.9970703]
 [12.023449  12.974622 ]
 [11.98827   13.012683 ]]


PQ Codes Shape: (88,) wit



#### Querying from PQ Index

In [13]:
X_query = np.array([[2, 2, 4, 4]])
top_k = 1

distances, indices = index.search(X_query, top_k)

print("Indices of Nearest Neighbors:\n", indices)
print("Distances to Nearest Neighbors:\n", distances)

Indices of Nearest Neighbors:
 [[1]]
Distances to Nearest Neighbors:
 [[1.9922161]]
