# FAISS Raw Implementation

## Indexing

#### Step 1: Define the Dataset

In [5]:
import numpy as np

n_samples = 10
dim = 3

X = np.array([
    [1,1,1], [1,2,1], [2,1,1], [2,2,1],
    [8,8,8], [8,9,8], [9,8,8], [9,9,8],
    [8,8,9], [9,9,9]
], dtype="float32")
X.shape

(10, 3)

#### Step 2: Create Coarse Quantisation (i.e. Cluster the vectors)

In [None]:
import faiss

k = 2             # Number of Clusters
d = X.shape[1]    # Dimension of Data Points
nredo = 2         # Number of Times to repeat the clustering
niter = 5         # Number of Iterations

# Intialise the Model
kmeans = faiss.Clustering(d, k)

# Set parameters
kmeans.nredo = nredo
kmeans.niter = niter
kmeans.verbose = False

# Index defines distance metric
index = faiss.IndexFlatL2(d)

# Train the model to get centroids and labels
kmeans.train(X, index)

# Centroids
centroids = faiss.vector_to_array(kmeans.centroids).reshape(k, d)

# Labels
_, cluster_assignments = index.search(X, 1)
labels = cluster_assignments.flatten()

centroids, labels



(array([[8.5     , 8.5     , 8.333334],
        [1.5     , 1.5     , 1.      ]], dtype=float32),
 array([1, 1, 1, 1, 0, 0, 0, 0, 0, 0]))

In [9]:
import pandas as pd

df = pd.DataFrame(X, columns=['Feature_1', 'Feature_2', 'Feature_3'])
df['Cluster_Label'] = labels
df["Centroid"] = df['Cluster_Label'].apply(lambda x: centroids[x])
df

Unnamed: 0,Feature_1,Feature_2,Feature_3,Cluster_Label,Centroid
0,1.0,1.0,1.0,1,"[1.5, 1.5, 1.0]"
1,1.0,2.0,1.0,1,"[1.5, 1.5, 1.0]"
2,2.0,1.0,1.0,1,"[1.5, 1.5, 1.0]"
3,2.0,2.0,1.0,1,"[1.5, 1.5, 1.0]"
4,8.0,8.0,8.0,0,"[8.5, 8.5, 8.333334]"
5,8.0,9.0,8.0,0,"[8.5, 8.5, 8.333334]"
6,9.0,8.0,8.0,0,"[8.5, 8.5, 8.333334]"
7,9.0,9.0,8.0,0,"[8.5, 8.5, 8.333334]"
8,8.0,8.0,9.0,0,"[8.5, 8.5, 8.333334]"
9,9.0,9.0,9.0,0,"[8.5, 8.5, 8.333334]"


## Retrieval

#### Step 1: Search Among the centroid

In [16]:
# Query Vector Example
query_vector = np.array([[2,1,1]], dtype="float32")

# Define n_probes (Number of clusters to search)
n_probes = 1

# Step 1: Find Nearest Centroid
_, nearest_centroid = index.search(query_vector, n_probes)
nearest_centroid, centroids[nearest_centroid]

(array([[1]]), array([[[1.5, 1.5, 1. ]]], dtype=float32))

#### Step 2: Search Among the vectors of centroid

In [17]:
# Get the vectors in the nearest centroid cluster
cluster_label = nearest_centroid[0][0]
cluster_vectors = X[labels == cluster_label]

# Define Top K vectors to retrieve
top_k = 2

# Find the top K nearest vectors in the cluster
index_cluster = faiss.IndexFlatL2(d)
index_cluster.add(cluster_vectors)
_, top_k_indices = index_cluster.search(query_vector, top_k)
top_k_vectors = cluster_vectors[top_k_indices.flatten()]
top_k_vectors

array([[2., 1., 1.],
       [1., 1., 1.]], dtype=float32)

# FAISS Direct Implementation

## Indexing

#### Step 1: Define the Dataset

In [18]:
import numpy as np

n_samples = 10
dim = 3

X = np.array([
    [1,1,1], [1,2,1], [2,1,1], [2,2,1],
    [8,8,8], [8,9,8], [9,8,8], [9,9,8],
    [8,8,9], [9,9,9]
], dtype="float32")
X.shape

(10, 3)

#### Create the Index

In [None]:
d = X.shape[1] # Dimension of Vector
k = 2 # Number of clusters

# Coarse quantizer
quantizer = faiss.IndexFlatL2(d) # To use L2 distance

# IVF Flat index
index = faiss.IndexIVFFlat(quantizer, d, k, faiss.METRIC_L2) # Why METRIC_L2 when quantizer is already L2?

"""
Query
 ├──► Coarse quantizer (which cluster?) -> faiss.IndexFlatL2(d) # To use L2 distance
 │       uses: quantizer
 │
 └──► Fine search (exact distances) -> faiss.METRIC_L2
         uses: METRIC_L2 or METRIC_INNER_PRODUCT
"""
# Training
index.train(X)

# Add vectors
index.add(X)



## Retrieval

In [26]:
n_probes = 1  # Number of clusters to search
index.nprobe = n_probes

# Top K vector to retrieve
top_k = 2

# Query Vector Example
q = np.array([[2,1,1]], dtype="float32")

# Search
D, I = index.search(q, k=top_k) # D: distances, I: indices

# Vectors
retrieved_vectors = X[I.flatten()]
retrieved_vectors

array([[2., 1., 1.],
       [1., 1., 1.]], dtype=float32)