# Benchmarking ANN

Focus in this benchmarking is ONLY on recall

### Notebook hangs, Kernel restarts?
* It may happen as a result of HIGH memory/cpu usage
* Upload to Google Colab and try it there

In [1]:
## Needed for notebook on Google Colab
# !pip install faiss-gpu datasets -q

In [2]:
import faiss
from datasets import load_dataset
import pandas as pd
import numpy as np
import time

## 1. Load dataset acloudfan/embedded_movies_small

In [3]:
movies_dataset_name = 'acloudfan/embedded_movies_small'

movies_dataset = load_dataset(movies_dataset_name)

# This will hold the data for movies, will be cross referenced for details
movies_dataset_train = movies_dataset['train']
# Embeddings need to be in numpy array with dtype=float32
movies_dataset_train_np = np.array(movies_dataset_train['plot_embedding']).astype(np.float32)

print("movies_dataset_train_np.shape = ", movies_dataset_train_np.shape)


# This will hold the details for test dataset
movies_dataset_test = movies_dataset['test']
movies_dataset_test_np = np.array(movies_dataset_test['plot_embedding']).astype(np.float32)

print("movies_dataset_test_np.shape = ", movies_dataset_test_np.shape)

movies_dataset_train_np.shape =  (1017, 1536)
movies_dataset_test_np.shape =  (434, 1536)


In [5]:
# Check the embedding dimension
embedding_dimension = len(movies_dataset_test_np[0])

embedding_dimension

1536

## 2. Utility metheods for index creation and training


### FlatL2

https://github.com/facebookresearch/faiss/blob/main/tutorial/python/2-IVFFlat.py

In [None]:
# Create the index
def  create_index_flatl2():
    flatl2_index = faiss.IndexFlatL2(embedding_dimension)
    # Add the training embeddings to the index
    flatl2_index.train(movies_dataset_train_np)
    flatl2_index.add(movies_dataset_train_np)
    # return
    return flatl2_index

### IVFFlat

https://github.com/facebookresearch/faiss/blob/main/tutorial/python/2-IVFFlat.py

##### If returned search indexes = -1
Read the documentation : https://github.com/facebookresearch/faiss/wiki/FAQ#what-does-it-mean-when-a-search-returns--1-ids

In [None]:

# Change to this number will change the performance & recall
# nlist_ivfflat = 200

def create_index_ivfflat(nlist=200):
    # Quantizer
    quantizer = faiss.IndexFlatL2(embedding_dimension)
    # Index creation
    ivfflat_index = faiss.IndexIVFFlat(quantizer, embedding_dimension, nlist)
    # Train the index
    ivfflat_index.train(movies_dataset_train_np)
    ivfflat_index.add(movies_dataset_train_np)

    return ivfflat_index

### PQ

https://github.com/facebookresearch/faiss/blob/main/tutorial/python/3-IVFPQ.py

In [None]:
def create_index_ivfpq(nlist=200, number_subspaces=8, number_bits_per_centroid=8):
    quantizer = faiss.IndexFlatL2(embedding_dimension)  # we keep the same L2 distance flat index
    index_ivfpq = faiss.IndexIVFPQ(quantizer, embedding_dimension, nlist, number_subspaces, number_bits_per_centroid) 

    # Train
    index_ivfpq.train(movies_dataset_train_np)
    index_ivfpq.add(movies_dataset_train_np)

    # return 
    return index_ivfpq

### HNSW Flat

https://faiss.ai/cpp_api/struct/structfaiss_1_1IndexHNSWFlat.html

In [None]:
def create_index_hnsw_flat(M = 32):

    index_hnsw_flat = faiss.IndexHNSWFlat(embedding_dimension, M)
    index_hnsw_flat.train(movies_dataset_train_np)
    index_hnsw_flat.add(movies_dataset_train_np)

    return index_hnsw_flat

### HNSW Scalar

https://faiss.ai/cpp_api/struct/structfaiss_1_1IndexHNSWSQ.html

In [None]:
def create_index_hnsw_sq(scalar_quantizer=faiss.ScalarQuantizer.QT_8bit, M=16):
    
    index_hnsw_sq = faiss.IndexHNSWSQ(embedding_dimension, scalar_quantizer, M)

    index_hnsw_sq.train(movies_dataset_train_np)
    index_hnsw_sq.add(movies_dataset_train_np)

    # return
    return index_hnsw_sq

## 4. ANN search & compare utility functions
Utility function gets the results of search in JSON array

In [None]:
# Runs search using the test dataset
# Returns result as a JSON array
def run_against_index(index_faiss, dataset_np):
    # index = index of test embedding, value = result tuple
    results_array = []
    
    # num_test_embeddings = dataset_test_np.shape[0]
    num_test_embeddings = dataset_np.shape[0]

    for i in range(num_test_embeddings): 
        result = index_faiss.search(np.array([dataset_np[i]]), k)
        distances = result[0].tolist()[0]
        similar_indexes = result[1].tolist()[0]
        results_array.append({"distances": distances, "similar_indexes": similar_indexes})

    return results_array
    

In [None]:
# Calculate Recall
# Function gets 2 JSON for comparison
def  calculate_recall(exact_result, ann_result):
    perfect_recalls = 0
    count_matches_found = 0
    total_comparisons = 0
    
    for i, result in enumerate(exact_result):
        exact_indices = result['similar_indexes']
        ann_indices = ann_result[i]['similar_indexes']
        
        if exact_indices == ann_indices:
            perfect_recalls = perfect_recalls + 1
        else:
           # check how many indices are common between 2 arrays
           for indices in exact_indices:
               total_comparisons = total_comparisons + 1
               if indices in ann_indices:
                  count_matches_found = count_matches_found + 1

    # To avoid divide by 0
    if total_comparisons == 0:
        # Means perfect recall !!
        average_recall = 1
    else:
        average_recall = round((count_matches_found/total_comparisons), 2)
        
    return     perfect_recalls,average_recall

## 5. Create index
Adjust the parameters for algorithms as needed

In [None]:
# Create the indexes using the functions defined in earlier cells
index_flat = create_index_flatl2()

# Manitain data in a dictionary
index_benchmark_results = []

# IVF Flat Adjust: nlist
index_ivfflat_config = { "nlist": 10 }
# Create index
index_ivfflat = create_index_ivfflat(**index_ivfflat_config)
index_benchmark_results.append(
    {
        "label": "IVF Flat",
        "parameters": index_ivfflat,
        "index" : index_ivfflat
    }
)

# IVF PQ  Adjust: nlist_ivfpq, number_subspaces, number_bits_per_centroid
index_ivfpq_config = {"nlist": 20, "number_subspaces": 8, "number_bits_per_centroid": 8}
index_ivfpq = create_index_ivfpq(**index_ivfpq_config)
index_benchmark_results.append(
    {
        "label": "IVF PQ",
        "parameters": index_ivfpq_config,
        "index" : index_ivfpq
    }
)

# HNSW Flat   Adjust: M
index_hnsw_flat_config = {"M":20}
index_hnsw_flat = create_index_hnsw_flat(**index_hnsw_flat_config)
index_benchmark_results.append(
    {
        "label": "HNSW Flat",
        "parameters": index_hnsw_flat_config,
        "index" : index_hnsw_flat
    }
)


# HNSW Scalar Quantizer  Adjust : M
index_hnsw_sq_config = {"M":20, "scalar_quantizer": faiss.ScalarQuantizer.QT_8bit}
index_hnsw_sq = create_index_hnsw_sq(**index_hnsw_sq_config)
index_benchmark_results.append(
    {
        "label": "HNSW SQ",
        "parameters": index_hnsw_sq_config,
        "index" : index_hnsw_sq
    }
)


## 6. Run benchmark for all indexes

In [None]:
k = 5

# Establish the baseline
exact_search_result = run_against_index(index_flat, movies_dataset_test_np)

for benchmark_info in index_benchmark_results:
    index_for_test = benchmark_info['index']
    ann_search_result = run_against_index(index_for_test, movies_dataset_test_np)
    recall = calculate_recall(exact_search_result, ann_search_result)
    benchmark_info['recall'] = recall[1]

# Set the index
# index_for_test = index_ivfflat
# ann_search_result = run_against_index(index_for_test, movies_dataset_test_np)
# calculate_recall(exact_search_result, ann_search_result)
index_benchmark_results

## 7. Plot the bar chart with recalls

In [None]:
import matplotlib.pyplot as plt 

fig = plt.figure(figsize = (10, 5))

metrics_data={}
for result in index_benchmark_results:
    metrics_data[result['label']] = int(result['recall']*100)

print(metrics_data)

plt.bar(metrics_data.keys(), metrics_data.values(), color ='lightblue', width = 0.4)
plt.xlabel('FAISS index')
plt.ylabel('Recall')
plt.grid(True)
plt.show()