# Project: Movies recommendation
## Part-2 Movie recommendation index performance

#### Supported indexes
https://github.com/facebookresearch/faiss/wiki/Faiss-indexes

#### Note
* Running this notebook end to end may take few minutes

#### Notebook hangs !!
It may happen on machines with low resource availability.

* Try to restart local notebook
        - jupyter notebook stop 
        - jupyter notebook start
* If that doesn't help, use *Google collab*
  https://colab.research.google.com/
* Upload this notebook
* Run the cell below to install the required packages


In [1]:
## Needed for notebook on Google Colab
# !pip install faiss-gpu datasets

In [2]:
import faiss
from datasets import load_dataset
import pandas as pd
import numpy as np
import time

## 1. Load dataset acloudfan/embedded_movies_small

In [3]:
movies_dataset_name = 'acloudfan/embedded_movies_small'

movies_dataset = load_dataset(movies_dataset_name)

# This will hold the data for movies, will be cross referenced for details
movies_dataset_train = movies_dataset['train']
# Embeddings need to be in numpy array with dtype=float32
movies_dataset_train_np = np.array(movies_dataset_train['plot_embedding']).astype(np.float32)

print("movies_dataset_train_np.shape = ", movies_dataset_train_np.shape)


# This will hold the details for test dataset
movies_dataset_test = movies_dataset['test']
movies_dataset_test_np = np.array(movies_dataset_test['plot_embedding']).astype(np.float32)

print("movies_dataset_test_np.shape = ", movies_dataset_test_np.shape)

movies_dataset_train_np.shape =  (1017, 1536)
movies_dataset_test_np.shape =  (434, 1536)


In [4]:
# Check the embedding dimension
embedding_dimension = len(movies_dataset_test_np[0])

embedding_dimension

1536

## 2. Utility method

In [5]:
def search_all_test_against_train_dataset(faiss_index, k=3):
    start_time = time.time()

    faiss_index.search(movies_dataset_test_np, k)
    
    end_time = time.time()
    return round((end_time - start_time)*1000,0)

## 3. Create and train indexes


### FlatL2

https://github.com/facebookresearch/faiss/blob/main/tutorial/python/2-IVFFlat.py

In [6]:
# Create the index
def  create_index_flatl2():
    flatl2_index = faiss.IndexFlatL2(embedding_dimension)
    # Add the training embeddings to the index
    flatl2_index.add(movies_dataset_train_np)
    # return
    return flatl2_index

### IVFFlat

https://github.com/facebookresearch/faiss/blob/main/tutorial/python/2-IVFFlat.py

##### If returned search indexes = -1
Read the documentation : https://github.com/facebookresearch/faiss/wiki/FAQ#what-does-it-mean-when-a-search-returns--1-ids

In [7]:

# Change to this number will change the performance & recall
# nlist_ivfflat = 200

def create_index_ivfflat(nlist=200):
    # Quantizer
    quantizer = faiss.IndexFlatL2(embedding_dimension)
    # Index creation
    ivfflat_index = faiss.IndexIVFFlat(quantizer, embedding_dimension, nlist)
    # Train the index
    ivfflat_index.train(movies_dataset_train_np)
    ivfflat_index.add(movies_dataset_train_np)

    print(ivfflat_index.is_trained, '  ntotal=', ivfflat_index.ntotal,'    shape= ', movies_dataset_train_np.shape)
    
    # return
    return ivfflat_index

### PQ

https://github.com/facebookresearch/faiss/blob/main/tutorial/python/3-IVFPQ.py

In [8]:
def create_index_ivfpq(nlist=200, number_subspaces=8, number_bits_per_centroid=8):
    quantizer = faiss.IndexFlatL2(embedding_dimension)  # we keep the same L2 distance flat index
    index_ivfpq = faiss.IndexIVFPQ(quantizer, embedding_dimension, nlist, number_subspaces, number_bits_per_centroid) 

    # Train
    index_ivfpq.train(movies_dataset_train_np)
    index_ivfpq.add(movies_dataset_train_np)

    # return 
    return index_ivfpq

### HNSW Flat

https://faiss.ai/cpp_api/struct/structfaiss_1_1IndexHNSWFlat.html


In [9]:
def create_index_hnsw_flat(M = 32, efConstruction=8, efSearch=8):

    index_hnsw_flat = faiss.IndexHNSWFlat(embedding_dimension, M)

    index_hnsw_flat.efConstruction = efConstruction
    index_hnsw_flat.efSearch = efSearch

    index_hnsw_flat.train(movies_dataset_train_np)
    index_hnsw_flat.add(movies_dataset_train_np)

    return index_hnsw_flat

### HNSW Scalar

https://faiss.ai/cpp_api/struct/structfaiss_1_1IndexHNSWSQ.html

In [10]:
def create_index_hnsw_sq(scalar_quantizer=faiss.ScalarQuantizer.QT_8bit, M=16, efConstruction=8, efSearch=8):
    
    index_hnsw_sq = faiss.IndexHNSWSQ(embedding_dimension, scalar_quantizer, M)

    index_hnsw_flat.efConstruction = efConstruction
    index_hnsw_flat.efSearch = efSearch

    index_hnsw_sq.train(movies_dataset_train_np)
    index_hnsw_sq.add(movies_dataset_train_np)

    # return
    return index_hnsw_sq

## 4. Run tests

In [11]:
# Create the indexes using the functions defined in earlier cells
index_flatl2 = create_index_flatl2()
index_ivfflat = create_index_ivfflat(nlist=200)
index_ivfpq = create_index_ivfpq(nlist=100, number_subspaces=8,number_bits_per_centroid=8)
index_hnsw_flat = create_index_hnsw_flat(M = 200, efConstruction=8, efSearch=8)
index_hnsw_sq = create_index_hnsw_sq(scalar_quantizer=faiss.ScalarQuantizer.QT_8bit, M=16, efConstruction=8, efSearch=8)

True   ntotal= 1017     shape=  (1017, 1536)


In [None]:
# Run searches against each of the indexes
index_flatl2_duration = search_all_test_against_train_dataset(index_flatl2)
index_ivfflat_duration = search_all_test_against_train_dataset(index_ivfflat)
index_ivfpq_duration = search_all_test_against_train_dataset(index_ivfpq)
index_hnsw_flat_duration = search_all_test_against_train_dataset(index_hnsw_flat)
index_hnsw_sq_duration = search_all_test_against_train_dataset(index_hnsw_sq)

## 5. Plot chart

In [None]:
metrics_data = {
    'index_flatl2': index_flatl2_duration,
    'index_ivfflat': index_ivfflat_duration,
    'index_ivfpq': index_ivfpq_duration,
    'index_hnsw_flat': index_hnsw_flat_duration,
    'index_hnsw_sq': index_hnsw_sq_duration
}

In [None]:
import matplotlib.pyplot as plt 
fig = plt.figure(figsize = (10, 5))

plt.bar(metrics_data.keys(), metrics_data.values(), color ='lightblue', width = 0.4)
plt.xlabel('FAISS index')
plt.ylabel('Time in milliseconds')
plt.show()