In [1]:
import faiss
print("✅ FAISS is installed correctly!")

✅ FAISS is installed correctly!


In [2]:
import pandas as pd
from IPython.display import display

# Load Datasets
movies = pd.read_csv('../../cleaned_remapped_movies.csv')
ratings = pd.read_csv('../../cleaned_remapped_ratings.csv')
display(movies.head(5))

Unnamed: 0,movieId,title,genres,year,genre_list,combined_features
0,415.0,toy story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995,"['Adventure', 'Animation', 'Children', 'Comedy...",toy story (1995) Adventure|Animation|Children|...
1,191.0,jumanji (1995),Adventure|Children|Fantasy,1995,"['Adventure', 'Children', 'Fantasy']",jumanji (1995) Adventure|Children|Fantasy
2,941.0,grumpier old men (1995),Comedy|Romance,1995,"['Comedy', 'Romance']",grumpier old men (1995) Comedy|Romance
3,3313.0,waiting to exhale (1995),Comedy|Drama|Romance,1995,"['Comedy', 'Drama', 'Romance']",waiting to exhale (1995) Comedy|Drama|Romance
4,942.0,father of the bride part ii (1995),Comedy,1995,['Comedy'],father of the bride part ii (1995) Comedy


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words='english')

# Fit and transform the combined features
tfidf_matrix = tfidf.fit_transform(movies['combined_features'])

# Check the shape of the matrix
print("TF-IDF Matrix shape:", tfidf_matrix.shape)

TF-IDF Matrix shape: (87382, 42527)


In [4]:
import faiss
print(faiss.__version__)  # Check FAISS version

1.9.0


In [5]:
# import sys

# class StreamToLogger:
#     def __init__(self, stream):
#         self.stream = stream

#     def write(self, message):
#         if message.strip():
#             print(message)  # Print message explicitly

#     def flush(self):
#         pass

In [None]:
import time
import sys

# FAISS Training Code
'''
Step 1: Find a Valid SUB_VECTOR_SIZE
Step 2: Initialize FAISS Index
Step 3: Train FAISS on a Random Subset
Step 4: Add Vectors in Batches
Step 5: Move FAISS Index to CPU & Save

'''

import faiss
import numpy as np
from scipy.sparse import csr_matrix

BATCH_SIZE = 5000
NUM_CLUSTERS = 1800
PQ_BITS = 8


faiss.omp_set_num_threads(8)  # Force FAISS to use all 8 CPUs
print(f"✅ :::::::: FAISS is using {faiss.omp_get_max_threads()} threads.")

# Load dataset dimensions
VECTOR_DIM = tfidf_matrix.shape[1]

def find_valid_subvector_size(vector_dim):
    # List of FAISS-supported subvector sizes
    valid_sizes = [64, 56, 48, 40, 32, 24, 16, 12, 8, 4, 3, 2, 1]

    # Find the largest valid subvector size that evenly divides `vector_dim`
    for m in valid_sizes:
        if vector_dim % m == 0:
            return m
    return None

SUB_VECTOR_SIZE = find_valid_subvector_size(VECTOR_DIM)
print(f":::::: Found valid subvector size {SUB_VECTOR_SIZE} .......")

if SUB_VECTOR_SIZE is None:
    raise ValueError(f"No valid `SUB_VECTOR_SIZE` found for VECTOR_DIM = {VECTOR_DIM}.")


print(":::::: Starting FAISS training 1 .......")


# Initialize FAISS GPU resources
res = faiss.StandardGpuResources()

# Set 1GB GPU memory for FAISS operations
res.setTempMemory(1 * 1024 * 1024 * 1024)

print(":::::: Starting FAISS training 2 .......")
quantizer = faiss.IndexFlatIP(VECTOR_DIM)
print(":::::: Starting FAISS training 3 .......")
index = faiss.IndexIVFPQ(quantizer, VECTOR_DIM, NUM_CLUSTERS, SUB_VECTOR_SIZE, PQ_BITS)
print(":::::: Starting FAISS training 4 .......")


print(":::::: Starting FAISS training 5 .......")



# Train on 80,000 samples (FAISS best practice)
num_train_samples = min(60000, tfidf_matrix.shape[0])
random_indices = np.random.choice(tfidf_matrix.shape[0], num_train_samples, replace=False)
train_data = tfidf_matrix[random_indices].toarray().astype(np.float32)



# Set verbose output for training
sys.stdout.flush()  # Force logs to appear in the 
# sys.stdout = StreamToLogger(sys.stdout)  # Redirect FAISS logs
faiss.cvar.indexIVF_stats.reset()  # Reset FAISS internal stats
faiss.cvar.indexIVF_stats.verbose = True  # Enable verbose output


# ✅ Initialize the FAISS Index on CPU
index_cpu = faiss.IndexIVFPQ(quantizer, VECTOR_DIM, NUM_CLUSTERS, SUB_VECTOR_SIZE, PQ_BITS)
print(":::::: Starting FAISS training 5 A .......")
start_time = time.time()
index_cpu.train(train_data)  # Train on CPU
end_time = time.time()
print("✅ :::::: FAISS training completed!")
print(f"✅ FAISS training completed in {end_time - start_time:.2f} seconds.")

'''
# Use FAISS FP16 Precision to Reduce Memory Usage
# Reduces memory usage by 50%, enabling more efficient training.
'''
gpu_options = faiss.GpuMultipleClonerOptions()
gpu_options.useFloat16 = True  # Enable FP16

# Move FAISS index to GPU with allocated resources
gpu_index = faiss.index_cpu_to_gpu(res, 0, index_cpu, gpu_options)

print("✅ FAISS GPU memory allocated and index moved to GPU.")













# # Train on 80,000 samples (FAISS best practice)
# num_train_samples = min(80000, tfidf_matrix.shape[0])
# random_indices = np.random.choice(tfidf_matrix.shape[0], num_train_samples, replace=False)
# train_data = tfidf_matrix[random_indices].toarray().astype(np.float32)

print(":::::: Starting FAISS training 6 .......")

'''
# Memory-inefficient way to train 80K samples
gpu_index.train(train_data)
'''


# for i in range(num_iterations):
#     start_idx = i * batch_size
#     end_idx = (i + 1) * batch_train_size if i < num_iterations - 1 else train_data.shape[0]
    
#     batch_train_data = train_data[start_idx:end_idx]  # Slice a new batch of training data
#     print(f"🚀 Training iteration {i+1}/{num_iterations} on {batch_train_data.shape[0]} samples...")
    
#     index_cpu.train(batch_train_data)


print(":::::: Starting FAISS training 7 .......")

# Add vectors in large batches
for start in range(0, tfidf_matrix.shape[0], BATCH_SIZE):
    print(f"     :::::: Start ==> {start}")
    end = min(start + BATCH_SIZE, tfidf_matrix.shape[0])
    batch_data = tfidf_matrix[start:end].toarray().astype(np.float32)
    gpu_index.add(batch_data)

# ✅ Save trained FAISS index
final_index = faiss.index_gpu_to_cpu(gpu_index)
faiss.write_index(final_index, "../../faiss_gpu_index60k-1024.bin")

print("✅ FAISS model saved as faiss_gpu_index60k-1800.bin")

✅ :::::::: FAISS is using 8 threads.
:::::: Found valid subvector size 1 .......
:::::: Starting FAISS training 1 .......
:::::: Starting FAISS training 2 .......
:::::: Starting FAISS training 3 .......
:::::: Starting FAISS training 4 .......
:::::: Starting FAISS training 5 .......
:::::: Starting FAISS training 5 A .......
