In [3]:
from pymongo import MongoClient
import numpy as np

# Connect to MongoDB
client = MongoClient('mongodb://localhost:27017/')
db = client['VectorDBPython']
collection = db['CVs']

# Retrieve vectors from MongoDB
vectors = []
text = []
for doc in collection.find():
    vectors.append(doc['embedding'])
    text.append(doc['text_excerpt'])  # To keep track of which document each vector belongs to

vectors = np.array(vectors, dtype='float32')

In [4]:
import faiss

vectors = np.squeeze(vectors)
print("Vectors shape:", vectors.shape)

# Normalize vectors for cosine similarity
norms = np.linalg.norm(vectors, axis=1, keepdims=True)
vectors /= norms

print("Norms statistics - Min:", norms.min(), "Max:", norms.max(), "Mean:", norms.mean())

# Recompute norms to confirm
norms = np.linalg.norm(vectors, axis=1, keepdims=True)
# Normalize vectors again
vectors /= norms
print("Recomputed norms - Min:", norms.min(), "Max:", norms.max(), "Mean:", norms.mean())

# Create a Faiss index for inner product (cosine similarity)
index = faiss.IndexFlatIP(vectors.shape[1])
index.add(vectors)

# Optionally, save the Faiss index to a file
faiss.write_index(index, 'faiss_index.index')

Vectors shape: (11, 1024)
Norms statistics - Min: 7.9175515 Max: 16.900276 Mean: 9.136166
Recomputed norms - Min: 0.99999994 Max: 1.0000001 Mean: 1.0
