In [1]:
from transformers import BertModel, BertTokenizer
import torch

# Load the pre-trained model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
model = BertModel.from_pretrained('bert-large-uncased')

# Function to get embeddings
def get_embedding(query):
    # Tokenize the query
    inputs = tokenizer(query, return_tensors='pt', truncation=True, padding=True)
    
    # Forward pass through the model
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Extract the last hidden state (embeddings)
    last_hidden_state = outputs.last_hidden_state
    
    # Pool the embeddings (mean of the token embeddings)
    pooled_output = last_hidden_state.mean(dim=1)
    
    return pooled_output

# Example query
query = "Search for a senior professional with over 13 years of experience in business and system analysis roles, including leading AI and machine learning projects, with expertise in Microsoft Dynamics ERP and Oracle SQL. The individual should have a background in computer science from Egypt and have worked on large-scale projects in Riyadh."
query_vector = get_embedding(query)

print("Embedding vector:", query_vector)



Embedding vector: tensor([[-0.2824, -0.1611, -0.1934,  ..., -0.1406, -0.0357,  0.3038]])


In [2]:
from pymongo import MongoClient
import numpy as np

# Connect to MongoDB
client = MongoClient('mongodb://localhost:27017/')
db = client['VectorDBPython']
collection = db['CVs']

# Retrieve vectors from MongoDB
vectors = []
metadata = []
for doc in collection.find():
    vectors.append(doc['embedding'])
    metadata.append(doc['metadata'])  # To keep track of which document each vector belongs to

vectors = np.array(vectors, dtype='float32')

In [3]:
import faiss
import numpy as np

# Load the Faiss index from file
index = faiss.read_index('faiss_index.index')

# Normalize query vector
norm_query_vector = np.linalg.norm(query_vector)
print(f"Initial norm of the query vector: {norm_query_vector}")
query_vector /= np.linalg.norm(query_vector)

# Recalculate the norm to verify normalization
norm_query_vector_after = np.linalg.norm(query_vector)
print(f"Norm of the query vector after normalization: {norm_query_vector_after}")

# Perform the search
distances, indices = index.search(query_vector, k=1)

print("Indices:", indices)
print("Distances (inner product):", distances)

# Retrieve the metadata for the closest documents
for idx in indices[0]:
    print("-------------------------------------------------------------------------")
    metadata_item = metadata[idx]
    for key, value in metadata_item.items():
        print(f"{key}: {value}")
    print("-------------------------------------------------------------------------")

Initial norm of the query vector: 9.040555953979492
Norm of the query vector after normalization: 1.0
Indices: [[6]]
Distances (inner product): [[0.8438606]]
-------------------------------------------------------------------------
file_path: E:\youssef\Testing\Python\Resumes\Santiago-Resume-Template-Professional.pdf
email: yosrinegm@gmail.com
created_at: 2024-08-17T00:26:54.431725
tags: ['tag1', 'tag2']
-------------------------------------------------------------------------
