In [None]:
# Installing necessary libraries
!pip3 install pymongo httpx typing pandas

In [None]:
# Loading necessary libraries
import pymongo
import json
import httpx
from typing import List
import pandas as pd


In [None]:
# Set up a connection to your Amazon DocumentDB (MongoDB compatibility) cluster and creating the database

client = pymongo.MongoClient(
"<connection string with port>",
username="<username>",
password="<password>",
retryWrites=False,
tls='true',
tlsCAFile="global-bundle.pem")
db = client.semanticdemo
collection = db.movies


In [None]:
# Loading the DocumentDB database from the example dataset in csv
# Example dataset includes just 50 entries and is adapted from https://www.kaggle.com/datasets/tmdb/tmdb-movie-metadata

# Read the CSV file into a DataFrame
csv_file = "/home/ec2-user/demomovies.csv" 
data = pd.read_csv(csv_file)
# Convert the DataFrame to a list of dictionaries (one per row)
data_dict = data.to_dict(orient="records")
# Insert the data into the DocumentDB collection
collection.insert_many(data_dict)
print("CSV data has been successfully uploaded to DocumentDB")

In [None]:
# Generating text embeddings and storing it with existing data in Amazon DocumentDB

# Defining HuggingFace Token and embedding model
hf_token = "<Put your key from huggingFace's website>"
embedding_url = "https://api-inference.huggingface.co/pipeline/feature-extraction/sentence-transformers/all-MiniLM-L6-v2"

#Define Generate Embedding Function
def generate_embedding(text: str) -> List[float]:
    client2 = httpx.Client()
    response = client2.post(
        embedding_url,
        headers={"Authorization": f"Bearer {hf_token}"},
        json={"inputs": text}
    )
    client2.close()
    if response.status_code != 200:
        raise ValueError(f"Request failed with status code {response.status_code}: {response.text}")
    return response.json()

# Note: Model can take upto 20 secs to start. So, in case of model error, try again again after 20 seconds.

# Fetch all documents that have overview field
documents_to_update = list(collection.find({'overview': {"$exists": True}}))

# Define the batch size for processing
batch_size = 10  # You can adjust this based on your requirements

# Process documents in batches
for i in range(0, len(documents_to_update), batch_size):
    batch = documents_to_update[i:i + batch_size]

    # Generate embeddings for the current batch and store it alongside existing data as new field
    for doc in batch:
        doc['embedding_hf'] = generate_embedding(doc['overview'])

    # Update the batch of documents
    bulk_operations = [pymongo.ReplaceOne({'_id': doc['_id']}, doc) for doc in batch]
    collection.bulk_write(bulk_operations)

print("Batch processing completed.")


In [None]:
#Creating IVFflat index with dotProduct distance metrics

collection.create_index ([("embedding_hf","vector")], vectorOptions={
"lists": 1,
"similarity": "dotProduct",
"dimensions": 384})

In [None]:
#Defining which fields in results to project
projection = {
"_id":0,
"title": 1, 
"overview": 1}

#Defining semantic query function
def semantic_search(keyword):
    query = {"vectorSearch" : {"vector" : generate_embedding(keyword), "path": "embedding_hf", "similarity": "dotProduct", "k": 3}}
    results = collection.aggregate([{'$search': query},{"$project": projection}])
    return list(results)

#Defining keyword query function
def keyword_search(keyword):
    results = collection.aggregate([{"$match": {"overview": {"$regex": keyword}}},{"$project": projection}])
    return list(results)

In [None]:
#Doing semantic query example - search for movies with words "young magician"
semantic_search("young magician")

#You can see that search results are semantically similar. The query results do not have the exact words "young magician". However, it still manages to find movies like Harry Potter. Next, you can compare these results with keyword search.

In [None]:
#Doing keyword query example 1 - search for movies with keyword "young magician"
keyword_search("young magician")

#No results were returned because exact words "young magician" were not found in the overview description.

In [None]:
#Doing keyword query example 2 - search for movies with keyword "young wizard"
keyword_search("young wizard")

#One result was returned because exact words "young wizard" were found in the overview description.