In [2]:
# Install required libraries
!pip3.8 install pymongo boto3 pandas

In [None]:
# Load the required libraries
import pymongo
import boto3
import json
import pandas as pd 

In [None]:
# Set up a connection to your Amazon DocumentDB (MongoDB compatibility) cluster and creating the database.
client = pymongo.MongoClient(
"<Amazon DocumentDB database cluster connection string>",
port=27017,
username="<username>",
password="<password>",
retryWrites=False,
tls='true',
tlsCAFile="/home/ec2-user/global-bundle.pem") #Check the path as per your destination
db = client.semanticdemo
collection = db.movies

In [4]:
# Set up a bedrock client. Ensure that you have IAM permissions setup for Bedrock.
client = boto3.client('bedrock-runtime')

In [5]:
# Loading the DocumentDB database from the example dataset in csv

# Read the CSV file into a DataFrame
csv_file = "/home/ec2-user/demomovies.csv" #path to sample database file
data = pd.read_csv(csv_file)
# Convert the DataFrame to a list of dictionaries (one per row)
data_dict = data.to_dict(orient="records")
# Insert the data into the MongoDB collection
collection.insert_many(data_dict)
print("CSV data has been successfully uploaded to DocumentDB")

CSV data has been successfully uploaded to DocumentDB


In [9]:
# Defining Bedrock model parameters
modelId = "amazon.titan-embed-text-v1"  # (Change this to try different embedding models)
accept = "application/json"
contentType = "application/json"

#Define Generate Embedding Function
def generate_embedding(text):
    body = json.dumps({"inputText": text})
    response = client.invoke_model(
        body=body, modelId=modelId, accept=accept, contentType=contentType
    )
    response_body = json.loads(response.get("body").read())
    embedding = response_body.get("embedding")
    return embedding

In [7]:
# Fetch all documents that have overview field
documents_to_update = list(collection.find({'overview': {"$exists": True}}))

# Define the batch size for processing
batch_size = 10  # You can adjust this based on your requirements

# Process documents in batches
for i in range(0, len(documents_to_update), batch_size):
    batch = documents_to_update[i:i + batch_size]

    # Generate embeddings for the current batch and store it alongside existing data as new field
    for doc in batch:
        doc['embedding_br'] = generate_embedding(doc['overview'])

    # Update the batch of documents
    bulk_operations = [pymongo.ReplaceOne({'_id': doc['_id']}, doc) for doc in batch]
    collection.bulk_write(bulk_operations)

print("Batch processing completed.")

Batch processing completed.


In [8]:
#Creating HNSW vector search index. You can set the parameters as per your performance and recall requirements.

collection.create_index ([("embedding_br","vector")], 
    vectorOptions= {
        "type": "hnsw", 
        "similarity": "euclidean",
        "dimensions": 1536,
        "m": 16,
        "efConstruction": 64},
    name="my_vss_index")

'my_index'

In [5]:
#Creating native text search index

collection.create_index ([("overview","text")],name="my_text_index")

'my_text_index'

In [107]:
#Defining different query functions

#Setting up of projection parameters
projection = {
"_id":0,
"title": 1, 
"overview": 1}

#Semantic search function
def search_semantic(keyword):
    query = {"vectorSearch" : {"vector" : generate_embedding(keyword), "path": "embedding_br", "similarity": "dotProduct", "k": 3}}
    results = collection.aggregate([{'$search': query},{"$project": projection}])
    return list(results)

#Text search function
def search_text(keyword):
    results = collection.aggregate([{"$match": {"$text": {"$search": keyword}}},{"$project": projection},{"$limit": 3}])
    return list(results)

#Hybrid query function
def search_hybrid(keyword):
    results1 = search_semantic(keyword)[:2]
    results2 = search_text(keyword)[:2]
    combined_results = results1 + results2
    combined_results_as_tuples = [tuple(d.items()) for d in combined_results]
    union_result = list(set(combined_results_as_tuples))
    return union_result[:3]

In [108]:
#Semantic search function powered by vector search for Amazon DocumentDB captures semantically similar results.
#Results includes Star Wars, even though, the overview description does not have word alien.

search_semantic("aliens")

[{'title': 'Rogue One: A Star Wars Story',
  'overview': 'A rogue band of resistance fighters unite for a mission to steal the Death Star plans and bring a new hope to the galaxy.'},
 {'title': 'Avatar',
  'overview': 'In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization.'},
 {'title': 'Ice Age: Continental Drift',
  'overview': 'Manny, Diego, and Sid embark upon another adventure after their continent is set adrift. Using an iceberg as a ship, they encounter sea creatures and battle pirates as they explore a new world.'}]

In [109]:
#Text search effectively retrieves results where variations of word alien are included. 

search_text("aliens")

[{'title': 'Avatar',
  'overview': 'In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization.'},
 {'title': 'Spider-Man 3',
  'overview': 'The seemingly invincible Spider-Man goes up against an all-new crop of villain â€“ including the shape-shifting Sandman. While Spider-Manâ€™s superpowers are altered by an alien organism, his alter ego, Peter Parker, deals with nemesis Eddie Brock and also gets caught up in a love triangle.'}]

In [111]:
#Hybrid search approach combines the strengths of both semantic and text search methods.
#The results includes both Star Wars and Spider-Man. In certain use-cases, combination of vector search and text search would be the best option. 

search_hybrid("aliens")

[(('title', 'Avatar'),
  ('overview',
   'In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization.')),
 (('title', 'Rogue One: A Star Wars Story'),
  ('overview',
   'A rogue band of resistance fighters unite for a mission to steal the Death Star plans and bring a new hope to the galaxy.')),
 (('title', 'Spider-Man 3'),
  ('overview',
   'The seemingly invincible Spider-Man goes up against an all-new crop of villain â€“ including the shape-shifting Sandman. While Spider-Manâ€™s superpowers are altered by an alien organism, his alter ego, Peter Parker, deals with nemesis Eddie Brock and also gets caught up in a love triangle.'))]