In [1]:
import os
import chromadb
from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction
import pandas as pd
import warnings

os.chdir("C:/Users/abhmukherjee/Documents/GenAI/MongoDB/DocuSearch_AI/DocuSearch_AI")

warnings.filterwarnings(action="ignore", message="unclosed", category=ResourceWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning) 

In [3]:
df = pd.read_json("hf://datasets/MongoDB/embedded_movies/sample_mflix.embedded_movies.json")
df = df.head(250)

In [4]:
# Connect to the existing ChromaDB client
chroma_client = chromadb.PersistentClient()

In [5]:
from openai import OpenAI

from src.config import Config

if os.getenv("OPENAI_API_KEY") is not None:
    client = OpenAI(
        api_key=Config.OPENAI_API_KEY,
    )
    print ("OPENAI_API_KEY is ready")
else:
    print ("OPENAI_API_KEY environment variable not found")

# Initialize the OpenAI embedding function
embedding_function = OpenAIEmbeddingFunction(
    api_key=Config.OPENAI_API_KEY, 
    model_name=Config.EMBEDDING_MODEL,
)

OPENAI_API_KEY is ready


In [6]:
movies_collection = chroma_client.create_collection(
    name='mongodb_movies',
    embedding_function=embedding_function,
)

In [7]:
ids = []
metadatas = []
docs = []
embeddings = []

for idx, row in df.iterrows():
    if isinstance(row['plot_embedding'], list):
        meta_dict = {}
        meta_dict['title'] = row['title']
        
        ids.append(str(idx))
        docs.append(row['fullplot'])

        embeddings.append(row['plot_embedding'])
        metadatas.append(meta_dict)



In [None]:
movies_collection.add(
    ids = ids,
    documents = docs,
    embeddings = embeddings,
    metadatas = metadatas,
)

In [11]:
# Query by text (will be embedded by the embedding function)
query_text = "science fiction movies with aliens"
results = movies_collection.query(
    query_texts=[query_text],
    n_results=3
)

# Display the results
print(f"Top 5 results for query: '{query_text}'")
for i, (id, document, metadata) in enumerate(zip(
    results['ids'][0], 
    results['documents'][0], 
    results['metadatas'][0]
)):
    print(f"\n--- Result {i+1} ---")
    print(f"Movie: {metadata['title']}")
    print(f"Similarity Score: {results['distances'][0][i] if 'distances' in results else 'N/A'}")
    print(f"Plot: {document[:150]}...")  # Show first 150 chars of plot

Top 5 results for query: 'science fiction movies with aliens'

--- Result 1 ---
Movie: Shinobi: Heart Under Blade
Similarity Score: 1.9054886324682696
Plot: After more than four hundred years of war between the Shinobi warriors of the Manjidani Koga and Tsubagakure Iga clans, the Lord Hattori Hanzou decree...

--- Result 2 ---
Movie: Dirty Harry
Similarity Score: 1.926175641696861
Plot: In the year 1971, San Francisco faces the terror of a maniac known as Scorpio- who snipes at innocent victims and demands ransom through notes left at...

--- Result 3 ---
Movie: Flash Gordon
Similarity Score: 1.9292127100526804
Plot: In this update of the 1930s comic strip, Flash Gordon is a football hero who is skyjacked aboard Dr. Hans Zarkov's rocketship along with beautiful Dal...


In [None]:
# # Optional: If you have the vector embedding for a query
# from openai import OpenAI
# client = OpenAI(api_key=Config.OPENAI_API_KEY)
# query_embedding = client.embeddings.create(
#     input=query_text,
#     model=Config.EMBEDDING_MODEL
# ).data[0].embedding

# vector_results = movies_collection.query(
#     query_embeddings=[query_embedding],
#     n_results=5
# ) 

# # Display the results
# print(f"Top 5 results for query: '{query_text}'")
# for i, (id, document, metadata) in enumerate(zip(
#     vector_results['ids'][0], 
#     vector_results['documents'][0], 
#     vector_results['metadatas'][0]
# )):
#     print(f"\n--- Result {i+1} ---")
#     print(f"Movie: {metadata['title']}")
#     print(f"Similarity Score: {vector_results['distances'][0][i] if 'distances' in vector_results else 'N/A'}")
#     print(f"Plot: {document[:150]}...")  # Show first 150 chars of plot

Top 5 results for query: 'science fiction movies with aliens'

--- Result 1 ---
Movie: Shinobi: Heart Under Blade
Similarity Score: 1.9054886324682696
Plot: After more than four hundred years of war between the Shinobi warriors of the Manjidani Koga and Tsubagakure Iga clans, the Lord Hattori Hanzou decree...

--- Result 2 ---
Movie: Dirty Harry
Similarity Score: 1.926175641696861
Plot: In the year 1971, San Francisco faces the terror of a maniac known as Scorpio- who snipes at innocent victims and demands ransom through notes left at...

--- Result 3 ---
Movie: Flash Gordon
Similarity Score: 1.9292127100526804
Plot: In this update of the 1930s comic strip, Flash Gordon is a football hero who is skyjacked aboard Dr. Hans Zarkov's rocketship along with beautiful Dal...

--- Result 4 ---
Movie: The Hunley
Similarity Score: 1.9294807242778638
Plot: CSS Hunley tells the incredible true story of the crew of the manually propelled submarine CSS Hunley, during the siege of Charleston of 1864.