In [3]:
from dotenv import load_dotenv

load_dotenv()

OPENAI_API_TYPE = "openai"

In [5]:
import os
from pymongo import MongoClient

# Connect to local MongoDB Atlas
mongo_uri = os.getenv("MONGODB_URI")
if not mongo_uri:
    raise ValueError("MONGODB_URI environment variable is not set")

client = MongoClient(mongo_uri)
db = client.get_database("hyper")

print(f"Connected to database: {db.name}")


Connected to database: hyper


In [26]:
from pymongo import MongoClient


class AtlasClient ():


   def __init__ (self, altas_uri, dbname):
       self.mongodb_client = MongoClient(altas_uri)
       self.database = self.mongodb_client[dbname]


   ## A quick way to test if we can connect to Atlas instance
   def ping (self):
       self.mongodb_client.admin.command('ping')


   def get_collection (self, collection_name):
       collection = self.database[collection_name]
       return collection


   def find (self, collection_name, filter = {}, limit=10):
       collection = self.database[collection_name]
       items = list(collection.find(filter=filter, limit=limit))
       return items


   # https://www.mongodb.com/docs/atlas/atlas-vector-search/vector-search-stage/
   def vector_search(self, collection_name, index_name, attr_name, embedding_vector, limit=5):
       collection = self.database[collection_name]
       results = collection.aggregate([
           {
               '$vectorSearch': {
                   "index": index_name,
                   "path": attr_name,
                   "queryVector": embedding_vector,
                   "numCandidates": 50,
                   "limit": limit,
               }
           },
           ## We are extracting 'vectorSearchScore' here
           ## columns with 1 are included, columns with 0 are excluded
           {
               "$project": {
                   '_id' : 1,
                   'title' : 1,
                   'plot' : 1,
                   'year' : 1,
                   "search_score": { "$meta": "vectorSearchScore" }
           }
           }
           ])
       return list(results)


   def close_connection(self):
       self.mongodb_client.close()

In [27]:
mongo_uri = os.getenv("MONGODB_URI")
atlas_client = AtlasClient (mongo_uri, 'hyper')
atlas_client.ping()
print ('Connected to Atlas instance! We are good to go!')

Connected to Atlas instance! We are good to go!


In [None]:
import json

# Load the Persons.json file
with open('../../data/norman/Persons.json', 'r') as file:
    persons_data = json.load(file)

# Print each element of Persons.json
for person in persons_data:
    print(json.dumps(person, indent=2))
    print()  # Add a blank line between each person for better readability


In [15]:
from openai import OpenAI
import os

# Set up OpenAI API key
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

def get_embedding(text):
    response = client.embeddings.create(
        input=text,
        model="text-embedding-3-large"  # Using the latest text embedding model
    )
    return response.data[0].embedding

# Process each person and add embeddings
for person in persons_data:
    # Create a string representation of the person
    person_text = f"{person['given_name']} {person['family_name']}, age {person['age']}, {person['relationship_to_master_node']}"
    
    # Get the embedding
    embedding = get_embedding(person_text)
    
    # Add the embedding to the person's data
    person['vector'] = embedding

# Print the updated data to verify
for person in persons_data:
    print(json.dumps({k: v for k, v in person.items() if k != 'vector'}, indent=2))
    print(f"Vector (first 5 elements): {person['vector'][:5]}...")
    print()

# Optionally, save the updated data back to the file
with open('../../data/norman/Persons_with_embeddings.json', 'w') as file:
    json.dump(persons_data, file, indent=2)

print("Embeddings have been added to each person and saved to Persons_with_embeddings.json")


CreateEmbeddingResponse(data=[Embedding(embedding=[-0.003772404044866562, 0.025475597009062767, -0.0044333962723612785, 0.006650094874203205, -0.03558403253555298, 0.013205240480601788, 0.0037468408700078726, -0.015878424048423767, 0.005291590932756662, 0.018113382160663605, -0.04630598425865173, -0.0017063464038074017, -0.01647733524441719, 0.008516211062669754, 0.0014443231048062444, -0.00856733787804842, -0.06333840638399124, -0.014330022968351841, -0.043822698295116425, -0.039002202451229095, 0.028587007895112038, 0.0011311734560877085, 0.01945727877318859, 0.04753302037715912, 0.043442901223897934, 0.036927927285432816, 0.028733083978295326, 0.017207713797688484, 0.028324071317911148, 0.018916798755526543, 0.0016570457955822349, 0.013351315632462502, -0.006737740244716406, -0.01032024621963501, -0.004407833330333233, -0.012124280445277691, -0.018610039725899696, -0.002262346912175417, -0.004378618206828833, -0.016900954768061638, 0.004276365041732788, 0.0007933734450489283, 0.0141

In [17]:
user_input = "recordame las edades de mis hijas"

In [20]:
# Query amplification using OpenAI completions
amplified_query = client.chat.completions.create(
    model="gpt-4o-2024-08-06",  # Using the closest available model as of my knowledge cutoff
    messages=[
        {"role": "system", "content": "You are a helpful assistant that expands user queries to improve search results using cosine distance in a vactor database using text embeddings"},
        {"role": "user", "content": f"Expand this query for better search results, no pre-text, only the query: {user_input}"}
    ],
    max_tokens=100
).choices[0].message.content

print(f"Amplified query: {amplified_query}")

Amplified query: edades de mis hijas, ¿cuántos años tienen mis hijas?, edades de mis niños, ¿qué edad tienen mis hijas?, edad de mis hijas, edades de mis hijas actuales, edad de mis niñas, recordar edades de mis hijas, edades de mis hijos, edades de mis hijas actualizada


In [21]:
query_embedding = get_embedding(amplified_query)


CreateEmbeddingResponse(data=[Embedding(embedding=[0.0015695224283263087, 0.024507243186235428, -0.00872765015810728, -0.001483700587414205, 0.015477033331990242, 0.01842116005718708, -0.03255995362997055, -0.004974760580807924, 0.028789609670639038, 0.021621299907565117, 0.023925399407744408, -0.036725953221321106, -0.02093472331762314, 0.016221793368458748, 0.0002612840326037258, 0.021807489916682243, -0.010385903529822826, 0.012253621593117714, 0.028114670887589455, -0.03744743764400482, 0.0071683102287352085, 0.02057398110628128, -0.02653205767273903, -0.001912810024805367, 0.014010787941515446, 0.017443664371967316, -0.009699328802525997, -0.0045965625904500484, -0.007860703393816948, -0.024065041914582253, -0.01637307181954384, -0.00029310359968803823, 0.021272193640470505, -0.024995990097522736, -0.0633511021733284, -0.014034061692655087, -0.013964240439236164, -0.005786432418972254, -0.0005542057915590703, 0.0032845058012753725, -0.008814927190542221, 0.05027126520872116, -0.02

In [23]:
persons_collection = db["Persons"]


In [32]:
# Perform vector search using Atlas client
results = atlas_client.vector_search(
    collection_name="Persons",
    index_name="default",
    attr_name='vector',
    embedding_vector=query_embedding,
    limit=3
)

print("Top 3 similar results:")
context = []
for result in results:
    # Retrieve the full document from the database
    full_doc = persons_collection.find_one({"_id": result["_id"]})
    
    # Extract relevant information
    given_name = full_doc.get("given_name", "N/A")
    relationship = full_doc.get("relationship_to_master_node", "N/A")
    age = full_doc.get("age", "N/A")
    
    context.append(f"{given_name} ({relationship}): {age} years old")
    
    print(f"Given Name: {given_name}")
    print(f"Relationship: {relationship}")
    print(f"Age: {age}")
    print(f"Score: {result.get('score', 'N/A')}")

# Use the context to answer the user's question
context_str = "\n".join(context)
final_answer = client.chat.completions.create(
    model="gpt-4o-2024-08-06",
    messages=[
        {"role": "system", "content": "You are a helpful assistant that answers questions based on the given context."},
        {"role": "user", "content": f"Context:\n{context_str}\n\nQuestion: {user_input}\n\nAnswer the question based on the context provided."}
    ],
    max_tokens=150
).choices[0].message.content

print(f"\nFinal Answer: {final_answer}")






Top 3 similar results:
Given Name: Eva
Relationship: daughter
Age: 8
Score: N/A
Given Name: Mia
Relationship: daughter
Age: 8
Score: N/A
Given Name: Julia
Relationship: sons_girlfriend
Age: 10
Score: N/A

Final Answer: Eva y Mia tienen 8 años cada una.
