git clone https://github.com/OperationalizingAI/Hackathon-2-22-24.git

Copy requirements.txt, AtlasClient.py, and OpenAIClient.py into your root directory

In [None]:
!pip install google-cloud-secret-manager
!pip install --upgrade google-auth
!pip install -r requirements.txt

In [None]:
import os

from google.cloud import secretmanager
from google.colab import auth
from google.colab import drive

In [None]:
def load_secrets(secrets_name, project_id):
  # Build a client
  auth.authenticate_user()
  client = secretmanager.SecretManagerServiceClient()
  secret_name = secrets_name
  # Create path to latest secret
  resource_name = f"projects/{project_id}/secrets/{secret_name}/versions/latest"
  # Get your secret :
  response = client.access_secret_version(request={"name": resource_name})
  secret_string = response.payload.data.decode('UTF-8')
  return secret_string

In [None]:
project_id = 'botchagalupep1'
openai_api_key = load_secrets("openai_api_key",project_id)
os.environ['OPENAI_API_KEY'] = openai_api_key
#MONGODB_ATLAS_CLUSTER_URI = load_secrets("mdb_uri",project_id)
MONGODB_ATLAS_CLUSTER_URI = load_secrets("MDB_CLUSTER0_URI",project_id)
langsmith_api_key = load_secrets("langsmith_api_key",project_id)
#print(langsmith_api_key )
#print(MONGODB_ATLAS_CLUSTER_URI)

In [None]:
DB_NAME = 'sample_mflix'
COLLECTION_NAME = 'embedded_movies'
INDEX_NAME = 'idx_plot_embedding'

In [None]:
from AtlasClient import AtlasClient

atlas_client = AtlasClient (MONGODB_ATLAS_CLUSTER_URI, DB_NAME)
print("Connected to the Mongo Atlas database!")

Connected to the Mongo Atlas database!


In [None]:
from OpenAIClient import OpenAIClient

openAI_client = None

openAI_client = OpenAIClient (api_key=openai_api_key)
print ("OpenAI client initialized")

OpenAI client initialized


Add an Altas Vector Search Index to the emedded_movies collection (index name
```
{
  "fields": [
    {
      "type": "vector",
      "path": "plot_embedding",
      "numDimensions": 1536,
      "similarity": "euclidean"
    }
  ]
}
```

In [None]:
queries = [
    'fatalistic sci-fi movies',
    'humans fighting aliens',
    'futuristic christmas movies',
    'sci-fi story with a friendly alien',
    'relationship drama between two good friends',
    'college graduates working in a big city discover new relationships',
    'household pets get lost but go on a long journey to find home'
]

In [None]:
embeddings = {}

for query in queries:
    embedding  = openAI_client.get_embedding(query, model='text-embedding-ada-002')
    print (f"Embedding for query='{query}', embeddding_length={len(embedding)}, printing first few numbers... :\n", embedding [:10] )

    embeddings[query] = embedding

In [None]:
import json

str = json.dumps(embeddings)

with open("embeddings_openai.json", "w") as f:
    f.write(str)

print ("saved to : 'embeddings_openai.json'")

saved to : 'embeddings_openai.json'


In [None]:
import os
import json

cached_embeddings = {}
cached_embedding_file = 'embeddings_openai.json'

if os.path.exists(cached_embedding_file):
    with open(cached_embedding_file, "r") as f:
        str = f.read()
        cached_embeddings = json.loads(str)

print ("Loaded the following cached embeddings...")
for query in cached_embeddings.keys():
    print (f'- {query}')

Loaded the following cached embeddings...
- fatalistic sci-fi movies
- humans fighting aliens
- futuristic christmas movies
- sci-fi story with a friendly alien
- relationship drama between two good friends
- college graduates working in a big city discover new relationships
- household pets get lost but go on a long journey to find home


In [None]:
import time

# Handy function
def do_vector_search (query:str) -> None:
    query = query.lower().strip()
    print ('query: ', query)
    if query in cached_embeddings.keys():
        print ("using cached embeddings")
        embedding = cached_embeddings.get (query)
    else:
        t1a = time.perf_counter()
        embedding = openAI_client.get_embedding(query)
        t1b = time.perf_counter()
        print (f"Getting embeddings from OpenAI took {(t1b-t1a)*1000:,.0f} ms")

    t2a = time.perf_counter()
    movies = atlas_client.vector_search(collection_name=COLLECTION_NAME, index_name=INDEX_NAME, attr_name='plot_embedding', embedding_vector=embedding,limit=10 )
    t2b = time.perf_counter()

    print (f"Altas query returned {len (movies)} movies in {(t2b-t2a)*1000:,.0f} ms")
    print()

    for idx, movie in enumerate (movies):
        print(f'{idx+1}\nid: {movie["_id"]}\ntitle: {movie["title"]}' +
            f'\nsearch_score(meta):{movie["search_score"]}\nplot: {movie["plot"]}\n')

In [None]:
query="humans fighting aliens"

do_vector_search (query=query)

In [None]:
query="fatalistic sci-fi movies"

do_vector_search (query=query)