In [1]:
import openai
from dotenv import dotenv_values
config = dotenv_values('.env')
openai.api_key = config['OPENAI_API_KEY']

In [2]:
import pandas as pd
import numpy as np
from tenacity import retry, wait_random_exponential, stop_after_attempt
import pickle
import tiktoken

In [3]:
dataset_path = "./movie_plots.csv"
df = pd.read_csv(dataset_path)

In [4]:
movies = df[df["Origin/Ethnicity"] == "American"].sort_values("Release Year", ascending=False).head(5000)

In [5]:
movie_plots = movies["Plot"].values

In [6]:
enc = tiktoken.encoding_for_model("text-embedding-ada-002")

In [7]:
total_tokens = sum([len(enc.encode(plot)) for plot in movie_plots])

In [8]:
total_tokens
cost = total_tokens * (.0004 / 1000)
print(f"Estimated cost ${cost:.2f}")

Estimated cost $1.45


In [9]:
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
def get_embedding(text, model="text-embedding-ada-002"):

    # replace newlines, which can negatively affect performance
    text = text.replace("\n", " ")

    return openai.Embedding.create(input=text, model=model)["data"][0]["embedding"]

In [10]:
# establsih a cache of embeddings to avoid recomputing
# cache is a dict of tuples (text, model) -> embedding saved as a pickle file

# set path to embedding cache
embedding_cache_path = "movie_embeddings_demo.pkl"

# load the cache if it exists, and save a copy to disk
try:
    embedding_cache = pd.read_pickle(embedding_cache_path)
except FileNotFoundError:
    embedding_cache = {}
with open(embedding_cache_path ,"wb") as embedding_cache_file:
    pickle.dump(embedding_cache, embedding_cache_file)

# define a function to retrieve embeddings from the cache if present, and otherwise request via the API
def embeddings_from_string(
        string,
        model="text-embedding-ada-002",
        embedding_cache=embedding_cache
):
    """Return embedding of a given a string, using a cache to avoid recomputing."""
    if (string, model) not in embedding_cache.keys():
        embedding_cache[(string, model)] = get_embedding(string, model)
        print(f"GOT EMBEDDING FROM OPEN AI FOR {string[:20]}")
        with open(embedding_cache_path, "wb") as embedding_cache_file:
            pickle.dump(embedding_cache, embedding_cache_file)
    return embedding_cache[(string, model)]


In [11]:
plot_embeddings = [embeddings_from_string(plot, model="text-embedding-ada-002") for plot in movie_plots]

In [21]:
data = movies[["Title", "Genre"]].to_dict("records")

In [12]:
import sys
# sys.path
# import os 
# os.getcwd()
# sys.path.append('/home/akant/Projects/udemy-OpenAI/Embeddings_movies/venv-Embeddings/bin/nomic')


In [13]:
sys.path

['/home/akant/Projects/udemy-OpenAI/Embeddings_movies',
 '/home/akant/anaconda3/lib/python39.zip',
 '/home/akant/anaconda3/lib/python3.9',
 '/home/akant/anaconda3/lib/python3.9/lib-dynload',
 '',
 '/home/akant/anaconda3/lib/python3.9/site-packages']

In [16]:
# pip install nomic


Collecting nomic
  Using cached nomic-1.1.6-py3-none-any.whl
Collecting rich
  Using cached rich-13.3.5-py3-none-any.whl (238 kB)
Collecting pyarrow
  Using cached pyarrow-12.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (39.0 MB)
Collecting jsonlines
  Using cached jsonlines-3.1.0-py3-none-any.whl (8.6 kB)
Collecting loguru
  Using cached loguru-0.7.0-py3-none-any.whl (59 kB)
Collecting pydantic
  Using cached pydantic-1.10.7-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.2 MB)
Collecting wonderwords
  Using cached wonderwords-2.2.0-py3-none-any.whl (44 kB)
Collecting cohere
  Using cached cohere-4.4.1-py3-none-any.whl (32 kB)
Collecting backoff<3.0,>=2.0
  Using cached backoff-2.2.1-py3-none-any.whl (15 kB)
Collecting pygments<3.0.0,>=2.13.0
  Using cached Pygments-2.15.1-py3-none-any.whl (1.1 MB)
Collecting markdown-it-py<3.0.0,>=2.2.0
  Using cached markdown_it_py-2.2.0-py3-none-any.whl (84 kB)
Collecting mdurl~=0.1
  Using cached mdurl-0.1.2-py3-none-an

In [17]:
from nomic import atlas

In [22]:
project = atlas.map_embeddings(
    embeddings=np.array(plot_embeddings),
    data=data
)

[32m2023-05-13 09:58:28.694[0m | [1mINFO    [0m | [36mnomic.project[0m:[36m_create_project[0m:[36m965[0m - [1mCreating project `finicky-leave` in organization `akantunc`[0m
[32m2023-05-13 09:58:30.024[0m | [1mINFO    [0m | [36mnomic.atlas[0m:[36mmap_embeddings[0m:[36m100[0m - [1mUploading embeddings to Atlas.[0m
4it [00:02,  1.38it/s]                       
[32m2023-05-13 09:58:33.099[0m | [1mINFO    [0m | [36mnomic.project[0m:[36m_add_data[0m:[36m1577[0m - [1mUpload succeeded.[0m
[32m2023-05-13 09:58:33.100[0m | [1mINFO    [0m | [36mnomic.atlas[0m:[36mmap_embeddings[0m:[36m119[0m - [1mEmbedding upload succeeded.[0m
[32m2023-05-13 09:58:34.055[0m | [1mINFO    [0m | [36mnomic.project[0m:[36mcreate_index[0m:[36m1282[0m - [1mCreated map `finicky-leave` in project `finicky-leave`: https://atlas.nomic.ai/map/4fa3dfe6-77cf-42d2-9407-69f11ba0e50e/bc6a0779-3931-447f-83d7-6491797b54a6[0m
[32m2023-05-13 09:58:34.056[0m | [1mINFO    