## Requirements

In [None]:
!pip install pandas numpy tenacity

## Init OpenAI

In [None]:
from openai import OpenAI

client = OpenAI(
    base_url="http://localhost:1234/v1",
    api_key="lm-studio",
)

## Load csv data

In [None]:
import pandas as pd

dataset_path = "./movie_plots.csv"
df = pd.read_csv(dataset_path)

## Filter movie for 1000 sample 

In [16]:
movie = df[df["Origin/Ethnicity"] == "American"].sort_values("Release Year", ascending=False).head(1000)

## Tenacity used for retry if there is an error because of rate limit

In [17]:
movie

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
17271,2017,2:22,American,Paul Currie,"Paul Currie (director); Nathan Parker, Todd St...","drama, thriller",https://en.wikipedia.org/wiki/2:22_(2017_film),The movie starts with Dylan Branson (Michiel H...
17375,2017,Molly's Game,American,Aaron Sorkin,Aaron Sorkin (director/screenplay); Jessica Ch...,"biography, crime, drama",https://en.wikipedia.org/wiki/Molly%27s_Game,Molly Bloom is world-class mogul skier with Ol...
17343,2017,Lady Bird,American,Greta Gerwig,Greta Gerwig (director/screenplay); Saoirse Ro...,comedy-drama,https://en.wikipedia.org/wiki/Lady_Bird_(film),"Christine ""Lady Bird"" McPherson is a senior st..."
17325,2017,Professor Marston and the Wonder Women,American,Angela Robinson,Angela Robinson (director/screenplay); Luke Ev...,"biography, drama",https://en.wikipedia.org/wiki/Professor_Marsto...,The story is told in flashbacks set during a 1...
17352,2017,Mr. Roosevelt,American,Noël Wells,"Noël Wells (director/screenplay); Noël Wells, ...",comedy,https://en.wikipedia.org/wiki/Mr._Roosevelt,Emily Martin (Noël Wells) returns to her homet...
...,...,...,...,...,...,...,...,...
16203,2012,Black Rock,American,Katie Aselton,"Katie Aselton, Lake Bell, Kate Bosworth, Will ...",horror,https://en.wikipedia.org/wiki/Black_Rock_(2012...,Sarah (Kate Bosworth) invites her childhood fr...
16204,2012,Border Run,American,Gabriela Tagliavini,"Sharon Stone, Billy Zane, Manolo Cardona, Rose...",drama,https://en.wikipedia.org/wiki/Border_Run,"Stone portrays journalist Sofie Talbert, a har..."
16205,2012,"Bourne Legacy, TheThe Bourne Legacy",American,Tony Gilroy,"Jeremy Renner, Rachel Weisz, Edward Norton, Jo...","action, spy film",https://en.wikipedia.org/wiki/The_Bourne_Legac...,Six weeks after Jason Bourne's (Matt Damon) es...
16190,2012,Beasts of the Southern Wild,American,Benh Zeitlin,"Quvenzhané Wallis, Dwight Henry, Jonshel Alexa...","drama, fantasy",https://en.wikipedia.org/wiki/Beasts_of_the_So...,As a storm approaches a southern Louisiana bay...


In [None]:
from tenacity import retry, wait_random_exponential, stop_after_attempt

@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
def get_embedding(text, model="nomic-ai/nomic-embed-text-v1.5-GGUF"):
    # replace newlines, which can negatively effect performance
    text = text.replace("\n", " ")
    return client.embeddings.create(input = [text], model=model).data[0].embedding

In [None]:
get_embedding("Your text here")

## Establish cache embeddings

In [None]:
import pickle

embedding_cache_path = "movie_embeddings_cache.pkl"

try:
    embedding_cache = pd.read_pickle(embedding_cache_path)
except FileNotFoundError:
    embedding_cache = {}
with open(embedding_cache_path, "wb") as embedding_cache_file:
    pickle.dump(embedding_cache, embedding_cache_file)
    
def embedding_from_string(
        string,
        model="nomic-ai/nomic-embed-text-v1.5-GGUF",
        embedding_cache=embedding_cache
):
    if (string, model) not in embedding_cache.keys():
        embedding_cache[(string, model)] = get_embedding(string, model)
        print(f"GOT EMBEDDING FROM OPENAI FOR {string[:20]}")
        with open(embedding_cache_path, "wb") as embedding_cache_file:
            pickle.dump(embedding_cache, embedding_cache_file)
    return embedding_cache[(string, model)]

In [None]:
# generates embeddings
plot_embeddings = [
    embedding_from_string(plot, model="nomic-ai/nomic-embed-text-v1.5") for plot in movie["Plot"].values
]


In [None]:
plot_embeddings

In [None]:
data = movie[["Title", "Genre"]].to_dict("records")

In [None]:
data