## imports

In [42]:
import os, json
import openai
import ollama
from dotenv import dotenv_values, load_dotenv
import pandas as pd
import numpy as np

## create clients

In [43]:
load_dotenv()
client = openai.OpenAI()
embed_client = ollama.Client()

In [44]:
# response = client.embeddings.create(
#     input="Your text string goes here",
#     model="text-embedding-ada-002"
# )

# print(response.data[0].embedding)

## test free embedding models

In [45]:
brian = embed_client.embed(
    # model='nomic-embed-text:latest',
    model='mxbai-embed-large:latest',
    # model='llama3.2',
    input='brian'
)

brian.embeddings[0][:10]

[0.0035244038,
 -0.027790122,
 -0.007375986,
 0.0015292541,
 -0.0186261,
 0.024424428,
 0.010697802,
 -0.006578716,
 0.04246506,
 0.0021017815]

In [46]:
len(brian.embeddings[0])

1024

## create dataset

In [47]:
DATA_PATH = 'data'
INPUT_FILE = 'all-movie-plots.csv'
OUTPUT_FILE = 'movie-plots.csv'
INPUT_PATH = f'{DATA_PATH}/{INPUT_FILE}'
OUTPUT_PATH = f'{DATA_PATH}/{OUTPUT_FILE}'

In [48]:
all_movie_plots = pd.read_csv(INPUT_PATH)
all_movie_plots.head()

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Smashers,"A bartender is working at a saloon, serving drinks to customers. After he fills a stereotypically Irish man's bucket with beer, Carrie Nation and her followers burst inside. They assault the Irish man, pulling his hat over his eyes and then dumping the beer over his head. The group then begin wrecking the bar, smashing the fixtures, mirrors, and breaking the cash register. The bartender then sprays seltzer water in Nation's face before a group of policemen appear and order everybody to leave.[1]"
1,1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Light_of_the_Moon,"The moon, painted with a smiling face hangs over a park at night. A young couple walking past a fence learn on a railing and look up. The moon smiles. They embrace, and the moon's smile gets bigger. They then sit down on a bench by a tree. The moon's view is blocked, causing him to frown. In the last scene, the man fans the woman with his hat because the moon has left the sky and is perched over her shoulder to see everything better."
2,1901,The Martyred Presidents,American,Unknown,,unknown,https://en.wikipedia.org/wiki/The_Martyred_Presidents,"The film, just over a minute long, is composed of two shots. In the first, a girl sits at the base of an altar or tomb, her face hidden from the camera. At the center of the altar, a viewing portal displays the portraits of three U.S. Presidents—Abraham Lincoln, James A. Garfield, and William McKinley—each victims of assassination.\r\nIn the second shot, which runs just over eight seconds long, an assassin kneels feet of Lady Justice."
3,1901,"Terrible Teddy, the Grizzly King",American,Unknown,,unknown,"https://en.wikipedia.org/wiki/Terrible_Teddy,_the_Grizzly_King","Lasting just 61 seconds and consisting of two shots, the first shot is set in a wood during winter. The actor representing then vice-president Theodore Roosevelt enthusiastically hurries down a hillside towards a tree in the foreground. He falls once, but rights himself and cocks his rifle. Two other men, bearing signs reading ""His Photographer"" and ""His Press Agent"" respectively, follow him into the shot; the photographer sets up his camera. ""Teddy"" aims his rifle upward at the tree and fells what appears to be a common house cat, which he then proceeds to stab. ""Teddy"" holds his prize aloft, and the press agent takes notes. The second shot is taken in a slightly different part of the wood, on a path. ""Teddy"" rides the path on his horse towards the camera and out to the left of the shot, followed closely by the press agent and photographer, still dutifully holding their signs."
4,1902,Jack and the Beanstalk,American,"George S. Fleming, Edwin S. Porter",,unknown,https://en.wikipedia.org/wiki/Jack_and_the_Beanstalk_(1902_film),"The earliest known adaptation of the classic fairytale, this films shows Jack trading his cow for the beans, his mother forcing him to drop them in the front yard, and beig forced upstairs. As he sleeps, Jack is visited by a fairy who shows him glimpses of what will await him when he ascends the bean stalk. In this version, Jack is the son of a deposed king. When Jack wakes up, he finds the beanstalk has grown and he climbs to the top where he enters the giant's home. The giant finds Jack, who narrowly escapes. The giant chases Jack down the bean stalk, but Jack is able to cut it down before the giant can get to safety. He falls and is killed as Jack celebrates. The fairy then reveals that Jack may return home as a prince."


In [49]:
american_movie_plots = all_movie_plots[all_movie_plots['Origin/Ethnicity'] == 'American']
american_movie_plots.head()

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Smashers,"A bartender is working at a saloon, serving drinks to customers. After he fills a stereotypically Irish man's bucket with beer, Carrie Nation and her followers burst inside. They assault the Irish man, pulling his hat over his eyes and then dumping the beer over his head. The group then begin wrecking the bar, smashing the fixtures, mirrors, and breaking the cash register. The bartender then sprays seltzer water in Nation's face before a group of policemen appear and order everybody to leave.[1]"
1,1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Light_of_the_Moon,"The moon, painted with a smiling face hangs over a park at night. A young couple walking past a fence learn on a railing and look up. The moon smiles. They embrace, and the moon's smile gets bigger. They then sit down on a bench by a tree. The moon's view is blocked, causing him to frown. In the last scene, the man fans the woman with his hat because the moon has left the sky and is perched over her shoulder to see everything better."
2,1901,The Martyred Presidents,American,Unknown,,unknown,https://en.wikipedia.org/wiki/The_Martyred_Presidents,"The film, just over a minute long, is composed of two shots. In the first, a girl sits at the base of an altar or tomb, her face hidden from the camera. At the center of the altar, a viewing portal displays the portraits of three U.S. Presidents—Abraham Lincoln, James A. Garfield, and William McKinley—each victims of assassination.\r\nIn the second shot, which runs just over eight seconds long, an assassin kneels feet of Lady Justice."
3,1901,"Terrible Teddy, the Grizzly King",American,Unknown,,unknown,"https://en.wikipedia.org/wiki/Terrible_Teddy,_the_Grizzly_King","Lasting just 61 seconds and consisting of two shots, the first shot is set in a wood during winter. The actor representing then vice-president Theodore Roosevelt enthusiastically hurries down a hillside towards a tree in the foreground. He falls once, but rights himself and cocks his rifle. Two other men, bearing signs reading ""His Photographer"" and ""His Press Agent"" respectively, follow him into the shot; the photographer sets up his camera. ""Teddy"" aims his rifle upward at the tree and fells what appears to be a common house cat, which he then proceeds to stab. ""Teddy"" holds his prize aloft, and the press agent takes notes. The second shot is taken in a slightly different part of the wood, on a path. ""Teddy"" rides the path on his horse towards the camera and out to the left of the shot, followed closely by the press agent and photographer, still dutifully holding their signs."
4,1902,Jack and the Beanstalk,American,"George S. Fleming, Edwin S. Porter",,unknown,https://en.wikipedia.org/wiki/Jack_and_the_Beanstalk_(1902_film),"The earliest known adaptation of the classic fairytale, this films shows Jack trading his cow for the beans, his mother forcing him to drop them in the front yard, and beig forced upstairs. As he sleeps, Jack is visited by a fairy who shows him glimpses of what will await him when he ascends the bean stalk. In this version, Jack is the son of a deposed king. When Jack wakes up, he finds the beanstalk has grown and he climbs to the top where he enters the giant's home. The giant finds Jack, who narrowly escapes. The giant chases Jack down the bean stalk, but Jack is able to cut it down before the giant can get to safety. He falls and is killed as Jack celebrates. The fairy then reveals that Jack may return home as a prince."


In [50]:
recent_american_movie_plots = american_movie_plots[american_movie_plots['Release Year'] >= 1970].sort_values(by='Release Year', ascending=False)
recent_american_movie_plots.head()

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
17376,2017,Phantom Thread,American,Paul Thomas Anderson,"Paul Thomas Anderson (director/screenplay); Daniel Day-Lewis, Lesley Manville, Richard Graham, Vicky Krieps",drama,https://en.wikipedia.org/wiki/Phantom_Thread,"In 1954 London, renowned fashion designer Reynolds Woodcock creates dresses for members of high society. His charisma and genius are matched by his obsessive, controlling and sometimes abusive personality. Cyril, his sister, manages the day-to-day operations of his fashion house and has significant influence over his life. Reynolds is haunted by the death of their mother, and stitches hidden messages into the linings of the dresses he makes.\r\nAfter designing a new gown for a revered client, Reynolds visits a restaurant in the countryside and becomes interested in a waitress, Alma. He asks her on a date, and she accepts. Their relationship develops, and she moves in with him, becoming his assistant, muse and lover. Cyril initially distrusts Alma but comes to respect her willfulness and determination.\r\nAt first, Alma enjoys being a part of Reynolds' work, but he proves aloof and hard to please, and they bicker. When Alma tries to surprise him with a romantic dinner, Reynolds lash..."
17243,2017,"Everything, Everything",American,Stella Meghie,"Stella Meghie (director); J. Mills Goodloe (screenplay); Amandla Stenberg, Nick Robinson, Anika Noni Rose, Ana de la Reguera, Danube Hermosillo","romance, drama","https://en.wikipedia.org/wiki/Everything,_Everything_(film)","Eighteen-year-old Maddy suffers from SCID, an immuno-deficiency disease that prevents her from leaving her home and interacting with others. Her mother, Pauline Whittier, takes care of her with the help of her nurse Carla, who has taken care of Madeline for 15 years. Only Pauline, Carla and Carla's daughter, Rosa, are allowed in the home. Maddy yearns to see the world, particularly the ocean.\r\nOne day, a new family moves next door, and their son, who is Maddy's age, catches her eye. They share a look as Maddy watches through the window. Later that night, while Pauline and Maddy are watching a movie, the boy and his sister appear on their doorstep, offering a bundt cake. Pauline politely rejects it, and as she's about to close the door, the boy asks where her daughter is. Pauline lies and tells him Maddy is not home. It is also revealed that the father of the boy is violent and their relationship is strained.\r\nLater, the boy writes his number on his window for Maddy and soon the..."
17241,2017,Alien: Covenant,American,Ridley Scott,"Ridley Scott (director); Michael Green, Jack Paglen (screenplay); Michael Fassbender, Katherine Waterston, Demián Bichir, Billy Crudup, Danny McBride, Jussie Smollett, Amy Seimetz, Carmen Ejogo, Callie Hernandez, Alex England, James Franco","sci-fi, horror",https://en.wikipedia.org/wiki/Alien:_Covenant,"In a prologue, business magnate Peter Weyland speaks with his newly activated android, who chooses the name ""David"" after observing Michelangelo’s statue of David. Weyland tells David that one day they will search for mankind's creator together. David comments on his own unlimited lifespan compared to his creator's limited one, which unsettles Weyland.\r\nIn 2104, 11 years after the Prometheus expedition, the colonization ship Covenant is bound for remote planet Origae-6, with two thousand colonists in stasis and 1,140 human embryos aboard. The ship is monitored by Walter, a newer android model that physically resembles David. A stellar burst damages the ship, killing 47 colonists. Walter orders the ship's computer to wake the crew of 14, which includes several married couples. The ship's captain, Jake Branson, dies when his stasis pod malfunctions. While repairing the ship, the crew picks up a radio transmission of a human voice from a nearby (two weeks’ travel) planet, which appe..."
17240,2017,Paris Can Wait,American,Eleanor Coppola,"Eleanor Coppola (director/screenplay); Diane Lane, Alec Baldwin, Arnaud Viard","comedy, romance",https://en.wikipedia.org/wiki/Paris_Can_Wait,"Anne (Diane Lane) is in Cannes with her husband Michael (Alec Baldwin), a prominent movie producer. As the festival ends she learns that the vacation she and her husband were supposed to go on in Paris will be slightly delayed as they need to go to Budapest first. They plan to fly to Paris, but the pilot suggests Anne not fly due to an ear infection. Michael's producing partner Jacques (Arnaud Viard) offers to drive Anne to Paris himself.\r\nWhat is supposed to be a short car ride quickly devolves into a pleasant leisurely trip as Jacques, a French foodie, can't resist taking any opportunity he can to stop every hour or so to sample new food. He is also openly flirtatious with Anne but she begins to question his intentions when he repeatedly uses her credit card to foot the bill for the gourmet meals they are sampling. They visit a church where Anne grieves the baby she lost, and tells Jacques she wears her locket necklace in his honor. They share a romantic dinner together where J..."
17239,2017,The Wall,American,Doug Liman,"Doug Liman (director); Dwain Worrell (screenplay); Aaron Taylor-Johnson, John Cena","drama, thriller",https://en.wikipedia.org/wiki/The_Wall_(2017_film),"During the Iraq War, U.S. Army Staff Sergeant Shane Matthews (John Cena) is a sniper who is sent to investigate a pipeline construction site in the desert of the country, with his spotter, Sergeant Allen Isaac (Aaron Taylor-Johnson).\r\nThe pair patiently wait 22 hours on overwatch before determining that the site is clear. Matthews proceeds to investigate the site, but is shot by an Iraqi sniper. Isaac tries to rescue the dying Matthews, but he is also wounded in the right knee and has his radio damaged and his water bottle destroyed in the process.\r\nAlone, Isaac takes cover behind an unsteady wall and tends to his wounds. The sniper has a radio tuned into the American channel, and uses it to communicate with Isaac under the pretense of being a high ranking allied soldier at another site. The deception allows the sniper to get other useful information from Isaac. Throughout their various one-sided attempts at conversation, we learn that the sniper does not claim to the mythical ..."


In [51]:
recent_american_movies_sample = recent_american_movie_plots[:5000]
recent_american_movies_sample.count()


Release Year        5000
Title               5000
Origin/Ethnicity    5000
Director            5000
Cast                4941
Genre               5000
Wiki Page           5000
Plot                5000
dtype: int64

In [52]:
movie_plots = recent_american_movie_plots
movie_plots.to_csv(OUTPUT_PATH)

## create embedding function

### retry decorator

In [53]:
from random import randint
from ollama import EmbedResponse
import time

def retry(wait:int=2, max_retries:int=5):
    def decorator(func):
        def wrapper(*args, **kwargs):
            attempts = 0
            while True:
                try:
                    result = func(*args, **kwargs)
                    return result
                except:
                    print(f'function call failed, attempts: {attempts}/{max_retries}')
                    if attempts >= max_retries:
                        print('giving up')
                        break
                    attempts +=1
                    print(f'waiting {wait} seconds...')
                    time.sleep(wait)
        return wrapper
    return decorator

@retry(wait=.5)
def test_retry(attempts:int=0, max_retries=5):
    if randint(0, 1):
        raise Exception

test_retry()


### get_embedding

In [54]:
EMBEDDING_MODEL = 'mxbai-embed-large:latest'

@retry(wait=2, max_retries=5)
def get_embedding(text:str, model:str=EMBEDDING_MODEL) -> EmbedResponse:
    text = text.replace('\n', ' ')
    embedding = embed_client.embed(input=text, model=model)
    return embedding.embeddings[0]

get_embedding('this is a story about five friends')[:10]

[0.049320336,
 -0.006095786,
 0.005403328,
 0.05819033,
 0.016559888,
 -0.049261045,
 -0.022237664,
 0.0150083145,
 0.027820827,
 -0.016057922]

## cache management

In [55]:
from pathlib import Path
import pickle


CACHE_PATH = f'{DATA_PATH}/cache.pkl'


def reset_cache(cache_path:str=CACHE_PATH):
    print(f'deleting {cache_path}')
    Path(cache_path).unlink(missing_ok=True)


def save_cache(cache:dict, cache_path:str=CACHE_PATH, reset:bool=False):
    if reset:
        reset_cache(cache_path)
    
    print(f'saving cache to {cache_path}')
    with open(cache_path, 'wb') as fp:
        pickle.dump(cache, fp)


def get_cache(cache_path:str=CACHE_PATH, reset:bool=False) -> dict:
    if reset:
        reset_cache(cache_path)
    try:
        print(f'loading {cache_path}...')
        cache = pd.read_pickle(cache_path)
    except FileNotFoundError:
        print('failed to load')
        cache = {}
        save_cache(cache)
    return cache



In [58]:
CACHE = get_cache(reset=True)

def embedding_from_text(text:str, model:str=EMBEDDING_MODEL, cache:dict=CACHE, **extras) -> list[float]:
    key = (text, model)
    if cache.get(key) is None:
        new_value = {}
        
        for k, v in extras.items():
            new_value[k] = v
            
        new_value['embedding'] = get_embedding(text, model)
        cache[key] = new_value    
        save_cache(cache)
    return cache[key]


def embedding_from_title(title:str, movies:pd.DataFrame, model=EMBEDDING_MODEL, cache:dict=CACHE):
    try:
        # result = movies.loc[movies['Title'].str.lower() == title.lower(), 'Plot']
        movie = movies.loc[movies['Title'].str.lower() == title.lower()]
        # title = result['Title'].squeeze()
        title = movie['Title'].iloc[0]
        plot = movie['Plot'].iloc[0]
        return embedding_from_text(plot, title=title)
    except (IndexError, KeyError):  # Catch potential errors
        print(f'movie {title} not found')
        return None



# pd.options.display.width = 200
pd.options.display.width = 1000
# pd.options.display.max_colwidth # 50
pd.options.display.max_colwidth = 1000

embedding_from_title('phantom thread', movie_plots)
embedding_from_title('stargate', movie_plots)

for k, v in CACHE.items():
    print(v['title'])



deleting data/cache.pkl
loading data/cache.pkl...
failed to load
saving cache to data/cache.pkl
saving cache to data/cache.pkl
saving cache to data/cache.pkl
Phantom Thread
Stargate
