# Ground Truth Dataset

Lets create a ground truth dataset. Lets create the first few examples by hand and then use the LLM to generate the rest.


In [1]:
# Reload
%load_ext autoreload
%autoreload 2



In [2]:
%pip install -qU langchain-openai langchain-core

Note: you may need to restart the kernel to use updated packages.


In [3]:
from utils.data_collection import load_df

df = load_df("../raw/top_50000.pkl")

Loading df


In [4]:
df.head()

Unnamed: 0,title,vote_average,vote_count,status,release_date,revenue,runtime,budget,imdb_id,original_language,...,cast,director,director_of_photography,writers,producers,music_composer,imdb_rating,imdb_votes,embedding,embedding_norm
0,Inception,8.369,36269.0,Released,2010-07-15,825532800.0,148.0,160000000.0,tt1375666,en,...,"Leonardo DiCaprio, Tim Kelleher, Jack Gilroy, ...",Christopher Nolan,Wally Pfister,Christopher Nolan,"Thomas Tull, Yoshikuni Taki, Christopher Nolan...",Hans Zimmer,8.8,2593927.0,"[-0.007635640446096659, -0.004168331157416105,...",1.0
1,Interstellar,8.44,35193.0,Released,2014-11-05,701729200.0,169.0,165000000.0,tt0816692,en,...,"Russ Fega, Liam Dickinson, Jeff Hephner, Brook...",Christopher Nolan,Hoyte van Hoytema,"Jonathan Nolan, Christopher Nolan","Jake Myers, Jordan Goldberg, Lynda Obst, Chris...",Hans Zimmer,8.7,2159078.0,"[-0.0350022129714489, 0.023837948217988014, -0...",1.0
2,The Dark Knight,8.516,32550.0,Released,2008-07-16,1004558000.0,152.0,185000000.0,tt0468569,en,...,"Matthew O'Neill, Nestor Carbonell, Morgan Free...",Christopher Nolan,Wally Pfister,"David S. Goyer, Jonathan Nolan, Bob Kane, Chri...","Kevin De La Noy, Michael Uslan, Thomas Tull, C...","Hans Zimmer, James Newton Howard",9.0,2921100.0,"[0.002534319879487157, -0.03606430068612099, -...",1.0
3,Avatar,7.582,31265.0,Released,2009-12-15,2923706000.0,162.0,237000000.0,tt0499549,en,...,"Jason Whyte, Scott Lawrence, Brandon Jelkes, J...",James Cameron,Mauro Fiore,James Cameron,"Jon Landau, Colin Wilson, Laeta Kalogridis, Ja...",James Horner,7.9,1402405.0,"[-0.028900334611535072, -0.0029624924063682556...",1.0
4,Deadpool,7.621,30696.0,Released,2016-02-09,782837300.0,108.0,58000000.0,tt1431045,en,...,"Ryan Reynolds, Gina Carano, Kayla Adams, Randa...",Tim Miller,Ken Seng,"Paul Wernick, Fabian Nicieza, Rob Liefeld, Rhe...","Ryan Reynolds, Jonathon Komack Martin, Paul We...",Tom Holkenborg,8.0,1167674.0,"[-0.05629737302660942, 0.005468115210533142, -...",1.0


In [5]:
questions = [
    {"query": "What is the title of the movie with the highest rating?",
        "splits": ["relative"]},
    {"query": "What is the title of the movie with the lowest rating?",
        "splits": ["relative"]},
    {"query": "What are the top 3 movies with the highest ratings?",
        "splits": ["relative"]},
    {"query": "What are the top 5 movies with the highest ratings?",
        "splits": ["relative"]},
    {"query": "What are the top 10 movies with the highest rating?",
        "splits": ["relative"]},
    {"query": "What are the bottom 10 movies by rating?",
        "splits": ["relative"]},
    {"query": "What are the bottom 5 movies by rating?",
        "splits": ["relative"]},

    # Vague search
    # Content tag = Requires the agent to understand the content of the movie
    {"query": "What is that one movie about a rat helping a chef cook?",
        "splits": ["content"]},
    {"query": "What are some good movies about a woman who can talk to animals?",
        "splits": ["content"]},
    {"query": "What are some good christmas movies about the raindeer?",
        "splits": ["content"]},
    {"query": "What are some good military movies about the war in vietnam?",
        "splits": ["content"]},
    {"query": "What are some good conspiracy theory documentaries about the moon landing?",
        "splits": ["content"]},


    # Specifics
    {"query": "What is the movie with the highest rating that is not a comedy?",
        "splits": ["specifics"]},
    {"query": "What is the lowest rated movie that Will Farrell is in?",
        "splits": ["specifics"]},
    {"query": "What are all of the movies Tom Hanks was in?",
        "splits": ["specifics"]},
    {"query": "What is that movie where Edward Norton has multiple personalities?",
        "splits": ["specifics"]},
    {"query": "Which movie in Italian has the most ratings?",
        "splits": ["specifics"]},
    {"query": "What is the lowest rated movie that Will Farrell is in?",
        "splits": ["specifics"]},
    {"query": "What are all of the movies Tom Hanks was in?",
        "splits": ["specifics"]},
    {"query": "What is that movie where Edward Norton has multiple personalities?",
        "splits": ["specifics"]},
    {"query": "What has James Cameron directed?",
        "splits": ["specifics"]},
    {"query": "What was the most popular movie in 2000?",
        "splits": ["specifics"]},
    {"query": "What is the movie with the most ratings in the year 2000?",
        "splits": ["specifics"]},

    # Combination
    {"query": "Who is the director of the 3rd lowest revenue movie?",
        "splits": ["combination", "hard"]},
    {"query": "What is the most recent movie Horror movie that James Cameron directed?",
        "splits": ["combination", "hard"]},
    {"query": "How much revenue did Christopher Nolan make in his first 2 movies?",
        "splits": ["combination", "hard"]},
]

Thats a good start!

In [9]:
from utils.langgraph import create_agent

agent = create_agent()

Loading df


Lets create all of the traces

In [11]:
from langchain_core.messages import HumanMessage

for question in questions:
    print(question)
    messages = [HumanMessage(content=question["query"])]
    response = agent.invoke({"messages": messages})
    print(response["messages"][-1].content)


{'query': 'What is the title of the movie with the highest rating?', 'splits': ['relative']}
Got query embeddings
The movie with the highest rating is **Cinema Paradiso** (1988), which has a rating of **8.5**. 

For more information, you can visit the [IMDb page](https://www.imdb.com/title/tt0095765).
{'query': 'What is the title of the movie with the lowest rating?', 'splits': ['relative']}
Got query embeddings
The movie with the lowest rating is **"Dirty Movie"**, which has a rating of **3.1**. 

You can find more information about it [here](https://www.imdb.com/title/tt1107812).
{'query': 'What are the top 3 movies with the highest ratings?', 'splits': ['relative']}
Got query embeddings
Here are the top 3 movies with the highest ratings:

1. **Cinema Paradiso** (1988)
   - **Rating**: 8.5
   - **Overview**: A filmmaker recalls his childhood, when he fell in love with the movies at his village's theater and formed a deep friendship with the theater's projectionist.
   - [More Info](h

Okay we will have a bunch of traces generated from this. We will need to correct them. We can do this by adding them all to an annotation queue and going through them.

## Correcting them

okay now I need to correct the traces

In [12]:
df.head()


Unnamed: 0,title,vote_average,vote_count,status,release_date,revenue,runtime,budget,imdb_id,original_language,...,cast,director,director_of_photography,writers,producers,music_composer,imdb_rating,imdb_votes,embedding,embedding_norm
0,Inception,8.369,36269.0,Released,2010-07-15,825532800.0,148.0,160000000.0,tt1375666,en,...,"Leonardo DiCaprio, Tim Kelleher, Jack Gilroy, ...",Christopher Nolan,Wally Pfister,Christopher Nolan,"Thomas Tull, Yoshikuni Taki, Christopher Nolan...",Hans Zimmer,8.8,2593927.0,"[-0.007635640446096659, -0.004168331157416105,...",1.0
1,Interstellar,8.44,35193.0,Released,2014-11-05,701729200.0,169.0,165000000.0,tt0816692,en,...,"Russ Fega, Liam Dickinson, Jeff Hephner, Brook...",Christopher Nolan,Hoyte van Hoytema,"Jonathan Nolan, Christopher Nolan","Jake Myers, Jordan Goldberg, Lynda Obst, Chris...",Hans Zimmer,8.7,2159078.0,"[-0.0350022129714489, 0.023837948217988014, -0...",1.0
2,The Dark Knight,8.516,32550.0,Released,2008-07-16,1004558000.0,152.0,185000000.0,tt0468569,en,...,"Matthew O'Neill, Nestor Carbonell, Morgan Free...",Christopher Nolan,Wally Pfister,"David S. Goyer, Jonathan Nolan, Bob Kane, Chri...","Kevin De La Noy, Michael Uslan, Thomas Tull, C...","Hans Zimmer, James Newton Howard",9.0,2921100.0,"[0.002534319879487157, -0.03606430068612099, -...",1.0
3,Avatar,7.582,31265.0,Released,2009-12-15,2923706000.0,162.0,237000000.0,tt0499549,en,...,"Jason Whyte, Scott Lawrence, Brandon Jelkes, J...",James Cameron,Mauro Fiore,James Cameron,"Jon Landau, Colin Wilson, Laeta Kalogridis, Ja...",James Horner,7.9,1402405.0,"[-0.028900334611535072, -0.0029624924063682556...",1.0
4,Deadpool,7.621,30696.0,Released,2016-02-09,782837300.0,108.0,58000000.0,tt1431045,en,...,"Ryan Reynolds, Gina Carano, Kayla Adams, Randa...",Tim Miller,Ken Seng,"Paul Wernick, Fabian Nicieza, Rob Liefeld, Rhe...","Ryan Reynolds, Jonathon Komack Martin, Paul We...",Tom Holkenborg,8.0,1167674.0,"[-0.05629737302660942, 0.005468115210533142, -...",1.0


In [13]:
from utils.data_collection import df_to_llm


In [19]:
# Get top 5 apps based on imdb_rating
top_5_apps = df.sort_values(by="imdb_rating", ascending=False)
df_to_llm(top_5_apps.head())


[{'title': '(re)kindle',
  'status': 'Released',
  'release_date': '2021-02-21',
  'revenue': 0.0,
  'runtime': 40.0,
  'budget': 0.0,
  'url': 'https://www.imdb.com/title/tt14946614',
  'original_language': 'pt',
  'original_title': '(re)começo',
  'overview': 'Ana and Helen, two divorced women, were close friends as teenagers. Today, amidst the corona virus pandemic and in quarantine, they get in touch after 20 years via internet. Through video conference calls, memories, sensations and emotions reflourishes.',
  'popularity': 4.241,
  'tagline': 'Two old friends get in touch through video calls amidst the Covid-19 pandemic.',
  'genres': 'Drama, Documentary, Romance',
  'production_companies': 'AMA',
  'production_countries': nan,
  'spoken_languages': 'Português',
  'cast': 'Joana Dória, Manuela Afonso',
  'director': 'Aron Matschulat Aguiar',
  'writers': 'Aron Matschulat Aguiar',
  'producers': 'Aron Matschulat Aguiar',
  'rating': 10.0,
  'votes': 6.0},
 {'title': 'Nirvana: Unpl

In [16]:
# Bottom 5 apps
bottom_5_apps = df.tail()
bottom_5_apps


Unnamed: 0,title,vote_average,vote_count,status,release_date,revenue,runtime,budget,imdb_id,original_language,...,cast,director,director_of_photography,writers,producers,music_composer,imdb_rating,imdb_votes,embedding,embedding_norm
49995,Monky,5.5,21.0,Released,2017-12-22,0.0,90.0,0.0,tt6184194,sv,...,"Julius Jimenez Hugoson, Bianca Kronlöf, Tomas ...",Maria Blom,Andréas Lennartsson,"Jonathan Sjöberg, Anders Weidemann, Truls Ande...",Patrick Ryborn,Anders Nygårds,5.0,437.0,"[-0.003916103858500719, -0.026218455284833908,...",1.0
49996,Mounam Pesiyadhe,6.8,21.0,Released,2002-12-13,0.0,168.0,0.0,tt0386650,ta,...,"Laila, Suriya, Neha Pendse, Nandha, Vidharth, ...",Ameer,Ramji,"Ameer, Snehan, Vaali","Rajan Radhakrishnan, Ganesh Raghu, Karthik Rad...",Yuvan Shankar Raja,7.7,2444.0,"[-0.010540680959820747, 0.002165184821933508, ...",1.0
49997,Vlad the Impaler: The True Life of Dracula,7.7,21.0,Released,1979-01-08,0.0,134.0,0.0,tt0141966,ro,...,"Petre Simionescu, Mihai Pălădescu, George Cons...",Doru Năstase,,Mircea Mohor,Dumitru Fernoagă,,7.5,1313.0,"[-0.021357912570238113, -0.03555901348590851, ...",1.0
49998,Deadly Crossing,5.6,21.0,Released,2010-12-27,0.0,86.0,0.0,tt1701105,en,...,"Steven Seagal, Marina Eva, Gil Bellows, Kyle C...",Keoni Waxman,,Steven Seagal,,,5.4,278.0,"[-0.004094576928764582, -0.02722948044538498, ...",1.0
49999,Most Likely to Murder,5.4,21.0,Released,2019-12-14,0.0,80.0,0.0,tt10621180,en,...,"Madison McLaughlin, Ava Allan, Ashlee Füss, Pe...",Kaila York,Paul Salmons,"Blaine Chiappetta, Matt Marinovich","Nigel Thomas, Rick Benattar",Alexander Bornstein,5.2,351.0,"[-0.026393771171569824, 0.012630559504032135, ...",1.0


In [20]:
# Movie with the lowest rating

lowest_rating = df.sort_values(by="imdb_rating", ascending=True).head()
lowest_rating

Unnamed: 0,title,vote_average,vote_count,status,release_date,revenue,runtime,budget,imdb_id,original_language,...,cast,director,director_of_photography,writers,producers,music_composer,imdb_rating,imdb_votes,embedding,embedding_norm
35040,Reis,1.784,37.0,Released,2017-03-02,0.0,108.0,8000000.0,tt5988370,tr,...,"Abidin Yerebakan, Reha Beyoğlu, Alper Türedi, ...",Hüdaverdi Yavuz,,,,,1.0,74143.0,"[-0.03620416671037674, 0.03401148319244385, -0...",1.0
41950,2025: The World Enslaved by a Virus,1.8,28.0,Released,2021-01-15,0.0,91.0,10000.0,tt13788842,en,...,"Lukas Speer, Joshua Wesely, John Vogt, Tabitha...","Joshua Wesely, Simon Wesely",Simon Wesely,"Joshua Wesely, Simon Wesely","Joshua Wesely, Simon Wesely",,1.0,2585.0,"[-0.03571880981326103, -0.0023378352634608746,...",1.0
49584,Rollergator,2.9,21.0,Released,1996-08-08,0.0,83.0,0.0,tt0207061,en,...,"Sandra Shuker, Erin O'Bryan, Donald G. Jackson...",Donald G. Jackson,,"Scott Shaw, Matt X. Lawrence, Donald G. Jackson","George Peirson, Scott Shaw, Donald G. Jackson",,1.2,821.0,"[-0.03807482123374939, -0.0151399876922369, -0...",1.0
30313,"Daniel, the Wizard",1.4,47.0,Released,2004-08-12,0.0,81.0,0.0,tt0421051,de,...,"Gaby Schmidberger, Adele Eden, Manolito Lommel...",Ulli Lommel,"Manuel Lommel, Max Nikoff",Ulli Lommel,"Pia Yvonne Woods, Peter Schamoni, Rudolf Walde...",,1.2,14913.0,"[0.01854429766535759, -0.01938939467072487, -0...",1.0
28393,Smolensk,2.7,52.0,Released,2016-09-09,0.0,115.0,3000000.0,tt6038600,pl,...,"Marta Alaborska, Halina Łabonarska, Piotr Bajo...",Antoni Krauze,Michal Pakulski,"Tomasz Łysiak, Antoni Krauze, Marcin Wolski, M...",Maciej Pawlicki,,1.2,40210.0,"[-0.029994618147611618, -0.018035979941487312,...",1.0


In [23]:
# Top 10 highest rating movies
top_10_highest_rating = df.sort_values(by="imdb_rating", ascending=False).head(10)
titles = top_10_highest_rating["title"].tolist()
" | ".join(titles)


'(re)kindle | Nirvana: Unplugged In New York | Phineas and Ferb: Last Day of Summer | The Godfather Trilogy: 1901-1980 | The Shawshank Redemption | Doctor Who: The Day of the Doctor | Twenty One Pilots: Livestream Experience | John Mayer: Where the Light Is (Live in Los Angeles) | Cem Yılmaz: CMYLMZ | The Godfather'

In [24]:
from utils.data_collection import MovieSearchTool

tool = MovieSearchTool()

Loading df


In [34]:
moon_movies = tool.semantic_search("Moon Conspiracy", k=100)

# Where "Documentary" is in the genres column
documentaries = moon_movies[moon_movies["genres"].str.contains("Documentary")]
documentaries.head(10)

Got query embeddings


Unnamed: 0,title,vote_average,vote_count,status,release_date,revenue,runtime,budget,imdb_id,original_language,...,director_of_photography,writers,producers,music_composer,imdb_rating,imdb_votes,embedding,embedding_norm,dot_product,similarity
5943,Apollo 11,7.838,617.0,Released,2019-03-01,0.0,93.0,0.0,tt8760684,en,...,,,"Josh Braun, Evan Krauss, Tom Quinn, Courtney S...",Matt Morton,8.1,27512.0,"[-0.01491253450512886, 0.020741747692227364, -...",1.0,0.368156,0.368156
16007,For All Mankind,7.8,134.0,Released,1989-11-01,770132.0,80.0,0.0,tt0097372,en,...,,,"Betsy Broyles Breier, Al Reinert",,8.1,6676.0,"[-0.018750187009572983, 0.01347751822322607, -...",1.0,0.375394,0.375394
19162,In the Shadow of the Moon,7.485,100.0,Released,2007-01-19,0.0,109.0,2000000.0,tt0925248,en,...,,,"Richard Life, Davic McNab, Belinda Blacklock",,8.0,6954.0,"[0.001786063308827579, -0.002448124112561345, ...",1.0,0.397764,0.397764
22034,The Last Man on the Moon,6.5,80.0,Released,2016-02-26,0.0,95.0,0.0,tt3219604,en,...,,"Mark Craig, Eugene Cernan",,,7.4,3585.0,"[0.00435029249638319, -0.007642406038939953, -...",1.0,0.402559,0.402559
28295,Above Majestic,7.358,53.0,Released,2018-10-30,0.0,133.0,0.0,tt9143304,en,...,,,"Corey Goode, Jordan Sather",,5.9,1522.0,"[0.01591109298169613, 0.0012432755902409554, -...",1.0,0.323658,0.323658
33801,Mission Control: The Unsung Heroes of Apollo,7.205,39.0,Released,2017-03-14,0.0,101.0,0.0,tt5959952,en,...,Ian Salvage,,"Gareth Dodds, Keith Haviland",,7.5,1703.0,"[0.03263729810714722, -0.0053871991112828255, ...",1.0,0.400421,0.400421
34317,Dark Side of the Moon,7.013,38.0,Released,2002-10-16,0.0,52.0,0.0,tt0344160,fr,...,Stéphane Saporito,William Karel,"Emmanuelle Fage, Luc Martin-Gousset, Vincent D...",,7.6,1938.0,"[-0.0065108719281852245, -0.04185466095805168,...",1.0,0.475232,0.475232
36555,Apollo: Missions to the Moon,7.3,35.0,Released,2019-07-07,0.0,94.0,0.0,tt9782756,en,...,,Tom Jennings,"Chris Morcom, Abe Scheuermann",,7.5,686.0,"[-0.006804818753153086, -0.0030696596950292587...",1.0,0.397728,0.397728
42672,Aliens on the Moon: The Truth Exposed,3.9,27.0,Released,2014-07-20,0.0,86.0,0.0,tt3797808,en,...,,Robert Kiviat,,,3.5,1051.0,"[-0.016269655898213387, -0.033461686223745346,...",1.0,0.50107,0.50107
46540,2001: The Making of a Myth,6.5,23.0,Released,2001-01-13,0.0,43.0,0.0,tt1014669,en,...,,,"Jamie Doran, Piers Bizony",,6.9,693.0,"[0.013608579523861408, -0.011418469250202179, ...",1.0,0.327862,0.327862


In [35]:
titles = documentaries["title"].tolist()
" | ".join(titles)


'Apollo 11 | For All Mankind | In the Shadow of the Moon | The Last Man on the Moon | Above Majestic | Mission Control: The Unsung Heroes of Apollo | Dark Side of the Moon | Apollo: Missions to the Moon | Aliens on the Moon: The Truth Exposed | 2001: The Making of a Myth'

In [32]:
genres

['Action, Science Fiction, Adventure',
 'Adventure, Drama, Science Fiction',
 'Drama, Action, Crime, Thriller',
 'Action, Adventure, Fantasy, Science Fiction',
 'Action, Adventure, Comedy',
 'Science Fiction, Action, Adventure',
 'Adventure, Action, Science Fiction',
 'Drama',
 'Action, Science Fiction, Adventure',
 'Thriller, Crime',
 'Comedy, Drama, Romance',
 'Adventure, Fantasy',
 'Drama, Crime',
 'Action, Science Fiction, Adventure',
 'Drama, Western',
 'Action, Science Fiction',
 'Adventure, Science Fiction, Action',
 'Crime, Thriller, Drama',
 'Drama, Romance',
 'Adventure, Fantasy, Action',
 'Adventure, Fantasy, Action',
 'Drama, Thriller, Mystery',
 'Crime, Drama, Comedy',
 'Action, Adventure, Science Fiction',
 'Adventure, Action, Science Fiction',
 'Action, Crime, Drama, Thriller',
 'Action, Adventure, Science Fiction',
 'Drama, Thriller, War',
 'Action, Adventure, Science Fiction',
 'Action, Adventure, Science Fiction',
 'Action, Adventure, Fantasy',
 'Science Fiction, Adve

In [37]:
# In not Nan in genres
has_genre = df[df["genres"].notna()]

horror = has_genre[has_genre["genres"].str.contains("Horror")]

james_cameron_movies = horror[horror["director"] == "James Cameron"]
james_cameron_movies



Unnamed: 0,title,vote_average,vote_count,status,release_date,revenue,runtime,budget,imdb_id,original_language,...,cast,director,director_of_photography,writers,producers,music_composer,imdb_rating,imdb_votes,embedding,embedding_norm
10583,Piranha II: The Spawning,4.382,259.0,Released,1982-08-14,0.0,84.0,145786.0,tt0082910,en,...,"Steve Marachuk, Hildy Magnasun, Ancile Gloudon...",James Cameron,Roberto D'Ettorre Piazzoli,"James Cameron, Ovidio G. Assonitis, Charles H....","Hisako Tsukuba, Ovidio G. Assonitis, Jeff Sche...",Stelvio Cipriani,3.8,10088.0,"[-0.009380608797073364, -0.00252433936111629, ...",1.0


In [40]:
# What is the lowest rated movie that Will Farrell is in?
has_cast = df[df["cast"].notna()]

will_farrell = has_cast[has_cast["cast"].str.contains("Will Farrell")]
will_farrell = will_farrell.sort_values(by="imdb_rating", ascending=True)
will_farrell.head(1)




Unnamed: 0,title,vote_average,vote_count,status,release_date,revenue,runtime,budget,imdb_id,original_language,...,cast,director,director_of_photography,writers,producers,music_composer,imdb_rating,imdb_votes,embedding,embedding_norm
22244,Don't Hug Me I'm Scared 6,8.0,79.0,Released,2016-06-19,0.0,8.0,0.0,tt5832696,en,...,"Harry Denniston, Lauren Veevers, James Randell...","Becky Sloan, Joseph Pelling",Edward Tucker,"Becky Sloan, Baker Terry, Joseph Pelling","Becky Sloan, James Stevenson Bretton, Benjamin...",,8.4,2655.0,"[-0.031121233478188515, -0.010212965309619904,...",1.0


In [42]:
# What has James Cameron directed?

has_director = df[df["director"].notna()]
cameron = has_director[has_director["director"] == "James Cameron"]
cameron

titles = cameron["title"].tolist()
" | ".join(titles)





'Avatar | Titanic | The Terminator | Terminator 2: Judgment Day | Avatar: The Way of Water | Aliens | True Lies | The Abyss | Piranha II: The Spawning | Ghosts of the Abyss'

In [46]:
# Who is the director of the 3rd lowest revenue movie?

lowest_revenue = df.sort_values(by="revenue", ascending=True)
third_lowest_revenue = lowest_revenue.iloc[2]
print(third_lowest_revenue["title"])
print(third_lowest_revenue["director"])


Airplane Mode
David Dinetz, Dylan Trussell


In [49]:
# What was the most popular movie in 2000?
# Reformat date column from YYYY-MM-DD to datetime compatible with .dt
import pandas as pd
df["release_date"] = pd.to_datetime(df["release_date"])

movies_2000 = df[df["release_date"].dt.year == 2000]
most_popular_2000 = movies_2000.sort_values(by="imdb_votes", ascending=False).head(1)
most_popular_2000


Unnamed: 0,title,vote_average,vote_count,status,release_date,revenue,runtime,budget,imdb_id,original_language,...,cast,director,director_of_photography,writers,producers,music_composer,imdb_rating,imdb_votes,embedding,embedding_norm
72,Gladiator,8.217,18218.0,Released,2000-05-04,465361176.0,155.0,103000000.0,tt0172495,en,...,"Connie Nielsen, Joaquin Phoenix, Spencer Treat...",Ridley Scott,John Mathieson,"David Franzoni, William Nicholson, John Logan","Branko Lustig, Ridley Scott, David Franzoni, D...","Hans Zimmer, Lisa Gerrard",8.5,1657014.0,"[0.00046017413842491806, 0.031162530183792114,...",1.0


In [56]:
# What are some good christmas movies about the raindeer?

reindeer_movies = tool.semantic_search("reindeer", k = 10)
reindeer_movies

titles = reindeer_movies["title"].tolist()
" | ".join(titles)


Got query embeddings


"The Flight Before Christmas | Annabelle's Wish | Elliot: The Littlest Reindeer | The White Reindeer | A Reindeer's Journey | Reindeerspotting: Escape from Santaland | Loup | Prancer Returns | Terror in the Midnight Sun | Robbie the Reindeer: Hooves of Fire"

In [58]:
# What are all of the movies Tom Hanks was in?


tom_hanks = has_cast[has_cast["cast"].str.contains("Tom Hanks")]
titles = tom_hanks["title"].tolist()
" | ".join(titles)




"Forrest Gump | Toy Story | The Green Mile | Saving Private Ryan | Catch Me If You Can | Toy Story 3 | Cars | Toy Story 2 | Cast Away | Toy Story 4 | The Da Vinci Code | The Simpsons Movie | The Terminal | Sully | Cloud Atlas | Captain Phillips | Bridge of Spies | Angels & Demons | The Polar Express | Inferno | Apollo 13 | The Post | The Circle | Philadelphia | Road to Perdition | Big | Finch | Elvis | Saving Mr. Banks | You've Got Mail | The Walk | Greyhound | A Man Called Otto | Borat Subsequent Moviefilm | Sleepless in Seattle | Extremely Loud & Incredibly Close | Asteroid City | News of the World | A Beautiful Day in the Neighborhood | The Ladykillers | Pinocchio | Charlie Wilson's War | A League of Their Own | Splash | Larry Crowne | Turner & Hooch | The 'Burbs | A Hologram for the King | The Money Pit | Toy Story That Time Forgot | Toy Story of Terror! | That Thing You Do! | Hawaiian Vacation | Bachelor Party | Small Fry | Joe Versus the Volcano | Partysaurus Rex | Dragnet | The 

In [59]:
# What are some good movies about a woman who can talk to animals?

animal_movies = tool.semantic_search("Woman talking to animals", k = 10)
animal_movies

titles = animal_movies["title"].tolist()
" | ".join(titles)




Got query embeddings


'Zookeeper | Doctor Dolittle | Dr. Dolittle: Million Dollar Mutts | Creature Comforts | Animal Behaviour | Marine Life Interviews | Sleeping Dogs Lie | A Talking Cat!?! | Zoology | Little Miss Dolittle'

In [60]:
# What are the bottom 5 movies by rating?

bottom_5 = df.sort_values(by="imdb_rating", ascending=True).head(5)
titles = bottom_5["title"].tolist()
" | ".join(titles)




'Reis | 2025: The World Enslaved by a Virus | Rollergator | Daniel, the Wizard | Smolensk'

In [61]:
# What are some good military movies about the war in vietnam?

vietnam_movies = tool.semantic_search("Vietnam War", k = 10)
vietnam_movies

titles = vietnam_movies["title"].tolist()
" | ".join(titles)




Got query embeddings


'Good Morning, Vietnam | Casualties of War | Danger Close: The Battle of Long Tan | The War | Uncommon Valor | Last Days in Vietnam | Hearts and Minds | Tunnel Rats | Diên Biên Phu | Dear America: Letters Home from Vietnam'

In [63]:
# Which movie in Italian has the most ratings?

has_original_language = df[df["original_language"].notna()]

italian_movies = has_original_language[has_original_language["original_language"] == "it"]

most_ratings = italian_movies.sort_values(by="imdb_votes", ascending=False).head(1)
most_ratings


Unnamed: 0,title,vote_average,vote_count,status,release_date,revenue,runtime,budget,imdb_id,original_language,...,cast,director,director_of_photography,writers,producers,music_composer,imdb_rating,imdb_votes,embedding,embedding_norm
408,"The Good, the Bad and the Ugly",8.461,8494.0,Released,1966-12-22,38900000.0,161.0,1200000.0,tt0060196,it,...,"Claudio Scarchilli, Gianni Di Segni, José Terr...",Sergio Leone,Tonino Delli Colli,"Mickey Knox, Furio Scarpelli, Luciano Vincenzo...",Alberto Grimaldi,Ennio Morricone,8.8,823981.0,"[0.018123049288988113, -0.0037472927942872047,...",1.0


In [64]:
# What is the movie with the highest rating that is not a comedy?

has_genres = df[df["genres"].notna()]
not_comedy = has_genres[~has_genres["genres"].str.contains("Comedy")]
highest_rating = not_comedy.sort_values(by="imdb_rating", ascending=False).head(1)
highest_rating







Unnamed: 0,title,vote_average,vote_count,status,release_date,revenue,runtime,budget,imdb_id,original_language,...,cast,director,director_of_photography,writers,producers,music_composer,imdb_rating,imdb_votes,embedding,embedding_norm
25922,(re)kindle,7.4,61.0,Released,2021-02-21,0.0,40.0,0.0,tt14946614,pt,...,"Joana Dória, Manuela Afonso",Aron Matschulat Aguiar,,Aron Matschulat Aguiar,Aron Matschulat Aguiar,,10.0,6.0,"[-0.013671365566551685, -0.017643675208091736,...",1.0


In [65]:
# What are the bottom 10 movies by rating?

bottom_10 = df.sort_values(by="imdb_rating", ascending=True).head(10)
titles = bottom_10["title"].tolist()
" | ".join(titles)



# What is the most popular movie in 2000?



"Reis | 2025: The World Enslaved by a Virus | Rollergator | Daniel, the Wizard | Smolensk | Kidnapping, Caucasian Style | Extraction Point | Shark Exorcist | Saving Christmas | Potato Salad: Don't Ask!"

In [70]:
# How much revenue did Christopher Nolan make in his first 2 movies?

nolan = df[df["director"] == "Christopher Nolan"]

earliest_nolan = nolan.sort_values(by="release_date", ascending=True).head(2)

earliest_nolan

for index, row in earliest_nolan.iterrows():
    print(f"Title: {row['title']}")
    print(f"Revenue: {row['revenue']}")
    print("------")




Title: Doodlebug
Revenue: 0.0
------
Title: Following
Revenue: 48482.0
------
