# Ground Truth Dataset

Lets create a ground truth dataset. Lets create the first few examples by hand and then use the LLM to generate the rest.


In [None]:
# Reload
%load_ext autoreload
%autoreload 2



In [None]:
%pip install -qU langchain-openai langchain-core

In [None]:
from utils.data_collection import load_df

df = load_df("../raw/top_50000.pkl")

In [None]:
df.head()

In [None]:
questions = [
    {"query": "What is the title of the movie with the highest rating?",
        "splits": ["relative"]},
    {"query": "What is the title of the movie with the lowest rating?",
        "splits": ["relative"]},
    {"query": "What are the top 3 movies with the highest ratings?",
        "splits": ["relative"]},
    {"query": "What are the top 5 movies with the highest ratings?",
        "splits": ["relative"]},
    {"query": "What are the top 10 movies with the highest rating?",
        "splits": ["relative"]},
    {"query": "What are the bottom 10 movies by rating?",
        "splits": ["relative"]},
    {"query": "What are the bottom 5 movies by rating?",
        "splits": ["relative"]},

    # Vague search
    # Content tag = Requires the agent to understand the content of the movie
    {"query": "What is that one movie about a rat helping a chef cook?",
        "splits": ["content"]},
    {"query": "What are some good movies about a woman who can talk to animals?",
        "splits": ["content"]},
    {"query": "What are some good christmas movies about the raindeer?",
        "splits": ["content"]},
    {"query": "What are some good military movies about the war in vietnam?",
        "splits": ["content"]},
    {"query": "What are some good conspiracy theory documentaries about the moon landing?",
        "splits": ["content"]},


    # Specifics
    {"query": "What is the movie with the highest rating that is not a comedy?",
        "splits": ["specifics"]},
    {"query": "What is the lowest rated movie that Will Farrell is in?",
        "splits": ["specifics"]},
    {"query": "What are all of the movies Tom Hanks was in?",
        "splits": ["specifics"]},
    {"query": "What is that movie where Edward Norton has multiple personalities?",
        "splits": ["specifics"]},
    {"query": "Which movie in Italian has the most ratings?",
        "splits": ["specifics"]},
    {"query": "What is the lowest rated movie that Will Farrell is in?",
        "splits": ["specifics"]},
    {"query": "What are all of the movies Tom Hanks was in?",
        "splits": ["specifics"]},
    {"query": "What is that movie where Edward Norton has multiple personalities?",
        "splits": ["specifics"]},
    {"query": "What has James Cameron directed?",
        "splits": ["specifics"]},
    {"query": "What was the most popular movie in 2000?",
        "splits": ["specifics"]},
    {"query": "What is the movie with the most ratings in the year 2000?",
        "splits": ["specifics"]},

    # Combination
    {"query": "Who is the director of the 3rd lowest revenue movie?",
        "splits": ["combination", "hard"]},
    {"query": "What is the most recent movie Horror movie that James Cameron directed?",
        "splits": ["combination", "hard"]},
    {"query": "How much revenue did Christopher Nolan make in his first 2 movies?",
        "splits": ["combination", "hard"]},
]

Thats a good start!

In [None]:
from utils.langgraph import create_agent

agent = create_agent()

Lets create all of the traces

In [None]:
from langchain_core.messages import HumanMessage

for question in questions:
    print(question)
    messages = [HumanMessage(content=question["query"])]
    response = agent.invoke({"messages": messages})
    print(response["messages"][-1].content)


Okay we will have a bunch of traces generated from this. We will need to correct them. We can do this by adding them all to an annotation queue and going through them.

## Correcting them

okay now I need to correct the traces

In [None]:
df.head()


In [None]:
from utils.data_collection import df_to_llm


In [None]:
# Get top 5 apps based on imdb_rating
top_5_apps = df.sort_values(by="imdb_rating", ascending=False)
df_to_llm(top_5_apps.head())


In [None]:
# Bottom 5 apps
bottom_5_apps = df.tail()
bottom_5_apps


In [None]:
# Movie with the lowest rating

lowest_rating = df.sort_values(by="imdb_rating", ascending=True).head()
lowest_rating

In [None]:
# Top 10 highest rating movies
top_10_highest_rating = df.sort_values(by="imdb_rating", ascending=False).head(10)
titles = top_10_highest_rating["title"].tolist()
" | ".join(titles)


In [None]:
from utils.data_collection import MovieSearchTool

tool = MovieSearchTool()

In [None]:
moon_movies = tool.semantic_search("Moon Conspiracy", k=100)

# Where "Documentary" is in the genres column
documentaries = moon_movies[moon_movies["genres"].str.contains("Documentary")]
documentaries.head(10)

In [None]:
titles = documentaries["title"].tolist()
" | ".join(titles)


In [None]:
genres

In [None]:
# In not Nan in genres
has_genre = df[df["genres"].notna()]

horror = has_genre[has_genre["genres"].str.contains("Horror")]

james_cameron_movies = horror[horror["director"] == "James Cameron"]
james_cameron_movies



In [None]:
# What is the lowest rated movie that Will Farrell is in?
has_cast = df[df["cast"].notna()]

will_farrell = has_cast[has_cast["cast"].str.contains("Will Farrell")]
will_farrell = will_farrell.sort_values(by="imdb_rating", ascending=True)
will_farrell.head(1)




In [None]:
# What has James Cameron directed?

has_director = df[df["director"].notna()]
cameron = has_director[has_director["director"] == "James Cameron"]
cameron

titles = cameron["title"].tolist()
" | ".join(titles)





In [None]:
# Who is the director of the 3rd lowest revenue movie?

lowest_revenue = df.sort_values(by="revenue", ascending=True)
third_lowest_revenue = lowest_revenue.iloc[2]
print(third_lowest_revenue["title"])
print(third_lowest_revenue["director"])


In [None]:
# What was the most popular movie in 2000?
# Reformat date column from YYYY-MM-DD to datetime compatible with .dt
import pandas as pd
df["release_date"] = pd.to_datetime(df["release_date"])

movies_2000 = df[df["release_date"].dt.year == 2000]
most_popular_2000 = movies_2000.sort_values(by="imdb_votes", ascending=False).head(1)
most_popular_2000


In [None]:
# What are some good christmas movies about the raindeer?

reindeer_movies = tool.semantic_search("reindeer", k = 10)
reindeer_movies

titles = reindeer_movies["title"].tolist()
" | ".join(titles)


In [None]:
# What are all of the movies Tom Hanks was in?


tom_hanks = has_cast[has_cast["cast"].str.contains("Tom Hanks")]
titles = tom_hanks["title"].tolist()
" | ".join(titles)




In [None]:
# What are some good movies about a woman who can talk to animals?

animal_movies = tool.semantic_search("Woman talking to animals", k = 10)
animal_movies

titles = animal_movies["title"].tolist()
" | ".join(titles)




In [None]:
# What are the bottom 5 movies by rating?

bottom_5 = df.sort_values(by="imdb_rating", ascending=True).head(5)
titles = bottom_5["title"].tolist()
" | ".join(titles)




In [None]:
# What are some good military movies about the war in vietnam?

vietnam_movies = tool.semantic_search("Vietnam War", k = 10)
vietnam_movies

titles = vietnam_movies["title"].tolist()
" | ".join(titles)




In [None]:
# Which movie in Italian has the most ratings?

has_original_language = df[df["original_language"].notna()]

italian_movies = has_original_language[has_original_language["original_language"] == "it"]

most_ratings = italian_movies.sort_values(by="imdb_votes", ascending=False).head(1)
most_ratings


In [None]:
# What is the movie with the highest rating that is not a comedy?

has_genres = df[df["genres"].notna()]
not_comedy = has_genres[~has_genres["genres"].str.contains("Comedy")]
highest_rating = not_comedy.sort_values(by="imdb_rating", ascending=False).head(1)
highest_rating







In [None]:
# What are the bottom 10 movies by rating?

bottom_10 = df.sort_values(by="imdb_rating", ascending=True).head(10)
titles = bottom_10["title"].tolist()
" | ".join(titles)



# What is the most popular movie in 2000?



In [None]:
# How much revenue did Christopher Nolan make in his first 2 movies?

nolan = df[df["director"] == "Christopher Nolan"]

earliest_nolan = nolan.sort_values(by="release_date", ascending=True).head(2)

earliest_nolan

for index, row in earliest_nolan.iterrows():
    print(f"Title: {row['title']}")
    print(f"Revenue: {row['revenue']}")
    print("------")


