In [None]:
import os
from dotenv import load_dotenv
from langchain.chains import create_sql_query_chain
from langchain_openai import ChatOpenAI
from langchain_community.utilities import SQLDatabase
from langchain_openai import OpenAIEmbeddings
from pyprojroot import here
import ast
import pandas as pd
load_dotenv()

True

## Transforming the tabular data
The tabular data is transformed into something that can be embedded into a vector database.
Information about the cast is merged into the movies table, so each row to be embedded contain information about a movie and the cast.

In [4]:
os.environ['OPENAI_API_KEY'] = os.getenv("OPENAI_API_KEY")
llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)

In [None]:
sqldb_directory = here("data/db/imdb.db")
db = SQLDatabase.from_uri(f"sqlite:///{sqldb_directory}")

In [6]:
columns = ["tconst", "titleType", "primaryTitle", "originalTitle", "isAdult", "startYear", "endYear", "runtimeMinutes", "genres", "averageRating", "numVotes"]

top_movies_query = """SELECT b.*, r.averageRating, r.numVotes
FROM 'title.basics' b
JOIN 'title.ratings' r ON b.tconst = r.tconst
ORDER BY r.numVotes DESC
LIMIT 100000;
"""

result = db.run(top_movies_query)
parsed_result = ast.literal_eval(result)  

# Load the results into a pandas DataFrame
df_top_movies = pd.DataFrame(parsed_result, columns=columns)

In [7]:
df_top_movies.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes
0,tt0111161,movie,The Shawshank Redemption,The Shawshank Redemption,0,1994,,142,Drama,9.3,3001848
1,tt0468569,movie,The Dark Knight,The Dark Knight,0,2008,,152,"Action,Crime,Drama",9.0,2980743
2,tt1375666,movie,Inception,Inception,0,2010,,148,"Action,Adventure,Sci-Fi",8.8,2648271
3,tt0137523,movie,Fight Club,Fight Club,0,1999,,139,Drama,8.8,2427205
4,tt0944947,tvSeries,Game of Thrones,Game of Thrones,0,2011,2019.0,60,"Action,Adventure,Drama",9.2,2397453


In [8]:
cast_query = """SELECT p.tconst, p.nconst, p.category, p.job, p.characters
FROM "title.principals" p
WHERE p.tconst IN (
    SELECT tconst FROM "title.ratings" ORDER BY numVotes DESC LIMIT 100000
);
"""

result = db.run(cast_query)
parsed_result = ast.literal_eval(result)

df_cast = pd.DataFrame(parsed_result, columns=["tconst", "nconst", "category", "job", "characters"])

In [9]:
df_cast.head()

Unnamed: 0,tconst,nconst,category,job,characters
0,tt0000001,nm1588970,self,,"[""Self""]"
1,tt0000001,nm0005690,director,,
2,tt0000001,nm0005690,producer,producer,
3,tt0000001,nm0374658,cinematographer,director of photography,
4,tt0000003,nm0721526,director,,


In [10]:
names_query = """SELECT * 
FROM "name.basics" 
WHERE nconst IN (
    SELECT DISTINCT nconst 
    FROM "title.principals" 
    WHERE tconst IN (SELECT tconst FROM "title.ratings" ORDER BY numVotes DESC LIMIT 100000)
);
"""

result = db.run(names_query)
parsed_result = ast.literal_eval(result)

df_names = pd.DataFrame(parsed_result, columns=["nconst", "primaryName", "birthYear", "deathYear", "primaryProfession", "knownForTitles"])

In [11]:
df_names.head()

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm0000001,Fred Astaire,1899,1987.0,"actor,miscellaneous,producer","tt0072308,tt0050419,tt0027125,tt0031983"
1,nm0000002,Lauren Bacall,1924,2014.0,"actress,soundtrack,archive_footage","tt0037382,tt0075213,tt0117057,tt0038355"
2,nm0000003,Brigitte Bardot,1934,,"actress,music_department,producer","tt0057345,tt0049189,tt0056404,tt0054452"
3,nm0000004,John Belushi,1949,1982.0,"actor,writer,music_department","tt0072562,tt0077975,tt0080455,tt0078723"
4,nm0000005,Ingmar Bergman,1918,2007.0,"writer,director,actor","tt0050986,tt0069467,tt0050976,tt0083922"


In [13]:
df_top_movies.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes
0,tt0111161,movie,The Shawshank Redemption,The Shawshank Redemption,0,1994,,142,Drama,9.3,3001848
1,tt0468569,movie,The Dark Knight,The Dark Knight,0,2008,,152,"Action,Crime,Drama",9.0,2980743
2,tt1375666,movie,Inception,Inception,0,2010,,148,"Action,Adventure,Sci-Fi",8.8,2648271
3,tt0137523,movie,Fight Club,Fight Club,0,1999,,139,Drama,8.8,2427205
4,tt0944947,tvSeries,Game of Thrones,Game of Thrones,0,2011,2019.0,60,"Action,Adventure,Drama",9.2,2397453


In [14]:
df_cast_names = df_cast.merge(df_names, on="nconst")

In [19]:
cast_names_grouped = df_cast_names.groupby("tconst").apply(lambda x: "\n".join([f'Name:{row["primaryName"]} {row["birthYear"]}, characters:{row["characters"]}, job:{row["category"]}' for _, row in x.iterrows()]))
cast_names_grouped.name = "cast_info"
df_cast_info = pd.DataFrame(cast_names_grouped.reset_index())

  cast_names_grouped = df_cast_names.groupby("tconst").apply(lambda x: "\n".join([f'Name:{row["primaryName"]} {row["birthYear"]}, characters:{row["characters"]}, job:{row["category"]}' for _, row in x.iterrows()]))


In [21]:
df_cast_info.head()

Unnamed: 0,tconst,cast_info
0,tt0000001,"Name:Carmencita 1868, characters:[""Self""], job..."
1,tt0000003,"Name:Émile Reynaud 1844, characters:None, job:..."
2,tt0000005,"Name:Charles Kayser 1878, characters:[""Blacksm..."
3,tt0000008,"Name:Fred Ott 1860, characters:[""Sneezing Man""..."
4,tt0000010,"Name:Louis Lumière 1864, characters:None, job:..."


In [22]:
df_movie_data = df_top_movies.merge(cast_names_grouped, on="tconst")

In [None]:
df_movie_data.to_csv(here("data/imdb_movies_100k.csv"), index=False)