In [160]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ast # Used for safely evaluating string-formatted python literals

# Set visualization style
sns.set_style('whitegrid')

In [161]:
# --- 1. Data Ingestion ---

# Load the datasets from the raw data folder
# Make sure your notebook is in the 'notebooks' folder and the data is in 'data/raw/'
try:
    movies_df = pd.read_csv('../data/raw/tmdb_5000_movies.csv')
    credits_df = pd.read_csv('../data/raw/tmdb_5000_credits.csv')
except FileNotFoundError:
    print("Error: Make sure the data files are in a 'data/raw' folder, one level above your 'notebooks' directory.")
    # As a fallback for display, create empty dataframes
    movies_df = pd.DataFrame()
    credits_df = pd.DataFrame()


In [162]:
# --- 2. Merging the Datasets ---

# The 'credits' dataframe has a 'movie_id' column which corresponds to the 'id' column in the 'movies' dataframe.
# We'll rename 'movie_id' to 'id' in the credits dataframe to have a consistent key for merging.
credits_df.rename(columns={'movie_id': 'id'}, inplace=True)

# Merge the two dataframes on the 'id' column
df = pd.merge(movies_df, credits_df, on='id')

In [163]:
# --- 3. Initial Reconnaissance ---

# Display the first 5 rows to get a feel for the data
print("--- First 5 Rows of the Merged DataFrame ---")
df.head()

--- First 5 Rows of the Merged DataFrame ---


Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title_x,vote_average,vote_count,title_y,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...",...,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...",...,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]",...,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [164]:
# Prints a summary of the DataFrame's structure
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 23 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4803 non-null   int64  
 1   genres                4803 non-null   object 
 2   homepage              1712 non-null   object 
 3   id                    4803 non-null   int64  
 4   keywords              4803 non-null   object 
 5   original_language     4803 non-null   object 
 6   original_title        4803 non-null   object 
 7   overview              4800 non-null   object 
 8   popularity            4803 non-null   float64
 9   production_companies  4803 non-null   object 
 10  production_countries  4803 non-null   object 
 11  release_date          4802 non-null   object 
 12  revenue               4803 non-null   int64  
 13  runtime               4801 non-null   float64
 14  spoken_languages      4803 non-null   object 
 15  status               

In [165]:
# Generates statistics for numerical columns
df.describe()

Unnamed: 0,budget,id,popularity,revenue,runtime,vote_average,vote_count
count,4803.0,4803.0,4803.0,4803.0,4801.0,4803.0,4803.0
mean,29045040.0,57165.484281,21.492301,82260640.0,106.875859,6.092172,690.217989
std,40722390.0,88694.614033,31.81665,162857100.0,22.611935,1.194612,1234.585891
min,0.0,5.0,0.0,0.0,0.0,0.0,0.0
25%,790000.0,9014.5,4.66807,0.0,94.0,5.6,54.0
50%,15000000.0,14629.0,12.921594,19170000.0,103.0,6.2,235.0
75%,40000000.0,58610.5,28.313505,92917190.0,118.0,6.8,737.0
max,380000000.0,459488.0,875.581305,2787965000.0,338.0,10.0,13752.0


In [166]:
# Replace 0s with NaN in 'budget' and 'revenue' columns
df['budget'] = df['budget'].replace(0, np.nan)
df['revenue'] = df['revenue'].replace(0, np.nan)

# Drop rows where 'budget' or 'revenue' are NaN
df.dropna(subset=['budget', 'revenue'], inplace=True)

In [167]:
# Generates statistics for numerical columns
df.describe()

Unnamed: 0,budget,id,popularity,revenue,runtime,vote_average,vote_count
count,3229.0,3229.0,3229.0,3229.0,3229.0,3229.0,3229.0
mean,40654440.0,44780.705791,29.033689,121243000.0,110.724373,6.309353,977.287395
std,44396740.0,74609.434723,36.16573,186302900.0,20.965694,0.873891,1414.309577
min,1.0,5.0,0.019984,5.0,41.0,0.0,0.0
25%,10500000.0,4958.0,10.446722,17000000.0,96.0,5.8,178.0
50%,25000000.0,11451.0,20.410354,55184720.0,107.0,6.3,471.0
75%,55000000.0,45272.0,37.335721,146292000.0,121.0,6.9,1148.0
max,380000000.0,417859.0,875.581305,2787965000.0,338.0,8.5,13752.0


In [168]:
from src.parsing_utils import safe_literal_eval, extract_names, get_director

In [169]:
# List of columns to parse
columns_to_parse = ['genres', 'keywords', 'production_companies', 'production_countries',
                    'spoken_languages', 'cast', 'crew']

# Loop through the columns and apply the function
for column in columns_to_parse:
    df[column] = df[column].apply(safe_literal_eval)

In [170]:
df["genres_list"] = df["genres"].apply(extract_names)
df[["title_x", "genres_list"]].head()

Unnamed: 0,title_x,genres_list
0,Avatar,"[Action, Adventure, Fantasy, Science Fiction]"
1,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action]"
2,Spectre,"[Action, Adventure, Crime]"
3,The Dark Knight Rises,"[Action, Crime, Drama, Thriller]"
4,John Carter,"[Action, Adventure, Science Fiction]"


In [171]:
# Apply the function to the 'crew' column to create the new 'director' column
df['director'] = df['crew'].apply(get_director)

# Let's see the result!
print("Success! Here are the directors for the first few movies:")
df[['title_x', 'director']].head()

Success! Here are the directors for the first few movies:


Unnamed: 0,title_x,director
0,Avatar,James Cameron
1,Pirates of the Caribbean: At World's End,Gore Verbinski
2,Spectre,Sam Mendes
3,The Dark Knight Rises,Christopher Nolan
4,John Carter,Andrew Stanton


In [172]:
# Convert 'release_date' to datetime objects
df['release_date'] = pd.to_datetime(df['release_date'])

# From the new datetime object, we can easily extract the year
df['release_year'] = df['release_date'].dt.year

# Let's check the data types and the new column
print("Data types after conversion:")
print(df[['release_date', 'release_year']].info())
df[['title_x', 'release_date', 'release_year']].head()

Data types after conversion:
<class 'pandas.core.frame.DataFrame'>
Index: 3229 entries, 0 to 4798
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   release_date  3229 non-null   datetime64[ns]
 1   release_year  3229 non-null   int32         
dtypes: datetime64[ns](1), int32(1)
memory usage: 63.1 KB
None


Unnamed: 0,title_x,release_date,release_year
0,Avatar,2009-12-10,2009
1,Pirates of the Caribbean: At World's End,2007-05-19,2007
2,Spectre,2015-10-26,2015
3,The Dark Knight Rises,2012-07-16,2012
4,John Carter,2012-03-07,2012


In [173]:
# Calculate Profit
df['profit'] = df['revenue'] - df['budget']

# Calculate Return on Investment (ROI)
df['roi'] = (df['profit'] / df['budget']) * 100 # Multiplying by 100 to get a percentage

# Let's see our new financial columns!
df[['title_x', 'budget', 'revenue', 'profit', 'roi']].head()

Unnamed: 0,title_x,budget,revenue,profit,roi
0,Avatar,237000000.0,2787965000.0,2550965000.0,1076.356577
1,Pirates of the Caribbean: At World's End,300000000.0,961000000.0,661000000.0,220.333333
2,Spectre,245000000.0,880674600.0,635674600.0,259.459024
3,The Dark Knight Rises,250000000.0,1084939000.0,834939100.0,333.97564
4,John Carter,260000000.0,284139100.0,24139100.0,9.284269


In [174]:
# Drop the original, messy, or now-redundant columns
df.drop(columns=['genres', 'keywords', 'production_companies', 'production_countries',
                 'spoken_languages', 'cast', 'crew', 'homepage', 'overview',
                 'tagline', 'status', 'title_y'], inplace=True)

# Rename 'title_x' for simplicity
df.rename(columns={'title_x': 'title'}, inplace=True)


# Save the cleaned and processed DataFrame to a new CSV file
df.to_csv('../data/processed/tmdb_movies_processed.csv', index=False)

print("Data processing complete! Your clean dataset has been saved to data/processed/")
df.info()

Data processing complete! Your clean dataset has been saved to data/processed/
<class 'pandas.core.frame.DataFrame'>
Index: 3229 entries, 0 to 4798
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   budget             3229 non-null   float64       
 1   id                 3229 non-null   int64         
 2   original_language  3229 non-null   object        
 3   original_title     3229 non-null   object        
 4   popularity         3229 non-null   float64       
 5   release_date       3229 non-null   datetime64[ns]
 6   revenue            3229 non-null   float64       
 7   runtime            3229 non-null   float64       
 8   title              3229 non-null   object        
 9   vote_average       3229 non-null   float64       
 10  vote_count         3229 non-null   int64         
 11  genres_list        3229 non-null   object        
 12  director           3227 non-null   object   