In [44]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

# Load the cleaned data
df = pd.read_csv("tmdb_cleaned_data.csv")

print(f"✓ Data loaded successfully!")
print(f"   Shape: {df.shape}")

display(df.head(5))

#print(df.info())

✓ Data loaded successfully!
   Shape: (18, 20)


Unnamed: 0,id,title,tagline,release_date,genres,belongs_to_collection,original_language,budget_musd,revenue_musd,production_companies,production_countries,vote_count,vote_average,popularity,runtime,overview,spoken_languages,poster_path,backdrop_path,origin_country
0,299534,Avengers: Endgame,Avenge the fallen.,2019-04-24,Adventure|Science Fiction|Action,The Avengers Collection,en,356.0,2799.4391,Marvel Studios,United States of America,26844,8.238,16.4257,181.0,After the devastating events of Avengers: Infi...,English|Japanese|Xhosa,/ulzhLuWrPK07P1YkdWQLZnQh1JL.jpg,/7RyHsO4yDXtBv1zUU3mTpHeQ0d5.jpg,['US']
1,19995,Avatar,Enter the world of Pandora.,2009-12-15,Action|Adventure|Fantasy|Science Fiction,Avatar Collection,en,237.0,2923.706026,Dune Entertainment|Lightstorm Entertainment|20...,United States of America|United Kingdom,32718,7.594,23.4542,162.0,"In the 22nd century, a paraplegic Marine is di...",English|Spanish,/gKY6q7SjCkAU6FqvqWybDYgUKIF.jpg,/vL5LR6WdxWPjLPFRLe133jXWsh5.jpg,['US']
2,140607,Star Wars: The Force Awakens,Every generation has a story.,2015-12-15,Adventure|Action|Science Fiction,Star Wars Collection,en,245.0,2068.223624,Lucasfilm Ltd.|Bad Robot,United States of America,20021,7.255,10.9108,136.0,Thirty years after defeating the Galactic Empi...,English,/wqnLdwVXoBjKibFRR5U3y0aDUhs.jpg,/8BTsTfln4jlQrLXUBquXJ0ASQy9.jpg,['US']
3,299536,Avengers: Infinity War,Destiny arrives all the same.,2018-04-25,Adventure|Action|Science Fiction,The Avengers Collection,en,300.0,2052.415039,Marvel Studios,United States of America,31061,8.2,32.9393,149.0,As the Avengers and their allies have continue...,English|Xhosa,/7WsyChQLEftFiDOVTGkv3hFpyyt.jpg,/mDfJG3LC3Dqb67AZ52x3Z0jU0uB.jpg,['US']
4,597,Titanic,Nothing on earth could come between them.,1997-11-18,Drama|Romance,,en,200.0,2264.162353,Paramount Pictures|20th Century Fox|Lightstorm...,United States of America,26408,7.905,26.1517,194.0,101-year-old Rose DeWitt Bukater tells the sto...,English|French|German|Swedish|Italian|Russian,/9xjZS2rlVxm8SFx8kPC3aIGCOYQ.jpg,/tupgjqhWx5oieQrdyesO3aclUX9.jpg,['US']


In [45]:
# Calculate Profit (in millions USD)
df['profit_musd'] = df['revenue_musd'] - df['budget_musd']
print(f"Profit calculated")

# Calculate ROI (Return on Investment)
# ROI = Revenue / Budget (only where budget > 0 to avoid division by zero)
df['ROI'] = np.where(df['budget_musd'] > 0, 
                     df['revenue_musd'] / df['budget_musd'], 
                     np.nan)
print(f"ROI calculated")

print(f"\nSample of new columns:")
print(df[['title', 'budget_musd', 'revenue_musd', 'profit_musd', 'ROI']].head())

# Step 2: Create the UDF for ranking movies
print("\nStep 2: Creating ranking function (UDF)...")

def rank_movies(dataframe, 
                rank_by, 
                top_n=10, 
                ascending=False, 
                filter_condition=None,
                display_columns=None):

    
    # Apply filter if provided
    if filter_condition is not None:
        df_filtered = dataframe[filter_condition].copy()
    else:
        df_filtered = dataframe.copy()
    
    # Remove rows where the ranking column is NaN
    df_filtered = df_filtered[df_filtered[rank_by].notna()]
    
    # Sort by the specified column
    df_ranked = df_filtered.sort_values(by=rank_by, ascending=ascending)
    
    # Select top N
    df_result = df_ranked.head(top_n)
    
    # Define default display columns if not provided
    if display_columns is None:
        display_columns = ['title', 'release_date', rank_by]
    
    # Add the ranking column to display
    if rank_by not in display_columns:
        display_columns.append(rank_by)
    
    # Return only existing columns
    existing_cols = [col for col in display_columns if col in df_result.columns]
    
    return df_result[existing_cols].reset_index(drop=True)

print("Ranking function created successfully!")

Profit calculated
ROI calculated

Sample of new columns:
                          title  budget_musd  revenue_musd  profit_musd  \
0             Avengers: Endgame        356.0   2799.439100  2443.439100   
1                        Avatar        237.0   2923.706026  2686.706026   
2  Star Wars: The Force Awakens        245.0   2068.223624  1823.223624   
3        Avengers: Infinity War        300.0   2052.415039  1752.415039   
4                       Titanic        200.0   2264.162353  2064.162353   

         ROI  
0   7.863593  
1  12.336312  
2   8.441729  
3   6.841383  
4  11.320812  

Step 2: Creating ranking function (UDF)...
Ranking function created successfully!


In [46]:
# Define display columns we want to see in results
display_cols = ['title', 'release_date', 'budget_musd', 'revenue_musd', 
                'profit_musd', 'ROI', 'vote_average', 'vote_count', 'popularity']

# 1. HIGHEST REVENUE
print("TOP 10 HIGHEST REVENUE MOVIES")
highest_revenue = rank_movies(df, 
                              rank_by='revenue_musd', 
                              top_n=10, 
                              ascending=False,
                              display_columns=['title', 'release_date', 'revenue_musd', 'budget_musd'])
display(highest_revenue)

# 2. HIGHEST BUDGET
print("TOP 10 HIGHEST BUDGET MOVIES")
highest_budget = rank_movies(df, 
                             rank_by='budget_musd', 
                             top_n=10, 
                             ascending=False,
                             display_columns=['title', 'release_date', 'budget_musd', 'revenue_musd'])
display(highest_budget)

# 3. HIGHEST PROFIT
print("TOP 10 HIGHEST PROFIT MOVIES")
highest_profit = rank_movies(df, 
                             rank_by='profit_musd', 
                             top_n=10, 
                             ascending=False,
                             display_columns=['title', 'release_date', 'profit_musd', 'revenue_musd', 'budget_musd'])
display(highest_profit)

# 4. LOWEST PROFIT
print("TOP 10 LOWEST PROFIT MOVIES (Biggest Losses)")
lowest_profit = rank_movies(df, 
                            rank_by='profit_musd', 
                            top_n=10, 
                            ascending=True,
                            display_columns=['title', 'release_date', 'profit_musd', 'revenue_musd', 'budget_musd'])
display(lowest_profit)

# 5. HIGHEST ROI (Budget >= 10M)
print("TOP 10 HIGHEST ROI MOVIES (Budget ≥ $10M)")
highest_roi = rank_movies(df, 
                         rank_by='ROI', 
                         top_n=10, 
                         ascending=False,
                         filter_condition=(df['budget_musd'] >= 10),
                         display_columns=['title', 'release_date', 'ROI', 'revenue_musd', 'budget_musd'])
display(highest_roi)

# 6. LOWEST ROI (Budget >= 10M)
print("TOP 10 LOWEST ROI MOVIES (Budget ≥ $10M)")
lowest_roi = rank_movies(df, 
                        rank_by='ROI', 
                        top_n=10, 
                        ascending=True,
                        filter_condition=(df['budget_musd'] >= 10),
                        display_columns=['title', 'release_date', 'ROI', 'revenue_musd', 'budget_musd'])
display(lowest_roi)

# 7. MOST VOTED MOVIES
print("TOP 10 MOST VOTED MOVIES")
most_voted = rank_movies(df, 
                        rank_by='vote_count', 
                        top_n=10, 
                        ascending=False,
                        display_columns=['title', 'release_date', 'vote_count', 'vote_average'])
display(most_voted)

# 8. HIGHEST RATED MOVIES (vote_count >= 10)
print("TOP 10 HIGHEST RATED MOVIES (Vote Count ≥ 10)")
highest_rated = rank_movies(df, 
                           rank_by='vote_average', 
                           top_n=10, 
                           ascending=False,
                           filter_condition=(df['vote_count'] >= 10),
                           display_columns=['title', 'release_date', 'vote_average', 'vote_count'])
display(highest_rated)

# 9. LOWEST RATED MOVIES (vote_count >= 10)
print("TOP 10 LOWEST RATED MOVIES (Vote Count ≥ 10)")
lowest_rated = rank_movies(df, 
                          rank_by='vote_average', 
                          top_n=10, 
                          ascending=True,
                          filter_condition=(df['vote_count'] >= 10),
                          display_columns=['title', 'release_date', 'vote_average', 'vote_count'])
display(lowest_rated)

# 10. MOST POPULAR MOVIES
print("TOP 10 MOST POPULAR MOVIES")
most_popular = rank_movies(df, 
                          rank_by='popularity', 
                          top_n=10, 
                          ascending=False,
                          display_columns=['title', 'release_date', 'popularity', 'vote_average'])
display(most_popular)

TOP 10 HIGHEST REVENUE MOVIES


Unnamed: 0,title,release_date,revenue_musd,budget_musd
0,Avatar,2009-12-15,2923.706026,237.0
1,Avengers: Endgame,2019-04-24,2799.4391,356.0
2,Titanic,1997-11-18,2264.162353,200.0
3,Star Wars: The Force Awakens,2015-12-15,2068.223624,245.0
4,Avengers: Infinity War,2018-04-25,2052.415039,300.0
5,Jurassic World,2015-06-06,1671.537444,150.0
6,The Lion King,2019-07-12,1662.020819,260.0
7,The Avengers,2012-04-25,1518.815515,220.0
8,Furious 7,2015-04-01,1515.4,190.0
9,Frozen II,2019-11-20,1453.683476,150.0


TOP 10 HIGHEST BUDGET MOVIES


Unnamed: 0,title,release_date,budget_musd,revenue_musd
0,Avengers: Age of Ultron,2015-04-22,365.0,1405.403694
1,Avengers: Endgame,2019-04-24,356.0,2799.4391
2,Star Wars: The Last Jedi,2017-12-13,300.0,1332.69883
3,Avengers: Infinity War,2018-04-25,300.0,2052.415039
4,The Lion King,2019-07-12,260.0,1662.020819
5,Star Wars: The Force Awakens,2015-12-15,245.0,2068.223624
6,Avatar,2009-12-15,237.0,2923.706026
7,The Avengers,2012-04-25,220.0,1518.815515
8,Black Panther,2018-02-13,200.0,1349.926083
9,Titanic,1997-11-18,200.0,2264.162353


TOP 10 HIGHEST PROFIT MOVIES


Unnamed: 0,title,release_date,profit_musd,revenue_musd,budget_musd
0,Avatar,2009-12-15,2686.706026,2923.706026,237.0
1,Avengers: Endgame,2019-04-24,2443.4391,2799.4391,356.0
2,Titanic,1997-11-18,2064.162353,2264.162353,200.0
3,Star Wars: The Force Awakens,2015-12-15,1823.223624,2068.223624,245.0
4,Avengers: Infinity War,2018-04-25,1752.415039,2052.415039,300.0
5,Jurassic World,2015-06-06,1521.537444,1671.537444,150.0
6,The Lion King,2019-07-12,1402.020819,1662.020819,260.0
7,Furious 7,2015-04-01,1325.4,1515.4,190.0
8,Frozen II,2019-11-20,1303.683476,1453.683476,150.0
9,The Avengers,2012-04-25,1298.815515,1518.815515,220.0


TOP 10 LOWEST PROFIT MOVIES (Biggest Losses)


Unnamed: 0,title,release_date,profit_musd,revenue_musd,budget_musd
0,Star Wars: The Last Jedi,2017-12-13,1032.69883,1332.69883,300.0
1,Avengers: Age of Ultron,2015-04-22,1040.403694,1405.403694,365.0
2,Incredibles 2,2018-06-14,1043.225667,1243.225667,200.0
3,Beauty and the Beast,2017-03-16,1106.115964,1266.115964,160.0
4,Frozen,2013-11-20,1124.219009,1274.219009,150.0
5,Jurassic World: Fallen Kingdom,2018-06-06,1140.469037,1310.469037,170.0
6,Black Panther,2018-02-13,1149.926083,1349.926083,200.0
7,Harry Potter and the Deathly Hallows: Part 2,2011-07-12,1216.511219,1341.511219,125.0
8,The Avengers,2012-04-25,1298.815515,1518.815515,220.0
9,Frozen II,2019-11-20,1303.683476,1453.683476,150.0


TOP 10 HIGHEST ROI MOVIES (Budget ≥ $10M)


Unnamed: 0,title,release_date,ROI,revenue_musd,budget_musd
0,Avatar,2009-12-15,12.336312,2923.706026,237.0
1,Titanic,1997-11-18,11.320812,2264.162353,200.0
2,Jurassic World,2015-06-06,11.143583,1671.537444,150.0
3,Harry Potter and the Deathly Hallows: Part 2,2011-07-12,10.73209,1341.511219,125.0
4,Frozen II,2019-11-20,9.691223,1453.683476,150.0
5,Frozen,2013-11-20,8.494793,1274.219009,150.0
6,Star Wars: The Force Awakens,2015-12-15,8.441729,2068.223624,245.0
7,Furious 7,2015-04-01,7.975789,1515.4,190.0
8,Beauty and the Beast,2017-03-16,7.913225,1266.115964,160.0
9,Avengers: Endgame,2019-04-24,7.863593,2799.4391,356.0


TOP 10 LOWEST ROI MOVIES (Budget ≥ $10M)


Unnamed: 0,title,release_date,ROI,revenue_musd,budget_musd
0,Avengers: Age of Ultron,2015-04-22,3.850421,1405.403694,365.0
1,Star Wars: The Last Jedi,2017-12-13,4.442329,1332.69883,300.0
2,Incredibles 2,2018-06-14,6.216128,1243.225667,200.0
3,The Lion King,2019-07-12,6.392388,1662.020819,260.0
4,Black Panther,2018-02-13,6.74963,1349.926083,200.0
5,Avengers: Infinity War,2018-04-25,6.841383,2052.415039,300.0
6,The Avengers,2012-04-25,6.903707,1518.815515,220.0
7,Jurassic World: Fallen Kingdom,2018-06-06,7.708641,1310.469037,170.0
8,Avengers: Endgame,2019-04-24,7.863593,2799.4391,356.0
9,Beauty and the Beast,2017-03-16,7.913225,1266.115964,160.0


TOP 10 MOST VOTED MOVIES


Unnamed: 0,title,release_date,vote_count,vote_average
0,The Avengers,2012-04-25,33583,7.832
1,Avatar,2009-12-15,32718,7.594
2,Avengers: Infinity War,2018-04-25,31061,8.2
3,Avengers: Endgame,2019-04-24,26844,8.238
4,Titanic,1997-11-18,26408,7.905
5,Avengers: Age of Ultron,2015-04-22,23792,7.3
6,Black Panther,2018-02-13,22887,7.368
7,Harry Potter and the Deathly Hallows: Part 2,2011-07-12,21360,8.1
8,Jurassic World,2015-06-06,21065,6.699
9,Star Wars: The Force Awakens,2015-12-15,20021,7.255


TOP 10 HIGHEST RATED MOVIES (Vote Count ≥ 10)


Unnamed: 0,title,release_date,vote_average,vote_count
0,Avengers: Endgame,2019-04-24,8.238,26844
1,Avengers: Infinity War,2018-04-25,8.2,31061
2,Harry Potter and the Deathly Hallows: Part 2,2011-07-12,8.1,21360
3,Titanic,1997-11-18,7.905,26408
4,The Avengers,2012-04-25,7.832,33583
5,Avatar,2009-12-15,7.594,32718
6,Incredibles 2,2018-06-14,7.5,13312
7,Black Panther,2018-02-13,7.368,22887
8,Avengers: Age of Ultron,2015-04-22,7.3,23792
9,Star Wars: The Force Awakens,2015-12-15,7.255,20021


TOP 10 LOWEST RATED MOVIES (Vote Count ≥ 10)


Unnamed: 0,title,release_date,vote_average,vote_count
0,Jurassic World: Fallen Kingdom,2018-06-06,6.539,12358
1,Jurassic World,2015-06-06,6.699,21065
2,Star Wars: The Last Jedi,2017-12-13,6.766,15857
3,Beauty and the Beast,2017-03-16,6.969,15798
4,The Lion King,2019-07-12,7.1,10519
5,Furious 7,2015-04-01,7.224,10990
6,Frozen II,2019-11-20,7.243,10044
7,Frozen,2013-11-20,7.248,17125
8,Star Wars: The Force Awakens,2015-12-15,7.255,20021
9,Avengers: Age of Ultron,2015-04-22,7.3,23792


TOP 10 MOST POPULAR MOVIES


Unnamed: 0,title,release_date,popularity,vote_average
0,The Avengers,2012-04-25,47.9587,7.832
1,Avengers: Infinity War,2018-04-25,32.9393,8.2
2,Titanic,1997-11-18,26.1517,7.905
3,Avatar,2009-12-15,23.4542,7.594
4,Frozen,2013-11-20,21.9087,7.248
5,Incredibles 2,2018-06-14,19.495,7.5
6,Harry Potter and the Deathly Hallows: Part 2,2011-07-12,17.4473,8.1
7,Avengers: Endgame,2019-04-24,16.4257,8.238
8,Avengers: Age of Ultron,2015-04-22,15.5884,7.3
9,Black Panther,2018-02-13,12.8512,7.368


### Fetch Cast Data 

In [47]:
print("This will add: 'cast', 'cast_size', 'director', 'crew_size' columns\n")

# Load environment variables
from dotenv import load_dotenv
import os
import requests
import time

load_dotenv()
API_KEY = os.getenv("TMDB_API_KEY")

def fetch_credits(movie_id):
    """Fetch cast and crew data for a movie"""
    url = f"https://api.themoviedb.org/3/movie/{movie_id}/credits"
    params = {'api_key': API_KEY}
    
    try:
        response = requests.get(url, params=params)
        if response.status_code == 200:
            return response.json()
        else:
            print(f"Failed to fetch credits for movie_id {movie_id}: {response.status_code}")
            return None
    except Exception as e:
        print(f"Error fetching credits for movie_id {movie_id}: {e}")
        return None

def extract_cast_info(credits_data):
    """Extract cast names and size from credits data"""
    if not credits_data or 'cast' not in credits_data:
        return None, 0
    
    cast_list = credits_data['cast']
    if len(cast_list) == 0:
        return None, 0
    
    # Get cast names (join with |)
    cast_names = [person['name'] for person in cast_list if 'name' in person]
    cast_string = '|'.join(cast_names) if cast_names else None
    
    return cast_string, len(cast_list)

def extract_director_info(credits_data):
    """Extract director name and crew size from credits data"""
    if not credits_data or 'crew' not in credits_data:
        return None, 0
    
    crew_list = credits_data['crew']
    crew_size = len(crew_list)
    
    # Find director(s)
    directors = [person['name'] for person in crew_list 
                 if person.get('job') == 'Director']
    
    director_string = '|'.join(directors) if directors else None
    
    return director_string, crew_size

# Fetch credits for all movies
print("Fetching cast and crew data from TMDb API...")
print(f"Total movies to fetch: {len(df)}\n")

cast_data = []
director_data = []
cast_sizes = []
crew_sizes = []

for idx, row in df.iterrows():
    movie_id = int(row['id'])
    print(f"Fetching credits for: {row['title']} (ID: {movie_id})...")
    
    credits = fetch_credits(movie_id)
    
    # Extract cast info
    cast_string, cast_size = extract_cast_info(credits)
    cast_data.append(cast_string)
    cast_sizes.append(cast_size)
    
    # Extract director info
    director_string, crew_size = extract_director_info(credits)
    director_data.append(director_string)
    crew_sizes.append(crew_size)
    
    time.sleep(0.25)  # Respect API rate limits

# Add new columns to dataframe
df['cast'] = cast_data
df['cast_size'] = cast_sizes
df['director'] = director_data
df['crew_size'] = crew_sizes

print("\nCast and crew data fetched successfully!")
print(f"\nSample of new columns:")
print(df[['title', 'cast', 'director', 'cast_size', 'crew_size']].head())

# Save updated dataframe
df.to_csv("tmdb_cleaned_data_with_credits.csv", index=False)
print("   ✓ Saved to 'tmdb_cleaned_data_with_credits.csv'")

This will add: 'cast', 'cast_size', 'director', 'crew_size' columns

Fetching cast and crew data from TMDb API...
Total movies to fetch: 18

Fetching credits for: Avengers: Endgame (ID: 299534)...
Fetching credits for: Avatar (ID: 19995)...
Fetching credits for: Star Wars: The Force Awakens (ID: 140607)...
Fetching credits for: Avengers: Infinity War (ID: 299536)...
Fetching credits for: Titanic (ID: 597)...
Fetching credits for: Jurassic World (ID: 135397)...
Fetching credits for: The Lion King (ID: 420818)...
Fetching credits for: The Avengers (ID: 24428)...
Fetching credits for: Furious 7 (ID: 168259)...
Fetching credits for: Avengers: Age of Ultron (ID: 99861)...
Fetching credits for: Black Panther (ID: 284054)...
Fetching credits for: Harry Potter and the Deathly Hallows: Part 2 (ID: 12445)...
Fetching credits for: Star Wars: The Last Jedi (ID: 181808)...
Fetching credits for: Frozen II (ID: 330457)...
Fetching credits for: Jurassic World: Fallen Kingdom (ID: 351286)...
Fetching c

In [48]:
#df.head()

In [49]:
'''print("INVESTIGATING: Actors and Directors in Our Dataset")

# Check all unique actors 
print("\nChecking cast data...")
print(f"Movies with cast data: {df['cast'].notna().sum()} / {len(df)}")

# Let's see some sample cast members
print("\nSample of cast from first few movies:")
for idx in range(min(5, len(df))):
    title = df.iloc[idx]['title']
    cast = df.iloc[idx]['cast']
    if pd.notna(cast):
        cast_list = cast.split('|')[:5]  # Show first 5 actors
        print(f"   - {title}: {', '.join(cast_list)}")

# Check all unique directors
print("\nChecking director data...")
print(f"Movies with director data: {df['director'].notna().sum()} / {len(df)}")

print("\nAll directors in dataset:")
all_directors = df['director'].dropna().unique()
for director in sorted(all_directors):
    movies_count = (df['director'] == director).sum()
    print(f"   - {director} ({movies_count} movie(s))")

# Let's check if we have any famous actors
print("\nSearching for some famous actors...")
famous_actors = ['Bruce Willis', 'Uma Thurman', 'Leonardo DiCaprio', 'Tom Cruise', 
                 'Robert Downey Jr.', 'Chris Evans', 'Scarlett Johansson', 
                 'Samuel L. Jackson', 'Chris Hemsworth', 'Chris Pratt']

for actor in famous_actors:
    has_actor_movies = df[df['cast'].apply(lambda x: has_actor(x, actor) if pd.notna(x) else False)]
    if len(has_actor_movies) > 0:
        print(f"{actor}: {len(has_actor_movies)} movie(s)")
        for _, movie in has_actor_movies.iterrows():
            print(f"- {movie['title']}")

# Check what genres we have
print("\nChecking available genres...")
all_genres = set()
for genres in df['genres'].dropna():
    genre_list = genres.split('|')
    all_genres.update(genre_list)

print(f"Available genres: {sorted(all_genres)}")

# Check genre combinations
print("\nMovies with 'Science Fiction' AND 'Action':")
sci_fi_action = df[df['genres'].apply(lambda x: has_genres(x, ['Science Fiction', 'Action']) if pd.notna(x) else False)]
if len(sci_fi_action) > 0:
    display(sci_fi_action[['title', 'genres', 'vote_average']])
else:
    print("   No movies with both Science Fiction and Action genres.")'''

'print("INVESTIGATING: Actors and Directors in Our Dataset")\n\n# Check all unique actors \nprint("\nChecking cast data...")\nprint(f"Movies with cast data: {df[\'cast\'].notna().sum()} / {len(df)}")\n\n# Let\'s see some sample cast members\nprint("\nSample of cast from first few movies:")\nfor idx in range(min(5, len(df))):\n    title = df.iloc[idx][\'title\']\n    cast = df.iloc[idx][\'cast\']\n    if pd.notna(cast):\n        cast_list = cast.split(\'|\')[:5]  # Show first 5 actors\n        print(f"   - {title}: {\', \'.join(cast_list)}")\n\n# Check all unique directors\nprint("\nChecking director data...")\nprint(f"Movies with director data: {df[\'director\'].notna().sum()} / {len(df)}")\n\nprint("\nAll directors in dataset:")\nall_directors = df[\'director\'].dropna().unique()\nfor director in sorted(all_directors):\n    movies_count = (df[\'director\'] == director).sum()\n    print(f"   - {director} ({movies_count} movie(s))")\n\n# Let\'s check if we have any famous actors\nprint(

### Advanced Movie Filtering

In [50]:
# Helper function to check if a person is in cast
def has_actor(cast_string, actor_name):
    if pd.isna(cast_string) or cast_string is None:
        return False
    return actor_name.lower() in cast_string.lower()

# Helper function to check if a person is the director
def has_director(director_string, director_name):
    if pd.isna(director_string) or director_string is None:
        return False
    return director_name.lower() in director_string.lower()

# Helper function to check if movie has specific genres
def has_genres(genre_string, required_genres):
    if pd.isna(genre_string) or genre_string is None:
        return False
    genre_lower = genre_string.lower()
    return all(genre.lower() in genre_lower for genre in required_genres)

#Best-rated Sci-Fi Action movies starring Bruce Willis
print("Best-rated Sci-Fi Action movies starring Bruce Willis")
# Filter criteria:
# 1. Has Bruce Willis in cast
# 2. Has both "Science Fiction" and "Action" in genres
# 3. Sort by vote_average (highest to lowest)

search1_results = df[
    df['cast'].apply(lambda x: has_actor(x, 'Bruce Willis')) &
    df['genres'].apply(lambda x: has_genres(x, ['Science Fiction', 'Action']))
].copy()

# Sort by rating (highest to lowest)
search1_results = search1_results.sort_values('vote_average', ascending=False)

print(f"\nFound {len(search1_results)} movies matching criteria:")
if len(search1_results) > 0:
    display(search1_results[['title', 'genres', 'vote_average', 'vote_count', 
                              'release_date', 'director']].reset_index(drop=True))
else:
    print("No movies found matching this criteria.")
    
    bruce_willis_movies = df[df['cast'].apply(lambda x: has_actor(x, 'Bruce Willis'))]
    if len(bruce_willis_movies) > 0:
        display(bruce_willis_movies[['title', 'genres', 'vote_average']].reset_index(drop=True))
    else:
        print("No Bruce Willis movies in dataset.")

# SEARCH 2: Movies starring Uma Thurman, directed by Quentin Tarantino
# Filter criteria:
# 1. Has Uma Thurman in cast
# 2. Directed by Quentin Tarantino
# 3. Sort by runtime (shortest to longest)

search2_results = df[
    df['cast'].apply(lambda x: has_actor(x, 'Uma Thurman')) &
    df['director'].apply(lambda x: has_director(x, 'Quentin Tarantino'))
].copy()

# Sort by runtime (shortest to longest)
search2_results = search2_results.sort_values('runtime', ascending=True)

print(f"\nFound {len(search2_results)} movies matching criteria:")
if len(search2_results) > 0:
    display(search2_results[['title', 'director', 'runtime', 'vote_average', 
                              'release_date', 'genres']].reset_index(drop=True))
else:
    print("No movies found matching this criteria.")
    
    uma_movies = df[df['cast'].apply(lambda x: has_actor(x, 'Uma Thurman'))]
    if len(uma_movies) > 0:
        display(uma_movies[['title', 'director', 'runtime']].reset_index(drop=True))
    else:
        print("No Uma Thurman movies in dataset.")
    
    tarantino_movies = df[df['director'].apply(lambda x: has_director(x, 'Quentin Tarantino'))]
    if len(tarantino_movies) > 0:
        display(tarantino_movies[['title', 'director', 'runtime']].reset_index(drop=True))
    else:
        print("No Quentin Tarantino movies in dataset.")

Best-rated Sci-Fi Action movies starring Bruce Willis

Found 0 movies matching criteria:
No movies found matching this criteria.
No Bruce Willis movies in dataset.

Found 0 movies matching criteria:
No movies found matching this criteria.
No Uma Thurman movies in dataset.
No Quentin Tarantino movies in dataset.


### Franchise vs Standalone

In [51]:
# Separate franchise and standalone movies
print("\nSeparating Franchise and Standalone movies")

# Franchise movies: belongs_to_collection is not null
franchise_movies = df[df['belongs_to_collection'].notna()].copy()

# Standalone movies: belongs_to_collection is null
standalone_movies = df[df['belongs_to_collection'].isna()].copy()

print(f" Franchise movies: {len(franchise_movies)}")
print(f" Standalone movies: {len(standalone_movies)}")
print(f" Total movies: {len(df)}")


# Create comparison dataframe
comparison_data = {
    'Metric': [
        'Number of Movies',
        'Mean Revenue (M USD)',
        'Median ROI',
        'Mean Budget (M USD)',
        'Mean Popularity',
        'Mean Rating'
    ],
    'Franchise Movies': [
        len(franchise_movies),
        franchise_movies['revenue_musd'].mean(),
        franchise_movies['ROI'].median(),
        franchise_movies['budget_musd'].mean(),
        franchise_movies['popularity'].mean(),
        franchise_movies['vote_average'].mean()
    ],
    'Standalone Movies': [
        len(standalone_movies),
        standalone_movies['revenue_musd'].mean(),
        standalone_movies['ROI'].median(),
        standalone_movies['budget_musd'].mean(),
        standalone_movies['popularity'].mean(),
        standalone_movies['vote_average'].mean()
    ]
}

comparison_df = pd.DataFrame(comparison_data)

# Calculate difference/ratio
comparison_df['Difference'] = comparison_df['Franchise Movies'] - comparison_df['Standalone Movies']
comparison_df['Franchise/Standalone Ratio'] = comparison_df['Franchise Movies'] / comparison_df['Standalone Movies']

display(comparison_df)

#Detailed Analysis
print("\n1️REVENUE COMPARISON:")
print(f"   Franchise movies - Mean Revenue: ${comparison_df.iloc[1]['Franchise Movies']:.2f}M")
print(f"   Standalone movies - Mean Revenue: ${comparison_df.iloc[1]['Standalone Movies']:.2f}M")
if comparison_df.iloc[1]['Franchise Movies'] > comparison_df.iloc[1]['Standalone Movies']:
    print(f"    Franchise movies generate {comparison_df.iloc[1]['Franchise/Standalone Ratio']:.2f}x more revenue on average!")
else:
    print(f"    Standalone movies generate more revenue on average!")

print("\n ROI COMPARISON:")
print(f"   Franchise movies - Median ROI: {comparison_df.iloc[2]['Franchise Movies']:.2f}x")
print(f"   Standalone movies - Median ROI: {comparison_df.iloc[2]['Standalone Movies']:.2f}x")
if comparison_df.iloc[2]['Franchise Movies'] > comparison_df.iloc[2]['Standalone Movies']:
    print(f"   Franchise movies have {comparison_df.iloc[2]['Franchise/Standalone Ratio']:.2f}x better ROI!")
else:
    print(f"   Standalone movies have better ROI!")

print("\nBUDGET COMPARISON:")
print(f"   Franchise movies - Mean Budget: ${comparison_df.iloc[3]['Franchise Movies']:.2f}M")
print(f"   Standalone movies - Mean Budget: ${comparison_df.iloc[3]['Standalone Movies']:.2f}M")
if comparison_df.iloc[3]['Franchise Movies'] > comparison_df.iloc[3]['Standalone Movies']:
    print(f"   Franchise movies have {comparison_df.iloc[3]['Franchise/Standalone Ratio']:.2f}x higher budgets on average!")
else:
    print(f"   Standalone movies have higher budgets on average!")

print("\n POPULARITY COMPARISON:")
print(f"   Franchise movies - Mean Popularity: {comparison_df.iloc[4]['Franchise Movies']:.2f}")
print(f"   Standalone movies - Mean Popularity: {comparison_df.iloc[4]['Standalone Movies']:.2f}")

print("\n RATING COMPARISON:")
print(f"   Franchise movies - Mean Rating: {comparison_df.iloc[5]['Franchise Movies']:.2f}/10")
print(f"   Standalone movies - Mean Rating: {comparison_df.iloc[5]['Standalone Movies']:.2f}/10")

# Show sample franchise and standalone movies
print("\n Franchise Movies")
display(franchise_movies[['title', 'belongs_to_collection', 'revenue_musd', 
                          'budget_musd', 'vote_average']].head())
print("\n Standalone Movies")
display(standalone_movies[['title', 'revenue_musd', 'budget_musd', 
                           'vote_average']].head())


Separating Franchise and Standalone movies
 Franchise movies: 16
 Standalone movies: 2
 Total movies: 18


Unnamed: 0,Metric,Franchise Movies,Standalone Movies,Difference,Franchise/Standalone Ratio
0,Number of Movies,16.0,2.0,14.0,8.0
1,Mean Revenue (M USD),1682.668411,1765.139159,-82.470747,0.953278
2,Median ROI,7.786117,9.617018,-1.830901,0.809619
3,Mean Budget (M USD),226.125,180.0,46.125,1.25625
4,Mean Popularity,17.978088,18.37275,-0.394662,0.978519
5,Mean Rating,7.387875,7.437,-0.049125,0.993395



1️REVENUE COMPARISON:
   Franchise movies - Mean Revenue: $1682.67M
   Standalone movies - Mean Revenue: $1765.14M
    Standalone movies generate more revenue on average!

 ROI COMPARISON:
   Franchise movies - Median ROI: 7.79x
   Standalone movies - Median ROI: 9.62x
   Standalone movies have better ROI!

BUDGET COMPARISON:
   Franchise movies - Mean Budget: $226.12M
   Standalone movies - Mean Budget: $180.00M
   Franchise movies have 1.26x higher budgets on average!

 POPULARITY COMPARISON:
   Franchise movies - Mean Popularity: 17.98
   Standalone movies - Mean Popularity: 18.37

 RATING COMPARISON:
   Franchise movies - Mean Rating: 7.39/10
   Standalone movies - Mean Rating: 7.44/10

 Franchise Movies


Unnamed: 0,title,belongs_to_collection,revenue_musd,budget_musd,vote_average
0,Avengers: Endgame,The Avengers Collection,2799.4391,356.0,8.238
1,Avatar,Avatar Collection,2923.706026,237.0,7.594
2,Star Wars: The Force Awakens,Star Wars Collection,2068.223624,245.0,7.255
3,Avengers: Infinity War,The Avengers Collection,2052.415039,300.0,8.2
5,Jurassic World,Jurassic Park Collection,1671.537444,150.0,6.699



 Standalone Movies


Unnamed: 0,title,revenue_musd,budget_musd,vote_average
4,Titanic,2264.162353,200.0,7.905
16,Beauty and the Beast,1266.115964,160.0,6.969


### Most Successful Franchise 

In [52]:
# Filter only franchise movies
franchise_df = df[df['belongs_to_collection'].notna()].copy()

if len(franchise_df) > 0:
    # Group by franchise
    franchise_status = franchise_df.groupby('belongs_to_collection').agg({
        'id': 'count',  
        'budget_musd': ['sum', 'mean'],  
        'revenue_musd': ['sum', 'mean'], 
        'vote_average': 'mean'  
    }).round(2)
    
    # Flatten column names
    franchise_status.columns = [
        'Number of Movies',
        'Total Budget (M USD)',
        'Mean Budget (M USD)',
        'Total Revenue (M USD)',
        'Mean Revenue (M USD)',
        'Mean Rating'
    ]
    
    
    franchise_status = franchise_status.sort_values('Total Revenue (M USD)', ascending=False)
    
    franchise_status = franchise_status.reset_index()
    franchise_status = franchise_status.rename(columns={'belongs_to_collection': 'Franchise'})
    
    display(franchise_status)
    
    print("\nTop 5 Franchises by Number of Movies:")
    top_by_count = franchise_status.nlargest(5, 'Number of Movies')[['Franchise', 'Number of Movies']]
    display(top_by_count)
    
    print("\nTop 5 Franchises by Total Budget:")
    top_by_budget = franchise_status.nlargest(5, 'Total Budget (M USD)')[['Franchise', 'Total Budget (M USD)', 'Number of Movies']]
    display(top_by_budget)
    
    print("\nTop 5 Franchises by Mean Budget per Movie:")
    top_by_mean_budget = franchise_status.nlargest(5, 'Mean Budget (M USD)')[['Franchise', 'Mean Budget (M USD)', 'Number of Movies']]
    display(top_by_mean_budget)
    
    print("\nTop 5 Franchises by Total Revenue:")
    top_by_revenue = franchise_status.nlargest(5, 'Total Revenue (M USD)')[['Franchise', 'Total Revenue (M USD)', 'Mean Revenue (M USD)', 'Number of Movies']]
    display(top_by_revenue)
    
    print("\nTop 5 Franchises by Mean Revenue per Movie:")
    top_by_mean_revenue = franchise_status.nlargest(5, 'Mean Revenue (M USD)')[['Franchise', 'Mean Revenue (M USD)', 'Number of Movies']]
    display(top_by_mean_revenue)
    
    print("\nTop 5 Franchises by Mean Rating:")
    top_by_rating = franchise_status.nlargest(5, 'Mean Rating')[['Franchise', 'Mean Rating', 'Number of Movies']]
    display(top_by_rating)
else:
    print("  No franchise movies found in dataset.")



# PART B: TOP DIRECTORS
print("PART B: MOST SUCCESSFUL DIRECTORS")
director_df = df[df['director'].notna()].copy()

if len(director_df) > 0:
    # Some movies might have multiple directors (separated by |)
    # For simplicity, we'll treat "Director1|Director2" as a single entity
    # Or we can split them - let's split for more accurate analysis
    
    # Expand directors (if multiple directors, create separate rows)
    director_expanded = []
    for idx, row in director_df.iterrows():
        directors = row['director'].split('|')
        for director in directors:
            director_row = row.copy()
            director_row['director'] = director.strip()
            director_expanded.append(director_row)
    
    director_expanded_df = pd.DataFrame(director_expanded)
    
    # Group by director
    director_status = director_expanded_df.groupby('director').agg({
        'id': 'count',  # Number of movies
        'revenue_musd': ['sum', 'mean'],  # Total & Mean Revenue
        'vote_average': 'mean'  # Mean Rating
    }).round(2)
    

    director_status.columns = [
        'Number of Movies',
        'Total Revenue (M USD)',
        'Mean Revenue (M USD)',
        'Mean Rating'
    ]
    
    # Sort by total revenue (descending)
    director_status = director_status.sort_values('Total Revenue (M USD)', ascending=False)
    
    
    director_status = director_status.reset_index()
    display(director_status)
    
    
    print("\nTop 5 Directors by Number of Movies:")
    top_directors_by_count = director_status.nlargest(5, 'Number of Movies')[['director', 'Number of Movies']]
    display(top_directors_by_count)
    
    print("\nTop 5 Directors by Total Revenue:")
    top_directors_by_revenue = director_status.nlargest(5, 'Total Revenue (M USD)')[['director', 'Total Revenue (M USD)', 'Number of Movies']]
    display(top_directors_by_revenue)
    
    print("\nTop 5 Directors by Mean Rating:")
    top_directors_by_rating = director_status.nlargest(5, 'Mean Rating')[['director', 'Mean Rating', 'Number of Movies']]
    display(top_directors_by_rating)
    
    print("\nTop 5 Directors by Mean Revenue per Movie:")
    top_directors_by_mean_revenue = director_status.nlargest(5, 'Mean Revenue (M USD)')[['director', 'Mean Revenue (M USD)', 'Number of Movies']]
    display(top_directors_by_mean_revenue)
else:
    print(" No director data found in dataset.")

Unnamed: 0,Franchise,Number of Movies,Total Budget (M USD),Mean Budget (M USD),Total Revenue (M USD),Mean Revenue (M USD),Mean Rating
0,The Avengers Collection,4,1241.0,310.25,7776.07,1944.02,7.89
1,Star Wars Collection,2,545.0,272.5,3400.92,1700.46,7.01
2,Jurassic Park Collection,2,320.0,160.0,2982.01,1491.0,6.62
3,Avatar Collection,1,237.0,237.0,2923.71,2923.71,7.59
4,Frozen Collection,2,300.0,150.0,2727.9,1363.95,7.25
5,The Lion King (Reboot) Collection,1,260.0,260.0,1662.02,1662.02,7.1
6,The Fast and the Furious Collection,1,190.0,190.0,1515.4,1515.4,7.22
7,Black Panther Collection,1,200.0,200.0,1349.93,1349.93,7.37
8,Harry Potter Collection,1,125.0,125.0,1341.51,1341.51,8.1
9,The Incredibles Collection,1,200.0,200.0,1243.23,1243.23,7.5



Top 5 Franchises by Number of Movies:


Unnamed: 0,Franchise,Number of Movies
0,The Avengers Collection,4
1,Star Wars Collection,2
2,Jurassic Park Collection,2
4,Frozen Collection,2
3,Avatar Collection,1



Top 5 Franchises by Total Budget:


Unnamed: 0,Franchise,Total Budget (M USD),Number of Movies
0,The Avengers Collection,1241.0,4
1,Star Wars Collection,545.0,2
2,Jurassic Park Collection,320.0,2
4,Frozen Collection,300.0,2
5,The Lion King (Reboot) Collection,260.0,1



Top 5 Franchises by Mean Budget per Movie:


Unnamed: 0,Franchise,Mean Budget (M USD),Number of Movies
0,The Avengers Collection,310.25,4
1,Star Wars Collection,272.5,2
5,The Lion King (Reboot) Collection,260.0,1
3,Avatar Collection,237.0,1
7,Black Panther Collection,200.0,1



Top 5 Franchises by Total Revenue:


Unnamed: 0,Franchise,Total Revenue (M USD),Mean Revenue (M USD),Number of Movies
0,The Avengers Collection,7776.07,1944.02,4
1,Star Wars Collection,3400.92,1700.46,2
2,Jurassic Park Collection,2982.01,1491.0,2
3,Avatar Collection,2923.71,2923.71,1
4,Frozen Collection,2727.9,1363.95,2



Top 5 Franchises by Mean Revenue per Movie:


Unnamed: 0,Franchise,Mean Revenue (M USD),Number of Movies
3,Avatar Collection,2923.71,1
0,The Avengers Collection,1944.02,4
1,Star Wars Collection,1700.46,2
5,The Lion King (Reboot) Collection,1662.02,1
6,The Fast and the Furious Collection,1515.4,1



Top 5 Franchises by Mean Rating:


Unnamed: 0,Franchise,Mean Rating,Number of Movies
8,Harry Potter Collection,8.1,1
0,The Avengers Collection,7.89,4
3,Avatar Collection,7.59,1
9,The Incredibles Collection,7.5,1
7,Black Panther Collection,7.37,1


PART B: MOST SUCCESSFUL DIRECTORS


Unnamed: 0,director,Number of Movies,Total Revenue (M USD),Mean Revenue (M USD),Mean Rating
0,James Cameron,2,5187.87,2593.93,7.75
1,Anthony Russo,2,4851.85,2425.93,8.22
2,Joe Russo,2,4851.85,2425.93,8.22
3,Joss Whedon,2,2924.22,1462.11,7.57
4,Jennifer Lee,2,2727.9,1363.95,7.25
5,Chris Buck,2,2727.9,1363.95,7.25
6,J.J. Abrams,1,2068.22,2068.22,7.26
7,Colin Trevorrow,1,1671.54,1671.54,6.7
8,Jon Favreau,1,1662.02,1662.02,7.1
9,James Wan,1,1515.4,1515.4,7.22



Top 5 Directors by Number of Movies:


Unnamed: 0,director,Number of Movies
0,James Cameron,2
1,Anthony Russo,2
2,Joe Russo,2
3,Joss Whedon,2
4,Jennifer Lee,2



Top 5 Directors by Total Revenue:


Unnamed: 0,director,Total Revenue (M USD),Number of Movies
0,James Cameron,5187.87,2
1,Anthony Russo,4851.85,2
2,Joe Russo,4851.85,2
3,Joss Whedon,2924.22,2
4,Jennifer Lee,2727.9,2



Top 5 Directors by Mean Rating:


Unnamed: 0,director,Mean Rating,Number of Movies
1,Anthony Russo,8.22,2
2,Joe Russo,8.22,2
11,David Yates,8.1,1
0,James Cameron,7.75,2
3,Joss Whedon,7.57,2



Top 5 Directors by Mean Revenue per Movie:


Unnamed: 0,director,Mean Revenue (M USD),Number of Movies
0,James Cameron,2593.93,2
1,Anthony Russo,2425.93,2
2,Joe Russo,2425.93,2
6,J.J. Abrams,2068.22,1
7,Colin Trevorrow,1671.54,1


In [53]:
df.head()

Unnamed: 0,id,title,tagline,release_date,genres,belongs_to_collection,original_language,budget_musd,revenue_musd,production_companies,production_countries,vote_count,vote_average,popularity,runtime,overview,spoken_languages,poster_path,backdrop_path,origin_country,profit_musd,ROI,cast,cast_size,director,crew_size
0,299534,Avengers: Endgame,Avenge the fallen.,2019-04-24,Adventure|Science Fiction|Action,The Avengers Collection,en,356.0,2799.4391,Marvel Studios,United States of America,26844,8.238,16.4257,181.0,After the devastating events of Avengers: Infi...,English|Japanese|Xhosa,/ulzhLuWrPK07P1YkdWQLZnQh1JL.jpg,/7RyHsO4yDXtBv1zUU3mTpHeQ0d5.jpg,['US'],2443.4391,7.863593,Robert Downey Jr.|Chris Evans|Mark Ruffalo|Chr...,105,Anthony Russo|Joe Russo,603
1,19995,Avatar,Enter the world of Pandora.,2009-12-15,Action|Adventure|Fantasy|Science Fiction,Avatar Collection,en,237.0,2923.706026,Dune Entertainment|Lightstorm Entertainment|20...,United States of America|United Kingdom,32718,7.594,23.4542,162.0,"In the 22nd century, a paraplegic Marine is di...",English|Spanish,/gKY6q7SjCkAU6FqvqWybDYgUKIF.jpg,/vL5LR6WdxWPjLPFRLe133jXWsh5.jpg,['US'],2686.706026,12.336312,Sam Worthington|Zoe Saldaña|Sigourney Weaver|S...,65,James Cameron,990
2,140607,Star Wars: The Force Awakens,Every generation has a story.,2015-12-15,Adventure|Action|Science Fiction,Star Wars Collection,en,245.0,2068.223624,Lucasfilm Ltd.|Bad Robot,United States of America,20021,7.255,10.9108,136.0,Thirty years after defeating the Galactic Empi...,English,/wqnLdwVXoBjKibFRR5U3y0aDUhs.jpg,/8BTsTfln4jlQrLXUBquXJ0ASQy9.jpg,['US'],1823.223624,8.441729,Harrison Ford|Mark Hamill|Carrie Fisher|Adam D...,183,J.J. Abrams,261
3,299536,Avengers: Infinity War,Destiny arrives all the same.,2018-04-25,Adventure|Action|Science Fiction,The Avengers Collection,en,300.0,2052.415039,Marvel Studios,United States of America,31061,8.2,32.9393,149.0,As the Avengers and their allies have continue...,English|Xhosa,/7WsyChQLEftFiDOVTGkv3hFpyyt.jpg,/mDfJG3LC3Dqb67AZ52x3Z0jU0uB.jpg,['US'],1752.415039,6.841383,Robert Downey Jr.|Chris Evans|Chris Hemsworth|...,69,Joe Russo|Anthony Russo,730
4,597,Titanic,Nothing on earth could come between them.,1997-11-18,Drama|Romance,,en,200.0,2264.162353,Paramount Pictures|20th Century Fox|Lightstorm...,United States of America,26408,7.905,26.1517,194.0,101-year-old Rose DeWitt Bukater tells the sto...,English|French|German|Swedish|Italian|Russian,/9xjZS2rlVxm8SFx8kPC3aIGCOYQ.jpg,/tupgjqhWx5oieQrdyesO3aclUX9.jpg,['US'],2064.162353,11.320812,Leonardo DiCaprio|Kate Winslet|Billy Zane|Kath...,117,James Cameron,262
