In [1]:
import pandas as pd
import numpy as np
import ast


In [17]:
#load data 
load_data = pd.read_csv('../data/raw/moviesData_08-04-2025_10-07-51.csv')

#create dataframe
df = pd.DataFrame(load_data)
#df.head()

In [18]:
drop_cols = ['adult', 'imdb_id', 'original_title', 'video', 'homepage']
df = df.drop(columns=drop_cols)
# df.head(3)

In [19]:
# Extracting nested columns in the data
df['genres'] = df['genres'].apply(lambda x: "|".join([d['name'] for d in ast.literal_eval(x)]) if pd.notnull(x) else np.nan)

df['belongs_to_collection'] = df['belongs_to_collection'].apply(lambda x: ast.literal_eval(x)['name'] if pd.notnull(x) else np.nan)

df['production_countries'] = df['production_countries'].apply(lambda x: "|".join([d['name'] for d in ast.literal_eval(x)]) if pd.notnull(x) else np.nan)

df['production_companies'] = df['production_companies'].apply(lambda x: "|".join([d['name'] for d in ast.literal_eval(x)]) if pd.notnull(x) else np.nan)

df['spoken_languages'] = df['spoken_languages'].apply(lambda x: "|".join([d['english_name'] for d in ast.literal_eval(x)]) if pd.notnull(x) else np.nan)

#new columns inclusion from nested 

df['credits_parsed'] = df['credits'].apply(lambda x: ast.literal_eval(x) if pd.notnull(x) else {})
# Function to get cast
def get_full_cast(credit):
    cast_list = credit.get('cast', [])
    names = [member.get('name') for member in cast_list if member.get('name')]
    return '|'.join(names) if names else None

# Function to get director
def get_director(credit):
    crew_list = credit.get('crew', [])
    for member in crew_list:
        if member.get('job') == 'Director':
            return member.get('name')
    return None

# Function to get cast size
def get_cast_size(credit):
    cast_list = credit.get('cast', [])
    return len(cast_list)

# Function to get crew size
def get_crew_size(credit):
    crew_list = credit.get('crew', [])
    return len(crew_list)

# Apply functions to create new columns
df['cast'] = df['credits_parsed'].apply(get_full_cast)
df['director'] = df['credits_parsed'].apply(get_director)
df['cast_size'] = df['credits_parsed'].apply(get_cast_size)
df['crew_size'] = df['credits_parsed'].apply(get_crew_size)
df.drop(columns=['credits_parsed'], inplace=True)


In [21]:
df['genres'] = df['genres'].apply(lambda x: '|'.join(sorted(x.split('|'))))
df['production_countries'] = df['production_countries'].apply(lambda x: '|'.join(sorted(x.split('|'))))
df['spoken_languages'] = df['spoken_languages'].apply(lambda x: '|'.join(sorted(x.split('|'))))

In [22]:
print(df['genres'].value_counts())
print(df['belongs_to_collection'].value_counts())
print(df['production_countries'].value_counts())
print(df['production_companies'].value_counts())
print(df['spoken_languages'].value_counts())

genres
Action|Adventure|Science Fiction             7
Action|Adventure|Science Fiction|Thriller    2
Action|Adventure|Fantasy|Science Fiction     1
Drama|Romance                                1
Adventure|Animation|Drama|Family             1
Action|Crime|Thriller                        1
Adventure|Fantasy                            1
Adventure|Animation|Comedy|Family|Fantasy    1
Adventure|Animation|Family|Fantasy           1
Family|Fantasy|Romance                       1
Action|Adventure|Animation|Family            1
Name: count, dtype: int64
belongs_to_collection
The Avengers Collection                4
Star Wars Collection                   2
Frozen Collection                      2
Jurassic Park Collection               2
Avatar Collection                      1
The Lion King (Reboot) Collection      1
The Fast and the Furious Collection    1
Black Panther Collection               1
Harry Potter Collection                1
The Incredibles Collection             1
Name: count, dtype

In [24]:
# Convert budget, id, popularity to numeric
df['budget'] = pd.to_numeric(df['budget'],errors='coerce')
df['popularity'] = pd.to_numeric(df['popularity'], errors='coerce')
df['id'] = pd.to_numeric(df['id'],errors='coerce')
df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')
# df.head(1)

In [163]:
df[['budget', 'revenue', 'runtime']] = df[['budget', 'revenue', 'runtime']].replace(0, np.nan)

In [26]:
# budget and revenue transformation
df['budget_musd'] = df['budget'] / 1e6
df['revenue_musd'] = df['revenue'] / 1e6


In [27]:
df.head(2)

Unnamed: 0,backdrop_path,belongs_to_collection,budget,genres,id,origin_country,original_language,overview,popularity,poster_path,...,title,vote_average,vote_count,credits,cast,director,cast_size,crew_size,budget_musd,revenue_musd
0,/7RyHsO4yDXtBv1zUU3mTpHeQ0d5.jpg,The Avengers Collection,356000000,Action|Adventure|Science Fiction,299534,['US'],en,After the devastating events of Avengers: Infi...,22.6136,/ulzhLuWrPK07P1YkdWQLZnQh1JL.jpg,...,Avengers: Endgame,8.238,26200,"{'cast': [{'adult': False, 'gender': 2, 'id': ...",Robert Downey Jr.|Chris Evans|Mark Ruffalo|Chr...,Joe Russo,105,593,356.0,2799.4391
1,/vL5LR6WdxWPjLPFRLe133jXWsh5.jpg,Avatar Collection,237000000,Action|Adventure|Fantasy|Science Fiction,19995,['US'],en,"In the 22nd century, a paraplegic Marine is di...",33.5372,/kyeqWdyUXW608qlYkRqosgbbJyK.jpg,...,Avatar,7.587,32104,"{'cast': [{'adult': False, 'gender': 2, 'id': ...",Sam Worthington|Zoe Saldaña|Sigourney Weaver|S...,James Cameron,65,986,237.0,2923.706026


In [None]:
# Find movies with vote_count = 0
zero_votes = df[df['vote_count'] == 0]

# See what their vote_average looks like
print(zero_votes[['title', 'vote_count', 'vote_average']])

# Adjust: Set vote_average to NaN where vote_count is 0

df.loc[df['vote_count'] == 0, 'vote_average'] = np.nan


Empty DataFrame
Columns: [title, vote_count, vote_average]
Index: []


In [30]:
placeholders = ['No Data', '', 'nan']
df['overview'] = df['overview'].replace(placeholders, np.nan)
df['tagline'] = df['tagline'].replace(placeholders, np.nan)


In [31]:
df = df.drop_duplicates()
df = df.dropna(subset=['id', 'title'])


In [32]:
df = df[df.notna().sum(axis=1) >= 10]


In [33]:
df = df[df['status'] == 'Released']
df = df.drop(columns=['status'])


In [34]:
final_cols = [
    'id', 'title', 'tagline', 'release_date', 'genres', 'belongs_to_collection',
    'original_language', 'budget_musd', 'revenue_musd', 'production_companies',
    'production_countries', 'vote_count', 'vote_average', 'popularity', 'runtime',
    'overview', 'spoken_languages', 'poster_path', 'cast', 'cast_size', 'director', 'crew_size'
]
df = df[final_cols]


In [36]:
df.head(2)

Unnamed: 0,id,title,tagline,release_date,genres,belongs_to_collection,original_language,budget_musd,revenue_musd,production_companies,...,vote_average,popularity,runtime,overview,spoken_languages,poster_path,cast,cast_size,director,crew_size
0,299534,Avengers: Endgame,Avenge the fallen.,2019-04-24,Action|Adventure|Science Fiction,The Avengers Collection,en,356.0,2799.4391,Marvel Studios,...,8.238,22.6136,181,After the devastating events of Avengers: Infi...,English|Japanese|Xhosa,/ulzhLuWrPK07P1YkdWQLZnQh1JL.jpg,Robert Downey Jr.|Chris Evans|Mark Ruffalo|Chr...,105,Joe Russo,593
1,19995,Avatar,Enter the world of Pandora.,2009-12-15,Action|Adventure|Fantasy|Science Fiction,Avatar Collection,en,237.0,2923.706026,Dune Entertainment|Lightstorm Entertainment|20...,...,7.587,33.5372,162,"In the 22nd century, a paraplegic Marine is di...",English|Spanish,/kyeqWdyUXW608qlYkRqosgbbJyK.jpg,Sam Worthington|Zoe Saldaña|Sigourney Weaver|S...,65,James Cameron,986


In [39]:
df = df.reset_index(drop=True)


## Step 3: KPI Implementation & Analysis 

In [40]:
# 1. Create Helper Columns
df['profit'] = df['revenue_musd'] - df['budget_musd']
df['roi'] = df['revenue_musd'] / df['budget_musd']

In [None]:
# 2. Define a Ranking Function
def rank_movies(df, sort_by, ascending=False, min_budget_musd=None, min_votes=None, top_n=5):
    temp = df.copy()
    if min_budget_musd:
        temp = temp[temp['budget_musd'] >= min_budget_musd]
    if min_votes:
        temp = temp[temp['vote_count'] >= min_votes]
    temp = temp.sort_values(by=sort_by, ascending=ascending)
    return temp[['title', sort_by]].head(top_n)

In [42]:
# 3. KPI Calculations

# Highest Revenue
print("\nTop 5 Highest Revenue Movies:")
print(rank_movies(df, sort_by='revenue_musd', ascending=False))

# Highest Budget
print("\nTop 5 Highest Budget Movies:")
print(rank_movies(df, sort_by='budget_musd', ascending=False))

# Highest Profit
print("\nTop 5 Highest Profit Movies:")
print(rank_movies(df, sort_by='profit', ascending=False))

# Lowest Profit
print("\nTop 5 Lowest Profit Movies:")
print(rank_movies(df, sort_by='profit', ascending=True))

# Highest ROI (only budget ≥ 10M)
print("\nTop 5 Highest ROI Movies (budget ≥ 10M):")
print(rank_movies(df, sort_by='roi', ascending=False, min_budget_musd=10))

# Lowest ROI (only budget ≥ 10M)
print("\nTop 5 Lowest ROI Movies (budget ≥ 10M):")
print(rank_movies(df, sort_by='roi', ascending=True, min_budget_musd=10))

# Most Voted Movies
print("\nTop 5 Most Voted Movies:")
print(rank_movies(df, sort_by='vote_count', ascending=False))

# Highest Rated Movies (only movies ≥ 10 votes)
print("\nTop 5 Highest Rated Movies (votes ≥ 10):")
print(rank_movies(df, sort_by='vote_average', ascending=False, min_votes=10))

# Lowest Rated Movies (only movies ≥ 10 votes)
print("\nTop 5 Lowest Rated Movies (votes ≥ 10):")
print(rank_movies(df, sort_by='vote_average', ascending=True, min_votes=10))

# Most Popular Movies
print("\nTop 5 Most Popular Movies:")
print(rank_movies(df, sort_by='popularity', ascending=False))



Top 5 Highest Revenue Movies:
                          title  revenue_musd
1                        Avatar   2923.706026
0             Avengers: Endgame   2799.439100
4                       Titanic   2264.162353
2  Star Wars: The Force Awakens   2068.223624
3        Avengers: Infinity War   2052.415039

Top 5 Highest Budget Movies:
                          title  budget_musd
9       Avengers: Age of Ultron        365.0
0             Avengers: Endgame        356.0
3        Avengers: Infinity War        300.0
6                 The Lion King        260.0
2  Star Wars: The Force Awakens        245.0

Top 5 Highest Profit Movies:
                          title       profit
1                        Avatar  2686.706026
0             Avengers: Endgame  2443.439100
4                       Titanic  2064.162353
2  Star Wars: The Force Awakens  1823.223624
3        Avengers: Infinity War  1752.415039

Top 5 Lowest Profit Movies:
                       title       profit
9    Avengers: Age of 