In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style='whitegrid')

In [None]:
# Load the data
df = pd.read_csv("movie_metadata.csv")
df = df.drop_duplicates()
df = df.dropna(thresh=int(df.shape[1]*0.7))  # Keep rows with at least 70% non-null

In [None]:
def remove_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    return df[(df[column] >= lower) & (df[column] <= upper)]

In [None]:
## Research Question 1: Does IMDb Score Impact Gross Income?
q1_df = df[['imdb_score', 'gross']].dropna()
q1_df = q1_df[q1_df['gross'] > 0]
q1_df = remove_outliers_iqr(q1_df, 'gross')

# Scatter plot
plt.figure(figsize=(8, 5))
sns.scatterplot(data=q1_df, x='imdb_score', y='gross')
plt.title("IMDb Score vs Gross Income")
plt.show()

# Histogram
q1_df['imdb_score'].hist(bins=20)
plt.title("Distribution of IMDb Scores")
plt.xlabel("IMDb Score")
plt.ylabel("Count")
plt.show()

# Basic summary
print(q1_df.describe())

# Grouping by IMDb score bins for trend checking
q1_df['score_bin'] = pd.cut(q1_df['imdb_score'], bins=[0, 5, 6, 7, 8, 9, 10])
grouped_q1 = q1_df.groupby('score_bin')['gross'].describe()
display(grouped_q1)

# Correlation (prep for regression)
print("Correlation:", q1_df['imdb_score'].corr(q1_df['gross']))

In [None]:
## Research Question 2: Is There a Trend Between Genre and Budget?
q2_df = df[['genres', 'budget']].dropna()
q2_df = q2_df[q2_df['budget'] > 0]
q2_df['main_genre'] = q2_df['genres'].apply(lambda x: x.split('|')[0])
q2_df = remove_outliers_iqr(q2_df, 'budget')

# Boxplot of budget by main genre
plt.figure(figsize=(12,6))
sns.boxplot(data=q2_df, x='main_genre', y='budget')
plt.xticks(rotation=45)
plt.title("Budget by Genre")
plt.show()

# Histogram
q2_df['budget'].hist(bins=30)
plt.title("Distribution of Movie Budgets")
plt.xlabel("Budget")
plt.ylabel("Count")
plt.show()

# Summary per genre
genre_summary = q2_df.groupby('main_genre')['budget'].describe()
display(genre_summary)

# Number of observations per group
print(q2_df['main_genre'].value_counts())

In [None]:
## Research Question 3: Does the Director’s Name Influence the Number of Votes?
q3_df = df[['director_name', 'num_voted_users']].dropna()
q3_df = q3_df[q3_df['num_voted_users'] > 0]
q3_df = remove_outliers_iqr(q3_df, 'num_voted_users')

# Top 10 directors by average number of votes
top_directors = q3_df.groupby('director_name')['num_voted_users'].mean().sort_values(ascending=False).head(10)
top_directors.plot(kind='bar')
plt.title("Top 10 Directors by Avg Number of Votes")
plt.ylabel("Avg Voted Users")
plt.xticks(rotation=45)
plt.show()

# Histogram of votes
q3_df['num_voted_users'].hist(bins=30)
plt.title("Distribution of Number of Voted Users")
plt.xlabel("Votes")
plt.ylabel("Count")
plt.show()

# Summary
print(q3_df.describe())

# Grouping by director
director_summary = q3_df.groupby('director_name')['num_voted_users'].describe().sort_values(by='mean', ascending=False)
display(director_summary.head(10))