In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as st

sns.set_theme(style='whitegrid')

In [None]:
alpha = 0.05
confidence = 1 - alpha

In [None]:
# Load the data
df = pd.read_csv("movie_metadata.csv")
df = df.drop_duplicates()
df = df.dropna(thresh=int(df.shape[1]*0.7))  # Keep rows with at least 70% non-null

In [None]:
def remove_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    return df[(df[column] >= lower) & (df[column] <= upper)]

Research Question 1: Does IMDb Score Impact Gross Income?

In [None]:
# Get IMBD and gross income DF
q1_df = df[['imdb_score', 'gross']].dropna()
q1_df = q1_df[q1_df['gross'] > 0]
q1_df = remove_outliers_iqr(q1_df, 'gross')

Q1 Exploratory Data Analysis (EDA)

In [None]:
def q1_visualize():
    # Scatter plot
    plt.figure(figsize=(8, 5))
    sns.scatterplot(data=q1_df, x='imdb_score', y='gross')
    plt.title("IMDb Score vs Gross Income")
    plt.show()

    # Histogram
    q1_df['imdb_score'].hist(bins=20)
    plt.title("Distribution of IMDb Scores")
    plt.xlabel("IMDb Score")
    plt.ylabel("Count")
    plt.show()

    # Basic summary
    print(q1_df.describe())

    # Grouping by IMDb score bins for trend checking
    q1_df['score_bin'] = pd.cut(q1_df['imdb_score'], bins=[0, 5, 6, 7, 8, 9, 10])
    grouped_q1 = q1_df.groupby('score_bin', observed=False)['gross'].describe()
    display(grouped_q1)

q1_visualize()

Q1 Statistical Inference

In [None]:
def q1_inference():
    imdb_score = q1_df['imdb_score']
    gross = q1_df['gross']

    # Correlation and R2
    pearsons = st.pearsonr(imdb_score, gross, alternative='two-sided')
    r = pearsons.statistic
    r2 = r**2
    pearsons_pvalue = pearsons.pvalue
    pearsons_is_significant = pearsons_pvalue < alpha

    print(f"Correlation (r): {r}, r2: {r2}, pvalue: {pearsons_pvalue}, significant?: {pearsons_is_significant}")

    # Confidence intervals
    confidence_interval = pearsons.confidence_interval(confidence)
    print(f"Confidence Interval [{confidence_interval.low}, {confidence_interval.high}]")

    # ANOVA
    grouped_budgets = [group["gross"].values for _, group in q1_df.groupby("imdb_score")]
    anova = st.f_oneway(*grouped_budgets)
    F = anova.statistic
    anova_pvalue = anova.pvalue
    anova_is_significant = anova_pvalue < alpha
    print(f"ANOVA: F={F}, p={anova_pvalue}, significant?: {anova_is_significant}")

q1_inference()

Research Question 2: Is There a Trend Between Genre and Budget?

In [None]:
# Get genre and budget DF
q2_df = df[['genres', 'budget']].dropna()
q2_df = q2_df[q2_df['budget'] > 0]
q2_df['main_genre'] = q2_df['genres'].apply(lambda x: x.split('|')[0])
q2_df = remove_outliers_iqr(q2_df, 'budget')

Q2 Exploratory Data Analysis (EDA)

In [None]:
def q2_visualize():
    # Boxplot of budget by main genre
    plt.figure(figsize=(12,6))
    sns.boxplot(data=q2_df, x='main_genre', y='budget')
    plt.xticks(rotation=45)
    plt.title("Budget by Genre")
    plt.show()

    # Histogram
    q2_df['budget'].hist(bins=30)
    plt.title("Distribution of Movie Budgets")
    plt.xlabel("Budget")
    plt.ylabel("Count")
    plt.show()

    # Summary per genre
    genre_summary = q2_df.groupby('main_genre')['budget'].describe()
    display(genre_summary)

    # Number of observations per group
    print(q2_df['main_genre'].value_counts())

q2_visualize()

Q2 Statistical Inference

In [None]:
def q2_inference():
    main_genre = q2_df['main_genre'].astype("category").cat.codes
    budget = q2_df['budget']

    # Correlation and R2
    pearsons = st.pearsonr(main_genre, budget, alternative='two-sided')
    r = pearsons.statistic
    r2 = r**2
    pearsons_pvalue = pearsons.pvalue
    pearsons_is_significant = pearsons_pvalue < alpha

    print(f"Correlation (r): {r}, r2: {r2}, pvalue: {pearsons_pvalue}, significant?: {pearsons_is_significant}")

    # Confidence intervals
    confidence_interval = pearsons.confidence_interval(confidence)
    print(f"Confidence Interval [{confidence_interval.low}, {confidence_interval.high}]")

    # ANOVA
    grouped_budgets = [group["budget"].values for _, group in q2_df.groupby("main_genre")]
    anova = st.f_oneway(*grouped_budgets)
    F = anova.statistic
    anova_pvalue = anova.pvalue
    anova_is_significant = anova_pvalue < alpha
    print(f"ANOVA: F={F}, p={anova_pvalue}, significant?: {anova_is_significant}")

q2_inference()

Research Question 3: Does the Director’s Name Influence the Number of Votes?

In [None]:
# Get director's name and number of votes DF
q3_df = df[['director_name', 'num_voted_users']].dropna()
q3_df = q3_df[q3_df['num_voted_users'] > 0]
q3_df = remove_outliers_iqr(q3_df, 'num_voted_users')

Q3 Exploratory Data Analysis (EDA)

In [None]:
def q3_visualize():
    # Top 10 directors by average number of votes
    top_directors = q3_df.groupby('director_name')['num_voted_users'].mean().sort_values(ascending=False).head(10)
    top_directors.plot(kind='bar')
    plt.title("Top 10 Directors by Avg Number of Votes")
    plt.ylabel("Avg Voted Users")
    plt.xticks(rotation=45)
    plt.show()

    # Histogram of votes
    q3_df['num_voted_users'].hist(bins=30)
    plt.title("Distribution of Number of Voted Users")
    plt.xlabel("Votes")
    plt.ylabel("Count")
    plt.show()

    # Summary
    print(q3_df.describe())

    # Grouping by director
    director_summary = q3_df.groupby('director_name')['num_voted_users'].describe().sort_values(by='mean', ascending=False)
    display(director_summary.head(10))

q3_visualize()

Q3 Statistical Inference

In [None]:
def q3_inference():
    director_name = q3_df['director_name'].astype("category").cat.codes
    num_voted_users = q3_df['num_voted_users']

    # Correlation and R2
    pearsons = st.pearsonr(director_name, num_voted_users, alternative='two-sided')
    r = pearsons.statistic
    r2 = r**2
    pearsons_pvalue = pearsons.pvalue
    pearsons_is_significant = pearsons_pvalue < alpha

    print(f"Correlation (r): {r}, r2: {r2}, pvalue: {pearsons_pvalue}, significant?: {pearsons_is_significant}")

    # Confidence intervals
    confidence_interval = pearsons.confidence_interval(confidence)
    print(f"Confidence Interval [{confidence_interval.low}, {confidence_interval.high}]")

    # ANOVA
    grouped_budgets = [group["num_voted_users"].values for _, group in q3_df.groupby("director_name")]
    anova = st.f_oneway(*grouped_budgets)
    F = anova.statistic
    anova_pvalue = anova.pvalue
    anova_is_significant = anova_pvalue < alpha
    print(f"ANOVA: F={F}, p={anova_pvalue}, significant?: {anova_is_significant}")

q3_inference()