In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [53]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

file_path = '/content/drive/MyDrive/DSA2025Spring/rotten_tomatoes_movies.csv'
df = pd.read_csv(file_path)
df.head()
print("Size of dataset (unfiltered):", len(df))

#Clean and organize necessary columns of the dataset according to the needs of the project
columns_to_keep = ['title', 'audienceScore', 'tomatoMeter', 'releaseDateTheaters', 'releaseDateStreaming', 'genre', 'boxOffice']
df = df[columns_to_keep]
df = df.drop_duplicates(subset='title')
df['tomatoMeter'] = df['tomatoMeter'].fillna(df['tomatoMeter'].mean())
df['audienceScore'] = df['audienceScore'].fillna(df['audienceScore'].mean())
df = df.dropna(subset=['boxOffice'])

print("Size of dataset after removing rows without box office value:", len(df))

#Convert rows of box office column into float values
def boxOfficeConvert(boxOffice):
  if pd.isna(boxOffice):
    return np.nan
  boxOffice = boxOffice.replace('$', '')
  if 'M' in boxOffice:
    return float(boxOffice.replace('M', ''))*1000000
  elif 'K' in boxOffice:
      return float(boxOffice.replace('K', ''))*1000
  else:
      return float(boxOffice)
df['boxOffice'] = df['boxOffice'].apply(boxOfficeConvert)

print("Size of dataset before removing outliers (in rows):", len(df))
#Cleaning the dataset by removing outliers - by removing values outside the 25-75 quantile range (interquantile method)
quantile1 = df['boxOffice'].quantile(0.25)
quantile3 = df['boxOffice'].quantile(0.75)
iqr = quantile3 - quantile1

lowerBound = quantile1 - (1.5 * iqr)
upperBound = quantile3 + (1.5 * iqr)

df = df[(df['boxOffice'] >= lowerBound) & (df['boxOffice'] <= upperBound)]

print("Size of dataset after removing outliers (in rows):", len(df))

Size of dataset (unfiltered): 143258
Size of dataset after removing rows without box office value: 12528
Size of dataset before removing outliers (in rows): 12528
Size of dataset after removing outliers (in rows): 10626


In [43]:
#Heatmap to compare correlation:
correlation = df[['audienceScore','tomatoMeter','boxOffice']].corr()
plt.figure(figsize=(6,5))
sns.heatmap(correlation, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap for Box Office - Audience & Critic Scores ')
plt.savefig('/content/drive/MyDrive/DSA2025Spring/ScoreVsBoxOfficeResults/heatmap_correlation_boxoffice.png')
plt.close()

#Regression plots
plt.figure(figsize=(16,14))
sns.regplot(x='audienceScore', y='boxOffice', data=df, line_kws={'color': 'red'}, scatter_kws={'color':'#71b3ee','alpha': 0.5})
plt.title('Audience Score vs Box Office Revenue')
plt.savefig('/content/drive/MyDrive/DSA2025Spring/ScoreVsBoxOfficeResults/plot_audience_boxoffice.png')
plt.close()

plt.figure(figsize=(16,14))
sns.regplot(x='tomatoMeter', y='boxOffice', data=df, line_kws={'color': 'red'}, scatter_kws={'color':'#5d9978','alpha': 0.5})
plt.title('Critic Score vs Box Office Revenue')
plt.savefig('/content/drive/MyDrive/DSA2025Spring/ScoreVsBoxOfficeResults/plot_critic_boxoffice.png')
plt.close()

correlation.to_csv('/content/drive/MyDrive/DSA2025Spring/ScoreVsBoxOfficeResults/correlation_results.csv')

#Hypothesis Testing
def pvalueTest(pValue):
  if pValue < 0.05:
    print("P value is less than 0.05. The null hypothesis is rejected. There is a significant relationship between the values.")
  else:
    print("P value is more than or equal to 0.05. The null hypothesis cannot be rejected. There is no significant relationship between the values.")

#Pearson Correlation to compare the correlation between critic score & box office revenue to the one between audience score & box office revenue
import scipy.stats as stats
rCritic, pValueCritic = stats.pearsonr(df['tomatoMeter'], df['boxOffice'])
print("Correlation between Critic Score and Box Office Revenue:", rCritic)
print("P-value:", pValueCritic)
pvalueTest(pValueCritic)

rAudience, pValueAudience = stats.pearsonr(df['audienceScore'], df['boxOffice'])
print("Correlation between Audience Score and Box Office Revenue:", rAudience)
print("P-value:", pValueAudience)
pvalueTest(pValueAudience)

Correlation between Critic Score and Box Office Revenue: -0.24415258620083363
P-value: 5.145165271420907e-144
P value is less than 0.05. The null hypothesis is rejected. There is a significant relationship between the values.
Correlation between Audience Score and Box Office Revenue: -0.0901669691353097
P-value: 1.248400337443627e-20
P value is less than 0.05. The null hypothesis is rejected. There is a significant relationship between the values.
