In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

file_path = '/content/drive/MyDrive/DSA2025Spring/rotten_tomatoes_movies.csv'
df = pd.read_csv(file_path)
df.head()

#Clean and organize necessary columns of the dataset according to the needs of the project
columns_to_keep = ['title', 'audienceScore', 'tomatoMeter', 'releaseDateTheaters', 'releaseDateStreaming', 'genre', 'boxOffice']
df = df[columns_to_keep]
df = df.drop_duplicates(subset='title')
df = df.dropna(subset=['audienceScore', 'tomatoMeter'])

def boxOfficeConvert(boxOffice): #Convert rows of box office column into float values
  if pd.isna(boxOffice):
    return np.nan
  boxOffice = boxOffice.replace('$', '')
  if 'M' in boxOffice:
    return float(boxOffice.replace('M', ''))*1000000
  elif 'K' in boxOffice:
      return float(boxOffice.replace('K', ''))*1000
  else:
      return float(boxOffice)
df['boxOffice'] = df['boxOffice'].apply(boxOfficeConvert)

#Clean box office data from misleading or missing data:
#clearing movies with revenue below $1k dollars: meaning recognition from audience or critics is not to supply meaningful data
def boxOfficeClean(boxOffice):
  if boxOffice < 1000:
    return np.nan
  else:
    return boxOffice
df['boxOffice'] = df['boxOffice'].apply(boxOfficeClean)
df = df.dropna(subset=['boxOffice'])

#Heatmap to compare correlation:
correlation = df[['audienceScore','tomatoMeter','boxOffice']].corr()
plt.figure(figsize=(6,5))
sns.heatmap(correlation, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap for Box Office - Audience & Critic Scores ')
plt.savefig('/content/drive/MyDrive/DSA2025Spring/ScoreVsBoxOfficeResults/heatmap_correlation_boxoffice.png')
plt.close()

#Scatterplot
sns.regplot(x='audienceScore', y='boxOffice', data=df)
plt.title('Audience Score vs Box Office Revenue')
plt.savefig('/content/drive/MyDrive/DSA2025Spring/ScoreVsBoxOfficeResults/plot_audience_boxoffice.png')
plt.close()

sns.regplot(x='tomatoMeter', y='boxOffice', data=df)
plt.title('Critic Score vs Box Office Revenue')
plt.savefig('/content/drive/MyDrive/DSA2025Spring/ScoreVsBoxOfficeResults/plot_critic_boxoffice.png')
plt.close()

correlation.to_csv('/content/drive/MyDrive/DSA2025Spring/ScoreVsBoxOfficeResults/correlation_results.csv')

#Hypothesis Testing
def pvalueTest(pValue):
  if pValue < 0.05:
    print("P value is less than 0.05. The null hypothesis is rejected. There is a significant relationship between the values.")
  else:
    print("P value is more than or equal to 0.05. The null hypothesis cannot be rejected. There is no significant relationship between the values.")

#Pearson Correlation to compare the correlation between critic score & box office revenue to the one between audience score & box office revenue
import scipy.stats as stats
rCritic, pValueCritic = stats.pearsonr(df['tomatoMeter'], df['boxOffice'])
print("Correlation between Critic Score and Box Office Revenue:", rCritic)
print("P-value:", pValueCritic)
pvalueTest(pValueCritic)

rAudience, pValueAudience = stats.pearsonr(df['audienceScore'], df['boxOffice'])
print("Correlation between Audience Score and Box Office Revenue:", rAudience)
print("P-value:", pValueAudience)
pvalueTest(pValueAudience)