In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

file_path = '/content/drive/MyDrive/DSA2025Spring/rotten_tomatoes_movies.csv'
df = pd.read_csv(file_path)
df.head()
columns_to_keep = ['title', 'audienceScore', 'tomatoMeter', 'releaseDateTheaters', 'releaseDateStreaming', 'genre', 'boxOffice']
df = df[columns_to_keep]
df = df.drop_duplicates(subset='title')
df = df.dropna(subset=['audienceScore', 'tomatoMeter'])

!pip install pytrends
import time
from scipy.stats import f_oneway
from pytrends.request import TrendReq
pytrends = TrendReq()
#Using proxies due to Google rate limit
timeFrame = '2024-01-01 2025-01-01'

#Clean up and create a list only containing unique genres from the Rotten Tomatoes movies dataset
df = df.dropna(subset=['genre'])
genreList = df['genre'].str.split(',').explode()
genreList = genreList.str.strip().unique()
cleanedGenreList = []
for genre in genreList: #clean the list of NaN values
    if type(genre) == str:
        cleanedGenreList.append(genre)
genreList = cleanedGenreList
print("List of Genres:" + str(genreList))

def cleanGenreName(genre):
  genre = genre.lower()
  genre = genre.replace('+', '')
  genre = genre.replace('&', '')
  genre = genre.replace('sci-fi', 'science fiction')
  genre = genre.replace('-','')
  return genre

def pyTrendsGenre(genre):
  genre = cleanGenreName(genre)
  try:
    pytrends.build_payload([genre + " movies"], timeframe=timeFrame)
    interestForGenre = pytrends.interest_over_time()
    if interestForGenre.empty: #Clean up row of movie with no trend data on Pytrends to improve accuracy
      return None
    interestForGenre['genre'] = genre
    interestForGenre = interestForGenre.rename(columns={interestForGenre.columns[0]: 'value'})
    return interestForGenre
  except Exception as e:
    time.sleep(60)
    print("Error fetching data for ", str(genre))
    return None

trendData = []

#a list for genres that Google Trends has failed to return requests for (429 error) -> retry requesting for these genres after list is done iterating
failedGenres = []

retry = 5
waitTime = 40 #between retries

for genre in genreList:
  data = pyTrendsGenre(genre)
  if data is not None:
    trendData.append(data)
    print(data)
    time.sleep(60)
  else:
    failedGenres.append(genre)

for i in range(retry):
  genresFailedOnRetry = []
  for genre in failedGenres:
    data = pyTrendsGenre(genre)
    if data is not None:
      trendData.append(data)
      print(data)
    else:
      genresFailedOnRetry.append(genre)
  failedGenres = genresFailedOnRetry
  if not failedGenres:
    break
  time.sleep(waitTime)

if failedGenres:
  print("Final failed genres after all retries:", failedGenres)

if trendData:
  allData = pd.concat(trendData)
  allData = allData.reset_index()
  allData['date'] = pd.to_datetime(allData['date'])
  allData['month'] = (allData['date'].dt.month)

  def monthToSeason(month):
    if month in [12, 1, 2]: #winter = 1
      return 1
    elif month in [3, 4, 5]: #spring = 2
      return 2
    elif month in [6, 7, 8]: #summer = 3
      return 3
    elif month in [9,10,11]: #autumn = 4
      return 4

  allData['season'] = allData['month'].apply(monthToSeason)
  allData.to_csv('/content/drive/MyDrive/DSA2025Spring/SeasonalTrendResults/seasonalData.csv', index=False)

  trend = allData.groupby(['genre', 'season']).size().reset_index(name='count')
  print(trend)
  trend.to_csv('/content/drive/MyDrive/DSA2025Spring/SeasonalTrendResults/seasonalTrend.csv', index=False)

plt.figure(figsize=(10, 6))
sns.barplot(x='season', y='count', hue='genre', data=trend)
plt.title('Genre Trend Over Seasons')
plt.xlabel('Season')
plt.ylabel('Searches')
plt.xticks([0, 1, 2, 3], ['Winter', 'Spring', 'Summer', 'Autumn'])
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.savefig('/content/drive/MyDrive/DSA2025Spring/SeasonalTrendResults/seasonalTrendBarPlot.png')
plt.show()

monthlyInterest = allData.groupby(['genre', 'month'])['value'].mean().reset_index()
monthlyInterest.to_csv('/content/drive/MyDrive/DSA2025Spring/SeasonalTrendResults/monthlyInterest.csv', index=False)

peakMonth= monthlyInterest.loc[monthlyInterest.groupby('genre')['value'].idxmax()]
peakMonth.columns = ['genre', 'peak_month', 'peak_value']
peakMonth.to_csv('/content/drive/MyDrive/DSA2025Spring/SeasonalTrendResults/monthlyPeak.csv', index=False)

# ANOVA Test for each genre
def pvalueTest(pVal):
    if pVal < 0.05:
      return "There is a statistically significant difference between seasons."
    else:
      return"No statistically significant difference between seasons."

# ANOVA Test by season
results = []

#data for genres
for genre in trend['genre'].unique():
    genreTrend = trend[trend['genre'] == genre]

    seasonGroups = []

    #data for seasons
    for season in genreTrend['season'].unique():
      seasonData = genreTrend[genreTrend['season'] == season]
      seasonData = genreData[genreData['season'] == season]['value']
      if len(counts) > 0:
        seasonGroups.append(counts)
    if len(seasonGroups) > 1:
      fStat, pValAnova = f_oneway(*seasonGroups)
      result = [genre, fStat, pValAnova, pvalueTest(pValAnova)]
      results.append(result)

anova_results = pd.DataFrame(results, columns=['Genre', 'F-Statistic', 'P-Value', 'Conclusion'])
anova_results.to_csv('/content/drive/MyDrive/DSA2025Spring/SeasonalTrendResults/ANOVA.csv', index=False)

plt.figure(figsize=(14, 8))
for genre in allData['genre'].unique():
  genreData = allData[allData['genre'] == genre]
  if not genreData.empty:
    genreGrouped = genreData.groupby('date')['value'].mean()
    plt.plot(genreGrouped.index, genreGrouped.values, label=genre)
plt.title('Interest Over Time for Movie Genres (2024)')
plt.xlabel('Date')
plt.ylabel('Interest')
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.tight_layout()
plt.savefig('/content/drive/MyDrive/DSA2025Spring/SeasonalTrendResults/genreTrendLinePlot.png')
plt.show()

heatMapPivot = trend.pivot(index='season', columns='genre', values='count')
plt.figure(figsize=(10, 6))
sns.heatmap(heatMapPivot, annot=True, cmap='coolwarm', fmt='.0f')
plt.title('Heatmap: Genre by Season')
plt.xlabel('Genre')
plt.ylabel('Season')
plt.tight_layout()
plt.savefig('/content/drive/MyDrive/DSA2025Spring/SeasonalTrendResults/heatmapSeasonal.png')
plt.show()