In [1]:
import os
import re
import time
from datetime import datetime

In [2]:
import numpy as np
import pandas as pd

In [3]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager

In [4]:
# change dir
os.chdir("/Users/ben-tanen/Desktop/Projects/bt-website/code/projects/oscars-best-picture/")

In [5]:
# init selenium
driver = webdriver.Chrome(ChromeDriverManager().install())

  driver = webdriver.Chrome(ChromeDriverManager().install())


In [6]:
# define how deep we need to go on Metacritic (how many pages to find the lowest scoring nominee)
# page count assuming 24 films per page on Metacritic site
yr_depth = [
    {"year": 2000, "max_page": 6},
    {"year": 2001, "max_page": 6},
    {"year": 2002, "max_page": 4},
    {"year": 2003, "max_page": 4},
    {"year": 2004, "max_page": 6},
    {"year": 2005, "max_page": 7},
    {"year": 2006, "max_page": 6},
    {"year": 2007, "max_page": 2},
    {"year": 2008, "max_page": 9},
    {"year": 2009, "max_page": 11},
    {"year": 2010, "max_page": 4},
    {"year": 2011, "max_page": 18},
    {"year": 2012, "max_page": 10},
    {"year": 2013, "max_page": 5},
    {"year": 2014, "max_page": 7},
    {"year": 2015, "max_page": 5},
    {"year": 2016, "max_page": 10},
    {"year": 2017, "max_page": 7},
    {"year": 2018, "max_page": 21},
    {"year": 2019, "max_page": 15},
    {"year": 2020, "max_page": 7},
    {"year": 2021, "max_page": 19},
    {"year": 2022, "max_page": 13},
    {"year": 2023, "max_page": 5}
]

In [7]:
# get all films from all necessary pages
for yr in yr_depth[-1:]:
    yr_films = []
    for p in range(1, yr["max_page"] + 1):
        driver.get(f"https://www.metacritic.com/browse/movie/all/all/{yr['year']}/metascore/?page={p}")
        films = driver.find_elements(By.XPATH, "//div[contains(@class, 'c-finderProductCard_info')]")
        print(f"{yr['year']} p.{p} of {yr['max_page']} - {len(films)} films captured")
        yr_films += [
            {
                "title": f.find_element(By.XPATH, ".//div[@class='c-finderProductCard_title']").get_attribute("data-title"),
                "release_date": datetime.strptime(f.find_element(By.XPATH, ".//div[@class='c-finderProductCard_meta']/span").text, "%b %d, %Y"),
                "metacritic_score": int(f.find_element(By.XPATH, ".//div[contains(@title, 'Metascore ')]/span").text)
            } for i, f in enumerate(films)
        ]
    # save partials (in case of crash)
    pd.DataFrame(yr_films).to_csv(f"data/by-year/metacritic-topmovies-{yr['year']}.csv", index = False)

2023 p.1 of 5 - 24 films captured
2023 p.2 of 5 - 24 films captured
2023 p.3 of 5 - 24 films captured
2023 p.4 of 5 - 24 films captured
2023 p.5 of 5 - 24 films captured


In [75]:
# concat all years together
all_films_df = pd.concat([pd.read_csv(f"data/by-year/metacritic-topmovies-{yr['year']}.csv") for yr in yr_depth])
all_films_df = all_films_df[~all_films_df["title"].str.contains("re-release")]
all_films_df["release_date"] = pd.to_datetime(all_films_df["release_date"], format = "%Y-%m-%d")
all_films_df = all_films_df.reset_index(drop = True)
all_films_df

Unnamed: 0,title,release_date,metacritic_score
0,Yi Yi,2000-10-06,94
1,"Crouching Tiger, Hidden Dragon",2000-12-08,94
2,Beau Travail,2000-03-31,91
3,Almost Famous,2000-09-13,90
4,Chicken Run,2000-06-21,88
...,...,...,...
4931,Cassandro,2023-09-15,76
4932,Nostalgia,2023-01-27,76
4933,American Symphony,2023-11-24,76
4934,How to Blow Up a Pipeline,2023-04-07,76


In [76]:
# auto set oscar year based on release year
all_films_df["release_year"] = all_films_df["release_date"].dt.year
all_films_df["year"] = all_films_df["release_year"]

# but make modifications as needed
oscar_year_mods = [
    {"title": "The Father", "release_year": 2021, "oscar_year": 2020},
    {"title": "Judas and the Black Messiah", "release_year": 2021, "oscar_year": 2020}
]
for mod in oscar_year_mods:
    all_films_df.loc[(all_films_df["title"] == mod["title"]) & 
                     (all_films_df["release_year"] == mod["release_year"]), "year"] = mod["oscar_year"]

In [77]:
all_films_df.query("release_year != year")

Unnamed: 0,title,release_date,metacritic_score,release_year,year
4063,The Father,2021-02-26,88,2021,2020
4083,Judas and the Black Messiah,2021-02-12,85,2021,2020


In [78]:
# add ranking based on year + score
all_films_df["metacritic_rank"] = all_films_df.groupby("year").cumcount() + 1

# add tie rankings
all_films_df["tie_rank"] = all_films_df.groupby(["year", "metacritic_score"]).cumcount() + 1

In [79]:
# import nominee list
noms_df = pd.read_csv("data/oscar-noms.csv")
noms_df.head()

Unnamed: 0,title,year,winner
0,"Crouching Tiger, Hidden Dragon",2000,0
1,Traffic,2000,0
2,Erin Brockovich,2000,0
3,Gladiator,2000,1
4,Chocolat,2000,0


In [80]:
# merge nom + winner flags onto main df
all_films_plus_df = all_films_df.merge(noms_df, on = ["title", "year"], how = "outer")
all_films_plus_df.head()

Unnamed: 0,title,release_date,metacritic_score,release_year,year,metacritic_rank,tie_rank,winner
0,Yi Yi,2000-10-06,94,2000,2000,1,1,
1,"Crouching Tiger, Hidden Dragon",2000-12-08,94,2000,2000,2,2,0.0
2,Beau Travail,2000-03-31,91,2000,2000,3,1,
3,Almost Famous,2000-09-13,90,2000,2000,4,1,
4,Chicken Run,2000-06-21,88,2000,2000,5,1,


In [81]:
# check for nominees that did not match to metacritic data
all_films_plus_df[all_films_plus_df["metacritic_rank"].isna()]

Unnamed: 0,title,release_date,metacritic_score,release_year,year,metacritic_rank,tie_rank,winner


In [82]:
# add clean oscar fields
all_films_plus_df["oscars_nom"] = np.where(~all_films_plus_df["winner"].isna(), 1, 0)
all_films_plus_df["oscars_win"] = np.where(all_films_plus_df["winner"] == 1, 1, 0)

all_films_plus_df["min_oscar_nom_rank"] = all_films_plus_df.loc[all_films_plus_df["oscars_nom"] == 1].groupby("year")["metacritic_rank"].transform("max")
all_films_plus_df["min_oscar_nom_rank"] = all_films_plus_df.groupby("year")["min_oscar_nom_rank"].fillna(all_films_plus_df.groupby("year")["min_oscar_nom_rank"].transform("min"))
all_films_plus_df["oscar_win_rank"] = all_films_plus_df.loc[all_films_plus_df["oscars_win"] == 1].groupby("year")["metacritic_rank"].transform("max")
all_films_plus_df["oscar_win_rank"] = all_films_plus_df.groupby("year")["oscar_win_rank"].fillna(all_films_plus_df.groupby("year")["oscar_win_rank"].transform("min"))

all_films_plus_df.head()

Unnamed: 0,title,release_date,metacritic_score,release_year,year,metacritic_rank,tie_rank,winner,oscars_nom,oscars_win,min_oscar_nom_rank,oscar_win_rank
0,Yi Yi,2000-10-06,94,2000,2000,1,1,,0,0,126.0,110.0
1,"Crouching Tiger, Hidden Dragon",2000-12-08,94,2000,2000,2,2,0.0,1,0,126.0,110.0
2,Beau Travail,2000-03-31,91,2000,2000,3,1,,0,0,126.0,110.0
3,Almost Famous,2000-09-13,90,2000,2000,4,1,,0,0,126.0,110.0
4,Chicken Run,2000-06-21,88,2000,2000,5,1,,0,0,126.0,110.0


In [84]:
all_films_plus_df[["title", "year", "metacritic_score", "metacritic_rank", 
                   "oscars_nom", "oscars_win",
                   "min_oscar_nom_rank", "oscar_win_rank", "tie_rank"
                  ]].to_csv("data/metacritic-topmovies-byyear-2000to2023.csv", index = False)