In [2]:
import pathlib, os, subprocess, sys
REPO = "/content/DSA210-Term-Project"

if not pathlib.Path(REPO).exists():
    !git clone -q https://github.com/batuhanbaydr/DSA210-Term-Project.git {REPO}

%cd {REPO}
!pip -q install pandas seaborn matplotlib pyarrow

/content/DSA210-Term-Project


In [3]:
import pandas as pd
from scipy.stats import chi2_contingency

FILE = "data/processed/oscars_plus_bechdel_pol.parquet"
df = pd.read_parquet(FILE)

#### Test 1 – Bechdel pass vs Winner  
*H₀:* Bechdel outcome and winning are independent.  
*H₁:* Bechdel outcome and winning are **not** independent.

In [4]:
tbl1 = pd.crosstab(df["winner"], df["bechdel_pass"].fillna(False))
chi2, p, _, _ = chi2_contingency(tbl1)
print("Contingency table:\n", tbl1)
print(f"χ² = {chi2:.2f} | p = {p:.4g}")


  tbl1 = pd.crosstab(df["winner"], df["bechdel_pass"].fillna(False))


Contingency table:
 bechdel_pass  False  True 
winner                    
False          6438   1944
True           1879    595
χ² = 0.74 | p = 0.3907


#### Test 2 – Political theme vs Winner  
*H₀:* Political theme and winning are independent.  
*H₁:* Political theme and winning are **not** independent.

In [5]:
tbl2 = pd.crosstab(df["winner"], df["political_theme"])
chi2, p, _, _ = chi2_contingency(tbl2)
print("Contingency table:\n", tbl2)
print(f"χ² = {chi2:.2f} | p = {p:.4g}")

Contingency table:
 political_theme  False  True 
winner                       
False             7945    437
True              2297    177
χ² = 13.12 | p = 0.0002916


#### Test 3 – Racial composition shift (Nominees)  
*H₀:* Race distribution is the same before and after 2000.  
*H₁:* Race distribution differs between periods.

In [6]:
df["Race_s"] = df["Race"].fillna("Unknown").str.strip()
keep = ["White","Black","Asian","Hispanic","Unknown"]
df["Race_s"] = df["Race_s"].where(df["Race_s"].isin(keep), "Other")

pre_nom  = df[df["year_ceremony"] < 2000]["Race_s"]
post_nom = df[df["year_ceremony"] >= 2000]["Race_s"]
labels   = ["pre 2000"]*len(pre_nom) + ["2000+"]*len(post_nom)
tbl3     = pd.crosstab(labels, pd.concat([pre_nom, post_nom], ignore_index=True))
chi2, p, _, _ = chi2_contingency(tbl3)

print("Contingency table (Nominees):\n", tbl3)
print(f"χ² = {chi2:.2f} | p = {p:.4g}")

Contingency table (Nominees):
 Race_s    Asian  Black  Hispanic  White
row_0                                  
2000+       130    107         9   2375
pre 2000    108     86        17   8024
χ² = 235.16 | p = 1.059e-50


#### Test 4 – Racial composition shift (Winners)  
*H₀:* Race distribution of winners is the same before and after 2000.  
*H₁:* Race distribution differs between periods.


In [7]:
pre_w   = df[(df["winner"]) & (df["year_ceremony"] < 2000)]["Race_s"]
post_w  = df[(df["winner"]) & (df["year_ceremony"] >= 2000)]["Race_s"]
labels  = ["pre 2000"]*len(pre_w) + ["2000+"]*len(post_w)
tbl4    = pd.crosstab(labels, pd.concat([pre_w, post_w], ignore_index=True))
chi2, p, _, _ = chi2_contingency(tbl4)

print("Contingency table (Winners):\n", tbl4)
print(f"χ² = {chi2:.2f} | p = {p:.4g}")

Contingency table (Winners):
 Race_s    Asian  Black  Hispanic  White
row_0                                  
2000+        31     36         6    515
pre 2000     21     17         3   1845
χ² = 107.99 | p = 2.971e-23
