# Chapter 1: Correlation, Association, and the Yule-Simpson Paradox

In [1]:
import numpy as np
import pandas as pd
import scipy as sp
import statsmodels.formula.api as smf

# viz
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"


%load_ext autoreload
%autoreload 1

%load_ext watermark
%watermark --iversions

matplotlib       : 3.8.0
numpy            : 1.23.5
statsmodels      : 0.13.5
pandas           : 2.1.1
scipy            : 1.11.3
matplotlib_inline: 0.1.6



## Unadjusted and Adjusted Regression
Lalonde Observational Data

In [2]:
# read CPS data
dat = pd.read_table("cps1re74.csv", delimiter=" ")
dat["u74"] = np.where(dat["re74"] == 0, 1, 0)
dat["u75"] = np.where(dat["re75"] == 0, 1, 0)

Unadjusted regression

In [3]:
(
    smf.ols(f"re78 ~ treat", data=dat)
    .fit()  # vcov = "HC2" for robust SE
    .summary()
    .tables[1]
    .data[2][:5]
)
# %%

  return _pandas_is_categorical_dtype(dt)
  return _pandas_is_categorical_dtype(dt)
  return _pandas_is_categorical_dtype(dt)


['treat', '-8506.4954', '  712.766', '  -11.934', ' 0.000']

Adjusted Regression

In [4]:
rhs = list(set(dat.columns) - {"re78", "treat"})
(
    smf.ols(f're78 ~ treat + {"+".join(rhs)}', data=dat)
    .fit(vcov="HC1")
    .summary()
    .tables[1]
    .data[2][:5]
)

  return _pandas_is_categorical_dtype(dt)
  return _pandas_is_categorical_dtype(dt)
  return _pandas_is_categorical_dtype(dt)
  return _pandas_is_categorical_dtype(dt)
  return _pandas_is_categorical_dtype(dt)
  return _pandas_is_categorical_dtype(dt)
  return _pandas_is_categorical_dtype(dt)
  return _pandas_is_categorical_dtype(dt)
  return _pandas_is_categorical_dtype(dt)
  return _pandas_is_categorical_dtype(dt)
  return _pandas_is_categorical_dtype(dt)
  return _pandas_is_categorical_dtype(dt)
  return _pandas_is_categorical_dtype(dt)


['treat', ' 1067.5461', '  554.060', '    1.927', ' 0.054']

## Fisher's exact test for contingency tables
Bertrand and Mullainathan (2004) experiment 

In [5]:
resume = pd.read_csv("resume.csv")
print(xtab := pd.crosstab(resume["race"], resume["call"]))

call      0    1
race            
black  2278  157
white  2200  235


In [6]:
sp.stats.fisher_exact(xtab)

SignificanceResult(statistic=1.5498841922408801, pvalue=4.758747107909523e-05)

## simpson's paradox 
UCB admissions data

In [7]:
from rdatasets import data

ucb = data("UCBAdmissions")
ucb.head()

Unnamed: 0,Admit,Gender,Dept,Freq
0,Admitted,Male,A,512
1,Rejected,Male,A,313
2,Admitted,Female,A,89
3,Rejected,Female,A,19
4,Admitted,Male,B,353


In [8]:
(two_by_two := ucb.groupby(["Gender", "Admit"])["Freq"].sum().unstack())

Admit,Admitted,Rejected
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1
Female,557,1278
Male,1198,1493


In [9]:
vals = two_by_two.values

In [10]:
def risk_difference(tb2):
    vals = tb2.values
    denom = vals.sum(axis=1)
    p0 = vals[0, :] / denom[0]
    p1 = vals[1, :] / denom[1]
    return {"p.diff": (p1 - p0)[0], "pv": sp.stats.chi2_contingency(vals).pvalue}

In [11]:
risk_difference(two_by_two)

{'p.diff': 0.14164542824654186, 'pv': 1.0557968087828395e-21}

In [12]:
ucb.head()

Unnamed: 0,Admit,Gender,Dept,Freq
0,Admitted,Male,A,512
1,Rejected,Male,A,313
2,Admitted,Female,A,89
3,Rejected,Female,A,19
4,Admitted,Male,B,353


In [13]:
for d in list(ucb.Dept.unique()):
    twoby2_d = pd.pivot_table(
        ucb.loc[ucb.Dept == d, ["Gender", "Admit", "Freq"]],
        index="Gender",
        columns="Admit",
        values="Freq",
    )
    print(d, risk_difference(twoby2_d))

A {'p.diff': -0.20346801346801346, 'pv': 5.205468345876081e-05}
B {'p.diff': -0.04964285714285721, 'pv': 0.7705040532055736}
C {'p.diff': 0.028589959787261643, 'pv': 0.4261752614199229}
D {'p.diff': -0.01839808153477218, 'pv': 0.6378282691267924}
E {'p.diff': 0.0383011603586321, 'pv': 0.3686980945973032}
F {'p.diff': -0.011399998427586433, 'pv': 0.6403816651785297}
