In [1]:
import statsmodels
import statsmodels.api as sm
import statsmodels.formula.api as smf
import pandas as pd
import numpy as np

In [22]:
df = pd.DataFrame(pd.read_csv('merged.csv'))
df = df[df['Population'] > 0]
df['Count'] = df['Count'].fillna(0)
df.head()

Unnamed: 0,Race,Age,Gender,Location,Year,Count,Population
0,Asian,15–19,F,F4,2010,5.0,134.0
1,Asian,15–19,F,F4,2011,5.0,120.0
2,Asian,15–19,F,F4,2012,2.0,117.0
3,Asian,15–19,F,F4,2013,3.0,108.0
4,Asian,15–19,F,F4,2014,3.0,118.0


## First, 

In [115]:
poisson_res = smf.glm(formula='Count ~ C(Race) + C(Age) + C(Gender) + C(Location) + C(Year) -1', data=df, family=sm.families.Poisson(), 
              exposure=df.Population).fit()

In [116]:
df['mu'] = poisson_res.mu

In [117]:
# https://dius.com.au/2017/08/03/using-statsmodels-glms-to-model-beverage-consumption/

def response(row):
    "Calculate response observation for Cameron-Trivedi dispersion test"
    y = row['Count']
    m = row['mu']
    return ((y - m)**2 - y) / m

response_var = df.apply(response, axis=1)
df['response_var'] = response_var

ols_res = smf.ols('response_var ~ mu - 1', df).fit()
alpha = ols_res.params[0]

print('Overdispersion phi = ', 1.0/alpha)

Overdispersion phi =  13.481795676247309


In [118]:
res = smf.glm(formula='Count ~ C(Race) + C(Age) + C(Gender) + C(Location) + C(Year) -1', data=df, family=sm.families.NegativeBinomial(alpha=alpha), 
              exposure=df.Population).fit()
res.summary()

0,1,2,3
Dep. Variable:,Count,No. Observations:,13200.0
Model:,GLM,Df Residuals:,13132.0
Model Family:,NegativeBinomial,Df Model:,67.0
Link Function:,log,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-73478.0
Date:,"Wed, 10 Mar 2021",Deviance:,36140.0
Time:,17:23:37,Pearson chi2:,35100.0
No. Iterations:,12,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
C(Race)[Asian],-2.3671,0.024,-97.578,0.000,-2.415,-2.320
C(Race)[Black],-1.5080,0.023,-64.911,0.000,-1.553,-1.462
C(Race)[Hispanic],-2.0636,0.023,-88.330,0.000,-2.109,-2.018
C(Race)[White],-1.9028,0.023,-82.247,0.000,-1.948,-1.857
C(Age)[T.20–29],0.8646,0.009,98.357,0.000,0.847,0.882
C(Age)[T.30–39],0.6507,0.009,73.693,0.000,0.633,0.668
C(Age)[T.40–49],0.4085,0.009,45.760,0.000,0.391,0.426
C(Age)[T.50+],-0.3994,0.009,-43.935,0.000,-0.417,-0.382
C(Gender)[T.M],0.7844,0.005,143.209,0.000,0.774,0.795


In [119]:
for race in ['Asian', 'White', 'Black', 'Hispanic']:
    print('{} Coef = '.format(race) , res.params['C(Race)[{}]'.format(race)] - res.params['C(Race)[White]'])

Asian Coef =  -0.4643313097454096
White Coef =  0.0
Black Coef =  0.3948316581197282
Hispanic Coef =  -0.16076662747919035


In [120]:
for gender in ['M']:
    print('{} Coef = '.format(gender) , res.params['C(Gender)[T.{}]'.format(gender)])

M Coef =  0.7844337750162934


In [121]:
for age in ['20–29', '30–39', '40–49', '50+']:
    print('{} Coef = '.format(age) ,  res.params['C(Age)[T.{}]'.format(age)])

20–29 Coef =  0.864643384333091
30–39 Coef =  0.6506809553323125
40–49 Coef =  0.40847987929140556
50+ Coef =  -0.39943695173083493


In [106]:
res_df = pd.DataFrame({'Category': ['Asian', 'Black', 'Hispanic', 'M', '20-29', '30-39', '40-49', '50+'], 
                       'Coef': [-0.464, 0.395, -0.160, 0.784 ,0.865, 0.651, 0.408, -0.40],
                       'Stderr': [.024, .023, .023, .005, .009, .009, .009, .009]})

In [108]:
res_df['Rate'] = res_df['Coef'].apply(np.exp)

In [109]:
res_df

Unnamed: 0,Category,Coef,Stderr,Rate
0,Asian,-0.464,0.024,0.628764
1,Black,0.395,0.023,1.484384
2,Hispanic,-0.16,0.023,0.852144
3,M,0.784,0.005,2.190216
4,20-29,0.865,0.009,2.375006
5,30-39,0.651,0.009,1.917457
6,40-49,0.408,0.009,1.503807
7,50+,-0.4,0.009,0.67032


## Q3C
C. Give three distinct potential reasons for the racial disparity in stop rate as measured in part B.

1. Racial bias in policing -- law enforcement officers may be more likely to stop drivers who are Black compared to other races.
2. Uneven distribution of policing -- officers may be more present in urban regions, for example, or other regions with particular racial distributions.
3. re: 'Sundown Town' -- if you are Black in a predominantly White town, you might be particularly likely to be stopped, and if you are Black in a predominantly Black town, you might still be likely to be stopped (for reason number 2). 

In [122]:
df = pd.DataFrame(pd.read_csv('merged.csv'))
df = df[df['Population'] > 0]
df['Count'] = df['Count'].fillna(0)

In [125]:
df.to_csv('q3.csv')