In [1]:
%load_ext autoreload
%autoreload 2

## Imports

In [2]:
import pandas as pd
import numpy as np

In [3]:
import w8_estimation as est 
import w8_LinearModel as lm
import w8_probit_ante as probit
import w8_logit_ante as logit

## Read data and preliminary data selection

In [40]:
data = pd.read_csv('ppcs_cc.csv')

data['intercept'] = 1

assert data.notnull().all().all()

print(f'All years are the same: \t{(data['year'] == 2011).all()}') # all variables are for the same year -> drop year as feature

print(f'Dummy trap in officer race: \t{data['omajother'].sum() + data['omajwhite'].sum() + data['omajhisp'].sum() + data['omajblack'].sum() == len(data)}')  # one-hot encoded -> drop one category to avoid multicollinearity

print(f'Dummy trap in civilian race: \t{data['sother'].sum() + data['swhite'].sum() + data['shisp'].sum() + data['sblack'].sum() == len(data)}')  # one-hot encoded -> drop one category to avoid multicollinearity

print(f'osplit is 0 for all obs: \t{data['osplit'].sum() == 0}')  # no variation -> drop feature

All years are the same: 	True
Dummy trap in officer race: 	True
Dummy trap in civilian race: 	True
osplit is 0 for all obs: 	True


In [41]:
data.sum()

sblack                     420
shisp                      386
swhite                    2808
sother                     185
smale                     2012
sage                    155797
sempl                     2642
sincome                   8224
spop                      5177
daytime                   2532
inctype_lin               7440
omajblack                  231
omajhisp                    91
omajwhite                 3433
omajother                   44
osplit                       0
sbehavior                  247
year                   7639789
anyuseofforce_coded         19
intercept                 3799
dtype: int64

## Data selection

In [None]:
x_labels = ['sblack', # dummy
 'shisp', # dummy
 'swhite', # dummy
 #'sother',
 'smale', # dummy
 'sage', # continuous -> squares does make sense
 'sempl', # dummy
 'sincome', # categorical with numerical interpretation -> squares does make sense
 'spop', # categorical with numerical interpretation -> squares does make sense
 'daytime', # categorical, non-numerical interpretation -> squares doesnt make sense
 'inctype_lin', # categorical
 'omajblack', # dummy
 'omajhisp', # dummy
 'omajwhite', # dummy
 #'omajother',
 #'osplit',
 #'year'
 'sbehavior', # dummy
 #'intercept'
 ]

y_label = 'anyuseofforce_coded'

scales = ['sage'] #, 'spop', 'sincome', 'spopsq', 'sincomesq', 'sagesq']
squares = [] # ['sage', 'spop', 'sincome']

for var in squares:
    sq_label = var + 'sq'
    data[sq_label] = data[var]**2
    x_labels += [sq_label]

for var in scales:
    #data[var] = (data[var] - data[var].mean())/data[var].std()
    data[var] = data[var]/10

In [43]:
data[x_labels][data['anyuseofforce_coded'] == 1]

Unnamed: 0,sblack,shisp,swhite,smale,sage,sempl,sincome,spop,daytime,inctype_lin,omajblack,omajhisp,omajwhite,sbehavior,intercept
6,1,0,0,1,2.8,0,1,4,1,1,0,0,1,1,1
70,0,1,0,1,3.0,1,1,4,0,2,0,0,1,1,1
354,1,0,0,1,4.0,0,3,2,1,2,0,0,1,0,1
419,0,0,1,1,2.3,1,1,2,1,2,0,0,1,1,1
1011,0,0,1,1,2.0,1,3,1,1,2,0,0,1,1,1
1091,0,0,1,1,3.4,0,1,1,0,2,0,0,1,1,1
1834,0,1,0,1,2.2,0,3,3,0,2,0,0,1,0,1
1958,0,1,0,1,4.0,1,1,1,1,1,0,0,1,0,1
2156,0,1,0,0,2.6,0,2,4,1,1,0,1,0,1,1
2558,1,0,0,1,3.2,1,2,1,1,1,0,0,1,0,1


In [39]:
data['sblackhisp'] = data['omajblack'] + data['omajhisp']
x_labels += ['sblackhisp']

In [44]:
y = data[y_label]
x = data[x_labels]

In [45]:
y = y.values
x = x.values

In [46]:
assert np.linalg.matrix_rank(x.T @ x) == x.shape[1]  # check for multicollinearity

## Linear Probability Model (LPM)

In [47]:
ols_results =  lm.estimate(y, x, robust_se=True)
ols_tab = lm.print_table((y_label, x_labels), ols_results, title='LPM results')
ols_tab

LPM results
Dependent variable: anyuseofforce_coded

R2 = 0.033
sigma2 = nan


Unnamed: 0,b_hat,se,t
sblack,0.0024,0.0068,0.35
shisp,0.0092,0.008,1.1562
swhite,0.0003,0.0058,0.0456
smale,0.0042,0.0022,1.8944
sage,-0.0016,0.0006,-2.5851
sempl,-0.0056,0.0031,-1.7903
sincome,0.001,0.0014,0.6975
spop,0.0041,0.0021,1.9532
daytime,-0.0017,0.0028,-0.6052
inctype_lin,-0.0281,0.0149,-1.8793


## Probit

In [48]:
theta0 = probit.starting_values(y, x)
print(theta0)

[ 0.00599305  0.02301672  0.00066487  0.01054008 -0.00393337 -0.0139809
  0.00237788  0.01027833 -0.00428795 -0.07021874  0.00247763  0.01259336
  0.01603886  0.08837279  0.12981134]


In [49]:
ll = probit.loglikelihood(theta0, y, x)
ll

array([-0.71185754, -0.73049985, -0.71787324, ..., -0.68089747,
       -0.70839214, -0.68308383])

In [50]:
probit_results = est.estimate(probit.q, theta0, y, x)

Optimization terminated successfully.
         Current function value: 0.022523
         Iterations: 136
         Function evaluations: 2304
         Gradient evaluations: 144


LinAlgError: Singular matrix

In [33]:
probit_tab = est.print_table(x_labels, probit_results, title=f'Logit, y = {y_label}')
probit_tab

Optimizer succeeded after 138 iter. (2130 func. evals.). Final criterion:  0.02308.
Logit, y = anyuseofforce_coded


Unnamed: 0,theta,se,t
sblack,-0.1024,0.6125,-0.1672
shisp,0.0879,0.598,0.1469
swhite,-0.3092,0.5417,-0.5708
smale,0.4611,0.3681,1.2529
sage,-1.5434,1.4585,-1.0582
sempl,-0.418,0.2552,-1.6376
sincome,0.0223,0.1648,0.1354
spop,0.1678,0.1649,1.0177
daytime,-0.1375,0.2938,-0.4679
inctype_lin,-0.9034,0.3268,-2.7641


## Logit

In [34]:
theta0 = logit.starting_values(y, x)
theta0

array([ 0.02667516,  0.05406723,  0.01740258,  0.01939621, -0.05216599,
       -0.02101786,  0.00660356,  0.01991159, -0.00700116, -0.06534744,
        0.08430279,  0.10236938,  0.10810062,  0.14509425])

In [35]:
logit_results = est.estimate(logit.q, theta0, y, x)

Optimization terminated successfully.
         Current function value: 0.023123
         Iterations: 186
         Function evaluations: 2850
         Gradient evaluations: 190


In [36]:
logit_tab = est.print_table(x_labels, logit_results, title=f'Logit, y = {y_label}')
logit_tab

Optimizer succeeded after 186 iter. (2850 func. evals.). Final criterion:  0.02312.
Logit, y = anyuseofforce_coded


Unnamed: 0,theta,se,t
sblack,-0.0472,1.4387,-0.0328
shisp,0.4674,1.3702,0.3411
swhite,-0.3915,1.2886,-0.3038
smale,0.9495,0.7998,1.1873
sage,-4.0428,3.2884,-1.2294
sempl,-0.9595,0.5669,-1.6925
sincome,0.0194,0.3595,0.0539
spop,0.4506,0.351,1.284
daytime,-0.3948,0.6698,-0.5894
inctype_lin,-1.8666,0.7296,-2.5584


## Partial effects

In [20]:
average_joe = x.mean(axis=0)

In [21]:
b_lpm = ols_tab.b_hat.values
b_probit = probit_tab.theta.values
b_logit = logit_tab.theta.values

In [22]:
df_pe_av = pd.DataFrame(columns=['variable', 'PE_LPM', 'PE_Probit', 'PE_Logit'])

for var in x_labels:

    idx = x_labels.index(var)
    
    # LPM
    x1 = average_joe.copy()
    x0 = average_joe.copy()

    x1[idx] = 1
    x0[idx] = 0

    pred1 = lm.predict(b_lpm, x1)
    pred0 = lm.predict(b_lpm, x0)

    pe_lpm = pred1 - pred0
    
    # Probit
    pred1 = probit.predict(b_probit, x1)
    pred0 = probit.predict(b_probit, x0)
    pe_probit = pred1 - pred0
    
    # Logit
    pred1 = logit.predict(b_logit, x1)
    pred0 = logit.predict(b_logit, x0)
    pe_logit = pred1 - pred0
    
    df_pe_av = pd.concat([df_pe_av, pd.DataFrame({'variable': [var], 'PE_LPM': [pe_lpm], 'PE_Probit': [pe_probit], 'PE_Logit': [pe_logit]})], ignore_index=True)

  df_pe_av = pd.concat([df_pe_av, pd.DataFrame({'variable': [var], 'PE_LPM': [pe_lpm], 'PE_Probit': [pe_probit], 'PE_Logit': [pe_logit]})], ignore_index=True)


In [23]:
from scipy.stats import norm

norm.cdf(average_joe2 @ b_probit) - norm.cdf(average_joe @ b_probit)
logit.G(average_joe2 @ b_probit) - logit.G(average_joe @ b_probit)

NameError: name 'average_joe2' is not defined

In [57]:
df_pe_av

Unnamed: 0,variable,PE_LPM,PE_Probit,PE_Logit
0,sblack,0.0067,-0.000284,-0.00011
1,shisp,0.0137,0.000342,0.000834
2,swhite,0.0044,-0.001184,-0.000548
3,smale,0.0048,0.001436,0.001312
4,sage,-0.0013,-0.00228,-0.002322
5,sempl,-0.0053,-0.001734,-0.001602
6,sincome,0.0017,6.4e-05,2.3e-05
7,spop,0.005,0.000324,0.000416
8,daytime,-0.0017,-0.00043,-0.00055
9,inctype_lin,-0.0165,-0.078247,-0.043599
