In [1]:
%load_ext autoreload
%autoreload 2

## Imports

In [2]:
import pandas as pd
import numpy as np
from scipy.stats import norm

import sys
sys.path.append("py")

In [3]:
import w8_estimation as est 
import w8_LinearModel as lm
import w8_probit_ante as probit
import w8_logit_ante as logit

## Read data and preliminary data selection

In [4]:
data = pd.read_csv('ppcs_cc.csv')

data['intercept'] = 1

assert data.notnull().all().all()

print(f'All years are the same: \t{(data['year'] == 2011).all()}') # all variables are for the same year -> drop year as feature

print(f'Dummy trap in officer race: \t{data['omajother'].sum() + data['omajwhite'].sum() + data['omajhisp'].sum() + data['omajblack'].sum() == len(data)}')  # one-hot encoded -> drop one category to avoid multicollinearity

print(f'Dummy trap in civilian race: \t{data['sother'].sum() + data['swhite'].sum() + data['shisp'].sum() + data['sblack'].sum() == len(data)}')  # one-hot encoded -> drop one category to avoid multicollinearity

print(f'osplit is 0 for all obs: \t{data['osplit'].sum() == 0}')  # no variation -> drop feature

All years are the same: 	True
Dummy trap in officer race: 	True
Dummy trap in civilian race: 	True
osplit is 0 for all obs: 	True


In [5]:
data.sum()

sblack                     420
shisp                      386
swhite                    2808
sother                     185
smale                     2012
sage                    155797
sempl                     2642
sincome                   8224
spop                      5177
daytime                   2532
inctype_lin               7440
omajblack                  231
omajhisp                    91
omajwhite                 3433
omajother                   44
osplit                       0
sbehavior                  247
year                   7639789
anyuseofforce_coded         19
intercept                 3799
dtype: int64

## Data selection

In [6]:
x_labels = ['sblack', # dummy
 'shisp', # dummy
 'swhite', # dummy
 #'sother',
 'smale', # dummy
 'sage', # continuous -> squares does make sense
 'sempl', # dummy
 'sincome', # categorical with numerical interpretation -> squares does make sense
 'spop', # categorical with numerical interpretation -> squares does make sense
 'daytime', # categorical, non-numerical interpretation -> squares doesnt make sense
 'inctype_lin', # categorical
 #'omajblack', # dummy
 #'omajhisp', # dummy
 'omajwhite', # dummy
 #'omajother',
 #'osplit',
 #'year'
 'sbehavior', # dummy
 #'intercept'
 ]

y_label = 'anyuseofforce_coded'

scales = ['sage'] #, 'spop', 'sincome', 'spopsq', 'sincomesq', 'sagesq']
squares = [] # ['sage', 'spop', 'sincome']

for var in squares:
    sq_label = var + 'sq'
    data[sq_label] = data[var]**2
    x_labels += [sq_label]

for var in scales:
    #data[var] = (data[var] - data[var].mean())/data[var].std()
    data[var] = data[var]/10

In [7]:
data[x_labels][data['anyuseofforce_coded'] == 1]

Unnamed: 0,sblack,shisp,swhite,smale,sage,sempl,sincome,spop,daytime,inctype_lin,omajwhite,sbehavior
6,1,0,0,1,2.8,0,1,4,1,1,1,1
70,0,1,0,1,3.0,1,1,4,0,2,1,1
354,1,0,0,1,4.0,0,3,2,1,2,1,0
419,0,0,1,1,2.3,1,1,2,1,2,1,1
1011,0,0,1,1,2.0,1,3,1,1,2,1,1
1091,0,0,1,1,3.4,0,1,1,0,2,1,1
1834,0,1,0,1,2.2,0,3,3,0,2,1,0
1958,0,1,0,1,4.0,1,1,1,1,1,1,0
2156,0,1,0,0,2.6,0,2,4,1,1,0,1
2558,1,0,0,1,3.2,1,2,1,1,1,1,0


In [8]:
data['omajblackhisp'] = data['omajblack'] + data['omajhisp']
x_labels += ['omajblackhisp']

In [9]:
y = data[y_label]
x = data[x_labels]

In [10]:
y = y.values
x = x.values

In [11]:
assert np.linalg.matrix_rank(x.T @ x) == x.shape[1]  # check for multicollinearity

## Linear Probability Model (LPM)

In [12]:
ols_results =  lm.estimate(y, x, robust_se=True)
ols_tab = lm.print_table((y_label, x_labels), ols_results, title='LPM results')
ols_tab

LPM results
Dependent variable: anyuseofforce_coded

R2 = 0.031
sigma2 = nan


Unnamed: 0,b_hat,se,t
sblack,0.0067,0.0076,0.8799
shisp,0.0137,0.0088,1.5629
swhite,0.0044,0.006,0.7273
smale,0.0048,0.0022,2.1556
sage,-0.0013,0.0006,-2.0825
sempl,-0.0053,0.0031,-1.6737
sincome,0.0017,0.0014,1.1998
spop,0.005,0.0022,2.2966
daytime,-0.0017,0.0028,-0.6008
inctype_lin,-0.0165,0.0088,-1.8606


## Probit

In [13]:
theta0 = probit.starting_values(y, x)
print(theta0)

[ 0.01663346  0.0343133   0.01093297  0.01209689 -0.00323519 -0.01314457
  0.00417925  0.01247935 -0.00428019 -0.04114283  0.0677402   0.09092308
  0.05604228]


In [14]:
ll = probit.loglikelihood(theta0, y, x)
ll

array([-0.70983251, -0.73620633, -0.72182501, ..., -0.68060296,
       -0.71655806, -0.68415684])

In [15]:
probit_results = est.estimate(probit.q, theta0, y, x)

Optimization terminated successfully.
         Current function value: 0.023126
         Iterations: 94
         Function evaluations: 1428
         Gradient evaluations: 102


In [16]:
probit_tab = est.print_table(x_labels, probit_results, title=f'Logit, y = {y_label}')
probit_tab

Optimizer succeeded after 94 iter. (1428 func. evals.). Final criterion:  0.02313.
Logit, y = anyuseofforce_coded


Unnamed: 0,theta,se,t
sblack,-0.1078,0.6129,-0.1759
shisp,0.1004,0.5975,0.168
swhite,-0.3052,0.5413,-0.5637
smale,0.463,0.369,1.2547
sage,-0.1547,0.1458,-1.0606
sempl,-0.4223,0.2558,-1.6513
sincome,0.0241,0.1645,0.1467
spop,0.1715,0.1654,1.0366
daytime,-0.133,0.2952,-0.4505
inctype_lin,-0.9137,0.3262,-2.8012


## Logit

In [17]:
theta0 = logit.starting_values(y, x)
theta0

array([ 0.02661353,  0.05490128,  0.01749275,  0.01935503, -0.00517631,
       -0.02103131,  0.0066868 ,  0.01996697, -0.00684831, -0.06582853,
        0.10838432,  0.14547692,  0.08966765])

In [18]:
logit_results = est.estimate(logit.q, theta0, y, x)

Optimization terminated successfully.
         Current function value: 0.023197
         Iterations: 130
         Function evaluations: 1848
         Gradient evaluations: 132


In [19]:
logit_tab = est.print_table(x_labels, logit_results, title=f'Logit, y = {y_label}')
logit_tab

Optimizer succeeded after 130 iter. (1848 func. evals.). Final criterion:   0.0232.
Logit, y = anyuseofforce_coded


Unnamed: 0,theta,se,t
sblack,-0.0859,1.4436,-0.0595
shisp,0.5083,1.3714,0.3706
swhite,-0.3755,1.2893,-0.2912
smale,0.9782,0.8015,1.2204
sage,-0.4061,0.3295,-1.2323
sempl,-0.9633,0.5677,-1.6969
sincome,0.0179,0.3586,0.0499
spop,0.4629,0.3508,1.3194
daytime,-0.3859,0.6727,-0.5736
inctype_lin,-1.8959,0.7248,-2.6158


## Partial effects of the average and delta method

In [54]:
average_joe = x.mean(axis=0)

In [55]:
b_lpm = ols_tab.b_hat.values
b_probit = probit_tab.theta.values
b_logit = logit_tab.theta.values

cov_lpm = ols_results.get('cov')
cov_probit = probit_results.get('cov')
cov_logit = logit_results.get('cov')

In [56]:
x_labels_dummy = ['sblack',
 'shisp',
 'swhite',
 'smale',
 #'sage',
 'sempl',
 #'sincome',
 #'spop',
 #'daytime',
 #'inctype_lin',
 'omajwhite',
 'sbehavior',
 'omajblackhisp']

In [77]:
df_pe_av = pd.DataFrame(columns=['variable', 'PE_LPM', 'PE_Probit', 'PE_Logit'])
df_pe_av_se = pd.DataFrame(columns=['variable', 'PE_LPM', 'PE_Probit', 'PE_Logit'])

for var in x_labels:

    if var == 'daytime': continue

    idx = x_labels.index(var)
    
    x1 = average_joe.copy()
    x0 = average_joe.copy()

    # categorical variables
    if var not in x_labels_dummy:

        # special case for age as it is scaled
        if var == 'sage': 
            x0[idx] = np.floor(average_joe[idx]*10)/10 # mitigate earlier scaling
            x1[idx] = x0[idx] + 0.1

        else:
            x0[idx] = np.floor(average_joe[idx])
            x1[idx] = x0[idx] + 1

    # dummy variables
    else:
        x1[idx] = 1
        x0[idx] = 0
    
    # Probit
    pred1 = probit.predict(b_probit, x1)
    pred0 = probit.predict(b_probit, x0)
    pe_probit = pred1 - pred0

    # Probit: se
    g_probit = norm.pdf(x1 * b_probit)*x1 - norm.pdf(x0 * b_probit)*x0
    avar_probit = g_probit @ cov_probit @ g_probit.T

    # Logit
    pred1 = logit.predict(b_logit, x1)
    pred0 = logit.predict(b_logit, x0)
    pe_logit = pred1 - pred0
    
    # Logit: se
    g = pred1*(1-pred1)*x1 - pred0*(1 - pred0)*x0
    avar_logit = g @ cov_logit @ g.T
    
    # Save
    df_pe_av = pd.concat([df_pe_av, pd.DataFrame({'variable': [var], 'PE_LPM': [ols_tab['b_hat'][var]], 'PE_Probit': [pe_probit], 'PE_Logit': [pe_logit]})], ignore_index=True)
    df_pe_av_se = pd.concat([df_pe_av_se, pd.DataFrame({'variable': [var], 'PE_LPM': [np.sqrt(cov_lpm[idx, idx])], 'PE_Probit': [np.sqrt(avar_probit)], 'PE_Logit': [np.sqrt(avar_logit)]})], ignore_index=True)

  df_pe_av = pd.concat([df_pe_av, pd.DataFrame({'variable': [var], 'PE_LPM': [ols_tab['b_hat'][var]], 'PE_Probit': [pe_probit], 'PE_Logit': [pe_logit]})], ignore_index=True)
  df_pe_av_se = pd.concat([df_pe_av_se, pd.DataFrame({'variable': [var], 'PE_LPM': [np.sqrt(cov_lpm[idx, idx])], 'PE_Probit': [np.sqrt(avar_probit)], 'PE_Logit': [np.sqrt(avar_logit)]})], ignore_index=True)


In [78]:
new_df = df_pe_av.select_dtypes(include='number') / df_pe_av_se.select_dtypes(include='number')
new_df

Unnamed: 0,PE_LPM,PE_Probit,PE_Logit
0,0.886087,-0.00117,-0.062069
1,1.560005,0.001442,0.287366
2,0.731715,-0.005742,-0.271737
3,2.138341,0.010859,1.352279
4,-2.09204,-0.016272,-1.15806
5,-1.68709,-0.018584,-1.272985
6,1.220094,0.001135,0.049875
7,2.300395,0.009409,1.18115
8,-1.865439,-0.310216,-1.281949
9,2.221866,-0.013272,-0.320558


In [79]:
df_pe_av[['PE_LPM', 'PE_Probit', 'PE_Logit']].values / df_pe_av_se[['PE_LPM', 'PE_Probit', 'PE_Logit']].values

array([[ 8.86086943e-01, -1.16979607e-03, -6.20691302e-02],
       [ 1.56000476e+00,  1.44212305e-03,  2.87365907e-01],
       [ 7.31715352e-01, -5.74221191e-03, -2.71736839e-01],
       [ 2.13834053e+00,  1.08589121e-02,  1.35227877e+00],
       [-2.09204043e+00, -1.62723356e-02, -1.15806048e+00],
       [-1.68709031e+00, -1.85840240e-02, -1.27298487e+00],
       [ 1.22009408e+00,  1.13543128e-03,  4.98748310e-02],
       [ 2.30039530e+00,  9.40850746e-03,  1.18114991e+00],
       [-1.86543945e+00, -3.10216261e-01, -1.28194917e+00],
       [ 2.22186628e+00, -1.32724706e-02, -3.20558400e-01],
       [ 2.96902097e+00,  2.00821312e-01,  1.44078886e+00],
       [ 1.79209120e+00, -4.81708340e-03, -1.31219085e+00]])