In [None]:
# LIVIAS KODE

In [164]:
# attach lanbels to data
labels = {
    'sblack': 'Black',
    'shisp': 'Hispanic',
    'swhite': 'White',
    'sother': 'Other race',
    'sage': 'Age',
    'sempl': 'Employed last week or not',
    'smale': 'Male',
    'spop': 'Population size of civilian\'s address',
    'sincome': 'Income (categorical)',
    'sbehavior': 'Behavior of civilian in encounter',
    'omajwhite': 'Officer unit majorly white',
    'omajother': 'Officer unit majorly other',
    'omajblack': 'Officer unit majorly black',
    'omajhisp': 'Officer unit majorly hispanic',
    'osplit': 'Officer unit split race',
    'daytime': 'Time of encounter',
    'year': 'Year',
    'inctype_lin': 'Incident type',
    'anyuseofforce_coded': 'Any use of force by officer'}

data_labeled = data.rename(columns=labels)
data_labeled.columns

Index(['Black', 'Hispanic', 'White', 'Other race', 'Male', 'Age',
       'Employed last week or not', 'Income (categorical)',
       'Population size of civilian's address', 'Time of encounter',
       'Incident type', 'Officer unit majorly black',
       'Officer unit majorly hispanic', 'Officer unit majorly white',
       'Officer unit majorly other', 'Officer unit split race',
       'Behavior of civilian in encounter', 'Year',
       'Any use of force by officer'],
      dtype='object')

In [165]:
# table with summary statistics conditional on race and interaction
# a. make columns for each race and full sample
full_sample = data
white_only = data[data['swhite'] == 1]
black_only = data[data['sblack'] == 1]
hispanic_only = data[data['shisp'] == 1]
other_only = data[data['sother'] == 1 ]
groups = [full_sample, white_only, black_only, hispanic_only, other_only]
group_names = ['Full sample', 'White only', 'Black only', 'Hispanic only', 'Other only']
variables = ['sage','sempl', 'smale','sincome', 'spop', 'sbehavior', 'omajwhite', 
             'omajblack', 'omajhisp', 'omajother', 'osplit', 'daytime', 
             'inctype_lin', 'anyuseofforce_coded']
variables_labeled = [labels[var] for var in variables]
#####################################################
# create empty dataframe to store summary statistics
summary_stats = pd.DataFrame(columns=group_names, index=variables_labeled)
# append two columns for p-values from t-tests between white vs black and hispanic vs white
summary_stats['p-value (Black vs White)'] = np.nan
summary_stats['p-value (Hispanic vs White)'] = np.nan
summary_stats.index.name = 'Variable'
for i, group in enumerate(groups):
    summary_stats.iloc[:, i] = group[variables].mean().round(2) #mean
    # t-test means of black vs white and hispanic vs white
    if group_names[i] == 'Black only':
        from scipy.stats import ttest_ind
        t_stat, p_val = ttest_ind(black_only[variables], white_only[variables], equal_var=False, nan_policy='omit')
        summary_stats['p-value (Black vs White)'] = p_val.round(2)
    if group_names[i] == 'Hispanic only':
        from scipy.stats import ttest_ind
        t_stat, p_val = ttest_ind(hispanic_only[variables], white_only[variables], equal_var=False, nan_policy='omit')
        summary_stats['p-value (Hispanic vs White)'] = p_val.round(2)

#####################################################
summary_stats.to_latex('summary_stats.tex', index=True, decimal=',', float_format="%.2f")

# claras kode

This notebook uses the Police Public Contact Survey (PPCS) dataset: `ppcs_cc.csv`.

In [387]:
%load_ext autoreload
%autoreload 2

import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt 
import w8_probit as probit
import w8_logit as logit
import w8_estimation as est
from scipy.stats import norm
from scipy.stats import t

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Load the data 

In [434]:
# Load the dataset
dat = pd.read_csv('ppcs_cc.csv')

# Inspect distribution of the target variable
print("\nDistribution of 'anyuseofforce_coded':")
print(dat['anyuseofforce_coded'].value_counts(normalize=True))

# Inspect value counts for categorical variables
categorical_vars = ["sblack", "shisp", "swhite", "sother", "smale", "omajblack", 
                    "omajhisp", "omajwhite", "omajother", "osplit", "inctype_lin", "sbehavior"]

for var in categorical_vars:
    print(f"\nValue Counts for {var}:")
    print(dat[var].value_counts())


Distribution of 'anyuseofforce_coded':
anyuseofforce_coded
0    0.994999
1    0.005001
Name: proportion, dtype: float64

Value Counts for sblack:
sblack
0    3379
1     420
Name: count, dtype: int64

Value Counts for shisp:
shisp
0    3413
1     386
Name: count, dtype: int64

Value Counts for swhite:
swhite
1    2808
0     991
Name: count, dtype: int64

Value Counts for sother:
sother
0    3614
1     185
Name: count, dtype: int64

Value Counts for smale:
smale
1    2012
0    1787
Name: count, dtype: int64

Value Counts for omajblack:
omajblack
0    3568
1     231
Name: count, dtype: int64

Value Counts for omajhisp:
omajhisp
0    3708
1      91
Name: count, dtype: int64

Value Counts for omajwhite:
omajwhite
1    3433
0     366
Name: count, dtype: int64

Value Counts for omajother:
omajother
0    3755
1      44
Name: count, dtype: int64

Value Counts for osplit:
osplit
0    3799
Name: count, dtype: int64

Value Counts for inctype_lin:
inctype_lin
2    3641
1     158
Name: count, dtype

Table with summary statistics

In [435]:
# Define groups for demographic categories
group_vars = ["swhite", "sblack", "shisp", "sother"]

# List of all variables for which we want to compute means
all_vars = dat.columns

# Initialize an empty DataFrame to store results
summary_table = pd.DataFrame()

# Calculate the overall mean for each variable
overall_means = dat[all_vars].mean()
summary_table["Variable"] = all_vars
summary_table["Full Sample"] = overall_means.values

# Calculate the mean for each variable within each group
for group in group_vars:
    group_means = dat.loc[dat[group] == 1, all_vars].mean()
    summary_table[group.capitalize()] = group_means.values

# Add a row for "Number of Observations"
num_obs_row = pd.DataFrame({
    "Variable": ["Number of Observations"],
    "Full Sample": [dat.shape[0]],
    **{group.capitalize(): [dat.loc[dat[group] == 1].shape[0]] for group in group_vars}
})

# Append the "Number of Observations" row to the summary table
summary_table = pd.concat([summary_table, num_obs_row], ignore_index=True)

# Format the table for display
summary_table = summary_table.set_index("Variable")
print(summary_table)

# Optional: Save the table to a CSV for further analysis
summary_table.to_csv('grouped_summary_statistics_with_observations.csv')


                        Full Sample       Swhite       Sblack        Shisp  \
Variable                                                                     
sblack                     0.110555     0.000000     1.000000     0.000000   
shisp                      0.101606     0.000000     0.000000     1.000000   
swhite                     0.739142     1.000000     0.000000     0.000000   
sother                     0.048697     0.000000     0.000000     0.000000   
smale                      0.529613     0.521368     0.519048     0.585492   
sage                      41.010003    42.147792    39.183333    36.225389   
sempl                      0.695446     0.699786     0.688095     0.676166   
sincome                    2.164780     2.218305     1.935714     1.997409   
spop                       1.362727     1.271011     1.657143     1.652850   
daytime                    0.666491     0.680912     0.621429     0.642487   
inctype_lin                1.958410     1.957621     1.966667   

In [436]:
# Declare labels    
y_lab = 'anyuseofforce_coded'
#x_lab = ['const', 'sblack', 'shisp', 'sother']
x_lab = ['const', 'sblack', 'shisp', 'sother', 'smale', 'sempl', 'sincome', 'spop', 'sage', 'sagesq']
#x_lab = ['const', 'sblack', 'shisp', 'sother', 'smale', 'sempl', 'sincome', 'spop', 'sage', 'sagesq', 'daytime', 'inctype_lin', 'omajblack', 'omajhisp', 'omajother']
#x_lab = ['const', 'sblack', 'shisp', 'sother', 'smale', 'sempl', 'sincome', 'spop', 'sage', 'sagesq', 'daytime', 'inctype_lin', 'omajblack', 'omajhisp', 'omajother', 'sbehavior']

dat['sage'] = dat['sage'] / 10
dat['sagesq'] = dat.sage * dat.sage 

# create extra variables 
N = dat.shape[0]
dat['const'] = np.ones((N,))

# Rebuild the dataset
dat = dat[[y_lab] + x_lab].copy()

# Check for missing data
assert dat.notnull().all(axis=1).all(), 'Missing values detected. Clean your data!'

dat.tail(5)

Unnamed: 0,anyuseofforce_coded,const,sblack,shisp,sother,smale,sempl,sincome,spop,sage,sagesq
3794,0,1.0,0,0,0,0,1,3,1,7.2,51.84
3795,0,1.0,0,0,0,0,0,2,1,7.1,50.41
3796,0,1.0,0,0,0,0,0,1,1,7.6,57.76
3797,0,1.0,0,0,0,0,0,3,4,7.9,62.41
3798,0,1.0,0,0,0,0,0,2,1,7.5,56.25


In [442]:
# descriptive of who's been victim to police violence
test = dat[dat['anyuseofforce_coded']==1] 

# List of all variables for which we want to compute means
all_vars = test.columns

# Initialize an empty DataFrame to store results
summary_table = pd.DataFrame()

# Calculate the overall mean for each variable
overall_means = test[all_vars].mean()
summary_table["Variable"] = all_vars
summary_table["Full Sample"] = overall_means.values

# Format the table for display
summary_table = summary_table.set_index("Variable")
print(summary_table)

# Optional: Save the table to a CSV for further analysis
summary_table.to_latex('test.csv')


                     Full Sample
Variable                        
anyuseofforce_coded     1.000000
const                   1.000000
sblack                  0.157895
shisp                   0.315789
sother                  0.052632
smale                   0.789474
sempl                   0.473684
sincome                 2.052632
spop                    1.947368
sage                    3.078947
sagesq                 10.666842


In [437]:
# Extract y and X
y = dat[y_lab].values
x = dat[x_lab].values
K = x.shape[1]

print(K)
print(np.shape(x))

10
(3799, 10)


In [426]:
count_violent_1 = (dat['anyuseofforce_coded'] == 1).sum()
print(f"Number of 1s in 'anyuseofforce_coded': {count_violent_1}")

Number of 1s in 'anyuseofforce_coded': 19


## Estimate using Probit

In [427]:
# Initialize starting values
theta0 = probit.starting_values(y, x)

# Estimate model with probit
probit_results = est.estimate(probit.q, theta0, y, x, cov_type='Sandwich')

Optimization terminated successfully.
         Current function value: 0.027097
         Iterations: 83
         Function evaluations: 1111
         Gradient evaluations: 101


In [428]:
probit_tab = est.print_table(x_lab, probit_results, title=f'Probit, y = {y_lab}')
probit_tab

Optimizer succeeded after 83 iter. (1111 func. evals.). Final criterion:   0.0271.
Probit, y = anyuseofforce_coded


Unnamed: 0,theta,se,t
const,-3.3339,0.6515,-5.1175
sblack,0.0973,0.2576,0.3776
shisp,0.3859,0.2099,1.8382
sother,0.0834,0.3793,0.2198
smale,0.523,0.1992,2.6258
sempl,-0.53,0.1965,-2.6976
sincome,0.033,0.1042,0.3162
spop,0.2102,0.0727,2.8935
sage,0.3334,0.3735,0.8925
sagesq,-0.0653,0.0505,-1.2953


## Estimate using Logit

In [429]:
# Initialize starting values
theta0 = logit.starting_values(y, x)

# Estimate model with logit
logit_results = est.estimate(logit.q, theta0, y, x, cov_type='Sandwich')

Optimization terminated successfully.
         Current function value: 0.027131
         Iterations: 118
         Function evaluations: 1430
         Gradient evaluations: 130


In [430]:
logit_tab = est.print_table(x_lab, logit_results, title=f'Logit, y = {y_lab}')
logit_tab

Optimizer succeeded after 118 iter. (1430 func. evals.). Final criterion:  0.02713.
Logit, y = anyuseofforce_coded


Unnamed: 0,theta,se,t
const,-7.7897,1.8673,-4.1716
sblack,0.337,0.6804,0.4953
shisp,1.0101,0.5431,1.8599
sother,0.14,1.0511,0.1332
smale,1.338,0.5842,2.2904
sempl,-1.4035,0.5359,-2.6188
sincome,0.0774,0.2945,0.2628
spop,0.5491,0.1832,2.9973
sage,1.1974,1.1223,1.0669
sagesq,-0.2163,0.1621,-1.3343


## Average partial effects

### Probit

In [431]:
# Estimating the average partial effects using the probit
indices = [x_lab.index('sblack'), x_lab.index('shisp'), x_lab.index('sother')]  
labels = ['sblack', 'shispanic', 'sother'] 
res = probit.properties(x, probit_results['theta'],print_out = True,se=True,indices=indices, labels = labels)

In [432]:
res

Unnamed: 0,Estimate
sblack,0.001
shispanic,0.007
sother,0.001


### Logit

In [433]:
# Estimating the average partial effects using the logit
indices = [x_lab.index('sblack'), x_lab.index('shisp'), x_lab.index('sother')]  
labels = ['sblack', 'shispanic', 'sother']  
logit.properties(x, logit_results['theta'],print_out = True,se=True,indices=indices, labels = labels)

Unnamed: 0,Estimate
sblack,0.001
shispanic,0.006
sother,0.001


In [None]:
data

## Partial Effects

#### Defining different fixed vectors

In [356]:
#means of the regressors
print(f"{np.mean(dat['sage']):.2f}")
print(f"{np.mean(dat['sagesq']):.2f}")
print(f"{np.mean(dat['sincome']):.2f}")
print(f"{np.mean(dat['spop']):.2f}")

4.10
19.42
2.16
1.36


In [180]:
# Original vector
# make vector of stereotypical white young man 
x_lab = ['const', 'sblack', 'shisp', 'sother', 
         'smale', 'sage', 'sempl', 'sincome',
         'spop', 'daytime', 'inctype_lin', 'omajblack',
         'omajhisp', 'omajother', 'sbehavior','sagesq']

x_me = np.array([1, 0, 0, 0,
                 1, 2.5, 0, 1,
                 4, 5, 1, 0, # daytime = 5 means the incident happened at night
                 0, 0, 1, 6.25]).reshape(1, -1)

pd.DataFrame(x_me, columns=x_lab, index=['x_me'])


Unnamed: 0,const,sblack,shisp,sother,smale,sage,sempl,sincome,spop,daytime,inctype_lin,omajblack,omajhisp,omajother,sbehavior,sagesq
x_me,1.0,0.0,0.0,0.0,1.0,4.1,0.0,2.16,1.36,0.0,1.0,0.0,0.0,0.0,0.0,19.42


In [181]:
### BEHAVIOR = 1 ###
# Let us make a vector of the values we want to investigate
#x_me= np.array([1, 0, 0, 0, 1, 4.1, 0, 2.16, 1.36,0,1,0,0,0,1,19.42]).reshape(1, -1)
#pd.DataFrame(x_me, columns=x_lab, index=['x_behavior'])


#### Swiching race from white to black, hispanic and other

In [281]:
# make vector of stereotypical white old woman
x_lab = ['const', 'sblack', 'shisp', 'sother', 
         'smale', 'sage', 'sempl', 'sincome',
         'spop', 'daytime', 'inctype_lin', 'omajblack',
         'omajhisp', 'omajother', 'sbehavior','sagesq']

x_me = np.array([1, 0, 0, 0,
                 0, 4.5, 1, 3,
                 2, 2, 1, 0, 
                 0, 0, 1, 20.25]).reshape(1, -1)

pd.DataFrame(x_me, columns=x_lab, index=['x_me'])


Unnamed: 0,const,sblack,shisp,sother,smale,sage,sempl,sincome,spop,daytime,inctype_lin,omajblack,omajhisp,omajother,sbehavior,sagesq
x_me,1.0,0.0,0.0,0.0,0.0,4.5,1.0,3.0,2.0,2.0,1.0,0.0,0.0,0.0,1.0,20.25


In [334]:
"""# the average characteristics for age, agesq, income, spop
print(f"{np.mean(dat['sage']):.2f}")
print(f"{np.mean(dat['sagesq']):.2f}")
print(f"{np.mean(dat['sincome']):.2f}")
print(f"{np.mean(dat['spop']):.2f}")
print(f"{np.mean(dat['daytime']):.2f}")

x_lab = ['const', 'sblack', 'shisp', 'sother', 
         'smale', 'sage', 'sempl', 'sincome',
         'spop', 'daytime', 'inctype_lin', 'omajblack',
         'omajhisp', 'omajother', 'sbehavior','sagesq']

x_me = np.array([1, 0, 0, 0,
                 0, 4.1, 0, 2.16,
                 1.36, 5, 1, 0, 
                 0, 0, 1, 19.42]).reshape(1, -1)

pd.DataFrame(x_me, columns=x_lab, index=['x_me'])"""

'# the average characteristics for age, agesq, income, spop\nprint(f"{np.mean(dat[\'sage\']):.2f}")\nprint(f"{np.mean(dat[\'sagesq\']):.2f}")\nprint(f"{np.mean(dat[\'sincome\']):.2f}")\nprint(f"{np.mean(dat[\'spop\']):.2f}")\nprint(f"{np.mean(dat[\'daytime\']):.2f}")\n\nx_lab = [\'const\', \'sblack\', \'shisp\', \'sother\', \n         \'smale\', \'sage\', \'sempl\', \'sincome\',\n         \'spop\', \'daytime\', \'inctype_lin\', \'omajblack\',\n         \'omajhisp\', \'omajother\', \'sbehavior\',\'sagesq\']\n\nx_me = np.array([1, 0, 0, 0,\n                 0, 4.1, 0, 2.16,\n                 1.36, 5, 1, 0, \n                 0, 0, 1, 19.42]).reshape(1, -1)\n\npd.DataFrame(x_me, columns=x_lab, index=[\'x_me\'])'

In [184]:
b_pr = probit_tab.theta.values
me_race_pr = probit.G(x_me2@b_pr) - probit.G(x_me@b_pr) 

In [320]:
gx0 = norm.pdf(x_me@b_pr)
gx2 = norm.pdf(x_me2@b_pr)

grad_d_pr = gx2*x_me2 - gx0*x_me

Unnamed: 0,const,sblack,shisp,sother,smale,sage,sempl,sincome,spop,daytime,inctype_lin,omajblack,omajhisp,omajother,sbehavior,sagesq
x_me2,1.0,0.0,1.0,0.0,1.0,2.5,0.0,1.0,4.0,5.0,1.0,0.0,0.0,0.0,1.0,6.25


In [186]:
def get_se(grad, cov):
    cov_me = grad@cov@grad.T
    return np.sqrt(np.diag(cov_me))

se_d_pr = get_se(grad_d_pr, probit_results['cov'])

In [321]:
me_dict = {'Marginal Effect': me_race_pr[0],
           's.e.':            se_d_pr}
tab = pd.DataFrame(me_dict)
tab['t'] = tab['Marginal Effect'] / tab['s.e.']
tab.index.name = 'Var'
tab.round(6)

Unnamed: 0_level_0,Marginal Effect,s.e.,t
Var,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.0,0.0,0.0


### Logit

In [323]:
b_lg = logit_tab.theta.values
me_race_lg = logit.G(x_me2@b_lg) - logit.G(x_me@b_lg)

In [324]:
# Compute the logistic function exponential terms for x_me2 and x_me
exp_x0_b = np.exp(-(x_me@b_lg))
exp_x2_b = np.exp(-(x_me2@b_lg))

grad_d_lg = (x_me2 * exp_x2_b)/ (1 + exp_x2_b)**2 - (x_me * exp_x0_b)/ (1 + exp_x0_b)**2

se_d_lg = get_se(grad_d_lg, logit_results['cov'])

Unnamed: 0_level_0,Marginal Effect,s.e.,t
Var,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.003566,0.013393,0.266277


In [325]:
# find p value for marginal effects
from scipy.stats import norm
p_values = 2 * (1 - norm.cdf(np.abs(tab['t'])))
tab['p-value'] = p_values

In [326]:
tab.to_latex('me_probit_youngman_hisp.tex')

In [190]:
me_dict = {'Marginal Effect': me_race_lg[0],
           's.e.':            se_d_lg}
tab = pd.DataFrame(me_dict)
tab['t'] = tab['Marginal Effect'] / tab['s.e.']
tab.index.name = 'Var'
tab.round(6)

Unnamed: 0_level_0,Marginal Effect,s.e.,t
Var,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.0,0.0,0.0


In [328]:
p_values = 2 * (1 - norm.cdf(np.abs(tab['t'])))
tab['p-value'] = p_values

In [329]:
tab.to_latex('me_logit_youngman_hisp.tex')

Unnamed: 0_level_0,Marginal Effect,s.e.,t
Var,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.001049,0.004339,0.241665


In [338]:
files = [
    'me_probit_youngman.tex',
    'me_probit_youngman_hisp.tex',
    'me_logit_youngman.tex',
    'me_logit_youngman_hisp.tex',
    'me_probit_old_woman.tex',
    'me_probit_old_woman_hisp.tex',
    'me_logit_old_woman.tex',
    'me_logit_old_woman.tex'
]

with open('combined_table.tex', 'w') as outfile:
    outfile.write(r'\begin{table}[ht!]' + '\n')
    outfile.write(r'\centering' + '\n')
    outfile.write(r'\caption{Marginal Effects from Probit and Logit Models}' + '\n')
    outfile.write(r'\begin{tabular}{lccc}' + '\n')
    outfile.write(r'\toprule' + '\n')
    
    panels = ['Panel A: Probit, Young Man',
              'Panel B: Probit, Young Man Hispanic',
              'Panel C: Logit, young man',
              'Panel D: Logit, young man, hispanic',
              'Panel E: Probit, Old Woman',
              'Panel F: Probit, Old Woman, hispanic',
              'Panel G: Logit, Old Woman',
              'Panel H: Logit, Old Woman, hispanic']

    for panel_name, fname in zip(panels, files):
        outfile.write(r'\midrule' + '\n')
        outfile.write(r'\multicolumn{4}{c}{\textbf{' + panel_name + r'}} \\' + '\n')
        outfile.write(r'\midrule' + '\n')
        with open(fname) as infile:
            content = infile.read()
            outfile.write(content + '\n')
    
    outfile.write(r'\bottomrule' + '\n')
    outfile.write(r'\end{tabular}' + '\n')
    outfile.write(r'\end{table}' + '\n')

In [None]:
# bootstrap apes