In [1]:
import pandas as pd
import statsmodels.api as sm
import pymc as pm
import arviz as az
import numpy as np
from scipy.stats import chi2_contingency



In [2]:
df = pd.read_csv("../data/findings/new_orleans_pd_uof_2016_2022.csv")

df['use_of_force_effective'] = df['use_of_force_effective'].map({'yes': 1, 'no': 0})

df = df.dropna(subset=['use_of_force_effective'])

df.head(10)

Unnamed: 0,tracking_id,originating_bureau,division_level,division,unit,working_status,shift_time,investigation_status,disposition,service_type,...,citizen_injured,citizen_influencing_factors,citizen_distance_from_officer,citizen_age,citizen_build,citizen_height,citizen_arrested,citizen_arrest_charges,agency_y,citizen_uid
0,1fb5b1fcbf30fa9feb0a4fd2e2ae7b00,field operations bureau,recruits,a platoon,patrol,paid detail,between 7am-3pm,Completed,use of force justified,call for service,...,no,,,,,,,,new-orleans-pd,ccaa67bd6140cd3e7774c99295c8b5d9
1,888fc1d84fd0bb20c52627a603025e24,field operations bureau,2nd district,c platoon,,,,Completed,use of force authorized,arresting,...,yes,alchohol and unknown drugs,0 feet to 1 feet,29.0,large,> 6'3'',yes,,new-orleans-pd,354b7df12d7748042d9127690dfc73df
2,888fc1d84fd0bb20c52627a603025e24,field operations bureau,2nd district,c platoon,,,,Completed,use of force authorized,arresting,...,yes,alchohol and unknown drugs,0 feet to 1 feet,29.0,large,> 6'3'',yes,,new-orleans-pd,354b7df12d7748042d9127690dfc73df
3,fc500256497f7b93fb89605cacec4eb8,field operations bureau,1st district,3rd platoon,,,,Completed,use of force authorized,call for service,...,no,alchohol and unknown drugs,7 feet to 10 feet,29.0,medium,5'10'' to 6'0'',no,,new-orleans-pd,934d6ce00d0375d9f08caebe6c2e67d9
4,731920c88a49cc3ec37ed7f995f28586,investigations and support bureau,criminal investigations,homicide,squad e,,,Completed,use of force authorized,call for service,...,no,unknown,0 feet to 1 feet,19.0,medium,5'7'' to 5'9'',yes,,new-orleans-pd,ac04a4772e6a64b901112801421cd1c1
5,215d5ce2caf07ce5f8b2aa2abe372507,field operations bureau,8th district,c platoon,patrol,,,Completed,use of force authorized,arresting,...,no,unknown,0 feet to 1 feet,22.0,small,5'4'' to 5'6'',yes,illegal carrying of a weapon,new-orleans-pd,7f8031a29da0cd375373d6b2c1e0d6b1
6,215d5ce2caf07ce5f8b2aa2abe372507,field operations bureau,8th district,c platoon,patrol,,,Completed,use of force authorized,arresting,...,no,unknown,0 feet to 1 feet,22.0,small,5'4'' to 5'6'',yes,illegal carrying of a weapon,new-orleans-pd,7f8031a29da0cd375373d6b2c1e0d6b1
7,2853e3521ca8e59ab045f8f15b322880,field operations bureau,5th district,narcotics,narcotics,regular working,,Completed,use of force authorized,other,...,no,alchohol,7 feet to 10 feet,27.0,medium,5'10'' to 6'0'',yes,,new-orleans-pd,96c1c0621f0e7a2b111309b65a3363ef
8,2853e3521ca8e59ab045f8f15b322880,field operations bureau,5th district,narcotics,narcotics,regular working,,Completed,use of force authorized,other,...,no,alchohol,7 feet to 10 feet,27.0,medium,5'10'' to 6'0'',yes,,new-orleans-pd,96c1c0621f0e7a2b111309b65a3363ef
9,e3974551818eb8df3fa57e08974890bc,field operations bureau,2nd district,b platoon,patrol,,,Completed,use of force authorized,call for service,...,no,unknown,4 feet to 6 feet,15.0,small,5'0'' to 5'3'',no,,new-orleans-pd,c49495a72d50cdd677853a229e4b5efe


In [3]:
df['is_summer'] = df['uof_occur_month'].apply(lambda x: 1 if x in [6, 7, 8, 9] else 0)

X = sm.add_constant(df['is_summer'])
y = df['use_of_force_effective']

logit_model = sm.Logit(y, X)
result = logit_model.fit()

print(result.summary())

# const (Intercept): The mean value is 1.9727. The intercept represents the log-odds of the 'use of force' being effective
# when the 'is_summer' variable is 0, i.e., not in the summer. Taking the exponential of this value, exp(1.9727), 
# we get the odds ratio, which is approximately 7.19. So, the odds of use of force being effective when it's not summer 
# are about 7 to 1.

# is_summer: The coefficient for is_summer is -0.1475. 
# This is the change in the log odds of the 'use of force' being effective for each unit increase in the is_summer variable. 
# So, when an incident occurs in summer, the log odds of the 'use of force' being effective decrease by 0.1475.

# The p-value for is_summer is 0.044, which is less than the common significance level of 0.05. 
# This suggests that the effect of an incident occurring in summer on the effectiveness of the use of force is statistically 
# significant.

Optimization terminated successfully.
         Current function value: 0.379568
         Iterations 6
                             Logit Regression Results                             
Dep. Variable:     use_of_force_effective   No. Observations:                 7794
Model:                              Logit   Df Residuals:                     7792
Method:                               MLE   Df Model:                            1
Date:                    Thu, 13 Jul 2023   Pseudo R-squ.:               0.0009819
Time:                            11:24:05   Log-Likelihood:                -2958.4
converged:                           True   LL-Null:                       -2961.3
Covariance Type:                nonrobust   LLR p-value:                   0.01589
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          1.9896      0.042     46.986      0.000       1.907       2.0

In [4]:
contingency_table = pd.crosstab(df['use_of_force_effective'], df['is_summer'])
chi2, p, dof, expected = chi2_contingency(contingency_table)

print("chi2:", chi2, "p-value:", p)

# The chi-square statistic is approximately 3.9.
# The p-value is approximately 0.048, which is less than the common significance level of 0.05.
# The p-value is less than 0.05, so we reject the null hypothesis that use_of_force_effective and is_summer are independent. 
# This suggests that there is a statistically significant relationship between these two variables.

chi2: 5.724474331328892 p-value: 0.01673004424860636


In [5]:
with pm.Model() as model:

    beta_0 = pm.Normal('beta_0', mu=0, sigma=1)
    beta_1 = pm.Normal('beta_1', mu=0, sigma=1)

    # Expected value of outcome (use logistic link function)
    p = pm.math.invlogit(beta_0 + beta_1 * df['is_summer'])

    # Likelihood
    y_obs = pm.Bernoulli('y_obs', p=p, observed=df['use_of_force_effective'])

    trace = pm.sample(2000, tune=1000, cores=1)
    
az.summary(trace)

#beta_0 (Intercept): The mean value is 1.971, which is the log odds of the 'use of force' being effective
# when the 'is_summer' variable is 0 (i.e., when the incident is not in the summer). 
# The 3% and 97% Highest Density Interval (HDI) values (1.888 and 2.052) give us a credible interval for this parameter. 
# This interval is the range within which we believe the true parameter value lies with a certain level of confidence 
# (here, 94% confidence).

# beta_1 (Coefficient for is_summer): The mean value is -0.145. 
# This is the change in the log odds of the 'use of force' being effective for each unit increase in the 'is_summer' variable. 
# So, when an incident occurs in summer, the log odds of the 'use of force' being effective decrease by 0.145. 
# The 3% and 97% HDI values (-0.288 and -0.011) provide a credible interval for this parameter.

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Sequential sampling (2 chains in 1 job)
NUTS: [beta_0, beta_1]


Sampling 2 chains for 1_000 tune and 2_000 draw iterations (2_000 + 4_000 draws total) took 22 seconds.


Unnamed: 0,mean,sd,hdi_3%,hdi_97%,mcse_mean,mcse_sd,ess_bulk,ess_tail,r_hat
beta_0,1.986,0.043,1.902,2.062,0.001,0.001,2014.0,2318.0,1.0
beta_1,-0.167,0.071,-0.303,-0.034,0.002,0.001,2119.0,2543.0,1.0
