In [1]:
import pandas as pd
import statsmodels.api as sm
import pymc as pm
import arviz as az
import numpy as np
from scipy.stats import chi2_contingency



In [2]:
df = pd.read_csv("../data/findings/new_orleans_pd_uof_2016_2022.csv")

df.head(10)

Unnamed: 0,tracking_id,originating_bureau,division_level,division,unit,working_status,shift_time,investigation_status,disposition,service_type,...,citizen_injured,citizen_influencing_factors,citizen_distance_from_officer,citizen_age,citizen_build,citizen_height,citizen_arrested,citizen_arrest_charges,agency_y,citizen_uid
0,1fb5b1fcbf30fa9feb0a4fd2e2ae7b00,field operations bureau,recruits,a platoon,patrol,paid detail,between 7am-3pm,Completed,use of force justified,call for service,...,no,,,,,,,,new-orleans-pd,ccaa67bd6140cd3e7774c99295c8b5d9
1,888fc1d84fd0bb20c52627a603025e24,field operations bureau,2nd district,c platoon,,,,Completed,use of force authorized,arresting,...,yes,alchohol and unknown drugs,0 feet to 1 feet,29.0,large,> 6'3'',yes,,new-orleans-pd,354b7df12d7748042d9127690dfc73df
2,888fc1d84fd0bb20c52627a603025e24,field operations bureau,2nd district,c platoon,,,,Completed,use of force authorized,arresting,...,yes,alchohol and unknown drugs,0 feet to 1 feet,29.0,large,> 6'3'',yes,,new-orleans-pd,354b7df12d7748042d9127690dfc73df
3,fc500256497f7b93fb89605cacec4eb8,field operations bureau,1st district,3rd platoon,,,,Completed,use of force authorized,call for service,...,no,alchohol and unknown drugs,7 feet to 10 feet,29.0,medium,5'10'' to 6'0'',no,,new-orleans-pd,934d6ce00d0375d9f08caebe6c2e67d9
4,731920c88a49cc3ec37ed7f995f28586,investigations and support bureau,criminal investigations,homicide,squad e,,,Completed,use of force authorized,call for service,...,no,unknown,0 feet to 1 feet,19.0,medium,5'7'' to 5'9'',yes,,new-orleans-pd,ac04a4772e6a64b901112801421cd1c1
5,215d5ce2caf07ce5f8b2aa2abe372507,field operations bureau,8th district,c platoon,patrol,,,Completed,use of force authorized,arresting,...,no,unknown,0 feet to 1 feet,22.0,small,5'4'' to 5'6'',yes,illegal carrying of a weapon,new-orleans-pd,7f8031a29da0cd375373d6b2c1e0d6b1
6,215d5ce2caf07ce5f8b2aa2abe372507,field operations bureau,8th district,c platoon,patrol,,,Completed,use of force authorized,arresting,...,no,unknown,0 feet to 1 feet,22.0,small,5'4'' to 5'6'',yes,illegal carrying of a weapon,new-orleans-pd,7f8031a29da0cd375373d6b2c1e0d6b1
7,2853e3521ca8e59ab045f8f15b322880,field operations bureau,5th district,narcotics,narcotics,regular working,,Completed,use of force authorized,other,...,no,alchohol,7 feet to 10 feet,27.0,medium,5'10'' to 6'0'',yes,,new-orleans-pd,96c1c0621f0e7a2b111309b65a3363ef
8,2853e3521ca8e59ab045f8f15b322880,field operations bureau,5th district,narcotics,narcotics,regular working,,Completed,use of force authorized,other,...,no,alchohol,7 feet to 10 feet,27.0,medium,5'10'' to 6'0'',yes,,new-orleans-pd,96c1c0621f0e7a2b111309b65a3363ef
9,e3974551818eb8df3fa57e08974890bc,field operations bureau,2nd district,b platoon,patrol,,,Completed,use of force authorized,call for service,...,no,unknown,4 feet to 6 feet,15.0,small,5'0'' to 5'3'',no,,new-orleans-pd,c49495a72d50cdd677853a229e4b5efe


In [3]:
# preprocess data 

df = df.dropna(subset=['use_of_force_effective', 'citizen_influencing_factors'])

df['use_of_force_effective'] = df['use_of_force_effective'].map({'yes': 1, 'no': 0})

df['mentally_unstable'] = df['citizen_influencing_factors'].apply(lambda x: 1 if 'mentally unstable' in x else 0)
df = df.dropna(subset=['use_of_force_effective'])

In [4]:
X = sm.add_constant(df['mentally_unstable'])
y = df['use_of_force_effective']

logit_model = sm.Logit(y, X)
result = logit_model.fit()

print(result.summary())

# The coefficient for mentally_unstable is -0.2096. 
# This indicates that being mentally unstable is associated with a decrease in the log-odds of the use of force being effective
# holding all other variables constant.

# The p-value for mentally_unstable is 0.044, which is less than the common significance level of 0.05. 
# This suggests that the effect of being mentally unstable on the effectiveness of the use of force is statistically significant.

Optimization terminated successfully.
         Current function value: 0.381155
         Iterations 6
                             Logit Regression Results                             
Dep. Variable:     use_of_force_effective   No. Observations:                 7441
Model:                              Logit   Df Residuals:                     7439
Method:                               MLE   Df Model:                            1
Date:                    Thu, 13 Jul 2023   Pseudo R-squ.:               0.0006912
Time:                            11:09:28   Log-Likelihood:                -2836.2
converged:                           True   LL-Null:                       -2838.1
Covariance Type:                nonrobust   LLR p-value:                   0.04762
                        coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------
const                 1.9494      0.037     52.323      0.000 

In [5]:
contingency_table = pd.crosstab(df['use_of_force_effective'], df['mentally_unstable'])

chi2, p, dof, expected = chi2_contingency(contingency_table)

chi2, p

# The chi-square statistic is approximately 3.86.
# The p-value is approximately 0.049, which is less than the common significance level of 0.05.
# The p-value is less than 0.05, so we reject the null hypothesis that use_of_force_effective
# and mentally_unstable are independent. 
# This suggests that there is a statistically significant relationship between these two variables.

(3.864392672215673, 0.04932104085460102)

In [6]:
with pm.Model() as model:
    # Priors for unknown model parameters
    beta_0 = pm.Normal('beta_0', mu=0, sigma=1)
    beta_1 = pm.Normal('beta_1', mu=0, sigma=1)

    # Expected value of outcome (use logistic link function)
    p = pm.math.invlogit(beta_0 + beta_1 * df['mentally_unstable'])

    # Likelihood
    y_obs = pm.Bernoulli('y_obs', p=p, observed=df['use_of_force_effective'])

    trace = pm.sample(2000, tune=1000, cores=1)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Sequential sampling (2 chains in 1 job)
NUTS: [beta_0, beta_1]


Sampling 2 chains for 1_000 tune and 2_000 draw iterations (2_000 + 4_000 draws total) took 18 seconds.


In [7]:
az.summary(trace)

# beta_0 is called the intercept, and it represents the predicted outcome when all your predictor variables are zero. 
# In this case, it represents the "baseline" scenario where mentally_unstable is 0, meaning the citizen is not mentally unstable.

# The mean value of beta_0 is 1.946. In the context of logistic regression, this value is in terms of log-odds. 
# Convert it to odds by taking the exponential, exp(1.947), which gives us about 7. 
# The odds are a ratio, and in this case, it means that the odds of the use of force being effective 
# are 7 to 1 when the citizen is not mentally unstable.
# The 3% and 97% Highest Density Interval (HDI) is a range of values that likely contain the true value of beta_0. 
# In other words, we are 94% confident that the true value of beta_0 (if we had all possible data) 
# would fall between 1.880 and 2.020.

# beta_1 (Coefficient for mentally_unstable): The mean value is -0.202. 
# This is the change in the log odds of the 'use of force' being effective 
# for each unit increase in the 'mentally unstable' variable. 
# So, if a citizen is mentally unstable, the log odds of the 'use of force' being effective decrease by 0.204. 
# The 3% and 97% HDI values (-0.396 and -0.003) provide a credible interval for this parameter.

Unnamed: 0,mean,sd,hdi_3%,hdi_97%,mcse_mean,mcse_sd,ess_bulk,ess_tail,r_hat
beta_0,1.947,0.037,1.875,2.015,0.001,0.0,3518.0,2953.0,1.0
beta_1,-0.202,0.102,-0.387,-0.005,0.002,0.001,2962.0,2785.0,1.0


In [8]:
# preprocess data

df['was_hospitalized'] = (df['citizen_hospitalized'] == 'yes').astype(int)

df['was_mentally_unstable'] = (df['citizen_influencing_factors'] == 'mentally unstable').astype(int)

df = df.dropna(subset=['was_hospitalized', 'was_mentally_unstable'])

In [9]:
y = df['was_mentally_unstable']

X = df['was_hospitalized']
X = sm.add_constant(X)  

model = sm.Logit(y, X)
result = model.fit()

result.summary()

# The coefficient for 'was_hospitalized' is approximately 1.7368. 
# This value is the log odds; exponentiating this value gives us the odds ratio of 1.7368
# which means the odds of being mentally unstable are about 5.68 times higher for citizens 
# who were hospitalized compared to those who were not, assuming all other factors are equal.
# The p-value for 'was_hospitalized' is very close to 0, 
# which indicates that the relationship between hospitalization and mental instability is statistically significant.

Optimization terminated successfully.
         Current function value: 0.317632
         Iterations 7


0,1,2,3
Dep. Variable:,was_mentally_unstable,No. Observations:,7441.0
Model:,Logit,Df Residuals:,7439.0
Method:,MLE,Df Model:,1.0
Date:,"Thu, 13 Jul 2023",Pseudo R-squ.:,0.09681
Time:,11:10:00,Log-Likelihood:,-2363.5
converged:,True,LL-Null:,-2616.8
Covariance Type:,nonrobust,LLR p-value:,3.317e-112

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-3.0518,0.072,-42.384,0.000,-3.193,-2.911
was_hospitalized,1.7514,0.085,20.657,0.000,1.585,1.918


In [10]:
contingency_table = pd.crosstab(df['was_hospitalized'], df['was_mentally_unstable'])

chi2, p, dof, expected = chi2_contingency(contingency_table)

chi2, p

# The p-value for 'was_hospitalized' is very close to 0, 
# which indicates that the relationship between hospitalization and mental instability is statistically significant.

(508.3402788423857, 1.4565439986401333e-112)

In [11]:
with pm.Model() as model:
    # Priors for unknown model parameters
    beta_0 = pm.Normal('beta_0', mu=0, sigma=1)
    beta_1 = pm.Normal('beta_1', mu=0, sigma=1)

    # Expected value of outcome (use logistic link function)
    p = pm.math.invlogit(beta_0 + beta_1 * df['was_mentally_unstable'])

    # Likelihood
    y_obs = pm.Bernoulli('y_obs', p=p, observed=df['was_hospitalized'])

    trace = pm.sample(2000, tune=1000, cores=1)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Sequential sampling (2 chains in 1 job)
NUTS: [beta_0, beta_1]


Sampling 2 chains for 1_000 tune and 2_000 draw iterations (2_000 + 4_000 draws total) took 18 seconds.


In [12]:
az.summary(trace)

# The mean value of beta_1 is 1.741, which represents the log-odds ratio between the two groups (mentally unstable vs 
# not mentally unstable) in terms of the likelihood of hospitalization.
# The odds ratio is approximately exp(1.741) = 5.7. 
# This indicates that the odds of hospitalization are about 5.7 times higher for individuals who are mentally 
# unstable compared to those who are not, according to this model.

Unnamed: 0,mean,sd,hdi_3%,hdi_97%,mcse_mean,mcse_sd,ess_bulk,ess_tail,r_hat
beta_0,-0.604,0.026,-0.653,-0.556,0.0,0.0,3429.0,2636.0,1.0
beta_1,1.741,0.084,1.588,1.906,0.002,0.001,2998.0,2716.0,1.0
