In [1]:
import pandas as pd
import statsmodels.api as sm
import pymc as pm
import arviz as az
import numpy as np
from scipy.stats import chi2_contingency



In [2]:
df = pd.read_csv("../data/findings/new_orleans_pd_uof_2016_2022.csv")

df.head(10)

Unnamed: 0,tracking_id,originating_bureau,division_level,division,unit,working_status,shift_time,investigation_status,disposition,service_type,...,citizen_injured,citizen_influencing_factors,citizen_distance_from_officer,citizen_age,citizen_build,citizen_height,citizen_arrested,citizen_arrest_charges,agency_y,citizen_uid
0,1fb5b1fcbf30fa9feb0a4fd2e2ae7b00,field operations bureau,recruits,a platoon,patrol,paid detail,between 7am-3pm,Completed,use of force justified,call for service,...,no,,,,,,,,new-orleans-pd,ccaa67bd6140cd3e7774c99295c8b5d9
1,888fc1d84fd0bb20c52627a603025e24,field operations bureau,2nd district,c platoon,,,,Completed,use of force authorized,arresting,...,yes,alchohol and unknown drugs,0 feet to 1 feet,29.0,large,> 6'3'',yes,,new-orleans-pd,354b7df12d7748042d9127690dfc73df
2,888fc1d84fd0bb20c52627a603025e24,field operations bureau,2nd district,c platoon,,,,Completed,use of force authorized,arresting,...,yes,alchohol and unknown drugs,0 feet to 1 feet,29.0,large,> 6'3'',yes,,new-orleans-pd,354b7df12d7748042d9127690dfc73df
3,fc500256497f7b93fb89605cacec4eb8,field operations bureau,1st district,3rd platoon,,,,Completed,use of force authorized,call for service,...,no,alchohol and unknown drugs,7 feet to 10 feet,29.0,medium,5'10'' to 6'0'',no,,new-orleans-pd,934d6ce00d0375d9f08caebe6c2e67d9
4,731920c88a49cc3ec37ed7f995f28586,investigations and support bureau,criminal investigations,homicide,squad e,,,Completed,use of force authorized,call for service,...,no,unknown,0 feet to 1 feet,19.0,medium,5'7'' to 5'9'',yes,,new-orleans-pd,ac04a4772e6a64b901112801421cd1c1
5,215d5ce2caf07ce5f8b2aa2abe372507,field operations bureau,8th district,c platoon,patrol,,,Completed,use of force authorized,arresting,...,no,unknown,0 feet to 1 feet,22.0,small,5'4'' to 5'6'',yes,illegal carrying of a weapon,new-orleans-pd,7f8031a29da0cd375373d6b2c1e0d6b1
6,215d5ce2caf07ce5f8b2aa2abe372507,field operations bureau,8th district,c platoon,patrol,,,Completed,use of force authorized,arresting,...,no,unknown,0 feet to 1 feet,22.0,small,5'4'' to 5'6'',yes,illegal carrying of a weapon,new-orleans-pd,7f8031a29da0cd375373d6b2c1e0d6b1
7,2853e3521ca8e59ab045f8f15b322880,field operations bureau,5th district,narcotics,narcotics,regular working,,Completed,use of force authorized,other,...,no,alchohol,7 feet to 10 feet,27.0,medium,5'10'' to 6'0'',yes,,new-orleans-pd,96c1c0621f0e7a2b111309b65a3363ef
8,2853e3521ca8e59ab045f8f15b322880,field operations bureau,5th district,narcotics,narcotics,regular working,,Completed,use of force authorized,other,...,no,alchohol,7 feet to 10 feet,27.0,medium,5'10'' to 6'0'',yes,,new-orleans-pd,96c1c0621f0e7a2b111309b65a3363ef
9,e3974551818eb8df3fa57e08974890bc,field operations bureau,2nd district,b platoon,patrol,,,Completed,use of force authorized,call for service,...,no,unknown,4 feet to 6 feet,15.0,small,5'0'' to 5'3'',no,,new-orleans-pd,c49495a72d50cdd677853a229e4b5efe


In [3]:
# preprocess data 

df = df.dropna(subset=['citizen_race', 'citizen_hospitalized'])

df['citizen_hospitalized'] = df['citizen_hospitalized'].apply(lambda x: 1 if x == 'yes' else 0)

df = pd.get_dummies(df, columns=['citizen_race'])

races = ['asian / pacific islander', 'black', 'hispanic', 'indian', 'white']

In [4]:
logit_results = {}

for race in races:
    y = df['citizen_hospitalized']
    X = sm.add_constant(df[f'citizen_race_{race}']) 
    model = sm.Logit(y, X)
    result = model.fit(disp=0) 
    logit_results[race] = result.summary()

logit_results

# Asian / Pacific Islander: The coefficient is approximately 0.167. This suggests that, all else being equal, 
# being Asian / Pacific Islander is associated with an increase in the log-odds of hospitalization.

# Black: The coefficient is approximately -0.401. This suggests that, all else being equal, 
# being Black is associated with a decrease in the log-odds of hospitalization.

# Hispanic: The coefficient is approximately -0.429. This suggests that, all else being equal, 
# being Hispanic is associated with a decrease in the log-odds of hospitalization.

# Indian: The coefficient is approximately 1.618. This suggests that, all else being equal, 
# being Indian is associated with an increase in the log-odds of hospitalization.

# White: The coefficient is approximately 0.501. This suggests that, all else being equal, 
# being White is associated with an increase in the log-odds of hospitalization.

{'asian / pacific islander': <class 'statsmodels.iolib.summary.Summary'>
 """
                             Logit Regression Results                            
 Dep. Variable:     citizen_hospitalized   No. Observations:                 7790
 Model:                            Logit   Df Residuals:                     7788
 Method:                             MLE   Df Model:                            1
 Date:                  Thu, 13 Jul 2023   Pseudo R-squ.:               2.348e-05
 Time:                          11:19:26   Log-Likelihood:                -5254.2
 converged:                         True   LL-Null:                       -5254.3
 Covariance Type:              nonrobust   LLR p-value:                    0.6194
                                             coef    std err          z      P>|z|      [0.025      0.975]
 ---------------------------------------------------------------------------------------------------------
 const                                    -0.3908   

In [5]:
chi2_results = {}

for race in races:
    contingency_table = pd.crosstab(df['citizen_hospitalized'], df[f'citizen_race_{race}'])
    chi2, p, dof, expected = chi2_contingency(contingency_table)
    chi2_results[race] = {'Chi-square Statistic': chi2, 'p-value': p}

chi2_results

# Asian / Pacific Islander: The p-value is approximately 0.74, which is greater than 0.05. Therefore, we do not 
# reject the null hypothesis that hospitalization is independent of being Asian / Pacific Islander.

# Black: The p-value is extremely small (much less than 0.05). Therefore, we reject the null hypothesis 
# and conclude that there is a significant association between hospitalization and being Black.

# Hispanic: The p-value is approximately 0.016, which is less than 0.05. Therefore, we reject the null hypothesis 
# and conclude that there is a significant association between hospitalization and being Hispanic.

# Indian: The p-value is small (less than 0.05). Therefore, we reject the null hypothesis and conclude that 
# there is a significant association between hospitalization and being Indian.

# White: The p-value is extremely small (much less than 0.05). Therefore, we reject the null hypothesis 
# and conclude that there is a significant association between hospitalization and being White.

{'asian / pacific islander': {'Chi-square Statistic': 0.10817223442255777,
  'p-value': 0.7422346935631141},
 'black': {'Chi-square Statistic': 39.481020090751386,
  'p-value': 3.3126350731360034e-10},
 'hispanic': {'Chi-square Statistic': 5.697850306740474,
  'p-value': 0.01698570437785355},
 'indian': {'Chi-square Statistic': 10.989234987653955,
  'p-value': 0.0009164263076338237},
 'white': {'Chi-square Statistic': 52.232560903096605,
  'p-value': 4.930183145635031e-13}}

In [6]:
# preprocess data
df = df.dropna(subset=['citizen_sex'])

df = pd.get_dummies(df, columns=['citizen_sex'])
genders = ['female', 'male']

In [7]:
logit_results_gender = {}

for gender in genders:
    y = df['citizen_hospitalized']
    X = sm.add_constant(df[f'citizen_sex_{gender}']) 
    model = sm.Logit(y, X)
    result = model.fit(disp=0) 
    logit_results_gender[gender] = result.summary()

logit_results_gender

# For female', the coefficient is 0.3671. This means that, all else being equal, being female is associated with an 
# increase in the log-odds of hospitalization by about 0.37 units. 
# Conversely, for 'male', the coefficient is -0.3671, meaning being male is associated with a decrease in the 
# log-odds of hospitalization by about 0.37 units.

{'female': <class 'statsmodels.iolib.summary.Summary'>
 """
                             Logit Regression Results                            
 Dep. Variable:     citizen_hospitalized   No. Observations:                 7779
 Model:                            Logit   Df Residuals:                     7777
 Method:                             MLE   Df Model:                            1
 Date:                  Thu, 13 Jul 2023   Pseudo R-squ.:                0.002824
 Time:                          11:19:27   Log-Likelihood:                -5231.1
 converged:                         True   LL-Null:                       -5245.9
 Covariance Type:              nonrobust   LLR p-value:                 5.235e-08
                          coef    std err          z      P>|z|      [0.025      0.975]
 --------------------------------------------------------------------------------------
 const                 -0.4410      0.025    -17.686      0.000      -0.490      -0.392
 citizen_sex_female 

In [8]:
odds_ratios = {}

for gender in genders:
    y = df['citizen_hospitalized']
    X = sm.add_constant(df[f'citizen_sex_{gender}'])  # adding a constant
    model = sm.Logit(y, X)
    result = model.fit(disp=0)  # disp=0 suppresses convergence messages
    logit_results_gender[gender] = {"result": result, "summary": result.summary()}

for gender, outputs in logit_results_gender.items():
    odds_ratios[gender] = np.exp(outputs["result"].params)

odds_ratios

# The odds of hospitalization for females are 1.44 times the odds of hospitalization for non-females, 
# holding all else constant. This is about a 44% increase in the odds of hospitalization.

# The odds of hospitalization for males are 0.69 times the odds of hospitalization for non-males, 
# holding all else constant. This is about a 31% decrease in the odds of hospitalization.

{'female': const                 0.643379
 citizen_sex_female    1.443482
 dtype: float64,
 'male': const               0.928705
 citizen_sex_male    0.692769
 dtype: float64}