In [23]:
import pandas as pd
from lib import standardize_item_numbers
from scipy.stats import chi2_contingency
import statsmodels.api as sm
import numpy as np
import statsmodels.stats.multitest as smm

In [24]:
def clean():
    # police report data
    dfa = pd.read_csv("../data/police_reports/electronic_police_report_2018_2022.csv")
    dfa = dfa.pipe(standardize_item_numbers, ["item_number"])

    # This table only contains an item number. When merged with the report table, a table is produced where
    # each row represents a police report where surveillance footage was requested  
    dfb = pd.read_csv("../data/real_time_crime_center/rtcc.csv")
    dfb["rtcc_requested"] = "1"
    dfb = dfb.pipe(standardize_item_numbers, ["item_number"])

    df = pd.merge(dfa, dfb, on="item_number")
    return df

In [25]:
df = clean()

  dfa = pd.read_csv("../data/police_reports/electronic_police_report_2018_2022.csv")


In [26]:
def clean_offender_race(df):
    df.loc[:, "offender_race"] = df.offender_race.str.lower().str.strip()
    return df

In [27]:
df = df.pipe(clean_offender_race)

In [28]:
# drop rows missing charge descriptions/crimes
df = df[~((df.charge_description.fillna("") == ""))]

In [29]:
# impute race with the mode
df.loc[:, "offender_race"] = df.offender_race.fillna("").str.replace(r"^$", "black", regex=True)

In [30]:
# filter for top 10 most common charge descriptions/crimes
top_10_charge_descs_sr = (pd.Series(df.groupby("charge_description")
                                                .rtcc_requested.value_counts()
                                                .sort_values(ascending=False).head(10))
)
top_10_charge_descs_df = (pd.DataFrame(top_10_charge_descs_sr).unstack()
                                                              .reset_index()
)

top_10_charge_desc_list = [x for x in top_10_charge_descs_df["charge_description"]]

In [31]:
df = df[df.charge_description.isin(top_10_charge_desc_list)]

In [32]:
counts = df.groupby("offender_race").charge_description.value_counts()
counts = pd.DataFrame(counts)
counts = counts.reset_index()
counts =  counts.pivot(index='offender_race', columns='charge_description', values='count')
counts = counts.reset_index()

In [33]:

black_counts = counts.loc[counts['offender_race'] == 'black', ['THEFT OF A MOTOR VEHICLE', 'THEFT', 'ARMED ROBBERY',
                                                       'USE OF FIREARM IN ROBBERY', 'AGG. BATTERY', 'SIMPLE BURGLARY',
                                                       'SIMPLE ROBBERY', 'AGGRAVATED ASSAULT WITH A FIREARM',
                                                       'SIMPLE CRIMINAL DAMAGE TO PROPERTY', 'AGG. CRIMINAL DAMAGE']].values

white_counts = counts.loc[counts['offender_race'] == 'white', ['THEFT OF A MOTOR VEHICLE', 'THEFT', 'ARMED ROBBERY',
                                                       'USE OF FIREARM IN ROBBERY', 'AGG. BATTERY', 'SIMPLE BURGLARY',
                                                       'SIMPLE ROBBERY', 'AGGRAVATED ASSAULT WITH A FIREARM',
                                                       'SIMPLE CRIMINAL DAMAGE TO PROPERTY', 'AGG. CRIMINAL DAMAGE']].values

chi2, pval, dof, expected = chi2_contingency([black_counts, white_counts])
print("p-value:", pval)

p-value: 1.8328740039475772e-11


In [34]:
counts_sm = df.groupby("offender_race").charge_description.value_counts()
counts_sm = pd.DataFrame(counts_sm)
counts_sm = counts_sm.reset_index()
model = sm.GLM.from_formula("count ~ offender_race", family=sm.families.Poisson(), data=counts_sm).fit()

print(model.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:                  count   No. Observations:                   39
Model:                            GLM   Df Residuals:                       33
Model Family:                 Poisson   Df Model:                            5
Link Function:                    Log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -1007.7
Date:                Tue, 11 Apr 2023   Deviance:                       1794.4
Time:                        18:08:16   Pearson chi2:                 1.67e+03
No. Iterations:                     9   Pseudo R-squ. (CS):              1.000
Covariance Type:            nonrobust                                         
                                coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------
Intercept             