In [363]:
import pandas as pd
from lib import standardize_item_numbers
from scipy.stats import chi2_contingency
import statsmodels.api as sm
import numpy as np
import statsmodels.stats.multitest as smm

In [364]:
def clean():
    # police report data
    dfa = pd.read_csv("../data/police_reports/electronic_police_report_2018_2022.csv")
    dfa = dfa.pipe(standardize_item_numbers, ["item_number"])

    # This table only contains an item number. When merged with the report table, a table is produced where
    # each row represents a police report where surveillance footage was requested  
    dfb = pd.read_csv("../data/real_time_crime_center/rtcc.csv")
    dfb["rtcc_requested"] = 1
    dfb = dfb.pipe(standardize_item_numbers, ["item_number"])

    df = pd.merge(dfa, dfb, on="item_number", how="outer")
    df.loc[:, "rtcc_requested"] = df.rtcc_requested.fillna(0)
    return df

In [365]:
df = clean()
df.shape

  dfa = pd.read_csv("../data/police_reports/electronic_police_report_2018_2022.csv")


(558642, 27)

In [366]:
df.loc[:, "charge_description"] = df.charge_description.fillna("Unknown").str.title()

df.loc[:, "offender_race"] = df.offender_race.fillna("Unknown").str.title()

In [367]:
# impute race with the mode
df.loc[:, "offender_race"] = df.offender_race.fillna("").str.replace(r"^$", "black", regex=True)
df.shape

(558642, 27)

In [368]:
df = df.drop_duplicates(subset=["item_number", "charge_description", "offenderid", "victim_number"])
df.shape

(513811, 27)

In [369]:
df.offender_race.value_counts(normalize=True)

Unknown                                      0.605666
Black                                        0.335923
White                                        0.052017
Hispanic                                     0.005008
Asian                                        0.001156
Amer. Ind.                                   0.000181
Native Hawaiian Or Other Pacific Islander    0.000049
Name: offender_race, dtype: float64

In [370]:
# filter for top 10 most common charge descriptions/crimes
# top_10_charge_descs_sr = (pd.Series(df.groupby("charge_description")
#                                                 .rtcc_requested.value_counts()
#                                                 .sort_values(ascending=False).head(10))
# )
# top_10_charge_descs_df = (pd.DataFrame(top_10_charge_descs_sr).unstack()
#                                                               .reset_index()
# )

# top_10_charge_desc_list = [x for x in top_10_charge_descs_df["charge_description"]]

In [371]:
# df = df[df.charge_description.isin(top_10_charge_desc_list)]

In [372]:
counts = pd.DataFrame(df.groupby("offender_race").charge_description.value_counts())
counts = counts.rename(columns={counts.columns[0]: "count"})
counts = counts.reset_index()
counts =  counts.pivot(index='offender_race', columns='charge_description', values='count')
counts = counts.reset_index()

In [373]:

black_counts = counts.loc[counts['offender_race'] == 'black', ['THEFT OF A MOTOR VEHICLE', 'THEFT', 'ARMED ROBBERY',
                                                       'USE OF FIREARM IN ROBBERY', 'AGG. BATTERY', 'SIMPLE BURGLARY',
                                                       'SIMPLE ROBBERY', 'AGGRAVATED ASSAULT WITH A FIREARM',
                                                       'SIMPLE CRIMINAL DAMAGE TO PROPERTY', 'AGG. CRIMINAL DAMAGE']].values

white_counts = counts.loc[counts['offender_race'] == 'white', ['THEFT OF A MOTOR VEHICLE', 'THEFT', 'ARMED ROBBERY',
                                                       'USE OF FIREARM IN ROBBERY', 'AGG. BATTERY', 'SIMPLE BURGLARY',
                                                       'SIMPLE ROBBERY', 'AGGRAVATED ASSAULT WITH A FIREARM',
                                                       'SIMPLE CRIMINAL DAMAGE TO PROPERTY', 'AGG. CRIMINAL DAMAGE']].values

chi2, pval, dof, expected = chi2_contingency([black_counts, white_counts])
print("p-value:", pval)

KeyError: "None of [Index(['THEFT OF A MOTOR VEHICLE', 'THEFT', 'ARMED ROBBERY',\n       'USE OF FIREARM IN ROBBERY', 'AGG. BATTERY', 'SIMPLE BURGLARY',\n       'SIMPLE ROBBERY', 'AGGRAVATED ASSAULT WITH A FIREARM',\n       'SIMPLE CRIMINAL DAMAGE TO PROPERTY', 'AGG. CRIMINAL DAMAGE'],\n      dtype='object', name='charge_description')] are in the [columns]"

In [None]:
counts_sm = pd.DataFrame(df.groupby("offender_race").charge_description.value_counts())
counts_sm.charge_description.unique()
counts_sm = counts_sm.rename(columns={counts_sm.columns[0]: "counts"})
counts_sm = counts_sm.reset_index()
counts_sm

model = sm.GLM.from_formula("counts ~ offender_race", family=sm.families.Poisson(), data=counts_sm).fit()

print(model.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:                 counts   No. Observations:                  526
Model:                            GLM   Df Residuals:                      519
Model Family:                 Poisson   Df Model:                            6
Link Function:                    Log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -17019.
Date:                Fri, 28 Apr 2023   Deviance:                       32360.
Time:                        20:34:25   Pearson chi2:                 7.10e+04
No. Iterations:                     7   Pseudo R-squ. (CS):              1.000
Covariance Type:            nonrobust                                         
                                                                 coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------