In [None]:
import pandas as pd
import numpy as np
import pymc as pm
from scipy.stats import binom
import arviz as az
from fancyimpute import IterativeImputer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

In [None]:
# read in the police reports and real-time crime center data
pr = pd.read_csv("../data/police_reports/electronic_police_report_2018_2022.csv")
rtcc = pd.read_csv("../data/real_time_crime_center/rtcc.csv")
rtcc["rtcc_requested"] = 1 

# create a new column in the police reports data indicating whether the offender is Black
pr['race_black'] = (pr['offender_race'] == 'BLACK')

# select only the necessary columns from the police reports data and join with the distinct item numbers from the rtcc data
rc = pd.merge(pr, rtcc, on="item_number", how="left")

rc['rtcc_requested'] = rc['rtcc_requested'].fillna(0)

rc.loc[:, "rtcc_requested"] = rc.rtcc_requested.astype(str).str.replace(r"\.0", "", regex=True)
rc.loc[:, "offenderid"] = rc.offenderid.astype(str).str.replace(r"\.0", "", regex=True)

rc = rc[~((rc.charge_description.fillna("") == ""))]
rc = rc[~((rc.item_number.fillna("") == ""))]

# Replace empty and 'unknown' strings with NaN
rc['offender_race'] = rc['offender_race'].replace('', np.nan).fillna(value=np.nan, inplace=False)
rc['offender_race'] = rc['offender_race'].replace('UNKNOWN', np.nan)

# Create a new dataframe with only the 'location' and 'offender_race' columns
subset_df = rc[['location', 'offender_race']]

# Split the data into known and unknown values based on the missing values in the 'offender_race' column
known = subset_df[subset_df['offender_race'].notna()]
unknown = subset_df[subset_df['offender_race'].isna()]

# Encode categorical data using LabelEncoder
le = LabelEncoder()
known['location'] = le.fit_transform(known['location'])
known['offender_race'] = le.fit_transform(known['offender_race'])

# Impute missing values using IterativeImputer
imputer = IterativeImputer()
imputed_values = imputer.fit_transform(known)

# Convert imputed values back to dataframe
imputed_df = pd.DataFrame(imputed_values, columns=['location', 'offender_race'])
imputed_df['location'] = imputed_df['location'].astype(int)
imputed_df['offender_race'] = imputed_df['offender_race'].round().astype(int)

# Convert the numerical data back to categorical data using LabelEncoder's inverse_transform method
imputed_df['offender_race'] = le.inverse_transform(imputed_df['offender_race'])

# Replace the missing values in the original 'offender_race' column with the imputed values
rc.loc[unknown.index, 'offender_race'] = imputed_df['offender_race']

df_0 = rc[rc['rtcc_requested'] == "0"]
df_1 = rc[rc['rtcc_requested'] == '1']

rc = pd.concat([df_0, df_1])
print(rc.shape)

rc = rc.drop_duplicates(subset=["item_number", "offenderid"])
rc.loc[:, "rtcc_requested"] = rc.rtcc_requested.astype(int)

rc_grouped = rc.groupby(['race_black', 'charge_description']).agg(n=('item_number', 'count'), rtcc=('rtcc_requested', 'sum')).reset_index()
rc_grouped['race_charge'] = pd.Categorical(rc_grouped['race_black'].astype(str) + '_' + rc_grouped['charge_description'])

In [None]:
with pm.Model() as model:
    # set the prior for the regression coefficients
    b = pm.Normal('b', mu=0, tau=1/5)
    b_race = pm.Normal('b_race', mu=0, tau=1/5)
    
    # set the prior for the standard deviation of the intercept
    sd_intercept = pm.Uniform('sd_intercept', lower=0, upper=10)
    
    # define n as a data input
    n = pm.Data('n', rc_grouped['n'].values)
    
    # define race_black as a predictor
    race_black = pm.Data('race_black', rc_grouped['race_black'].astype(int).values, mutable=True)

    
    # define charge_description as a categorical variable
    charge_descr = rc_grouped['charge_description'].astype('category')
    charge_descr_cats = charge_descr.cat.categories.values
    charge_descr_ints = charge_descr.cat.codes.values
    
    # model the intercept as a normal distribution with a group-level standard deviation
    intercept = pm.Normal('intercept', mu=0, sigma=sd_intercept, shape=len(charge_descr_cats))
    
    # model the effects of the charge descriptions
    charge_effect = pm.Normal('charge_effect', mu=0, tau=1/5, shape=len(charge_descr_cats))
    
    # calculate the linear predictor
    lp = intercept[charge_descr_ints] + b * race_black + b_race * race_black + charge_effect[charge_descr_ints]
    
    # model the counts of RTCC requests as a binomial distribution
    p = pm.math.invlogit(lp)
    rtcc_est = pm.Deterministic('rtcc_est', p * n)
    pm.Binomial('rtcc', n=n, p=p, observed=rc_grouped['rtcc'].values)
    
    # set the prior for the standard deviation of the error term
    sigma = pm.Uniform('sigma', lower=0, upper=100)

# perform MCMC sampling
with model:
    trace = pm.sample(2000, tune=2000)

In [None]:
var_names = list(trace.posterior.data_vars.keys())
print(var_names)

In [None]:
az.summary(trace.posterior, var_names=['b', 'b_race', 'charge_effect'])

In [None]:
## add code to filter out intercepts and see the effect 
b_summary = az.summary(trace.posterior["b"], hdi_prob=0.95, round_to=2, var_names=['b'])
b_summary = pd.DataFrame(b_summary)
b_means = b_summary.mean()
print(b_means)

In [None]:
## add code to filter out charges
charge_summary = az.summary(trace.posterior["charge_effect"], hdi_prob=0.95, round_to=2, var_names=['charge_effect'])
charge_summary = pd.DataFrame(charge_summary)
charge_means = charge_summary.mean()
print(charge_means)

In [None]:
## add code to filter out charges
race_summary = az.summary(trace.posterior["b_race"], hdi_prob=0.95, round_to=2, var_names=['b_race'])
race_summary = pd.DataFrame(race_summary)
race_means = race_summary.mean()
print(race_means)

In [None]:
coef_summary = az.summary(trace.posterior["charge_effect"])
coef_summary.index = rc["charge_description"].unique()

coef_summary = coef_summary[["mean", "hdi_3%", "hdi_97%"]]
coef_summary.columns = ["mean", "q05", "q95"]


## take the median?
coef_summary = coef_summary.sort_values("mean", ascending=False)
print(coef_summary.head(10))

coef_summary = coef_summary.sort_values("mean")
print(coef_summary.head(10))