In [1]:
!pip install pymc
!pip install --upgrade setuptools
!pip install arviz
!pip install fancyimpute
!pip install pandas --upgrade

import pandas as pd
import numpy as np
import pymc as pm
from scipy.stats import binom
import arviz as az
from fancyimpute import IterativeImputer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

# read in the police reports and real-time crime center data
pr = pd.read_csv("../data/police_reports/electronic_police_report_2018_2022.csv")
rtcc = pd.read_csv("../data/real_time_crime_center/rtcc.csv")
rtcc["rtcc_requested"] = 1 

# create a new column in the police reports data indicating whether the offender is Black
pr['race_black'] = (pr['offender_race'] == 'BLACK')

# select only the necessary columns from the police reports data and join with the distinct item numbers from the rtcc data
rc = pd.merge(pr, rtcc, on="item_number", how="left")

rc['rtcc_requested'] = rc['rtcc_requested'].fillna(0)

rc.loc[:, "rtcc_requested"] = rc.rtcc_requested.astype(str).str.replace(r"\.0", "", regex=True)
rc.loc[:, "offenderid"] = rc.offenderid.astype(str).str.replace(r"\.0", "", regex=True)

rc = rc[~((rc.charge_description.fillna("") == ""))]
rc = rc[~((rc.item_number.fillna("") == ""))]

# Replace empty and 'unknown' strings with NaN
rc['offender_race'] = rc['offender_race'].replace('', np.nan).fillna(value=np.nan, inplace=False)
rc['offender_race'] = rc['offender_race'].replace('UNKNOWN', np.nan)

# Create a new dataframe with only the 'location' and 'offender_race' columns
subset_df = rc[['location', 'offender_race']]

# Split the data into known and unknown values based on the missing values in the 'offender_race' column
known = subset_df[subset_df['offender_race'].notna()]
unknown = subset_df[subset_df['offender_race'].isna()]

# Encode categorical data using LabelEncoder
le = LabelEncoder()
known['location'] = le.fit_transform(known['location'])
known['offender_race'] = le.fit_transform(known['offender_race'])

# Impute missing values using IterativeImputer
imputer = IterativeImputer()
imputed_values = imputer.fit_transform(known)

# Convert imputed values back to dataframe
imputed_df = pd.DataFrame(imputed_values, columns=['location', 'offender_race'])
imputed_df['location'] = imputed_df['location'].astype(int)
imputed_df['offender_race'] = imputed_df['offender_race'].round().astype(int)

# Convert the numerical data back to categorical data using LabelEncoder's inverse_transform method
imputed_df['offender_race'] = le.inverse_transform(imputed_df['offender_race'])

# Replace the missing values in the original 'offender_race' column with the imputed values
rc.loc[unknown.index, 'offender_race'] = imputed_df['offender_race']

df_0 = rc[rc['rtcc_requested'] == "0"]
df_1 = rc[rc['rtcc_requested'] == '1']

rc = pd.concat([df_0, df_1])
print(rc.shape)

rc = rc.drop_duplicates(subset=["item_number", "offenderid"])
rc.loc[:, "rtcc_requested"] = rc.rtcc_requested.astype(int)

rc_grouped = rc.groupby(['race_black', 'charge_description']).agg(n=('item_number', 'count'), rtcc=('rtcc_requested', 'sum')).reset_index()
rc_grouped['race_charge'] = pd.Categorical(rc_grouped['race_black'].astype(str) + '_' + rc_grouped['charge_description'])

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Requirement already up-to-date: setuptools in /home/ayyubi/.local/lib/python3.7/site-packages (67.7.2)
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Requirement already up-to-date: pandas in /opt/anaconda3/lib/python3.7/site-packages (1.3.5)


  exec(code_obj, self.user_global_ns, self.user_ns)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


(260715, 28)


In [2]:
with pm.Model() as model:
    # set the prior for the regression coefficients
    b = pm.Normal('b', mu=0, tau=1/5)
    b_race = pm.Normal('b_race', mu=0, tau=1/5)
    
    # set the prior for the standard deviation of the intercept
    sd_intercept = pm.Uniform('sd_intercept', lower=0, upper=10)
    
    # define n as a data input
    n = pm.Data('n', rc_grouped['n'].values)
    
    # define race_black as a predictor
    race_black = pm.Data('race_black', rc_grouped['race_black'].astype(int).values, mutable=True)

    
    # define charge_description as a categorical variable
    charge_descr = rc_grouped['charge_description'].astype('category')
    charge_descr_cats = charge_descr.cat.categories.values
    charge_descr_ints = charge_descr.cat.codes.values
    
    # model the intercept as a normal distribution with a group-level standard deviation
    intercept = pm.Normal('intercept', mu=0, sigma=sd_intercept, shape=len(charge_descr_cats))
    
    # model the effects of the charge descriptions
    charge_effect = pm.Normal('charge_effect', mu=0, tau=1/5, shape=len(charge_descr_cats))
    
    # calculate the linear predictor
    lp = intercept[charge_descr_ints] + b * race_black + b_race * race_black + charge_effect[charge_descr_ints]
    
    # model the counts of RTCC requests as a binomial distribution
    p = pm.math.invlogit(lp)
    rtcc_est = pm.Deterministic('rtcc_est', p * n)
    pm.Binomial('rtcc', n=n, p=p, observed=rc_grouped['rtcc'].values)
    
    # set the prior for the standard deviation of the error term
    sigma = pm.Uniform('sigma', lower=0, upper=100)

# perform MCMC sampling
with model:
    trace = pm.sample(2000, tune=2000)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [b, b_race, sd_intercept, intercept, charge_effect, sigma]


Sampling 4 chains for 2_000 tune and 2_000 draw iterations (8_000 + 8_000 draws total) took 1473 seconds.


In [3]:
var_names = list(trace.posterior.data_vars.keys())
print(var_names)

['b', 'b_race', 'intercept', 'charge_effect', 'sd_intercept', 'sigma', 'rtcc_est']


In [4]:
az.summary(trace.posterior, var_names=['b', 'b_race', 'charge_effect'])

Unnamed: 0,mean,sd,hdi_3%,hdi_97%,mcse_mean,mcse_sd,ess_bulk,ess_tail,r_hat
b,0.202,1.587,-2.774,3.116,0.015,0.018,11634.0,5937.0,1.0
b_race,0.203,1.587,-2.720,3.187,0.015,0.018,11630.0,6009.0,1.0
charge_effect[0],-1.065,2.124,-5.089,2.806,0.018,0.025,13913.0,5043.0,1.0
charge_effect[1],-0.979,2.096,-5.133,2.772,0.017,0.022,16223.0,6747.0,1.0
charge_effect[2],-0.551,2.050,-4.365,3.337,0.018,0.022,12660.0,6157.0,1.0
...,...,...,...,...,...,...,...,...,...
charge_effect[536],-0.846,2.067,-4.682,3.130,0.018,0.021,13009.0,6588.0,1.0
charge_effect[537],-1.104,2.039,-5.108,2.653,0.017,0.021,14987.0,6485.0,1.0
charge_effect[538],-1.100,2.082,-5.138,2.596,0.020,0.021,10847.0,6268.0,1.0
charge_effect[539],-0.921,2.103,-4.833,3.031,0.018,0.021,14357.0,6432.0,1.0


In [5]:
## add code to filter out intercepts and see the effect 
b_summary = az.summary(trace.posterior["b"], hdi_prob=0.95, round_to=2, var_names=['b'])
b_summary = pd.DataFrame(b_summary)
b_means = b_summary.mean()
print(b_means)

mean             0.20
sd               1.59
hdi_2.5%        -3.06
hdi_97.5%        3.12
mcse_mean        0.01
mcse_sd          0.02
ess_bulk     11634.01
ess_tail      5937.17
r_hat            1.00
dtype: float64


In [6]:
## add code to filter out charges
charge_summary = az.summary(trace.posterior["charge_effect"], hdi_prob=0.95, round_to=2, var_names=['charge_effect'])
charge_summary = pd.DataFrame(charge_summary)
charge_means = charge_summary.mean()
print(charge_means)

mean            -0.709409
sd               2.097856
hdi_2.5%        -4.819150
hdi_97.5%        3.380610
mcse_mean        0.019926
mcse_sd          0.021331
ess_bulk     14908.820721
ess_tail      6087.656802
r_hat            1.000000
dtype: float64


In [7]:
## add code to filter out charges
race_summary = az.summary(trace.posterior["b_race"], hdi_prob=0.95, round_to=2, var_names=['b_race'])
race_summary = pd.DataFrame(race_summary)
race_means = race_summary.mean()
print(race_means)

mean             0.20
sd               1.59
hdi_2.5%        -2.71
hdi_97.5%        3.47
mcse_mean        0.01
mcse_sd          0.02
ess_bulk     11629.50
ess_tail      6009.24
r_hat            1.00
dtype: float64


In [8]:
coef_summary = az.summary(trace.posterior["charge_effect"])
coef_summary.index = rc["charge_description"].unique()

coef_summary = coef_summary[["mean", "hdi_3%", "hdi_97%"]]
coef_summary.columns = ["mean", "q05", "q95"]


## take the median?
coef_summary = coef_summary.sort_values("mean", ascending=False)
print(coef_summary.head(10))

coef_summary = coef_summary.sort_values("mean")
print(coef_summary.head(10))

                                                  mean    q05    q95
DRIVER'S LICENSE REQUIRED                        0.677 -3.255  4.609
VEHICULAR TRESPASS                               0.676 -3.299  4.761
ALTERING/REMOVING VIN NUMBER                     0.659 -3.367  4.675
POSSESSION OF A BOMB                             0.628 -3.205  4.840
FOOD/BEVERAGE ON PUBLIC TRANSIT                  0.627 -3.322  4.481
ATTEMPT - IMPERSONATION                          0.339 -3.639  4.234
UNAUTHORIZED ENTRY OF A CRITICAL INFRASTRUCTURE  0.196 -3.596  4.239
ILLEGAL CARRYING OF WEAPON                       0.068 -4.005  3.836
PRINCIPAL TO ATTEMPTED ARMED ROBBERY             0.034 -3.753  3.918
TERRORIZING                                      0.022 -3.840  3.894
                                                     mean    q05    q95
PRINCIPAL TO FIRST DEGREE RAPE                     -1.508 -5.611  2.320
OBSTRUCTING SCHOOL FACILITY STAFF                  -1.488 -5.267  2.273
CONSPIRACY TO POSSESSION 