#### This notebook is based in large part on Brina Seidel's "Responsible Data Science Lab 8: LIME" notebook, available at https://github.com/DataResponsibly/courses/blob/master/documents/spring20/RDS_Lab8_2020.ipynb

In [1]:
from __future__ import print_function

%matplotlib inline

import sklearn.model_selection
import sklearn.metrics
import sklearn.datasets
import sklearn.ensemble
import sklearn.preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
import numpy as np
import lime
import lime.lime_tabular
from IPython.display import Markdown, display
import matplotlib.pyplot as plt
import sys
sys.path.append("../")
import numpy as np
import warnings
from lime import submodular_pick

# import xgboost
# from xgboost import plot_importance
# hmmm we didnt use this package to do our gradient boosted trees
# but would it maybe still be useful to analysis?

import json
from collections import OrderedDict

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

from IPython.display import Markdown, display
import matplotlib.pyplot as plt

import pandas as pd

In [9]:
total_df = pd.read_pickle('total_df.pckl.gz', compression = 'gzip')

In [23]:
# Get a list of feature names (excluding the outcome variable)
feature_names = list(total_df.columns)
feature_names.remove('MHI')

In [25]:
len(feature_names)

5603

In [26]:
labels = total_df.loc[:,'MHI']

In [28]:
labels.sum()

2212

In [29]:
total_df.dtypes

offense_category_aggravated assault police officer              uint8
offense_category_aggravated assault police officer firearm      uint8
offense_category_aggravated battery                             uint8
offense_category_aggravated battery police officer              uint8
offense_category_aggravated battery police officer firearm      uint8
                                                               ...   
season_summer                                                   uint8
season_winter                                                   uint8
incident_length                                                 int64
latitude                                                      float64
longitude                                                     float64
Length: 5604, dtype: object

In [85]:
def is_binary(series):
    '''
    Function adapted from StackOverflow user lucas
    https://stackoverflow.com/questions/32982034/which-columns-are-binary-in-a-pandas-dataframe
    '''
    return sorted(series.unique()) == [0, 1]

In [75]:
# Dictionary for aggregation, as defined in Data_Prep notebook
agg_dict_short = {'offense_category': 'median', 'event': 'median', 
                  'age_at_incident': 'median', 'gender': 'median', 
                  'race': 'median', 'law_enforcement_agency': 'median', 
                  'unit': 'median', 'incident_city': 'median', 
                  'updated_offense_category': 'median', 'age_over_100': 'median', 
                  'age_unknown': 'median', 'charge_count': 'max', 
                  '402': 'max', 'charge_offense_title': 'sum', 
                  'chapter': 'sum', 'act': 'sum', 
                  'section': 'sum', 'class': 'sum', 'aoic': 'sum'}

# List of categorical columns used for OHE, as defined in Data_Prep notebook
cat_cols = ['offense_category', 'charge_offense_title', 'chapter', 'act', 
            'section', 'class', 'aoic', 'event', 'gender', 'race', 
            'law_enforcement_agency', 'unit', 'incident_city', 'updated_offense_category']

In [76]:
decode_cols = []
for catcol in cat_cols:
    if agg_dict_short[catcol] == 'median':
        decode_cols.append(catcol)

In [77]:
decode_cols

['offense_category',
 'event',
 'gender',
 'race',
 'law_enforcement_agency',
 'unit',
 'incident_city',
 'updated_offense_category']

In [79]:
# check that all columns aggregated by 'median' are binary
# check that all columns aggregated elsewise are not binary
bin_errors = []
nonbin_errors = []

for dumcol in total_df.columns:
    dumb=False
    for col in agg_dict_short:
        if col+'_' in dumcol:
            dumb=True
            if agg_dict_short[col]=='median':
                if not is_binary(total_df[dumcol]):
                    print(dumcol, ' is NOT binary.')
                    nonbin_errors.append(dumcol)
                else:
                    print('\t'+dumcol+' is BINARY')
            else:
                if is_binary(total_df[dumcol]):
                    print(dumcol, ' is BINARY')
                    bin_errors.append(dumcol)
                else:
                    print('\t'+dumcol+' is NOT binary')
    if dumb==False:
        if is_binary(total_df[dumcol]):
            print(dumcol, ' is BINARY')
            bin_errors.append(dumcol)
        else:
            print('\t'+dumcol+' is NOT binary')

	offense_category_aggravated assault police officer is BINARY
	offense_category_aggravated assault police officer firearm is BINARY
	offense_category_aggravated battery is BINARY
	offense_category_aggravated battery police officer is BINARY
	offense_category_aggravated battery police officer firearm is BINARY
	offense_category_aggravated battery with a firearm is BINARY
	offense_category_aggravated discharge firearm is BINARY
	offense_category_aggravated dui is BINARY
	offense_category_aggravated fleeing and eluding is BINARY
	offense_category_aggravated identity theft is BINARY
	offense_category_aggravated robbery is BINARY
	offense_category_aggravated robbery bb gun is BINARY
	offense_category_armed robbery is BINARY
	offense_category_armed violence is BINARY
	offense_category_arson is BINARY
	offense_category_arson and attempt arson is BINARY
	offense_category_attempt armed robbery is BINARY
	offense_category_attempt arson is BINARY
	offense_category_attempt homicide is BINARY
	offe

	updated_offense_category_perjury is BINARY
	updated_offense_category_perjury is BINARY
	updated_offense_category_police shooting is BINARY
	updated_offense_category_police shooting is BINARY
	updated_offense_category_possession of burglary tools is BINARY
	updated_offense_category_possession of burglary tools is BINARY
	updated_offense_category_possession of contraband in penal institution is BINARY
	updated_offense_category_possession of contraband in penal institution is BINARY
	updated_offense_category_possession of explosives is BINARY
	updated_offense_category_possession of explosives is BINARY
	updated_offense_category_possession of stolen motor vehicle is BINARY
	updated_offense_category_possession of stolen motor vehicle is BINARY
	updated_offense_category_prostitution is BINARY
	updated_offense_category_prostitution is BINARY
	updated_offense_category_reckless discharge of firearm is BINARY
	updated_offense_category_reckless discharge of firearm is BINARY
	updated_offense_cat

	law_enforcement_agency_isp doo state financial crime is BINARY
	law_enforcement_agency_isp doo strategic investigations is BINARY
	law_enforcement_agency_isp doo zone 1, des plaines is BINARY
	law_enforcement_agency_isp doo zone 1, downers grove is BINARY
	law_enforcement_agency_isp doo zone 1, elgin is BINARY
	law_enforcement_agency_isp doo zone 3, lockport is BINARY
	law_enforcement_agency_isp dupage meg is BINARY
	law_enforcement_agency_isp gaming board, chicago is BINARY
	law_enforcement_agency_isp gaming board, des plaines is BINARY
	law_enforcement_agency_isp inv dchgo, des plaines is BINARY
	law_enforcement_agency_isp itc firearms services bureau is BINARY
	law_enforcement_agency_isp metro enf group of lake county is BINARY
	law_enforcement_agency_isp n central narcotics task force is BINARY
	law_enforcement_agency_isp n central narcotics task force-d is BINARY
	law_enforcement_agency_isp narcotics and currency interdiction unit [narcint] is BINARY
	law_enforcement_agency_isp n

	unit_unit 933 - patrol area 3 is BINARY
	unit_unit 934 - patrol area 4 is BINARY
	unit_unit 935 - patrol area 5 is BINARY
	unit_unknown is BINARY
	incident_city_addison is BINARY
	incident_city_albers is BINARY
	incident_city_algonquin is BINARY
	incident_city_alsip is BINARY
	incident_city_antioch is BINARY
	incident_city_arlington heights is BINARY
	incident_city_arlington hts is BINARY
	incident_city_aurora is BINARY
	incident_city_barrington is BINARY
	incident_city_barrington hills is BINARY
	incident_city_bartlett is BINARY
	incident_city_bedford park is BINARY
	incident_city_beecher is BINARY
	incident_city_bellwood is BINARY
	incident_city_bensenville is BINARY
	incident_city_berkeley is BINARY
	incident_city_berwyn is BINARY
	incident_city_bloomingdale is BINARY
	incident_city_bloomington is BINARY
	incident_city_blue island is BINARY
	incident_city_bluff springs is BINARY
	incident_city_bolingbrook is BINARY
	incident_city_boulder creek is BINARY
	incident_city_bourbonnais i

	incident_city_schaumburg is BINARY
	incident_city_schererville is BINARY
	incident_city_schiller park is BINARY
	incident_city_sheridan is BINARY
	incident_city_skokie is BINARY
	incident_city_south barrington is BINARY
	incident_city_south bend is BINARY
	incident_city_south chicago heights is BINARY
	incident_city_south holland is BINARY
	incident_city_spring grove is BINARY
	incident_city_springfield is BINARY
	incident_city_st charles is BINARY
	incident_city_steger is BINARY
	incident_city_steward is BINARY
	incident_city_stickney is BINARY
	incident_city_stone park is BINARY
	incident_city_streamwood is BINARY
	incident_city_summit is BINARY
	incident_city_summit argo is BINARY
	incident_city_thornton is BINARY
	incident_city_tinley park is BINARY
	incident_city_university park is BINARY
	incident_city_unknown is BINARY
	incident_city_vernon hills is BINARY
	incident_city_warren is BINARY
	incident_city_waukegan is BINARY
	incident_city_west chicago is BINARY
	incident_city_west

charge_offense_title_agg poss 21-30 stolen firearms  is BINARY
charge_offense_title_agg poss 6-10 stolen firearms  is BINARY
charge_offense_title_agg poss/2-5 converted firearm  is BINARY
	charge_offense_title_agg poss/2-5 stolen firearms is NOT binary
	charge_offense_title_agg reck drvg/child/crsng gd is NOT binary
charge_offense_title_agg reck drvg/incline/bod harm  is BINARY
	charge_offense_title_agg reck drvg/incline/child is NOT binary
	charge_offense_title_agg reckless drvg/bodily harm is NOT binary
charge_offense_title_agg unconscn agrmnt > $5k  is BINARY
charge_offense_title_agg use commu device/death  is BINARY
	charge_offense_title_agg uuw/loaded/no fcca/foid is NOT binary
	charge_offense_title_agg uuw/loaded/no fcca/foid/2+ is NOT binary
	charge_offense_title_agg uuw/unloaded/no fcca is NOT binary
	charge_offense_title_agg uuw/veh/fir loaded/no foid is NOT binary
	charge_offense_title_agg uuw/vehicle/loaded firearm is NOT binary
	charge_offense_title_aggravated arson is NOT 

charge_offense_title_attempt obstruct just/destroy evidence  is BINARY
charge_offense_title_attempt poss fraudulent id card  is BINARY
charge_offense_title_attempt possess lost credit/debit card  is BINARY
charge_offense_title_attempt possession of a controlled substance  is BINARY
	charge_offense_title_attempt possession of a stolen motor vehicle is NOT binary
charge_offense_title_attempt possession of cannabis  is BINARY
charge_offense_title_attempt possession of explosives  is BINARY
charge_offense_title_attempt possession of look-alike substance with intent to deliver/delivery of look-alike substance  is BINARY
	charge_offense_title_attempt predatory criminal sexual assault is NOT binary
	charge_offense_title_attempt predatory criminal sexual assault of a victim less than 13 years of age is NOT binary
	charge_offense_title_attempt residential arson is NOT binary
	charge_offense_title_attempt residential burglary is NOT binary
charge_offense_title_attempt retail theft  is BINARY
	ch

charge_offense_title_conspiracy possession of unstamped cigarettes  is BINARY
	charge_offense_title_conspiracy to commit aggravated battery is NOT binary
charge_offense_title_conspiracy to commit armed robbery  is BINARY
charge_offense_title_conspiracy to commit arson  is BINARY
charge_offense_title_conspiracy to commit first degree murder  is BINARY
charge_offense_title_conspiracy to commit insurance fraud  is BINARY
charge_offense_title_conspiracy to commit robbery  is BINARY
charge_offense_title_conspiracy to commit theft  is BINARY
charge_offense_title_conspiracy to committ battery  is BINARY
	charge_offense_title_conspiracy to committ first degree murder is NOT binary
charge_offense_title_conspiracy to committ terrorism  is BINARY
	charge_offense_title_conspiracy(aggravated battery) is NOT binary
	charge_offense_title_cont financial crimes entrprs is NOT binary
	charge_offense_title_continuing fin crime entrprs is NOT binary
	charge_offense_title_contrib crim delinq minor/cl 3 is 

	charge_offense_title_elec hrsmt/obscene proposal is NOT binary
charge_offense_title_elec hrsmt/obscene/3+  is BINARY
charge_offense_title_elec hrsmt/obscene/forc fel  is BINARY
charge_offense_title_elec hrsmt/obscene/same vic  is BINARY
charge_offense_title_elec hrsmt/threat pers/2+  is BINARY
charge_offense_title_elec hrsmt/threat pers/propb  is BINARY
charge_offense_title_electronic fencing/>$300  is BINARY
charge_offense_title_emp bring alcohol/penal inst  is BINARY
charge_offense_title_emp bring cannabis/penal inst  is BINARY
	charge_offense_title_emp bring con sub/penal inst is NOT binary
charge_offense_title_emp bring fir/expl penal inst  is BINARY
	charge_offense_title_emp bring weapon/tl penal inst is NOT binary
	charge_offense_title_emp del alcohol/penal inst is NOT binary
	charge_offense_title_emp del cannabis/penal inst is NOT binary
charge_offense_title_emp poss alcohol/penal inst  is BINARY
charge_offense_title_emp poss con sub/penal inst  is BINARY
charge_offense_title_e

	charge_offense_title_id theft/access personal info is NOT binary
	charge_offense_title_id theft/know id stol/mlty vic is NOT binary
	charge_offense_title_id theft/know stolen/2nd+ is NOT binary
	charge_offense_title_id theft/known stolen id/2+ is NOT binary
charge_offense_title_id theft/ob info/fel/mlty vic  is BINARY
	charge_offense_title_id theft/obtain id info/fel/2+ is NOT binary
charge_offense_title_id theft/poss rfid/3+ indiv  is BINARY
	charge_offense_title_id theft/use id info/felony/2+ is NOT binary
	charge_offense_title_id theft/use id/3 indiv 12 mos is NOT binary
	charge_offense_title_id theft/use id/commit felony is NOT binary
	charge_offense_title_id theft/use id/fel/3+ indiv is NOT binary
	charge_offense_title_id theft/use id/fel/mlty vic is NOT binary
charge_offense_title_id thft/access info/3+ indiv  is BINARY
	charge_offense_title_id thft/aid/abet another/2nd+ is NOT binary
	charge_offense_title_id thft/gain access pers info is NOT binary
	charge_offense_title_id thft

charge_offense_title_person >18 compel org mbr <18  is BINARY
	charge_offense_title_personation pub off/comm fel is NOT binary
	charge_offense_title_phone harassment/forcible fel is NOT binary
charge_offense_title_phone harassment/repeat calls  is BINARY
charge_offense_title_phone hrsmt/lewd pri forc fel  is BINARY
charge_offense_title_phone hrsmt/lewd/no cnt  is BINARY
charge_offense_title_phone hrsmt/lewd/same vic  is BINARY
	charge_offense_title_phone hrsmt/lewd/thrt kill is NOT binary
	charge_offense_title_phone hrsmt/make ring/no cnt is NOT binary
charge_offense_title_phone hrsmt/make ring/pri fel  is BINARY
charge_offense_title_phone hrsmt/no conversation  is BINARY
charge_offense_title_phone hrsmt/repeat/no cnt  is BINARY
charge_offense_title_phone hrsmt/repeat/same vic  is BINARY
charge_offense_title_phone hrsmt/repeat/thrt kill  is BINARY
charge_offense_title_phone hrsmt/ring/same vic  is BINARY
charge_offense_title_phone hrsmt/ring/thrt kill  is BINARY
charge_offense_title_pi

	charge_offense_title_ptrn minor pro/intel disabled is NOT binary
charge_offense_title_pub indecency/exposure/school  is BINARY
	charge_offense_title_pub indecency/sex conduct 3+ is NOT binary
charge_offense_title_pub indecency/sex/sch grounds  is BINARY
	charge_offense_title_pub official/emp tamper w/ rec is NOT binary
	charge_offense_title_public aid wire fraud is NOT binary
charge_offense_title_public indecency  is BINARY
	charge_offense_title_public indecency/exposure/3+ is NOT binary
charge_offense_title_public indecency/lewd exposure  is BINARY
charge_offense_title_public indecency/sex conduct  is BINARY
	charge_offense_title_purch 1 firearm/false info is NOT binary
	charge_offense_title_purch 1 firearm/intent is NOT binary
charge_offense_title_purch 11<20 firearm/false info  is BINARY
	charge_offense_title_purch 2-5 firearms/false info is NOT binary
charge_offense_title_purch 2-5 firearms/intent  is BINARY
charge_offense_title_purch 2<5 firearms/false info  is BINARY
charge_offe

	charge_offense_title_unlawful sale firearm to felon is NOT binary
	charge_offense_title_unlawful sale of firearm/felon is NOT binary
	charge_offense_title_unlawful use id card/theft is NOT binary
	charge_offense_title_unlawful use of a reencoder to defraud is NOT binary
	charge_offense_title_unlawful use of a scanning device is NOT binary
	charge_offense_title_unlawful use of a weapon is NOT binary
	charge_offense_title_unlawful use of recorded sounds or images is NOT binary
	charge_offense_title_unlawful use of unidentified sound or audio visual recordings is NOT binary
	charge_offense_title_unlawful use or possession of a weapon by a felon is NOT binary
charge_offense_title_unlawful use or possession of a weapon by a person in the custody of a facility of the department of corrections  is BINARY
	charge_offense_title_unlawful use vehicle id is NOT binary
	charge_offense_title_unlawful video/residence is NOT binary
	charge_offense_title_unlawful videotaping/victim<18 is NOT binary
ch

	section_11-1.50(a)(1) is NOT binary
	section_11-1.50(a)(2) is NOT binary
	section_11-1.60(a)(1) is NOT binary
	section_11-1.60(a)(2) is NOT binary
	section_11-1.60(a)(3) is NOT binary
	section_11-1.60(a)(4) is NOT binary
	section_11-1.60(a)(5) is NOT binary
	section_11-1.60(a)(6) is NOT binary
	section_11-1.60(a)(7) is NOT binary
	section_11-1.60(b) is NOT binary
	section_11-1.60(c)(1)(i) is NOT binary
	section_11-1.60(c)(1)(ii) is NOT binary
	section_11-1.60(c)(2)(i) is NOT binary
	section_11-1.60(c)(2)(ii) is NOT binary
	section_11-1.60(d) is NOT binary
	section_11-1.60(e) is NOT binary
	section_11-1.60(f) is NOT binary
section_11-1006(a)  is BINARY
	section_11-11 is NOT binary
section_11-1301.6(b)(2)  is BINARY
section_11-1303  is BINARY
	section_11-14(a) is NOT binary
section_11-14.1(a)  is BINARY
	section_11-14.3(a)(1) is NOT binary
section_11-14.3(a)(2)(a)  is BINARY
	section_11-14.3(a)(2)(b) is NOT binary
section_11-14.3(a)(2)(c)  is BINARY
	section_11-14.4(a)(1) is NOT binary


	section_12-4(b)(3) is NOT binary
section_12-4(b)(6)  is BINARY
	section_12-4(b)(7) is NOT binary
	section_12-4(b)(8) is NOT binary
	section_12-4(b)(9) is NOT binary
	section_12-4(c) is NOT binary
	section_12-4(d-6) is NOT binary
	section_12-4.1 is NOT binary
	section_12-4.2(a)(1) is NOT binary
	section_12-4.3(a) is NOT binary
	section_12-4.3(a-5) is NOT binary
	section_12-4.4a(a)(1) is NOT binary
	section_12-4.4a(a)(2) is NOT binary
	section_12-4.4a(b)(1) is NOT binary
	section_12-4.6(a) is NOT binary
section_12-5(a)  is BINARY
	section_12-5(a)(2) is NOT binary
	section_12-5(a-5) is NOT binary
section_12-5.01(a)(1)  is BINARY
section_12-5.01(a)(2)  is BINARY
	section_12-6(a)(1) is NOT binary
	section_12-6(a)(2) is NOT binary
	section_12-6(a)(3) is NOT binary
	section_12-6(a)(4) is NOT binary
	section_12-6(a)(5) is NOT binary
	section_12-6(a)(6) is NOT binary
	section_12-6.2(a)(1) is NOT binary
	section_12-6.2(a)(3) is NOT binary
	section_12-6.5 is NOT binary
section_12-603.1  is BINAR

	section_17-9(a)(i) is NOT binary
section_17-9(b)(i)  is BINARY
section_17.01(a)  is BINARY
section_17.02(a)  is BINARY
section_18(d)(4)  is BINARY
section_18(d)(7)  is BINARY
section_18(d)(8)  is BINARY
	section_18-1(a) is NOT binary
	section_18-1(b)(1) is NOT binary
section_18-2(a)  is BINARY
	section_18-2(a)(1) is NOT binary
	section_18-2(a)(2) is NOT binary
	section_18-2(a)(3) is NOT binary
	section_18-2(a)(4) is NOT binary
	section_18-3(a) is NOT binary
	section_18-4(a)(1) is NOT binary
	section_18-4(a)(2) is NOT binary
	section_18-4(a)(3) is NOT binary
	section_18-4(a)(4) is NOT binary
	section_18-4(a)(5) is NOT binary
	section_18-4(a)(6) is NOT binary
	section_18-5(a) is NOT binary
	section_18-6(a) is NOT binary
section_18b-103  is BINARY
section_18b-103,392.mcs  is BINARY
	section_18b-103,395.mcs is NOT binary
section_18c-7502(a)(i)  is BINARY
section_18c-7502(a)(ii)  is BINARY
section_18c-7502(a)(iii)  is BINARY
section_18c-7502(a)(iv)  is BINARY
	section_19-1(a) is NOT binary

	section_4-103(a)(5) is NOT binary
section_4-103(a)(6)  is BINARY
	section_4-103.1 is NOT binary
section_4-103.2(8)  is BINARY
	section_4-103.2(a)(1) is NOT binary
section_4-103.2(a)(2)  is BINARY
	section_4-103.2(a)(3) is NOT binary
	section_4-103.2(a)(5) is NOT binary
section_4-103.2(a)(6)  is BINARY
	section_4-103.2(a)(7)(a) is NOT binary
	section_4-103.2(a)(7)(b) is NOT binary
section_4-103.3  is BINARY
	section_4-104(a)(1) is NOT binary
	section_4-104(a)(2) is NOT binary
	section_4-104(a)(3) is NOT binary
section_4-104(a)(4)  is BINARY
section_4-104(a)(5)  is BINARY
	section_4-105(a)(1) is NOT binary
section_4-105(a)(2)  is BINARY
	section_4-105(a)(3) is NOT binary
	section_4-105(a)(4) is NOT binary
	section_4-105(a)(5) is NOT binary
section_4-105(a)(6)  is BINARY
	section_4.04 is NOT binary
	section_401(a)(1)(a) is NOT binary
	section_401(a)(1)(b) is NOT binary
	section_401(a)(1)(c) is NOT binary
	section_401(a)(1)(d) is NOT binary
	section_401(a)(1.5)(a) is NOT binary
	section_4

	section_8-4(11-1.20(a)(1)) is NOT binary
	section_8-4(11-1.20(a)(2)) is NOT binary
	section_8-4(11-1.20(a)(3)) is NOT binary
section_8-4(11-1.20(a)(4))  is BINARY
	section_8-4(11-1.30(a)(1)) is NOT binary
	section_8-4(11-1.30(a)(2)) is NOT binary
	section_8-4(11-1.30(a)(3)) is NOT binary
	section_8-4(11-1.30(a)(4)) is NOT binary
section_8-4(11-1.30(a)(5))  is BINARY
	section_8-4(11-1.30(a)(6)) is NOT binary
section_8-4(11-1.30(a)(8))  is BINARY
	section_8-4(11-1.60(a)(1)) is NOT binary
section_8-4(11-1.60(a)(2))  is BINARY
	section_8-4(11-1.60(a)(6)) is NOT binary
	section_8-4(11-1.60(c)(1)(i)) is NOT binary
section_8-4(11-24(b)(3))  is BINARY
section_8-4(12-11(a)(1))  is BINARY
section_8-4(12-11.1)  is BINARY
	section_8-4(12-13(a)(2)) is NOT binary
section_8-4(12-13(a)4)  is BINARY
	section_8-4(12-14.1(a)(1)) is NOT binary
section_8-4(12-16(c)(1)(i))  is BINARY
section_8-4(12-16(d))  is BINARY
section_8-4(12-3.05(a)(2))  is BINARY
	section_8-4(12-3.05(a)(5)) is NOT binary
	section_8-

	aoic_0011390 is NOT binary
	aoic_0011391 is NOT binary
aoic_0011394  is BINARY
aoic_0011403  is BINARY
aoic_0011404  is BINARY
	aoic_0011405 is NOT binary
	aoic_0011406 is NOT binary
	aoic_0011407 is NOT binary
	aoic_0011408 is NOT binary
	aoic_0011409 is NOT binary
	aoic_0011410 is NOT binary
	aoic_0011412 is NOT binary
aoic_0011414  is BINARY
	aoic_0011416 is NOT binary
	aoic_0011417 is NOT binary
	aoic_0011421 is NOT binary
	aoic_0011422 is NOT binary
	aoic_0011423 is NOT binary
	aoic_0011436 is NOT binary
aoic_0011437  is BINARY
	aoic_0011438 is NOT binary
	aoic_0011439 is NOT binary
	aoic_0011440 is NOT binary
	aoic_0011441 is NOT binary
	aoic_0011442 is NOT binary
aoic_0011443  is BINARY
aoic_0011456  is BINARY
	aoic_0011462 is NOT binary
	aoic_0011463 is NOT binary
aoic_0011464  is BINARY
aoic_0011470  is BINARY
	aoic_0011472 is NOT binary
aoic_0011473  is BINARY
aoic_0011474  is BINARY
aoic_0011475  is BINARY
	aoic_0011476 is NOT binary
aoic_0011481  is BINARY
	aoic_0011489 is

	aoic_0013582 is NOT binary
aoic_0013596  is BINARY
aoic_0013598  is BINARY
	aoic_0013619 is NOT binary
	aoic_0013626 is NOT binary
aoic_0013637  is BINARY
	aoic_0013638 is NOT binary
	aoic_0013639 is NOT binary
aoic_0013640  is BINARY
aoic_0013664  is BINARY
aoic_0013665  is BINARY
	aoic_0013696 is NOT binary
	aoic_0013697 is NOT binary
	aoic_0013698 is NOT binary
	aoic_0013699 is NOT binary
	aoic_0013700 is NOT binary
	aoic_0013701 is NOT binary
	aoic_0013702 is NOT binary
	aoic_0013703 is NOT binary
	aoic_0013704 is NOT binary
	aoic_0013705 is NOT binary
	aoic_0013706 is NOT binary
	aoic_0013707 is NOT binary
	aoic_0013709 is NOT binary
	aoic_0013710 is NOT binary
aoic_0013711  is BINARY
	aoic_0013712 is NOT binary
	aoic_0013723 is NOT binary
aoic_0013728  is BINARY
	aoic_0013729 is NOT binary
	aoic_0013730 is NOT binary
	aoic_0013731 is NOT binary
	aoic_0013732 is NOT binary
aoic_0013734  is BINARY
	aoic_0013735 is NOT binary
	aoic_0013736 is NOT binary
aoic_0013738  is BINARY
	aoi

aoic_0014990  is BINARY
aoic_0014991  is BINARY
	aoic_0014992 is NOT binary
aoic_0014993  is BINARY
aoic_0014994  is BINARY
	aoic_0014995 is NOT binary
aoic_0014996  is BINARY
	aoic_0014997 is NOT binary
aoic_0014998  is BINARY
aoic_0014999  is BINARY
aoic_0015001  is BINARY
aoic_0015002  is BINARY
aoic_0015011  is BINARY
aoic_0015013  is BINARY
aoic_0015017  is BINARY
aoic_0015029  is BINARY
aoic_0015035  is BINARY
aoic_0015047  is BINARY
	aoic_0015051 is NOT binary
aoic_0015059  is BINARY
	aoic_0015061 is NOT binary
	aoic_0015062 is NOT binary
	aoic_0015079 is NOT binary
	aoic_0015080 is NOT binary
	aoic_0015081 is NOT binary
	aoic_0015082 is NOT binary
aoic_0015087  is BINARY
aoic_0015088  is BINARY
aoic_0015089  is BINARY
	aoic_0015091 is NOT binary
aoic_0015092  is BINARY
aoic_0015094  is BINARY
	aoic_0015103 is NOT binary
	aoic_0015106 is NOT binary
aoic_0015107  is BINARY
	aoic_0015109 is NOT binary
	aoic_0015110 is NOT binary
	aoic_0015111 is NOT binary
aoic_0015112  is BINARY


	aoic_0016106 is NOT binary
	aoic_0016107 is NOT binary
	aoic_0016108 is NOT binary
	aoic_0016109 is NOT binary
	aoic_0016110 is NOT binary
	aoic_0016111 is NOT binary
	aoic_0016112 is NOT binary
	aoic_0016113 is NOT binary
	aoic_0016114 is NOT binary
	aoic_0016115 is NOT binary
	aoic_0016118 is NOT binary
	aoic_0016122 is NOT binary
	aoic_0016123 is NOT binary
aoic_0016124  is BINARY
aoic_0016125  is BINARY
	aoic_0016126 is NOT binary
	aoic_0016127 is NOT binary
	aoic_0016128 is NOT binary
	aoic_0016129 is NOT binary
	aoic_0016131 is NOT binary
aoic_0016132  is BINARY
aoic_0016133  is BINARY
aoic_0016140  is BINARY
aoic_0016141  is BINARY
	aoic_0016142 is NOT binary
	aoic_0016143 is NOT binary
	aoic_0016144 is NOT binary
	aoic_0016146 is NOT binary
	aoic_0016147 is NOT binary
	aoic_0016152 is NOT binary
	aoic_0016153 is NOT binary
	aoic_0016154 is NOT binary
	aoic_0016155 is NOT binary
aoic_0016156  is BINARY
	aoic_0016168 is NOT binary
	aoic_0016169 is NOT binary
	aoic_0016170 is NOT

aoic_0017230  is BINARY
aoic_0017231  is BINARY
aoic_0017234  is BINARY
	aoic_0017240 is NOT binary
	aoic_0017241 is NOT binary
	aoic_0017242 is NOT binary
aoic_0017246  is BINARY
	aoic_0017252 is NOT binary
aoic_0017253  is BINARY
	aoic_0017254 is NOT binary
	aoic_0017276 is NOT binary
	aoic_0017277 is NOT binary
	aoic_0017278 is NOT binary
	aoic_0017279 is NOT binary
	aoic_0017280 is NOT binary
aoic_0017281  is BINARY
	aoic_0017282 is NOT binary
	aoic_0017283 is NOT binary
	aoic_0017284 is NOT binary
	aoic_0017285 is NOT binary
	aoic_0017286 is NOT binary
	aoic_0017287 is NOT binary
	aoic_0017294 is NOT binary
aoic_0017295  is BINARY
aoic_0017301  is BINARY
	aoic_0017303 is NOT binary
	aoic_0017304 is NOT binary
	aoic_0017305 is NOT binary
	aoic_0017313 is NOT binary
	aoic_0017314 is NOT binary
	aoic_0017316 is NOT binary
	aoic_0017318 is NOT binary
aoic_0017320  is BINARY
aoic_0017331  is BINARY
aoic_0017332  is BINARY
aoic_0017333  is BINARY
aoic_0017342  is BINARY
aoic_0017344  is

	aoic_1080200 is NOT binary
aoic_1095100  is BINARY
	aoic_1095530 is NOT binary
	aoic_1100000 is NOT binary
	aoic_1100100 is NOT binary
aoic_1105000  is BINARY
	aoic_1110000 is NOT binary
	aoic_1115000 is NOT binary
	aoic_1120000 is NOT binary
	aoic_1120600 is NOT binary
	aoic_1125000 is NOT binary
aoic_1125100  is BINARY
	aoic_1130000 is NOT binary
	aoic_1130100 is NOT binary
aoic_1140000  is BINARY
	aoic_1140100 is NOT binary
	aoic_1140125 is NOT binary
aoic_1140150  is BINARY
	aoic_1140225 is NOT binary
aoic_1140310  is BINARY
aoic_1140425  is BINARY
	aoic_1140700 is NOT binary
aoic_1155000  is BINARY
aoic_1170000  is BINARY
aoic_1175000  is BINARY
aoic_1195000  is BINARY
aoic_1195100  is BINARY
aoic_1210000  is BINARY
aoic_1210400  is BINARY
aoic_1210600  is BINARY
	aoic_1211100 is NOT binary
	aoic_1211500 is NOT binary
aoic_1211700  is BINARY
	aoic_1214500 is NOT binary
aoic_1214600  is BINARY
	aoic_1214825 is NOT binary
	aoic_1214850 is NOT binary
	aoic_1214875 is NOT binary
	aoi

In [82]:
# view columns that are unexpectedly binary
for x in bin_errors:
    print(x)

age_over_100
age_unknown
402
charge_offense_title_15<200 object/parts lsd/analog
charge_offense_title_5<15 grams meth/analog
charge_offense_title_[aggravated methamphetamine possession with intent to deliver/aggravated methamphetamine delivery]
charge_offense_title_[attempt possession of cannabis with intent to deliver/attempt delivery of cannabis]
charge_offense_title_adltrat sub intent defrd test
charge_offense_title_agg aslt pc off/fireman weapon
charge_offense_title_agg assault/peace off/fireman
charge_offense_title_agg assault/transit employee
charge_offense_title_agg battery/controlled sub
charge_offense_title_agg battery/process server
charge_offense_title_agg dui/3rd+
charge_offense_title_agg false personation police
charge_offense_title_agg flee/concl/alt regis plt
charge_offense_title_agg flee/concl/alt regis plt/2
charge_offense_title_agg fls personatn/forc fel
charge_offense_title_agg frd/2+ cntrt/misrep/disabl
charge_offense_title_agg hm frd/2+ cntrt/misrep/60+
charge_offe

aoic_0016922
aoic_0016924
aoic_0016925
aoic_0016927
aoic_0016970
aoic_0016975
aoic_0016977
aoic_0016982
aoic_0016988
aoic_0016996
aoic_0016997
aoic_0017004
aoic_0017007
aoic_0017013
aoic_0017014
aoic_0017015
aoic_0017020
aoic_0017022
aoic_0017035
aoic_0017037
aoic_0017038
aoic_0017040
aoic_0017043
aoic_0017047
aoic_0017048
aoic_0017051
aoic_0017053
aoic_0017057
aoic_0017064
aoic_0017066
aoic_0017068
aoic_0017070
aoic_0017071
aoic_0017072
aoic_0017074
aoic_0017078
aoic_0017079
aoic_0017084
aoic_0017085
aoic_0017086
aoic_0017087
aoic_0017089
aoic_0017090
aoic_0017091
aoic_0017096
aoic_0017097
aoic_0017101
aoic_0017102
aoic_0017104
aoic_0017120
aoic_0017124
aoic_0017125
aoic_0017126
aoic_0017194
aoic_0017203
aoic_0017204
aoic_0017205
aoic_0017212
aoic_0017214
aoic_0017217
aoic_0017222
aoic_0017223
aoic_0017230
aoic_0017231
aoic_0017234
aoic_0017246
aoic_0017253
aoic_0017281
aoic_0017295
aoic_0017301
aoic_0017320
aoic_0017331
aoic_0017332
aoic_0017333
aoic_0017342
aoic_0017344
aoic_0017348

In [81]:
# view columns that are unexpectedly non-binary
nonbin_errors

[]

Some values of charge_offense_title, chapter, act, section, class, and aoic end up being binary. But because other values from these columns are non-binary, this indicates that these column categories should be treated as non-binary, and should not be one hot de-coded. 

age_over_100, age_unknown, 402, MHI, and weekday are all flags, so they cannot be one-hot decoded. 

season should be one-hot decoded. 

In [83]:
# add 'season' to list of columns to be one-hot decoded
decode_cols.append('season')

In [84]:
decode_cols

['offense_category',
 'event',
 'gender',
 'race',
 'law_enforcement_agency',
 'unit',
 'incident_city',
 'updated_offense_category',
 'season']

In [137]:
#one-hot decode

#create copy of total_df to hold one-hot decoded dataset
decoded_df = total_df.copy()

#loop through list of features to be one-hot decoded
for catcol in decode_cols:
    
    #initialize list of dummy features derived from single categorical feature
    dummy_list = []
    
    #loop through all columns in total_df
    for dumbcol in total_df.columns:
        
        if catcol+'_' in dumbcol:
            
            #append dummy feature to list
            dummy_list.append(dumbcol)
            
    #create new column which holds actual string values
    decoded_df[catcol] = decoded_df[dummy_list].idxmax(axis=1)
    
    #trim column name and '_' from beginning of value
    decoded_df[catcol] = decoded_df[catcol].map(lambda x: x.replace(catcol+'_', ''))
    
    #drop dummy features
    decoded_df.drop(columns=dummy_list, inplace=True)

KeyError: "None of [Index(['updated_offense_category_aggravated assault police officer',\n       'updated_offense_category_aggravated battery',\n       'updated_offense_category_aggravated battery police officer',\n       'updated_offense_category_aggravated battery with a firearm',\n       'updated_offense_category_aggravated discharge firearm',\n       'updated_offense_category_aggravated dui',\n       'updated_offense_category_aggravated fleeing and eluding',\n       'updated_offense_category_aggravated identity theft',\n       'updated_offense_category_aggravated robbery',\n       'updated_offense_category_armed robbery',\n       'updated_offense_category_armed violence',\n       'updated_offense_category_arson',\n       'updated_offense_category_arson and attempt arson',\n       'updated_offense_category_attempt armed robbery',\n       'updated_offense_category_attempt arson',\n       'updated_offense_category_attempt homicide',\n       'updated_offense_category_attempt sex crimes',\n       'updated_offense_category_attempt vehicular hijacking',\n       'updated_offense_category_battery',\n       'updated_offense_category_bomb threat',\n       'updated_offense_category_bribery', 'updated_offense_category_burglary',\n       'updated_offense_category_child abduction',\n       'updated_offense_category_child pornography',\n       'updated_offense_category_communicating with witness',\n       'updated_offense_category_credit card cases',\n       'updated_offense_category_criminal damage to property',\n       'updated_offense_category_criminal trespass to residence',\n       'updated_offense_category_deceptive practice',\n       'updated_offense_category_disarming police officer',\n       'updated_offense_category_dog fighting',\n       'updated_offense_category_domestic battery',\n       'updated_offense_category_driving with suspended or revoked license',\n       'updated_offense_category_dui',\n       'updated_offense_category_escape - failure to return',\n       'updated_offense_category_failure to register as a sex offender',\n       'updated_offense_category_forgery', 'updated_offense_category_fraud',\n       'updated_offense_category_fraudulent id',\n       'updated_offense_category_gambling',\n       'updated_offense_category_gun - non uuw',\n       'updated_offense_category_gun running',\n       'updated_offense_category_hate crimes',\n       'updated_offense_category_home invasion',\n       'updated_offense_category_homicide',\n       'updated_offense_category_human trafficking',\n       'updated_offense_category_identity theft',\n       'updated_offense_category_impersonating police officer',\n       'updated_offense_category_intimidation',\n       'updated_offense_category_kidnapping',\n       'updated_offense_category_major accidents',\n       'updated_offense_category_narcotics',\n       'updated_offense_category_obstructing justice',\n       'updated_offense_category_official misconduct',\n       'updated_offense_category_other offense',\n       'updated_offense_category_pandering',\n       'updated_offense_category_perjury',\n       'updated_offense_category_police shooting',\n       'updated_offense_category_possession of burglary tools',\n       'updated_offense_category_possession of contraband in penal institution',\n       'updated_offense_category_possession of explosives',\n       'updated_offense_category_possession of stolen motor vehicle',\n       'updated_offense_category_prostitution',\n       'updated_offense_category_reckless discharge of firearm',\n       'updated_offense_category_reckless homicide',\n       'updated_offense_category_residential burglary',\n       'updated_offense_category_retail theft',\n       'updated_offense_category_robbery',\n       'updated_offense_category_sex crimes',\n       'updated_offense_category_stalking',\n       'updated_offense_category_tampering', 'updated_offense_category_theft',\n       'updated_offense_category_theft by deception',\n       'updated_offense_category_unlawful restraint',\n       'updated_offense_category_uuw - unlawful use of weapon',\n       'updated_offense_category_vehicular hijacking',\n       'updated_offense_category_vehicular invasion',\n       'updated_offense_category_violate bail bond',\n       'updated_offense_category_violation of sex offender registration',\n       'updated_offense_category_violation order of protection'],\n      dtype='object')] are in the [columns]"

In [136]:
decoded_df

Unnamed: 0_level_0,event_direct indictment,event_grand jury,event_indictment,event_preliminary hearing,event_re-indictment,event_unknown,age_at_incident,gender_female,gender_male,gender_unknown,...,received_date,weekday,season_fall,season_spring,season_summer,season_winter,incident_length,latitude,longitude,offense_category
case_participant_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
260122253823,0,0,0,1,0,0,22.0,0,1,0,...,2011-05-24,0.0,0,1,0,0,0,0.000000,0.000000,promis conversion
272161011760,0,0,0,0,0,1,29.0,0,1,0,...,2012-01-27,1.0,0,0,1,0,0,0.000000,0.000000,promis conversion
864286527653,0,0,1,0,0,0,29.0,0,1,0,...,2011-01-31,1.0,0,1,0,0,0,0.000000,0.000000,promis conversion
882206007016,0,0,0,1,0,0,34.0,0,1,0,...,2011-01-31,1.0,1,0,0,0,0,41.875562,-87.624421,uuw - unlawful use of weapon
882242005211,0,0,1,0,0,0,41.0,0,1,0,...,2011-07-17,0.0,0,0,1,0,0,41.875562,-87.624421,homicide
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1101083006411,0,0,0,0,0,1,62.0,0,1,0,...,2019-11-21,1.0,1,0,0,0,0,41.875562,-87.624421,narcotics
1101084648023,0,0,0,0,0,1,31.0,0,1,0,...,2019-11-21,1.0,1,0,0,0,0,41.875562,-87.624421,narcotics
1101121467024,0,0,0,0,0,1,18.0,0,1,0,...,2019-11-22,1.0,1,0,0,0,0,41.875562,-87.624421,narcotics
1101123108635,0,0,0,0,0,1,27.0,0,1,0,...,2019-11-22,1.0,1,0,0,0,0,41.875562,-87.624421,narcotics


In [121]:
len(total_df.columns)

5618

In [122]:
len(decoded_df.columns)

5616