# Tree-based Classifier

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

## Merge additional features (geolocations) to the dataset

In [3]:
import requests, json
r = requests.get('https://data.cityofchicago.org/resource/crimes.json?$limit=500000')
data = r.json()
lst = []
for d in data:
    if ('longitude' in d) and ('latitude' in d):
        lst.append(d['case_number'])
        
print(len(lst))

491377


In [4]:
original_data = pd.read_csv('Arrests_cleaned.csv')

set_original_data = set(original_data["CASE NUMBER"])
num_in_new_data = 0

new_data = set(lst)

for case in set_original_data:
    if case in new_data:
        num_in_new_data +=1

print("There are ",num_in_new_data, "entries that overlap between datasets")

There are  51087 entries that overlap between datasets


In [5]:
location_data = pd.read_json(r.text)
location_data = location_data[["latitude","longitude","case_number"]]
location_data = location_data.dropna()
combined_data = pd.merge(original_data,location_data,'inner',left_on='CASE NUMBER',right_on='case_number')
combined_data

Unnamed: 0,ACB_NO,CASE NUMBER,ARREST MONTH,ARREST DATE,ARREST YEAR,ARREST TIME,ARREST MERIDIEM,RACE,CHARGE 1 STATUTE,CHARGE 1 DESCRIPTION,...,CHARGE 4 DESCRIPTION,CHARGE 4 TYPE,CHARGE 4 CLASS,CHARGES STATUTE,CHARGES DESCRIPTION,CHARGES TYPE,CHARGES CLASS,latitude,longitude,case_number
0,30039812,JE183770,3,30,2021,12:10:00,AM,WHITE HISPANIC,720 ILCS 5.0/12-3.2-A-1,DOMESTIC BATTERY - BODILY HARM,...,,,,720 ILCS 5.0/12-3.2-A-1 | | |,DOMESTIC BATTERY - BODILY HARM | | |,M | | |,A | | |,41.795251,-87.709642,JE183770
1,30063075,JE276251,6,23,2021,12:04:00,AM,BLACK,720 ILCS 5.0/12-3.2-A-2,DOMESTIC BATTERY - PHYSICAL CONTACT,...,,,,720 ILCS 5.0/12-3.2-A-2 | 720 ILCS 5.0/21-1-A-...,DOMESTIC BATTERY - PHYSICAL CONTACT | CRIM DAM...,M | M | |,A | A | |,41.850724,-87.732308,JE276251
2,30069116,JE252804,7,16,2021,09:36:00,PM,BLACK,720 ILCS 5.0/12-3.3-A,DOMESTIC BATTERY - AGGRAVATED,...,,,,720 ILCS 5.0/12-3.3-A | | |,DOMESTIC BATTERY - AGGRAVATED | | |,F | | |,2 | | |,41.751059,-87.561374,JE252804
3,30069020,JE303350,7,16,2021,12:19:00,PM,BLACK,720 ILCS 5.0/24-1.6-A-1,AGG UUW/VEH/FIR LOADED/NO FOID,...,DRIVER'S LICENSE/PERMIT - FAIL TO CARRY/DISPLAY,,P,720 ILCS 5.0/24-1.6-A-1 | 625 ILCS 5.0/11-708-...,AGG UUW/VEH/FIR LOADED/NO FOID | WRONG WAY ON ...,F | | |,4 | P | P | P,41.873054,-87.720349,JE303350
4,30068977,JE303006,7,16,2021,04:24:00,AM,BLACK HISPANIC,720 ILCS 5.0/12-1-A,ASSAULT - SIMPLE,...,,,,720 ILCS 5.0/12-1-A | | |,ASSAULT - SIMPLE | | |,M | | |,C | | |,41.777456,-87.755624,JE303006
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55167,30183808,JF425991,10,7,2022,06:20:00,PM,BLACK HISPANIC,720 ILCS 5.0/24-1.6-A-2,AGG UUW/PERS/UNLOADED/NO FCCA,...,,,,720 ILCS 5.0/24-1.6-A-2 | 9-76-220(A) | |,AGG UUW/PERS/UNLOADED/NO FCCA | OBSTRUCTION DR...,F | | |,4 | L | |,41.842187,-87.708629,JF425991
55168,30183795,JF425939,10,7,2022,05:40:00,PM,BLACK,720 ILCS 5.0/12-3.2-A-1,DOMESTIC BATTERY/BODILY HARM,...,,,,720 ILCS 5.0/12-3.2-A-1 | | |,DOMESTIC BATTERY/BODILY HARM | | |,M | | |,A | | |,41.708090,-87.659191,JF425939
55169,30183860,JF426166,10,7,2022,09:35:00,PM,BLACK,720 ILCS 5.0/24-1.6-A-1,AGG UUW/VEHICLE/LOADED/NO FCCA,...,HEADLIGHT TWO REQUIRED-MOTOR VEHICLE,,L,720 ILCS 5.0/24-1.6-A-1 | 625 ILCS 5.0/6-303-A...,AGG UUW/VEHICLE/LOADED/NO FCCA | DRIVING ON SU...,F | M | M |,4 | A | A | L,41.707458,-87.605509,JF426166
55170,30183842,JF426400,10,7,2022,07:16:00,PM,WHITE HISPANIC,720 ILCS 5.0/24-1.6-A-1,AGG UUW/VEHICLE/NO FOID,...,DL EXPIRED MORE THAN A YEAR,M,B,720 ILCS 5.0/24-1.6-A-1 | 720 ILCS 570.0/402-C...,AGG UUW/VEHICLE/NO FOID | PCS - POSSESSION - P...,F | F | M | M,4 | 4 | A | B,41.765271,-87.579352,JF426400


In [6]:
df = combined_data
print(len(df))
df.columns

55172


Index(['ACB_NO', 'CASE NUMBER', 'ARREST MONTH', 'ARREST DATE', 'ARREST YEAR',
       'ARREST TIME', 'ARREST MERIDIEM', 'RACE', 'CHARGE 1 STATUTE',
       'CHARGE 1 DESCRIPTION', 'CHARGE 1 TYPE', 'CHARGE 1 CLASS',
       'CHARGE 2 STATUTE', 'CHARGE 2 DESCRIPTION', 'CHARGE 2 TYPE',
       'CHARGE 2 CLASS', 'CHARGE 3 STATUTE', 'CHARGE 3 DESCRIPTION',
       'CHARGE 3 TYPE', 'CHARGE 3 CLASS', 'CHARGE 4 STATUTE',
       'CHARGE 4 DESCRIPTION', 'CHARGE 4 TYPE', 'CHARGE 4 CLASS',
       'CHARGES STATUTE', 'CHARGES DESCRIPTION', 'CHARGES TYPE',
       'CHARGES CLASS', 'latitude', 'longitude', 'case_number'],
      dtype='object')

## Data Preprocessing

In [7]:
races = df.loc[:, "RACE"].unique()
c1_types = df.loc[:, "CHARGE 1 TYPE"].unique()
print("races: ", races)
print("charge 1 type: ", c1_types)

races:  ['WHITE HISPANIC' 'BLACK' 'BLACK HISPANIC' 'WHITE'
 'ASIAN / PACIFIC ISLANDER' 'AMER INDIAN / ALASKAN NATIVE'
 'UNKNOWN / REFUSED']
charge 1 type:  ['M' 'F' 'O' 'R' 'V']


In [8]:
race_dummies = pd.get_dummies(df, columns=["RACE"])
dummy_races = ['RACE_AMER INDIAN / ALASKAN NATIVE', \
               'RACE_ASIAN / PACIFIC ISLANDER', 'RACE_BLACK', 'RACE_BLACK HISPANIC', \
               'RACE_UNKNOWN / REFUSED', 'RACE_WHITE', 'RACE_WHITE HISPANIC']
race_dummies = race_dummies[dummy_races]
race_dummies

Unnamed: 0,RACE_AMER INDIAN / ALASKAN NATIVE,RACE_ASIAN / PACIFIC ISLANDER,RACE_BLACK,RACE_BLACK HISPANIC,RACE_UNKNOWN / REFUSED,RACE_WHITE,RACE_WHITE HISPANIC
0,0,0,0,0,0,0,1
1,0,0,1,0,0,0,0
2,0,0,1,0,0,0,0
3,0,0,1,0,0,0,0
4,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...
55167,0,0,0,1,0,0,0
55168,0,0,1,0,0,0,0
55169,0,0,1,0,0,0,0
55170,0,0,0,0,0,0,1


In [9]:
month_dummies = pd.get_dummies(df, columns=["ARREST MONTH"])
dummy_month = ['ARREST MONTH_1', 'ARREST MONTH_2', 'ARREST MONTH_3', 'ARREST MONTH_4', \
       'ARREST MONTH_5', 'ARREST MONTH_6', 'ARREST MONTH_7', 'ARREST MONTH_8', \
       'ARREST MONTH_9', 'ARREST MONTH_10', 'ARREST MONTH_11', 'ARREST MONTH_12']
month_dummies = month_dummies[dummy_month]
month_dummies

Unnamed: 0,ARREST MONTH_1,ARREST MONTH_2,ARREST MONTH_3,ARREST MONTH_4,ARREST MONTH_5,ARREST MONTH_6,ARREST MONTH_7,ARREST MONTH_8,ARREST MONTH_9,ARREST MONTH_10,ARREST MONTH_11,ARREST MONTH_12
0,0,0,1,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,1,0,0,0,0,0,0
2,0,0,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,0,0,1,0,0,0,0,0
4,0,0,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
55167,0,0,0,0,0,0,0,0,0,1,0,0
55168,0,0,0,0,0,0,0,0,0,1,0,0
55169,0,0,0,0,0,0,0,0,0,1,0,0
55170,0,0,0,0,0,0,0,0,0,1,0,0


In [31]:
arrest_case_num = df.loc[:, "case_number"].str[2:].astype(int)
arrest_case_num

0        183770
1        276251
2        252804
3        303350
4        303006
          ...  
55167    425991
55168    425939
55169    426166
55170    426400
55171    426207
Name: case_number, Length: 55172, dtype: int64

In [10]:
arrest_ampm = df.loc[:, "ARREST MERIDIEM"].apply(lambda x: 1 if x == "am" else 0)
arrest_ampm

0        0
1        0
2        0
3        0
4        0
        ..
55167    0
55168    0
55169    0
55170    0
55171    0
Name: ARREST MERIDIEM, Length: 55172, dtype: int64

In [11]:
arrest_time = df.loc[:, "ARREST TIME"].str[:2].astype(int)
arrest_time

0        12
1        12
2         9
3        12
4         4
         ..
55167     6
55168     5
55169     9
55170     7
55171    10
Name: ARREST TIME, Length: 55172, dtype: int64

In [12]:
arrest_date = df.loc[:, "ARREST DATE"].astype(int)
arrest_date

0        30
1        23
2        16
3        16
4        16
         ..
55167     7
55168     7
55169     7
55170     7
55171     7
Name: ARREST DATE, Length: 55172, dtype: int64

In [13]:
arrest_long = df.loc[:, "longitude"]
arrest_long

0       -87.709642
1       -87.732308
2       -87.561374
3       -87.720349
4       -87.755624
           ...    
55167   -87.708629
55168   -87.659191
55169   -87.605509
55170   -87.579352
55171   -87.723880
Name: longitude, Length: 55172, dtype: float64

In [14]:
arrest_lat = df.loc[:, "latitude"]
arrest_lat

0        41.795251
1        41.850724
2        41.751059
3        41.873054
4        41.777456
           ...    
55167    41.842187
55168    41.708090
55169    41.707458
55170    41.765271
55171    41.879785
Name: latitude, Length: 55172, dtype: float64

### Binary Charge Types: Serious (Misdemeanor & Felony) vs Non-Serious

In [15]:
# binary charge types
# serious crime ('M' 'F' 'A' 'R' 'V') labelled as 1, 'O' labelled as 0
charge_types = df.loc[:, "CHARGE 1 TYPE"].apply(lambda x: 0 if x == "O" else 1)
charge_types.value_counts()

1    53359
0     1813
Name: CHARGE 1 TYPE, dtype: int64

### Multi-Class Charge Types: Misdemeanor, Felony & Others

In [17]:
# three charge types
m_charge_types = df.loc[:, "CHARGE 1 TYPE"].replace('A', 'M')
m_charge_types = m_charge_types.replace('R', 'O')
m_charge_types = m_charge_types.replace('V', 'O')
multi_class_types = pd.factorize(m_charge_types)
multi_class_types = multi_class_types[0]
m_charge_types.value_counts()

M    28563
F    24794
O     1815
Name: CHARGE 1 TYPE, dtype: int64

In [33]:
data = pd.concat([arrest_case_num, arrest_ampm, arrest_time, arrest_long, arrest_lat, month_dummies, race_dummies], axis=1)
data

Unnamed: 0,case_number,ARREST MERIDIEM,ARREST TIME,longitude,latitude,ARREST MONTH_1,ARREST MONTH_2,ARREST MONTH_3,ARREST MONTH_4,ARREST MONTH_5,...,ARREST MONTH_10,ARREST MONTH_11,ARREST MONTH_12,RACE_AMER INDIAN / ALASKAN NATIVE,RACE_ASIAN / PACIFIC ISLANDER,RACE_BLACK,RACE_BLACK HISPANIC,RACE_UNKNOWN / REFUSED,RACE_WHITE,RACE_WHITE HISPANIC
0,183770,0,12,-87.709642,41.795251,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
1,276251,0,12,-87.732308,41.850724,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,252804,0,9,-87.561374,41.751059,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,303350,0,12,-87.720349,41.873054,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,303006,0,4,-87.755624,41.777456,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55167,425991,0,6,-87.708629,41.842187,0,0,0,0,0,...,1,0,0,0,0,0,1,0,0,0
55168,425939,0,5,-87.659191,41.708090,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0
55169,426166,0,9,-87.605509,41.707458,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0
55170,426400,0,7,-87.579352,41.765271,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1


### Train Testing Split

In [36]:
X_train, X_test, y_train, y_test = train_test_split(data, charge_types, test_size = 0.2)
print("train-test-sizes: ", len(X_train), len(X_test))

train-test-sizes:  44137 11035


In [38]:
X_train, X_test, y_train_m, y_test_m = train_test_split(data, multi_class_types, test_size = 0.2)
print("train-test-sizes: ", len(X_train), len(X_test))

train-test-sizes:  44137 11035


## Random Forest

### Training (Binary Charge Types)

In [39]:
# creating a Random Forest classifier
clf = RandomForestClassifier(n_estimators = 100)

# Training the model on the training dataset
# fit function is used to train the model using the training sets as parameters
clf.fit(X_train, y_train)

RandomForestClassifier()

### Testing (Binary Charge Types)

In [40]:
# performing predictions on the test dataset
y_pred = clf.predict(X_test)

# using metrics module for accuracy calculation
print("ACCURACY OF THE MODEL: ", metrics.accuracy_score(y_test, y_pred))

ACCURACY OF THE MODEL:  0.9651110104213865


### Training (Multi-Class Charge Types)

In [41]:
# creating a Random Forest classifier
clf_m = RandomForestClassifier(n_estimators = 100)

# Training the model on the training dataset
# fit function is used to train the model using the training sets as parameters
clf_m.fit(X_train, y_train_m)

RandomForestClassifier()

### Testing (Multi-Class Charge Types)

In [42]:
# performing predictions on the test dataset
y_pred_m = clf_m.predict(X_test)

# using metrics module for accuracy calculation
print("ACCURACY OF THE MODEL: ", metrics.accuracy_score(y_test_m, y_pred_m))

ACCURACY OF THE MODEL:  0.6106932487539647


## XGBoost

### Binary Charge Types

In [43]:
# training
xgb_clf = XGBClassifier()
xgb_clf.fit(X_train, y_train)

# testing
score = xgb_clf.score(X_test, y_test)
print("accuracy score: ", score)

accuracy score:  0.9678296329859538


### Multi-Class Charge Types

In [44]:
# training
xgb_clf_m = XGBClassifier(max_depth=9)
xgb_clf_m.fit(X_train, y_train_m)

# testing
score_m = xgb_clf_m.score(X_test, y_test_m)
print("accuracy score: ", score_m)

accuracy score:  0.6230176710466697


## Result Summary

There are two types of classification: one is binary (predict serious charge I types (M & F) vs others), and the other one is multi-class (charge I types: M, F, others). Both of them apply several features to do the tree-based training, which include case number, arrest meridian, arrest hour, arrest longitude, arrest latitude, arrest month and race. The training and testing data split ratio is 4:1.<br>

For the binary classification, random forest model with 100 estimators presents 96.51% accuracy, and XGBoost with a default tree depth of 6 presents 96.78% accuracy.<br>

For the multi-class classification, random forest model with 100 estimators presents 61.07% accuracy, and XGBoost with a tree depth of 9 presents 62.30% accuracy.<br>