# Tree-based Classifier

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [4]:
df = pd.read_csv("Arrests_cleaned_with_location.csv")
df = df.dropna(subset=["Latitude", "Longitude"])
df

Unnamed: 0,ACB_NO,CASE NUMBER,ARREST MONTH,ARREST DATE,ARREST YEAR,ARREST TIME,ARREST MERIDIEM,RACE,CHARGE 1 STATUTE,CHARGE 1 DESCRIPTION,...,CHARGE 4 STATUTE,CHARGE 4 DESCRIPTION,CHARGE 4 TYPE,CHARGE 4 CLASS,CHARGES STATUTE,CHARGES DESCRIPTION,CHARGES TYPE,CHARGES CLASS,Latitude,Longitude
2,18908465,HX290868,6,5,2014,11:58:00,AM,BLACK,720 ILCS 570.0/402-C,PCS - POSSESSION - POSS AMT CON SUB EXCEPT (A)(D),...,,,,,720 ILCS 570.0/402-C | 720 ILCS 600.0/3.5-A | |,PCS - POSSESSION - POSS AMT CON SUB EXCEPT (A)...,F | M | |,4 | A | |,41.877464,-87.763012
3,18914257,HX302763,6,14,2014,4:50:00,AM,BLACK,720 ILCS 5.0/12-3-A-1,BATTERY - CAUSE BODILY HARM,...,,,,,720 ILCS 5.0/12-3-A-1 | 520 ILCS 5.0/1.22 | 72...,BATTERY - CAUSE BODILY HARM | RESIST/OBSTRUCT ...,M | M | M |,A | A | A |,41.885393,-87.663548
5,18921756,HX318527,6,25,2014,4:35:00,PM,BLACK,720 ILCS 5.0/19-4-A-1,CRIM TRESPASS TO RESIDENCE,...,,,,,720 ILCS 5.0/19-4-A-1 | | |,CRIM TRESPASS TO RESIDENCE | | |,M | | |,A | | |,41.750435,-87.657916
6,18911533,HX297219,6,10,2014,12:15:00,AM,WHITE HISPANIC,720 ILCS 5.0/12-3.2-A-2,DOMESTIC BATTERY - PHYSICAL CONTACT,...,,,,,720 ILCS 5.0/12-3.2-A-2 | 720 ILCS 5.0/12-3.5-...,DOMESTIC BATTERY - PHYSICAL CONTACT | INTERF R...,M | M | |,A | A | |,41.935168,-87.747779
7,18919006,HX312887,6,21,2014,12:30:00,PM,BLACK,10-8-515,SOLICITING UNLAWFUL BUSINESS,...,,,,,10-8-515 | | |,SOLICITING UNLAWFUL BUSINESS | | |,| | |,L | | |,41.896812,-87.748911
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
569735,30178472,JF400508,9,18,2022,7:10:00,AM,BLACK,720 ILCS 5.0/21-3-A-1,CRIMINAL TRESPASS TO REAL PROPERTY,...,,,,,720 ILCS 5.0/21-3-A-1 | | |,CRIMINAL TRESPASS TO REAL PROPERTY | | |,M | | |,B | | |,0.000000,0.000000
570001,19263360,HZ148289,2,11,2016,5:38:00,PM,BLACK,720 ILCS 5.0/12-3-A-2,BATTERY - MAKE PHYSICAL CONTACT,...,,,,,720 ILCS 5.0/12-3-A-2 | 720 ILCS 5.0/16-25-A-1...,BATTERY - MAKE PHYSICAL CONTACT | RETAIL THEFT...,M | M | |,A | A | |,41.728227,-87.552855
570268,19028723,HX541848,12,14,2014,10:58:00,AM,BLACK,720 ILCS 5.0/24-1.1-A,UUW - WEAPON - FELON POSSES WEAPON/2ND+,...,625 ILCS 5.0/6-303-A,DRIVING ON SUSPENDED LICENSE,M,A,720 ILCS 5.0/24-1.1-A | 720 ILCS 5.0/24-1.1-A ...,UUW - WEAPON - FELON POSSES WEAPON/2ND+ | UUW ...,F | F | M | M,2 | 2 | A | A,41.879366,-87.755845
570673,18956377,HX389427,8,15,2014,10:55:00,PM,BLACK,720 ILCS 570.0/402-C,PCS - POSSESSION - POSS AMT CON SUB EXCEPT (A)(D),...,,,,,720 ILCS 570.0/402-C | 720 ILCS 550.0/4-D | |,PCS - POSSESSION - POSS AMT CON SUB EXCEPT (A)...,F | F | |,4 | 4 | |,41.762780,-87.651629


In [5]:
print("size of the dataset: ", len(df))
df.columns

size of the dataset:  102103


Index(['ACB_NO', 'CASE NUMBER', 'ARREST MONTH', 'ARREST DATE', 'ARREST YEAR',
       'ARREST TIME', 'ARREST MERIDIEM', 'RACE', 'CHARGE 1 STATUTE',
       'CHARGE 1 DESCRIPTION', 'CHARGE 1 TYPE', 'CHARGE 1 CLASS',
       'CHARGE 2 STATUTE', 'CHARGE 2 DESCRIPTION', 'CHARGE 2 TYPE',
       'CHARGE 2 CLASS', 'CHARGE 3 STATUTE', 'CHARGE 3 DESCRIPTION',
       'CHARGE 3 TYPE', 'CHARGE 3 CLASS', 'CHARGE 4 STATUTE',
       'CHARGE 4 DESCRIPTION', 'CHARGE 4 TYPE', 'CHARGE 4 CLASS',
       'CHARGES STATUTE', 'CHARGES DESCRIPTION', 'CHARGES TYPE',
       'CHARGES CLASS', 'Latitude', 'Longitude'],
      dtype='object')

## Data Preprocessing

In [6]:
races = df.loc[:, "RACE"].unique()
c1_types = df.loc[:, "CHARGE 1 TYPE"].unique()
print("races: ", races)
print("charge 1 type: ", c1_types)

races:  ['BLACK' 'WHITE HISPANIC' 'WHITE' 'BLACK HISPANIC'
 'ASIAN / PACIFIC ISLANDER' 'UNKNOWN / REFUSED'
 'AMER INDIAN / ALASKAN NATIVE']
charge 1 type:  ['F' 'M' 'O' 'A' 'R']


In [7]:
race_dummies = pd.get_dummies(df, columns=["RACE"])
dummy_races = ['RACE_AMER INDIAN / ALASKAN NATIVE', \
               'RACE_ASIAN / PACIFIC ISLANDER', 'RACE_BLACK', 'RACE_BLACK HISPANIC', \
               'RACE_UNKNOWN / REFUSED', 'RACE_WHITE', 'RACE_WHITE HISPANIC']
race_dummies = race_dummies[dummy_races]
race_dummies

Unnamed: 0,RACE_AMER INDIAN / ALASKAN NATIVE,RACE_ASIAN / PACIFIC ISLANDER,RACE_BLACK,RACE_BLACK HISPANIC,RACE_UNKNOWN / REFUSED,RACE_WHITE,RACE_WHITE HISPANIC
2,0,0,1,0,0,0,0
3,0,0,1,0,0,0,0
5,0,0,1,0,0,0,0
6,0,0,0,0,0,0,1
7,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...
569735,0,0,1,0,0,0,0
570001,0,0,1,0,0,0,0
570268,0,0,1,0,0,0,0
570673,0,0,1,0,0,0,0


In [8]:
month_dummies = pd.get_dummies(df, columns=["ARREST MONTH"])
dummy_month = ['ARREST MONTH_1', 'ARREST MONTH_2', 'ARREST MONTH_3', 'ARREST MONTH_4', \
       'ARREST MONTH_5', 'ARREST MONTH_6', 'ARREST MONTH_7', 'ARREST MONTH_8', \
       'ARREST MONTH_9', 'ARREST MONTH_10', 'ARREST MONTH_11', 'ARREST MONTH_12']
month_dummies = month_dummies[dummy_month]
month_dummies

Unnamed: 0,ARREST MONTH_1,ARREST MONTH_2,ARREST MONTH_3,ARREST MONTH_4,ARREST MONTH_5,ARREST MONTH_6,ARREST MONTH_7,ARREST MONTH_8,ARREST MONTH_9,ARREST MONTH_10,ARREST MONTH_11,ARREST MONTH_12
2,0,0,0,0,0,1,0,0,0,0,0,0
3,0,0,0,0,0,1,0,0,0,0,0,0
5,0,0,0,0,0,1,0,0,0,0,0,0
6,0,0,0,0,0,1,0,0,0,0,0,0
7,0,0,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
569735,0,0,0,0,0,0,0,0,1,0,0,0
570001,0,1,0,0,0,0,0,0,0,0,0,0
570268,0,0,0,0,0,0,0,0,0,0,0,1
570673,0,0,0,0,0,0,0,1,0,0,0,0


In [9]:
arrest_case_num = df.loc[:, "CASE NUMBER"].str[2:].astype(int)
arrest_case_num

2         290868
3         302763
5         318527
6         297219
7         312887
           ...  
569735    400508
570001    148289
570268    541848
570673    389427
570885    373142
Name: CASE NUMBER, Length: 102103, dtype: int64

In [11]:
arrest_ampm = df.loc[:, "ARREST MERIDIEM"].apply(lambda x: 1 if x == "AM" else 0)
arrest_ampm

2         1
3         1
5         0
6         1
7         0
         ..
569735    1
570001    0
570268    1
570673    0
570885    0
Name: ARREST MERIDIEM, Length: 102103, dtype: int64

In [34]:
arrest_time = df.loc[:, "ARREST TIME"].str.split(":").apply(lambda x: x[0]).astype(int)
arrest_time

2         11
3          4
5          4
6         12
7         12
          ..
569735     7
570001     5
570268    10
570673    10
570885     8
Name: ARREST TIME, Length: 102103, dtype: int64

In [21]:
arrest_date = df.loc[:, "ARREST DATE"].astype(int)
arrest_date

2          5
3         14
5         25
6         10
7         21
          ..
569735    18
570001    11
570268    14
570673    15
570885     4
Name: ARREST DATE, Length: 102103, dtype: int64

In [22]:
arrest_long = df.loc[:, "Longitude"]
arrest_long

2        -87.763012
3        -87.663548
5        -87.657916
6        -87.747779
7        -87.748911
            ...    
569735     0.000000
570001   -87.552855
570268   -87.755845
570673   -87.651629
570885   -87.711219
Name: Longitude, Length: 102103, dtype: float64

In [23]:
arrest_lat = df.loc[:, "Latitude"]
arrest_lat

2         41.877464
3         41.885393
5         41.750435
6         41.935168
7         41.896812
            ...    
569735     0.000000
570001    41.728227
570268    41.879366
570673    41.762780
570885    41.847555
Name: Latitude, Length: 102103, dtype: float64

### Binary Charge Types: Serious (Misdemeanor & Felony) vs Non-Serious

In [24]:
# binary charge types
# serious crime ('M' 'F' 'A' 'R' 'V') labelled as 1, 'O' labelled as 0
charge_types = df.loc[:, "CHARGE 1 TYPE"].apply(lambda x: 0 if x == "O" else 1)
charge_types.value_counts()

1    97435
0     4668
Name: CHARGE 1 TYPE, dtype: int64

### Multi-Class Charge Types: Misdemeanor, Felony & Others

In [25]:
# three charge types
m_charge_types = df.loc[:, "CHARGE 1 TYPE"].replace('A', 'M')
m_charge_types = m_charge_types.replace('R', 'O')
m_charge_types = m_charge_types.replace('V', 'O')
multi_class_types = pd.factorize(m_charge_types)
multi_class_types = multi_class_types[0]
m_charge_types.value_counts()

M    62857
F    34577
O     4669
Name: CHARGE 1 TYPE, dtype: int64

In [38]:
data = pd.concat([arrest_case_num, arrest_ampm, arrest_time, arrest_long, arrest_lat, month_dummies, race_dummies], axis=1)
data

Unnamed: 0,CASE NUMBER,ARREST MERIDIEM,ARREST TIME,Longitude,Latitude,ARREST MONTH_1,ARREST MONTH_2,ARREST MONTH_3,ARREST MONTH_4,ARREST MONTH_5,...,ARREST MONTH_10,ARREST MONTH_11,ARREST MONTH_12,RACE_AMER INDIAN / ALASKAN NATIVE,RACE_ASIAN / PACIFIC ISLANDER,RACE_BLACK,RACE_BLACK HISPANIC,RACE_UNKNOWN / REFUSED,RACE_WHITE,RACE_WHITE HISPANIC
2,290868,1,11,-87.763012,41.877464,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,302763,1,4,-87.663548,41.885393,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
5,318527,0,4,-87.657916,41.750435,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
6,297219,1,12,-87.747779,41.935168,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
7,312887,0,12,-87.748911,41.896812,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
569735,400508,1,7,0.000000,0.000000,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
570001,148289,0,5,-87.552855,41.728227,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
570268,541848,1,10,-87.755845,41.879366,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0
570673,389427,0,10,-87.651629,41.762780,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


### Train Testing Split

In [39]:
X_train, X_test, y_train, y_test = train_test_split(data, charge_types, test_size = 0.2)
print("train-test-sizes: ", len(X_train), len(X_test))

train-test-sizes:  81682 20421


In [40]:
X_train, X_test, y_train_m, y_test_m = train_test_split(data, multi_class_types, test_size = 0.2)
print("train-test-sizes: ", len(X_train), len(X_test))

train-test-sizes:  81682 20421


## Random Forest

### Training (Binary Charge Types)

In [29]:
# creating a Random Forest classifier
clf = RandomForestClassifier(n_estimators = 100)

# Training the model on the training dataset
# fit function is used to train the model using the training sets as parameters
clf.fit(X_train, y_train)

RandomForestClassifier()

### Testing (Binary Charge Types)

In [30]:
# performing predictions on the test dataset
y_pred = clf.predict(X_test)

# using metrics module for accuracy calculation
print("ACCURACY OF THE MODEL: ", metrics.accuracy_score(y_test, y_pred))

ACCURACY OF THE MODEL:  0.9503452328485382


### Training (Multi-Class Charge Types)

In [31]:
# creating a Random Forest classifier
clf_m = RandomForestClassifier(n_estimators = 100)

# Training the model on the training dataset
# fit function is used to train the model using the training sets as parameters
clf_m.fit(X_train, y_train_m)

RandomForestClassifier()

### Testing (Multi-Class Charge Types)

In [32]:
# performing predictions on the test dataset
y_pred_m = clf_m.predict(X_test)

# using metrics module for accuracy calculation
print("ACCURACY OF THE MODEL: ", metrics.accuracy_score(y_test_m, y_pred_m))

ACCURACY OF THE MODEL:  0.6614759316389991


## XGBoost

### Binary Charge Types

In [41]:
# training
xgb_clf = XGBClassifier()
xgb_clf.fit(X_train, y_train)

# testing
score = xgb_clf.score(X_test, y_test)
print("accuracy score: ", score)

accuracy score:  0.9531364771558689


### Multi-Class Charge Types

In [42]:
# training
xgb_clf_m = XGBClassifier(max_depth=9)
xgb_clf_m.fit(X_train, y_train_m)

# testing
score_m = xgb_clf_m.score(X_test, y_test_m)
print("accuracy score: ", score_m)

accuracy score:  0.6791048430537192


## Result Summary

There are two types of classification: one is binary (predict serious charge I types (M & F) vs others), and the other one is multi-class (charge I types: M, F, others). Both of them apply several features to do the tree-based training, which include case number, arrest meridian, arrest hour, arrest longitude, arrest latitude, arrest month and race. The training and testing data split ratio is 4:1.<br>

For the binary classification, random forest model with 100 estimators presents 95.03% accuracy, and XGBoost with a default tree depth of 6 presents 95.31% accuracy.<br>

For the multi-class classification, random forest model with 100 estimators presents 66.15% accuracy, and XGBoost with a tree depth of 9 presents 67.91% accuracy.<br>