# <b> XGBoost Model

In [20]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [21]:
import os

df = pd.read_csv(os.path.join('NYPD_Arrest_Data_Clean.csv'))
df2 = pd.read_csv(os.path.join('NYPD_Arrest_Data_Clean_Demographics.csv'))

In [22]:
df

Unnamed: 0,OFNS_DESC,OFFENSE_LEVEL,ARREST_PRECINCT,JURISDICTION_CODE,Latitude,Longitude,MONTH,FALL,SPRING,SUMMER,WINTER,BRONX,BROOKLYN,MANHATTAN,QUEENS,STATEN_ISLAND
0,17,0,105,0,40.737043,-73.735514,1,False,False,False,True,False,False,False,True,False
1,4,0,107,71,40.732881,-73.807899,2,False,False,False,True,False,False,False,True,False
2,17,0,48,0,40.855109,-73.892818,3,False,True,False,False,True,False,False,False,False
3,17,0,121,0,40.628967,-74.163275,5,False,True,False,False,False,False,False,False,True
4,17,0,100,0,40.591980,-73.800066,6,False,False,True,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193605,47,1,109,0,40.757691,-73.834115,9,True,False,False,False,False,False,False,True,False
193606,5,1,34,0,40.856808,-73.928265,9,True,False,False,False,False,False,True,False,False
193607,7,0,23,0,40.786116,-73.942614,9,True,False,False,False,False,False,True,False,False
193608,43,1,44,1,40.827812,-73.925929,9,True,False,False,False,True,False,False,False,False


In [23]:
df2

Unnamed: 0,OFNS_DESC,OFFENSE_LEVEL,ARREST_PRECINCT,JURISDICTION_CODE,Latitude,Longitude,MONTH,FALL,SPRING,SUMMER,...,AGE_GROUP_<18,PERP_SEX_F,PERP_SEX_M,PERP_RACE_AMERICAN INDIAN/ALASKAN NATIVE,PERP_RACE_ASIAN / PACIFIC ISLANDER,PERP_RACE_BLACK,PERP_RACE_BLACK HISPANIC,PERP_RACE_UNKNOWN,PERP_RACE_WHITE,PERP_RACE_WHITE HISPANIC
0,17,0,105,0,40.737043,-73.735514,1,False,False,False,...,False,False,True,False,False,False,False,False,True,False
1,4,0,107,71,40.732881,-73.807899,2,False,False,False,...,False,False,True,False,False,False,False,False,True,False
2,17,0,48,0,40.855109,-73.892818,3,False,True,False,...,False,False,True,False,False,True,False,False,False,False
3,17,0,121,0,40.628967,-74.163275,5,False,True,False,...,False,False,True,False,False,False,False,False,True,False
4,17,0,100,0,40.591980,-73.800066,6,False,False,True,...,False,False,True,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193605,47,1,109,0,40.757691,-73.834115,9,True,False,False,...,False,False,True,False,False,False,False,False,False,True
193606,5,1,34,0,40.856808,-73.928265,9,True,False,False,...,False,False,True,False,False,False,False,False,False,True
193607,7,0,23,0,40.786116,-73.942614,9,True,False,False,...,False,False,True,False,False,False,False,False,True,False
193608,43,1,44,1,40.827812,-73.925929,9,True,False,False,...,False,False,True,False,False,True,False,False,False,False


In [24]:
from sklearn.model_selection import train_test_split

X = df.drop(columns='OFFENSE_LEVEL')
y = df['OFFENSE_LEVEL']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1234)

X2 = df2.drop(columns='OFFENSE_LEVEL')
y2 = df2['OFFENSE_LEVEL']

X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=0.3, random_state=1234)

# <b> XGBoost for Data without Demographics

In [25]:
# run XGBoost on df
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier

xgb = XGBClassifier(n_estimators=100, random_state=1234)
xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_test)

accuracy_score(y_test, y_pred)

print('Accuracy of XGBoost:', accuracy_score(y_test, y_pred))

Accuracy of XGBoost: 0.9319766540984453


# <b> XGBoost for Data with Demographics

In [27]:
# Clean feature names
X_train2.columns = X_train2.columns.str.replace('[', '').str.replace(']', '').str.replace('<', '').str.replace('>', '')
X_test2.columns = X_test2.columns.str.replace('[', '').str.replace(']', '').str.replace('<', '').str.replace('>', '')

# Run XGBoost on df2
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

xgb2 = XGBClassifier(n_estimators=100, random_state=1234)
xgb2.fit(X_train2, y_train2)
y_pred2 = xgb2.predict(X_test2)

accuracy = accuracy_score(y_test2, y_pred2)
print('Accuracy of XGBoost on df2:', accuracy)

Accuracy of XGBoost on df2: 0.9342664807258578


# <b> Hyperparameter Tuning

In [29]:
#run grid search on df
from sklearn.model_selection import GridSearchCV

param_grid = {'n_estimators': [100, 200, 300], 'max_depth': [3, 4, 5]}
grid_search = GridSearchCV(XGBClassifier(random_state=1234), param_grid, cv=5, verbose=3)
grid_search.fit(X_train, y_train)

print('Best parameters:', grid_search.best_params_)
print('Best score:', grid_search.best_score_)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV 1/5] END .....max_depth=3, n_estimators=100;, score=0.926 total time=   0.7s
[CV 2/5] END .....max_depth=3, n_estimators=100;, score=0.928 total time=   0.6s
[CV 3/5] END .....max_depth=3, n_estimators=100;, score=0.928 total time=   0.6s
[CV 4/5] END .....max_depth=3, n_estimators=100;, score=0.928 total time=   0.6s
[CV 5/5] END .....max_depth=3, n_estimators=100;, score=0.929 total time=   0.6s
[CV 1/5] END .....max_depth=3, n_estimators=200;, score=0.929 total time=   1.2s
[CV 2/5] END .....max_depth=3, n_estimators=200;, score=0.929 total time=   1.2s
[CV 3/5] END .....max_depth=3, n_estimators=200;, score=0.929 total time=   1.2s
[CV 4/5] END .....max_depth=3, n_estimators=200;, score=0.929 total time=   1.2s
[CV 5/5] END .....max_depth=3, n_estimators=200;, score=0.931 total time=   1.3s
[CV 1/5] END .....max_depth=3, n_estimators=300;, score=0.930 total time=   1.8s
[CV 2/5] END .....max_depth=3, n_estimators=300;,

In [30]:
# run grid search on df2
param_grid = {'n_estimators': [100, 200, 300], 'max_depth': [3, 4, 5]}
grid_search2 = GridSearchCV(XGBClassifier(random_state=1234), param_grid, cv=5, verbose=3)
grid_search2.fit(X_train2, y_train2)

print('Best parameters:', grid_search2.best_params_)
print('Best score:', grid_search2.best_score_)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV 1/5] END .....max_depth=3, n_estimators=100;, score=0.929 total time=   0.9s
[CV 2/5] END .....max_depth=3, n_estimators=100;, score=0.929 total time=   0.8s
[CV 3/5] END .....max_depth=3, n_estimators=100;, score=0.930 total time=   0.8s
[CV 4/5] END .....max_depth=3, n_estimators=100;, score=0.930 total time=   0.8s
[CV 5/5] END .....max_depth=3, n_estimators=100;, score=0.932 total time=   0.7s
[CV 1/5] END .....max_depth=3, n_estimators=200;, score=0.932 total time=   1.3s
[CV 2/5] END .....max_depth=3, n_estimators=200;, score=0.931 total time=   1.3s
[CV 3/5] END .....max_depth=3, n_estimators=200;, score=0.931 total time=   1.5s
[CV 4/5] END .....max_depth=3, n_estimators=200;, score=0.932 total time=   1.5s
[CV 5/5] END .....max_depth=3, n_estimators=200;, score=0.934 total time=   1.5s
[CV 1/5] END .....max_depth=3, n_estimators=300;, score=0.932 total time=   2.0s
[CV 2/5] END .....max_depth=3, n_estimators=300;,

# <b> Analysis, Accuracy, and Metrics

In [31]:
# print classificatioon report for df
from sklearn.metrics import classification_report

y_pred = grid_search.predict(X_test)
print(classification_report(y_test, y_pred))

# print classification report for df2
y_pred2 = grid_search2.predict(X_test2)
print(classification_report(y_test2, y_pred2))


              precision    recall  f1-score   support

           0       0.90      0.94      0.92     24823
           1       0.96      0.92      0.94     32721
           2       0.96      0.86      0.91       539

    accuracy                           0.93     58083
   macro avg       0.94      0.91      0.92     58083
weighted avg       0.93      0.93      0.93     58083

              precision    recall  f1-score   support

           0       0.91      0.94      0.92     24823
           1       0.96      0.93      0.94     32721
           2       0.97      0.86      0.91       539

    accuracy                           0.93     58083
   macro avg       0.94      0.91      0.92     58083
weighted avg       0.93      0.93      0.93     58083



In [32]:
# Generate classification report
report = classification_report(y_test, y_pred, digits=2, output_dict=True)
df = pd.DataFrame(report).T

# Convert support column to integer
df['support'] = df['support'].apply(int)

# Apply background gradient
styled_df = df.style.background_gradient(cmap='viridis', subset=pd.IndexSlice['0':'9', :'f1-score'])

# Display the styled DataFrame
styled_df

Unnamed: 0,precision,recall,f1-score,support
0,0.899332,0.944366,0.921299,24823
1,0.956467,0.921916,0.938873,32721
2,0.964435,0.855288,0.906588,539
accuracy,0.930892,0.930892,0.930892,0
macro avg,0.940078,0.90719,0.922254,58083
weighted avg,0.932123,0.930892,0.931063,58083


In [34]:
# Generate classification report
report2 = classification_report(y_test2, y_pred2, digits=2, output_dict=True)
df2 = pd.DataFrame(report2).T

# Convert support column to integer
df2['support'] = df2['support'].apply(int)

# Apply background gradient
styled_df2 = df2.style.background_gradient(cmap='viridis', subset=pd.IndexSlice['0':'9', :'f1-score'])

# Display the styled DataFrame
styled_df2

Unnamed: 0,precision,recall,f1-score,support
0,0.905841,0.944084,0.924567,24823
1,0.956514,0.927661,0.941866,32721
2,0.966527,0.857143,0.908555,539
accuracy,0.934025,0.934025,0.934025,0
macro avg,0.94296,0.909629,0.924996,58083
weighted avg,0.93495,0.934025,0.934164,58083
