# <b> Histogram Gradient Boosting Model

In [14]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [15]:
import os

df = pd.read_csv(os.path.join('NYPD_Arrest_Data_Clean.csv'))
df2 = pd.read_csv(os.path.join('NYPD_Arrest_Data_Clean_Demographics.csv'))

In [16]:
df

Unnamed: 0,OFNS_DESC,OFFENSE_LEVEL,ARREST_PRECINCT,JURISDICTION_CODE,Latitude,Longitude,MONTH,FALL,SPRING,SUMMER,WINTER,BRONX,BROOKLYN,MANHATTAN,QUEENS,STATEN_ISLAND
0,17,0,105,0,40.737043,-73.735514,1,False,False,False,True,False,False,False,True,False
1,4,0,107,71,40.732881,-73.807899,2,False,False,False,True,False,False,False,True,False
2,17,0,48,0,40.855109,-73.892818,3,False,True,False,False,True,False,False,False,False
3,17,0,121,0,40.628967,-74.163275,5,False,True,False,False,False,False,False,False,True
4,17,0,100,0,40.591980,-73.800066,6,False,False,True,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193605,47,1,109,0,40.757691,-73.834115,9,True,False,False,False,False,False,False,True,False
193606,5,1,34,0,40.856808,-73.928265,9,True,False,False,False,False,False,True,False,False
193607,7,0,23,0,40.786116,-73.942614,9,True,False,False,False,False,False,True,False,False
193608,43,1,44,1,40.827812,-73.925929,9,True,False,False,False,True,False,False,False,False


In [17]:
df2

Unnamed: 0,OFNS_DESC,OFFENSE_LEVEL,ARREST_PRECINCT,JURISDICTION_CODE,Latitude,Longitude,MONTH,FALL,SPRING,SUMMER,...,AGE_GROUP_<18,PERP_SEX_F,PERP_SEX_M,PERP_RACE_AMERICAN INDIAN/ALASKAN NATIVE,PERP_RACE_ASIAN / PACIFIC ISLANDER,PERP_RACE_BLACK,PERP_RACE_BLACK HISPANIC,PERP_RACE_UNKNOWN,PERP_RACE_WHITE,PERP_RACE_WHITE HISPANIC
0,17,0,105,0,40.737043,-73.735514,1,False,False,False,...,False,False,True,False,False,False,False,False,True,False
1,4,0,107,71,40.732881,-73.807899,2,False,False,False,...,False,False,True,False,False,False,False,False,True,False
2,17,0,48,0,40.855109,-73.892818,3,False,True,False,...,False,False,True,False,False,True,False,False,False,False
3,17,0,121,0,40.628967,-74.163275,5,False,True,False,...,False,False,True,False,False,False,False,False,True,False
4,17,0,100,0,40.591980,-73.800066,6,False,False,True,...,False,False,True,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193605,47,1,109,0,40.757691,-73.834115,9,True,False,False,...,False,False,True,False,False,False,False,False,False,True
193606,5,1,34,0,40.856808,-73.928265,9,True,False,False,...,False,False,True,False,False,False,False,False,False,True
193607,7,0,23,0,40.786116,-73.942614,9,True,False,False,...,False,False,True,False,False,False,False,False,True,False
193608,43,1,44,1,40.827812,-73.925929,9,True,False,False,...,False,False,True,False,False,True,False,False,False,False


In [18]:
from sklearn.model_selection import train_test_split

X = df.drop(columns='OFFENSE_LEVEL')
y = df['OFFENSE_LEVEL']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1234)

X2 = df2.drop(columns='OFFENSE_LEVEL')
y2 = df2['OFFENSE_LEVEL']

X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=0.3, random_state=1234)

# <b> Histogram Gradient Classifier for Data without Demographics

In [19]:
# hist gradient booster classifier on df
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import accuracy_score

hgb = HistGradientBoostingClassifier(max_iter=1000, random_state=1234)
hgb.fit(X_train, y_train)
y_pred = hgb.predict(X_test)

accuracy_score(y_test, y_pred)

print('Accuracy: {:.4f}'.format(accuracy_score(y_test, y_pred)))

Accuracy: 0.9301


# <b> Histogram Gradient Classifier for Data with Demographics

In [21]:
# hist gradient booster classifier on df2
hgb2 = HistGradientBoostingClassifier(max_iter=1000, random_state=1234)
hgb2.fit(X_train2, y_train2)
y_pred2 = hgb2.predict(X_test2)

accuracy_score(y_test2, y_pred2)

print('Accuracy: {:.4f}'.format(accuracy_score(y_test2, y_pred2)))

Accuracy: 0.9337


# <b> Hyperparameter Tuning

In [22]:
# gridsearchcv on df
from sklearn.model_selection import GridSearchCV

param_grid = {'learning_rate': [0.1, 0.01],
                'max_depth': [3, 5, 7],
                'l2_regularization': [0.1, 0.01]}
grid_search = GridSearchCV(hgb, param_grid, cv=3, verbose=3)
grid_search.fit(X_train, y_train)

print('Best parameters: {}'.format(grid_search.best_params_))

Fitting 3 folds for each of 12 candidates, totalling 36 fits
[CV 1/3] END l2_regularization=0.1, learning_rate=0.1, max_depth=3;, score=0.929 total time=   3.3s
[CV 2/3] END l2_regularization=0.1, learning_rate=0.1, max_depth=3;, score=0.927 total time=   3.3s
[CV 3/3] END l2_regularization=0.1, learning_rate=0.1, max_depth=3;, score=0.929 total time=   3.7s
[CV 1/3] END l2_regularization=0.1, learning_rate=0.1, max_depth=5;, score=0.930 total time=   3.4s
[CV 2/3] END l2_regularization=0.1, learning_rate=0.1, max_depth=5;, score=0.929 total time=   4.0s
[CV 3/3] END l2_regularization=0.1, learning_rate=0.1, max_depth=5;, score=0.931 total time=   3.8s
[CV 1/3] END l2_regularization=0.1, learning_rate=0.1, max_depth=7;, score=0.931 total time=   3.5s
[CV 2/3] END l2_regularization=0.1, learning_rate=0.1, max_depth=7;, score=0.930 total time=   3.5s
[CV 3/3] END l2_regularization=0.1, learning_rate=0.1, max_depth=7;, score=0.931 total time=   3.8s
[CV 1/3] END l2_regularization=0.1, lea

# <b> Analysis, Accuracy, and Metrics

In [23]:
# Best parameters: {'l2_regularization': 0.1, 'learning_rate': 0.1, 'max_depth': 7}
# make Hist gradient booster classifier with best parameters
hgb_best = HistGradientBoostingClassifier(max_iter=1000, random_state=1234, l2_regularization=0.1, learning_rate=0.1, max_depth=7)
hgb_best.fit(X_train, y_train)
y_pred_best = hgb_best.predict(X_test)

accuracy_score(y_test, y_pred_best)

0.9314085016269821

In [24]:
# make a hist gradient booster classifier with best parameters on df2
hgb_best2 = HistGradientBoostingClassifier(max_iter=1000, random_state=1234, l2_regularization=0.1, learning_rate=0.1, max_depth=7)
hgb_best2.fit(X_train2, y_train2)
y_pred_best2 = hgb_best2.predict(X_test2)

accuracy_score(y_test2, y_pred_best2)

0.9338016287037515

In [25]:
# generate a classification report for df and df2
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred_best))

print(classification_report(y_test2, y_pred_best2))

              precision    recall  f1-score   support

           0       0.90      0.95      0.92     24823
           1       0.96      0.92      0.94     32721
           2       0.96      0.86      0.91       539

    accuracy                           0.93     58083
   macro avg       0.94      0.91      0.92     58083
weighted avg       0.93      0.93      0.93     58083

              precision    recall  f1-score   support

           0       0.90      0.95      0.92     24823
           1       0.96      0.93      0.94     32721
           2       0.97      0.86      0.91       539

    accuracy                           0.93     58083
   macro avg       0.94      0.91      0.93     58083
weighted avg       0.93      0.93      0.93     58083



In [26]:
# Generate classification report
report = classification_report(y_test, y_pred, digits=2, output_dict=True)
df = pd.DataFrame(report).T

# Convert support column to integer
df['support'] = df['support'].apply(int)

# Apply background gradient
styled_df = df.style.background_gradient(cmap='viridis', subset=pd.IndexSlice['0':'9', :'f1-score'])

# Display the styled DataFrame
styled_df

Unnamed: 0,precision,recall,f1-score,support
0,0.887117,0.958949,0.921635,24823
1,0.967552,0.909477,0.937616,32721
2,0.935091,0.855288,0.893411,539
accuracy,0.930117,0.930117,0.930117,0
macro avg,0.92992,0.907905,0.917554,58083
weighted avg,0.932875,0.930117,0.930376,58083


In [28]:
# Generate classification report
report = classification_report(y_test2, y_pred2, digits=2, output_dict=True)
df = pd.DataFrame(report).T

# Convert support column to integer
df['support'] = df['support'].apply(int)

# Apply background gradient
styled_df = df.style.background_gradient(cmap='viridis', subset=pd.IndexSlice['0':'9', :'f1-score'])

# Display the styled DataFrame
styled_df

Unnamed: 0,precision,recall,f1-score,support
0,0.9035,0.94634,0.924424,24823
1,0.958469,0.925369,0.941628,32721
2,0.941057,0.858998,0.898157,539
accuracy,0.933716,0.933716,0.933716,0
macro avg,0.934342,0.910236,0.921403,58083
weighted avg,0.934815,0.933716,0.933872,58083
