# <b> Gradient Boosting Model

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import os

df = pd.read_csv(os.path.join('NYPD_Arrest_Data_Clean.csv'))
df2 = pd.read_csv(os.path.join('NYPD_Arrest_Data_Clean_Demographics.csv'))

In [3]:
df

Unnamed: 0,OFNS_DESC,OFFENSE_LEVEL,ARREST_PRECINCT,JURISDICTION_CODE,Latitude,Longitude,MONTH,FALL,SPRING,SUMMER,WINTER,BRONX,BROOKLYN,MANHATTAN,QUEENS,STATEN_ISLAND
0,17,0,105,0,40.737043,-73.735514,1,False,False,False,True,False,False,False,True,False
1,4,0,107,71,40.732881,-73.807899,2,False,False,False,True,False,False,False,True,False
2,17,0,48,0,40.855109,-73.892818,3,False,True,False,False,True,False,False,False,False
3,17,0,121,0,40.628967,-74.163275,5,False,True,False,False,False,False,False,False,True
4,17,0,100,0,40.591980,-73.800066,6,False,False,True,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193605,47,1,109,0,40.757691,-73.834115,9,True,False,False,False,False,False,False,True,False
193606,5,1,34,0,40.856808,-73.928265,9,True,False,False,False,False,False,True,False,False
193607,7,0,23,0,40.786116,-73.942614,9,True,False,False,False,False,False,True,False,False
193608,43,1,44,1,40.827812,-73.925929,9,True,False,False,False,True,False,False,False,False


In [4]:
df2

Unnamed: 0,OFNS_DESC,OFFENSE_LEVEL,ARREST_PRECINCT,JURISDICTION_CODE,Latitude,Longitude,MONTH,FALL,SPRING,SUMMER,...,AGE_GROUP_<18,PERP_SEX_F,PERP_SEX_M,PERP_RACE_AMERICAN INDIAN/ALASKAN NATIVE,PERP_RACE_ASIAN / PACIFIC ISLANDER,PERP_RACE_BLACK,PERP_RACE_BLACK HISPANIC,PERP_RACE_UNKNOWN,PERP_RACE_WHITE,PERP_RACE_WHITE HISPANIC
0,17,0,105,0,40.737043,-73.735514,1,False,False,False,...,False,False,True,False,False,False,False,False,True,False
1,4,0,107,71,40.732881,-73.807899,2,False,False,False,...,False,False,True,False,False,False,False,False,True,False
2,17,0,48,0,40.855109,-73.892818,3,False,True,False,...,False,False,True,False,False,True,False,False,False,False
3,17,0,121,0,40.628967,-74.163275,5,False,True,False,...,False,False,True,False,False,False,False,False,True,False
4,17,0,100,0,40.591980,-73.800066,6,False,False,True,...,False,False,True,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193605,47,1,109,0,40.757691,-73.834115,9,True,False,False,...,False,False,True,False,False,False,False,False,False,True
193606,5,1,34,0,40.856808,-73.928265,9,True,False,False,...,False,False,True,False,False,False,False,False,False,True
193607,7,0,23,0,40.786116,-73.942614,9,True,False,False,...,False,False,True,False,False,False,False,False,True,False
193608,43,1,44,1,40.827812,-73.925929,9,True,False,False,...,False,False,True,False,False,True,False,False,False,False


In [5]:
from sklearn.model_selection import train_test_split

X = df.drop(columns='OFFENSE_LEVEL')
y = df['OFFENSE_LEVEL']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1234)

X2 = df2.drop(columns='OFFENSE_LEVEL')
y2 = df2['OFFENSE_LEVEL']

X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=0.3, random_state=1234)

# <b> Gradient Classifier for Data without Demographics

In [6]:
# gradient booster classifier on df
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score

gbc = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=1234)
gbc.fit(X_train, y_train)

y_pred = gbc.predict(X_test)
accuracy_score(y_test, y_pred)

print('Accuracy of Gradient Booster Classifier on df:', accuracy_score(y_test, y_pred))

Accuracy of Gradient Booster Classifier on df: 0.9104901606321988


# <b> Gradient Classifier for Data with Demographics

In [7]:
# gradient booster classifier on df2

gbc_2 = GradientBoostingClassifier(n_estimators=100,learning_rate=1.0, max_depth=1, random_state=1234)
gbc_2.fit(X_train2, y_train2)

y_pred2 = gbc_2.predict(X_test2)
accuracy_score(y_test2, y_pred2)

print('Accuracy of Gradient Booster Classifier on df2:', accuracy_score(y_test2, y_pred2))


Accuracy of Gradient Booster Classifier on df2: 0.9089234371502849


# <b> Hyperparameter Tuning

In [10]:
# gridsearchcv on df
from sklearn.model_selection import GridSearchCV

param_grid = {'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 4, 5],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(GradientBoostingClassifier(random_state=1234), param_grid, cv=3, n_jobs=-1, verbose=3)
grid_search.fit(X_train, y_train)

print('Best parameters for df:', grid_search.best_params_)
print('Best score for df:', grid_search.best_score_)

#Best parameters for df: {'learning_rate': 0.1, 'max_depth': 5, 'min_samples_leaf': 1, 'n_estimators': 300}
#Best score for df: 0.9304419095119894

Fitting 3 folds for each of 27 candidates, totalling 81 fits


[CV 3/3] END learning_rate=0.1, max_depth=3, min_samples_leaf=1, n_estimators=100;, score=0.925 total time=  26.5s
[CV 1/3] END learning_rate=0.1, max_depth=3, min_samples_leaf=1, n_estimators=100;, score=0.924 total time=  27.4s
[CV 1/3] END learning_rate=0.1, max_depth=3, min_samples_leaf=2, n_estimators=100;, score=0.923 total time=  27.4s
[CV 2/3] END learning_rate=0.1, max_depth=3, min_samples_leaf=1, n_estimators=100;, score=0.924 total time=  27.8s
[CV 2/3] END learning_rate=0.1, max_depth=3, min_samples_leaf=2, n_estimators=100;, score=0.923 total time=  22.5s
[CV 2/3] END learning_rate=0.1, max_depth=3, min_samples_leaf=1, n_estimators=200;, score=0.926 total time=  54.4s
[CV 1/3] END learning_rate=0.1, max_depth=3, min_samples_leaf=1, n_estimators=200;, score=0.926 total time=  54.8s
[CV 3/3] END learning_rate=0.1, max_depth=3, min_samples_leaf=1, n_estimators=200;, score=0.928 total time=  55.0s
[CV 3/3] END learning_rate=0.1, max_depth=3, min_samples_leaf=2, n_estimators=10

In [17]:
# run gridsearch on df2
param_grid = {'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 4, 5],
    'min_samples_leaf': [1, 2, 4]
}
grid_search2 = GridSearchCV(GradientBoostingClassifier(random_state=1234), param_grid, cv=3, n_jobs=-1, verbose=3)
grid_search2.fit(X_train2, y_train2)

print('Best parameters for df2:', grid_search2.best_params_)
print('Best score for df2:', grid_search2.best_score_)

Fitting 3 folds for each of 54 candidates, totalling 162 fits
[CV 2/3] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, n_estimators=100;, score=0.878 total time=  38.1s
[CV 1/3] END learning_rate=0.01, max_depth=3, min_samples_leaf=2, n_estimators=100;, score=0.880 total time=  38.2s
[CV 1/3] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, n_estimators=100;, score=0.880 total time=  38.5s
[CV 3/3] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, n_estimators=100;, score=0.880 total time=  38.7s
[CV 2/3] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, n_estimators=200;, score=0.891 total time= 1.2min
[CV 3/3] END learning_rate=0.01, max_depth=3, min_samples_leaf=2, n_estimators=100;, score=0.880 total time=  37.6s
[CV 1/3] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, n_estimators=200;, score=0.892 total time= 1.3min
[CV 3/3] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, n_estimators=200;, score=0.891 total time= 1.3min
[CV 2/3] E

# <b> Analysis, Accuracy, and Metrics

In [13]:
# check mse
from sklearn.metrics import mean_squared_error

gbc_best = GradientBoostingClassifier(n_estimators=300, learning_rate=0.1, max_depth=5, min_samples_leaf=1, random_state=1234)
gbc_best.fit(X_train, y_train)

y_pred_best = gbc_best.predict(X_test)
mean_squared_error(y_test, y_pred_best)

print('Mean Squared Error for df:', mean_squared_error(y_test, y_pred_best))




Mean Squared Error for df: 0.07354991994215175


In [11]:
# print classification report on df with best model
from sklearn.metrics import classification_report

best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.95      0.92     24823
           1       0.96      0.92      0.94     32721
           2       0.94      0.86      0.90       539

    accuracy                           0.93     58083
   macro avg       0.93      0.91      0.92     58083
weighted avg       0.93      0.93      0.93     58083



In [14]:
# Generate classification report
report = classification_report(y_test, y_pred, digits=2, output_dict=True)
df = pd.DataFrame(report).T

# Convert support column to integer
df['support'] = df['support'].apply(int)

# Apply background gradient
styled_df = df.style.background_gradient(cmap='viridis', subset=pd.IndexSlice['0':'9', :'f1-score'])

# Display the styled DataFrame
styled_df

Unnamed: 0,precision,recall,f1-score,support
0,0.89491,0.95061,0.92192,24823
1,0.96118,0.917117,0.938632,32721
2,0.94332,0.864564,0.902227,539
accuracy,0.930944,0.930944,0.930944,0
macro avg,0.933137,0.910764,0.920926,58083
weighted avg,0.932693,0.930944,0.931152,58083


In [18]:
# generate classification report on df2 with best model
best_model2 = grid_search2.best_estimator_
y_pred2 = best_model2.predict(X_test2)
print(classification_report(y_test2, y_pred2))

              precision    recall  f1-score   support

           0       0.90      0.95      0.93     24823
           1       0.96      0.93      0.94     32721
           2       0.96      0.86      0.91       539

    accuracy                           0.93     58083
   macro avg       0.94      0.91      0.93     58083
weighted avg       0.94      0.93      0.93     58083



In [20]:
# Generate classification report
report = classification_report(y_test2, y_pred2, digits=2, output_dict=True)
df = pd.DataFrame(report).T

# Convert support column to integer
df['support'] = df['support'].apply(int)

# Apply background gradient
styled_df = df.style.background_gradient(cmap='viridis', subset=pd.IndexSlice['0':'9', :'f1-score'])

# Display the styled DataFrame
styled_df

Unnamed: 0,precision,recall,f1-score,support
0,0.90403,0.947951,0.92547,24823
1,0.959361,0.925644,0.942201,32721
2,0.962733,0.862709,0.90998,539
accuracy,0.934594,0.934594,0.934594,0
macro avg,0.942041,0.912101,0.925884,58083
weighted avg,0.935746,0.934594,0.934752,58083
