# <b> Bagging Classifier Model

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import os

df = pd.read_csv(os.path.join('NYPD_Arrest_Data_Clean.csv'))
df2 = pd.read_csv(os.path.join('NYPD_Arrest_Data_Clean_Demographics.csv'))

In [3]:
df

Unnamed: 0,OFNS_DESC,OFFENSE_LEVEL,ARREST_PRECINCT,JURISDICTION_CODE,Latitude,Longitude,MONTH,FALL,SPRING,SUMMER,WINTER,BRONX,BROOKLYN,MANHATTAN,QUEENS,STATEN_ISLAND
0,17,0,105,0,40.737043,-73.735514,1,False,False,False,True,False,False,False,True,False
1,4,0,107,71,40.732881,-73.807899,2,False,False,False,True,False,False,False,True,False
2,17,0,48,0,40.855109,-73.892818,3,False,True,False,False,True,False,False,False,False
3,17,0,121,0,40.628967,-74.163275,5,False,True,False,False,False,False,False,False,True
4,17,0,100,0,40.591980,-73.800066,6,False,False,True,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193605,47,1,109,0,40.757691,-73.834115,9,True,False,False,False,False,False,False,True,False
193606,5,1,34,0,40.856808,-73.928265,9,True,False,False,False,False,False,True,False,False
193607,7,0,23,0,40.786116,-73.942614,9,True,False,False,False,False,False,True,False,False
193608,43,1,44,1,40.827812,-73.925929,9,True,False,False,False,True,False,False,False,False


In [4]:
df2

Unnamed: 0,OFNS_DESC,OFFENSE_LEVEL,ARREST_PRECINCT,JURISDICTION_CODE,Latitude,Longitude,MONTH,FALL,SPRING,SUMMER,...,AGE_GROUP_<18,PERP_SEX_F,PERP_SEX_M,PERP_RACE_AMERICAN INDIAN/ALASKAN NATIVE,PERP_RACE_ASIAN / PACIFIC ISLANDER,PERP_RACE_BLACK,PERP_RACE_BLACK HISPANIC,PERP_RACE_UNKNOWN,PERP_RACE_WHITE,PERP_RACE_WHITE HISPANIC
0,17,0,105,0,40.737043,-73.735514,1,False,False,False,...,False,False,True,False,False,False,False,False,True,False
1,4,0,107,71,40.732881,-73.807899,2,False,False,False,...,False,False,True,False,False,False,False,False,True,False
2,17,0,48,0,40.855109,-73.892818,3,False,True,False,...,False,False,True,False,False,True,False,False,False,False
3,17,0,121,0,40.628967,-74.163275,5,False,True,False,...,False,False,True,False,False,False,False,False,True,False
4,17,0,100,0,40.591980,-73.800066,6,False,False,True,...,False,False,True,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193605,47,1,109,0,40.757691,-73.834115,9,True,False,False,...,False,False,True,False,False,False,False,False,False,True
193606,5,1,34,0,40.856808,-73.928265,9,True,False,False,...,False,False,True,False,False,False,False,False,False,True
193607,7,0,23,0,40.786116,-73.942614,9,True,False,False,...,False,False,True,False,False,False,False,False,True,False
193608,43,1,44,1,40.827812,-73.925929,9,True,False,False,...,False,False,True,False,False,True,False,False,False,False


In [5]:
from sklearn.model_selection import train_test_split

X = df.drop(columns='OFFENSE_LEVEL')
y = df['OFFENSE_LEVEL']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1234)

X2 = df2.drop(columns='OFFENSE_LEVEL')
y2 = df2['OFFENSE_LEVEL']

X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=0.3, random_state=1234)

# <b> Histogram Gradient Classifier for Data without Demographics

In [6]:
# bagging classifier on df
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier

bag = BaggingClassifier(n_estimators=100, random_state=1234)
bag.fit(X_train, y_train)
y_pred = bag.predict(X_test)

accuracy_score(y_test, y_pred)

print('Accuracy: {:.4f}'.format(accuracy_score(y_test, y_pred)))


Accuracy: 0.9328


# <b> Bagging Classifier for Data with Demographics

In [7]:
# run bagging classifier on df2
bag2 = BaggingClassifier(n_estimators=100, random_state=1234)
bag2.fit(X_train2, y_train2)
y_pred2 = bag2.predict(X_test2)

accuracy_score(y_test2, y_pred2)

print('Accuracy: {:.4f}'.format(accuracy_score(y_test2, y_pred2)))


Accuracy: 0.9320


# <b> Hyperparameter Tuning

In [9]:
# run gridsearch on df for bagging classifier
from sklearn.model_selection import GridSearchCV

param_grid = {'n_estimators': [50, 100, 200, 300, 400, 500],
              'max_samples': [0.1, 0.5, 1.0]}
grid = GridSearchCV(BaggingClassifier(random_state=1234), param_grid, cv=5, verbose=3)
grid.fit(X_train, y_train)

print('Best parameters: {}'.format(grid.best_params_))
print('Best cross-validation: {:.4f}'.format(grid.best_score_))


Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV 1/5] END ..max_samples=0.1, n_estimators=50;, score=0.928 total time=   1.4s
[CV 2/5] END ..max_samples=0.1, n_estimators=50;, score=0.932 total time=   1.3s
[CV 3/5] END ..max_samples=0.1, n_estimators=50;, score=0.931 total time=   1.3s
[CV 4/5] END ..max_samples=0.1, n_estimators=50;, score=0.932 total time=   1.3s
[CV 5/5] END ..max_samples=0.1, n_estimators=50;, score=0.932 total time=   1.3s
[CV 1/5] END .max_samples=0.1, n_estimators=100;, score=0.929 total time=   2.6s
[CV 2/5] END .max_samples=0.1, n_estimators=100;, score=0.932 total time=   2.6s
[CV 3/5] END .max_samples=0.1, n_estimators=100;, score=0.931 total time=   2.6s
[CV 4/5] END .max_samples=0.1, n_estimators=100;, score=0.932 total time=   2.6s
[CV 5/5] END .max_samples=0.1, n_estimators=100;, score=0.933 total time=   2.6s
[CV 1/5] END .max_samples=0.1, n_estimators=200;, score=0.931 total time=   5.2s
[CV 2/5] END .max_samples=0.1, n_estimators=200;

# <b> Analysis, Accuracy, and Metrics

In [11]:
# print classification report on df with best model
from sklearn.metrics import classification_report

# Best parameters: {'max_samples': 0.5, 'n_estimators': 400}
# Best cross-validation: 0.9323

y_pred = grid.best_estimator_.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.93      0.92     24823
           1       0.95      0.94      0.94     32721
           2       0.95      0.86      0.91       539

    accuracy                           0.93     58083
   macro avg       0.94      0.91      0.92     58083
weighted avg       0.93      0.93      0.93     58083



In [12]:
# Generate classification report
report = classification_report(y_test, y_pred, digits=2, output_dict=True)
df = pd.DataFrame(report).T

# Convert support column to integer
df['support'] = df['support'].apply(int)

# Apply background gradient
styled_df = df.style.background_gradient(cmap='viridis', subset=pd.IndexSlice['0':'9', :'f1-score'])

# Display the styled DataFrame
styled_df

Unnamed: 0,precision,recall,f1-score,support
0,0.913929,0.934657,0.924177,24823
1,0.950166,0.93524,0.942644,32721
2,0.95102,0.864564,0.905734,539
accuracy,0.934335,0.934335,0.934335,0
macro avg,0.938372,0.911487,0.924185,58083
weighted avg,0.934687,0.934335,0.934409,58083


In [15]:
# print classification report on df2 with {'max_samples': 0.5, 'n_estimators': 400}
bag2 = BaggingClassifier(n_estimators=400, max_samples=0.5, random_state=1234)
bag2.fit(X_train2, y_train2)
y_pred2 = bag2.predict(X_test2)

print(classification_report(y_test2, y_pred2))

              precision    recall  f1-score   support

           0       0.91      0.94      0.92     24823
           1       0.95      0.93      0.94     32721
           2       0.96      0.86      0.91       539

    accuracy                           0.93     58083
   macro avg       0.94      0.91      0.92     58083
weighted avg       0.93      0.93      0.93     58083



In [16]:
# Generate classification report
report = classification_report(y_test2, y_pred2, digits=2, output_dict=True)
df = pd.DataFrame(report).T

# Convert support column to integer
df['support'] = df['support'].apply(int)

# Apply background gradient
styled_df = df.style.background_gradient(cmap='viridis', subset=pd.IndexSlice['0':'9', :'f1-score'])

# Display the styled DataFrame
styled_df

Unnamed: 0,precision,recall,f1-score,support
0,0.912496,0.93639,0.924288,24823
1,0.951232,0.93411,0.942593,32721
2,0.964435,0.855288,0.906588,539
accuracy,0.934353,0.934353,0.934353,0
macro avg,0.942721,0.908596,0.92449,58083
weighted avg,0.9348,0.934353,0.934436,58083
