# Module 2 - Supervised Learning - Decision Trees

---

## Imports

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import seaborn as sb
from sklearn.model_selection import train_test_split
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, f1_score
import time

%matplotlib inline

---

## Load Data

In [2]:
state = 'GA'

current_dir = os.getcwd()
load_dir = current_dir + '/module1_data/' + state

#uncomment 1
# data = 'Xstate_f194.npy' #original
# data = 'Xforest_f102.npy' #random forest feature importance
data = 'Xpca_f194_c049_v9713.npy' #principal components

X = np.load(load_dir + '/' + data)
print(X.shape)

y = np.load(load_dir + '/ystate.npy')
y = y - 1
print(y.shape)

(83620, 49)
(83620, 1)


In [3]:
#DO NOT CHANGE

#split into training and test sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, train_size=0.70, random_state=0, shuffle=True)

print('X train:', X_train.shape)
print('y train:', y_train.shape)
print('X test: ', X_test.shape)
print('y test: ', y_test.shape)

X train: (58533, 49)
y train: (58533, 1)
X test:  (25086, 49)
y test:  (25086, 1)


---

## Decision Trees

### Setup (Single Example)

In [4]:
#Setup Classifier, with parameters
histClassifier = HistGradientBoostingClassifier(loss='categorical_crossentropy',min_samples_leaf=50,max_depth=8,learning_rate=0.1,max_iter=100)
gbdt = histClassifier.fit(X_train, np.ravel(y_train))

### Test Set

In [9]:
#confusion matrices:

#train data
y_pred1 = gbdt.predict(X_train)

#test data
y_pred2 = gbdt.predict(X_test)

cf_train = confusion_matrix(y_train,y_pred1)
cf_test = confusion_matrix(y_test,y_pred2)

cm1 = sb.light_palette("blue", as_cmap=True)
cm2 = sb.light_palette("green", as_cmap=True)
cf_train_plot = pd.DataFrame(cf_train)
cf_test_plot = pd.DataFrame(cf_test)

cf_train_plot = cf_train_plot.style.background_gradient(cmap=cm1)
cf_test_plot = cf_test_plot.style.background_gradient(cmap=cm2)

display(cf_train_plot)
plt.savefig('cf_train_plot.png')
print("Accuracy Train:",accuracy_score(y_train,y_pred1))
print("Precision Train:",precision_score(y_train,y_pred1,average=None))
print("F1 Train:", f1_score(y_train,y_pred1,average=None))
      
display(cf_test_plot)
plt.savefig('cf_test_plot.png')
print("Accuracy Test:",accuracy_score(y_test,y_pred2))
print("Precision Test:",precision_score(y_test,y_pred2,average=None))
print("F1 Test:", f1_score(y_test,y_pred2,average=None))


Unnamed: 0,0,1,2,3
0,18,0,0,0
1,0,15478,8640,86
2,0,4456,25149,118
3,0,1482,852,2254


Accuracy Train: 0.7329028069635932
Precision Train: [1.         0.72273067 0.72598943 0.9170057 ]
F1 Train: [1.         0.67856203 0.78146169 0.63979563]


Unnamed: 0,0,1,2,3
0,0,2,7,0
1,0,4644,5532,225
2,0,3459,8992,218
3,0,1072,657,278


Accuracy Test: 0.5546519971298732
Precision Test: [0.         0.50604773 0.59204635 0.38557559]
F1 Test: [0.         0.47441005 0.6455828  0.20381232]


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


<Figure size 432x288 with 0 Axes>

### Hyperparameter optimization/GridSearch

In [6]:
params = {'min_samples_leaf':[30, 50, 100],'max_depth':[5,6,7,8],
              'learning_rate':[0.05,0.1,0.15,0.2],'max_iter':[100,500,1000]}

search = GridSearchCV(histClassifier, params)
gridsearch = search.fit(X_train,np.ravel(y_train))


#train data
search_pred1 = gridsearch.predict(X_train)

#test data
search_pred2 = gridsearch.predict(X_test)



In [10]:
bestparams = gridsearch.best_params_
print('Best Parameters:',bestparams)

cf_train_search = confusion_matrix(y_train,search_pred1)
cf_test_search = confusion_matrix(y_test,search_pred2)

cm1 = sb.light_palette("blue", as_cmap=True)
cm2 = sb.light_palette("green", as_cmap=True)
cf_train_plot = pd.DataFrame(cf_train_search)
cf_test_plot = pd.DataFrame(cf_test_search)

cf_train_plot = cf_train_plot.style.background_gradient(cmap=cm1)
cf_test_plot = cf_test_plot.style.background_gradient(cmap=cm2)

display(cf_train_plot)
plt.savefig('cf_train_plot_search.png')
print("Accuracy Train:",accuracy_score(y_train,search_pred1))
print("Precision Train:",precision_score(y_train,search_pred1,average=None))
print("F1 Train:", f1_score(y_train,search_pred1,average=None))
      
display(cf_test_plot)
plt.savefig('cf_test_plot_search.png')
print("Accuracy Test:",accuracy_score(y_test,search_pred2))
print("Precision Test:",precision_score(y_test,search_pred2,average=None))
print("F1 Test:", f1_score(y_test,search_pred2,average=None))


Best Parameters: {'learning_rate': 0.05, 'max_depth': 5, 'max_iter': 100, 'min_samples_leaf': 100}


Unnamed: 0,0,1,2,3
0,18,0,0,0
1,0,11348,12708,148
2,0,6085,23461,177
3,0,2529,1340,719


Accuracy Train: 0.607281362650129
Precision Train: [1.         0.56848011 0.62547655 0.68869732]
F1 Train: [1.         0.51387945 0.69791171 0.2553267 ]


Unnamed: 0,0,1,2,3
0,0,2,7,0
1,0,4401,5892,108
2,0,3055,9506,108
3,0,1213,632,162


Accuracy Test: 0.5608307422466714
Precision Test: [0.         0.50755392 0.59275426 0.42857143]
F1 Test: [0.         0.46151426 0.66230056 0.13584906]


<Figure size 432x288 with 0 Axes>

### Save Model / Results

In [5]:
save_dir = current_dir + '\\module2_data\\' + state + '\\DecisionTrees'
print(save_dir)
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

C:\Users\alexa\Documents\GitHub\traffic-accident-weather-analysis\code\module2_data\GA\DecisionTrees
