# Decision Trees

In [5]:
import pandas as pd
import numpy as np

In [6]:
#loading in the datasets
df_training = pd.read_csv('../data/archive/train_pca.csv')
df_testing = pd.read_csv('../data/archive/test_pca.csv')

In [7]:
x_train = df_training.iloc[:, :-1]
y_train = df_training.iloc[:,-1]

x_test = df_testing.iloc[:, :-1]
y_test = df_testing.iloc[:,-1]

In [8]:
# imports from sklearn
from sklearn import tree
from sklearn.model_selection import GridSearchCV, StratifiedKFold

dt = tree.DecisionTreeClassifier()

In [9]:
#Making a grid of values we want our grid search to test to find the best parameters

grid_values = [{'criterion': ['gini', 'entropy'], 'max_depth' :list(range(1,104,4)), 'min_samples_leaf':list(range(10,100,10))}]

dt_classifier = GridSearchCV(estimator = dt, param_grid = grid_values, cv = StratifiedKFold(n_splits = 5),
                      scoring = ['accuracy', 'roc_auc_ovr', 'f1_micro'], refit = False, verbose = 0)

dt_model = dt_classifier.fit(x_train, y_train)

In [10]:
#initializing our lists so we can later store the best params that achieve best scores across metrics
accuracy_best_params = []
roc_auc_best_params = []
f1_best_params = []

accuracy_best_params.append(dt_model.cv_results_['params'][ np.argmin(dt_model.cv_results_['rank_test_accuracy'])])
roc_auc_best_params.append(dt_model.cv_results_['params'][ np.argmin(dt_model.cv_results_['rank_test_roc_auc_ovr'])])
f1_best_params.append(dt_model.cv_results_['params'][ np.argmin(dt_model.cv_results_['rank_test_f1_micro']) ])

In [11]:
accuracy_best_params

[{'criterion': 'entropy', 'max_depth': 13, 'min_samples_leaf': 20}]

In [12]:
roc_auc_best_params

[{'criterion': 'gini', 'max_depth': 61, 'min_samples_leaf': 70}]

In [13]:
f1_best_params

[{'criterion': 'entropy', 'max_depth': 13, 'min_samples_leaf': 20}]

In [14]:
#finding the accuracy metrics
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
accuracy_test_score = []

dt_clf = tree.DecisionTreeClassifier(criterion= 'entropy', max_depth = 9, min_samples_leaf = 70)
model = dt_clf.fit(x_train, y_train)

y_pred = model.predict(x_test)

accuracy_test_score.append(accuracy_score(y_test, y_pred))

In [15]:
accuracy_test_score

[0.8045469969460468]