# Decision Trees

In [1]:
import pandas as pd
import numpy as np
# imports from sklearn
from sklearn import tree
from sklearn.model_selection import GridSearchCV, StratifiedKFold

dt = tree.DecisionTreeClassifier()

In [2]:
#loading in the datasets
#pca datasets
df_pca_training = pd.read_csv('../data/archive/train_pca.csv')
df_pca_testing = pd.read_csv('../data/archive/test_pca.csv')

In [3]:
#defining x and y with pca dataset
x_pca_train = df_pca_training.iloc[:, :-1]
y_pca_train = df_pca_training.iloc[:,-1]

x_pca_test = df_pca_testing.iloc[:, :-1]
y_pca_test = df_pca_testing.iloc[:,-1]

In [5]:
#Making a grid of values we want our grid search to test to find the best parameters

grid_values = [{'criterion': ['gini', 'entropy'], 'max_depth' :list(range(1,104,4)), 'min_samples_leaf':list(range(10,100,10))}]

dt_classifier = GridSearchCV(estimator = dt, param_grid = grid_values, cv = StratifiedKFold(n_splits = 5),
                      scoring = ['accuracy', 'roc_auc_ovr', 'f1_micro'], refit = False, verbose = 0)

dt_pca_model = dt_classifier.fit(x_pca_train, y_pca_train)

In [6]:
#initializing our lists so we can later store the best params that achieve best scores across metrics
accuracy_best_params = []
precision_best_params = []
f1_best_params = []

def params(model):
    accuracy_best_params.append(model.cv_results_['params'][ np.argmin(model.cv_results_['rank_test_accuracy'])])
    precision_best_params.append(model.cv_results_['params'][ np.argmin(model.cv_results_['rank_test_roc_auc_ovr'])])
    f1_best_params.append(model.cv_results_['params'][ np.argmin(model.cv_results_['rank_test_f1_micro']) ])
    
    print(accuracy_best_params)
    print(precision_best_params)
    print(f1_best_params)

In [8]:
#shows best params for the model
params(dt_pca_model)

[{'criterion': 'entropy', 'max_depth': 37, 'min_samples_leaf': 80}, {'criterion': 'entropy', 'max_depth': 37, 'min_samples_leaf': 80}]
[{'criterion': 'gini', 'max_depth': 97, 'min_samples_leaf': 60}, {'criterion': 'gini', 'max_depth': 97, 'min_samples_leaf': 60}]
[{'criterion': 'entropy', 'max_depth': 37, 'min_samples_leaf': 80}, {'criterion': 'entropy', 'max_depth': 37, 'min_samples_leaf': 80}]


In [9]:
#finding the accuracy metrics
from sklearn.metrics import accuracy_score, f1_score, precision_score
    
def accuracy_metric(xtrain,ytrain,xtest,ytest):
    accuracy_test_score = []
    dt_clf = tree.DecisionTreeClassifier(criterion=accuracy_best_params[0]['criterion'], 
                                         max_depth = accuracy_best_params[0]['max_depth'], 
                                         min_samples_leaf = accuracy_best_params[0]['min_samples_leaf'])
    model = dt_clf.fit(xtrain, ytrain)
    y_pred = model.predict(xtest)
    accuracy_test_score.append(accuracy_score(ytest, y_pred))
    return accuracy_test_score

def f1_metric(xtrain,ytrain,xtest,ytest):
    f1_test_score = []
    dt_clf = tree.DecisionTreeClassifier(criterion=f1_best_params[0]['criterion'], 
                                         max_depth = f1_best_params[0]['max_depth'], 
                                         min_samples_leaf = f1_best_params[0]['min_samples_leaf'])
    model = dt_clf.fit(xtrain, ytrain)
    y_pred = model.predict(xtest)
    f1_test_score.append(f1_score(ytest, y_pred,average = 'micro'))
    return f1_test_score

def precision_metric(xtrain,ytrain,xtest,ytest):
    precision_test_score = []
    dt_clf = tree.DecisionTreeClassifier(criterion=precision_best_params[0]['criterion'], 
                                         max_depth = precision_best_params[0]['max_depth'], 
                                         min_samples_leaf = precision_best_params[0]['min_samples_leaf'])
    model = dt_clf.fit(xtrain, ytrain)
    y_pred = model.predict(xtest)
    precision_test_score.append(precision_score(ytest, y_pred,average = 'micro'))
    return precision_test_score

In [11]:
#Printing the accuracy metrics
print(accuracy_metric(x_pca_train,y_pca_train,x_pca_test,y_pca_test))
print(f1_metric(x_pca_train,y_pca_train,x_pca_test,y_pca_test))
print(precision_metric(x_pca_train,y_pca_train,x_pca_test,y_pca_test))

[0.8089582626399728]
[0.808958262639973]
[0.7991177468612148]


## Bootstrap

In [12]:
#Separating X and y from testing since this won't be used in the for loop
x_train_full = df_pca_training.iloc[:,:-1]
y_train_full = df_pca_training.iloc[:,-1]

x_test = df_pca_testing.iloc[:, :-1]
y_test = df_pca_testing.iloc[:,-1]

In [13]:
#getting all of our randomized samples from the training
all_samples = []

for sample in range(0,5):
    all_samples.append(df_pca_training.sample(n=5000, replace= True))

In [14]:
#Making a grid of values we want our grid search to test to find the best parameters
#looping this 5 times to get 5 bootstrap samples


accuracy_best_params = []
precision_best_params = []
f1_best_params = []


for sample in all_samples:
    
    
    #Making a grid of values we want our grid search to test to find the best parameters
    grid_values = [{'criterion': ['gini', 'entropy'], 'max_depth' :list(range(1,104,4)), 'min_samples_leaf':list(range(10,100,10))}]

    dt_classifier = GridSearchCV(estimator = dt, param_grid = grid_values, cv = StratifiedKFold(n_splits = 5),
                      scoring = ['accuracy', 'roc_auc_ovr', 'f1_micro'], refit = False, verbose = 0)

    x_train = sample.iloc[:, :-1]
    y_train = sample.iloc[:,-1]
    
    dt_pca_model = dt_classifier.fit(x_pca_train, y_pca_train)
    
    accuracy_best_params.append(dt_pca_model.cv_results_['params'][ np.argmin(dt_pca_model.cv_results_['rank_test_accuracy'])])
    precision_best_params.append(dt_pca_model.cv_results_['params'][ np.argmin(dt_pca_model.cv_results_['rank_test_roc_auc_ovr'])])
    f1_best_params.append(dt_pca_model.cv_results_['params'][ np.argmin(dt_pca_model.cv_results_['rank_test_f1_micro']) ])
    

In [15]:
#best accuracy parms for bootstrap samples
accuracy_best_params

[{'criterion': 'entropy', 'max_depth': 17, 'min_samples_leaf': 10},
 {'criterion': 'entropy', 'max_depth': 65, 'min_samples_leaf': 10},
 {'criterion': 'entropy', 'max_depth': 33, 'min_samples_leaf': 20},
 {'criterion': 'entropy', 'max_depth': 65, 'min_samples_leaf': 20},
 {'criterion': 'entropy', 'max_depth': 25, 'min_samples_leaf': 10}]

In [16]:
#best precision parms for bootstrap samples
precision_best_params

[{'criterion': 'gini', 'max_depth': 9, 'min_samples_leaf': 70},
 {'criterion': 'gini', 'max_depth': 101, 'min_samples_leaf': 70},
 {'criterion': 'gini', 'max_depth': 9, 'min_samples_leaf': 70},
 {'criterion': 'gini', 'max_depth': 9, 'min_samples_leaf': 70},
 {'criterion': 'gini', 'max_depth': 81, 'min_samples_leaf': 70}]

In [17]:
#best f1 parms for bootstrap samples
f1_best_params

[{'criterion': 'entropy', 'max_depth': 17, 'min_samples_leaf': 10},
 {'criterion': 'entropy', 'max_depth': 65, 'min_samples_leaf': 10},
 {'criterion': 'entropy', 'max_depth': 33, 'min_samples_leaf': 20},
 {'criterion': 'entropy', 'max_depth': 65, 'min_samples_leaf': 20},
 {'criterion': 'entropy', 'max_depth': 25, 'min_samples_leaf': 10}]

In [19]:
#finding the accuracy score for the 5 bootstraped samples
accuracy_test_score = []

i = 0
for param in accuracy_best_params:
 
    dt_clf = tree.DecisionTreeClassifier(criterion=accuracy_best_params[i]['criterion'], 
                                         max_depth = accuracy_best_params[i]['max_depth'], 
                                         min_samples_leaf = accuracy_best_params[i]['min_samples_leaf'])
    model = dt_clf.fit(x_train_full, y_train_full)
    y_pred = model.predict(x_test)
    accuracy_test_score.append(accuracy_score(y_test, y_pred))
    i += 1

In [20]:
accuracy_test_score

[0.8242280285035629,
 0.825924669155073,
 0.831014591109603,
 0.831014591109603,
 0.8245673566338649]

In [21]:
#finding the f1 score for the 5 bootstraped samples
f1_test_score = []

i = 0
for param in f1_best_params:
    
    dt_clf = tree.DecisionTreeClassifier(criterion=f1_best_params[i]['criterion'], 
                                         max_depth = f1_best_params[i]['max_depth'], 
                                         min_samples_leaf = f1_best_params[i]['min_samples_leaf'])
    model = dt_clf.fit(x_train_full, y_train_full)
    y_pred = model.predict(x_test)
    f1_test_score.append(f1_score(y_test, y_pred,average = 'micro'))
    i += 1

In [22]:
f1_test_score

[0.825585341024771,
 0.824906684764167,
 0.8310145911096029,
 0.829996606718697,
 0.825585341024771]

In [23]:
#finding the precision score for the 5 bootstraped samples
precision_test_score = []

i = 0
for param in precision_best_params:
    
    dt_clf = tree.DecisionTreeClassifier(criterion=precision_best_params[i]['criterion'], 
                                         max_depth = precision_best_params[i]['max_depth'], 
                                         min_samples_leaf = precision_best_params[i]['min_samples_leaf'])
    model = dt_clf.fit(x_train_full, y_train_full)
    y_pred = model.predict(x_test)
    precision_test_score.append(precision_score(y_test, y_pred,average = 'micro'))
    i += 1

In [24]:
precision_test_score

[0.7964031218187988,
 0.7974211062097047,
 0.7974211062097047,
 0.7964031218187988,
 0.7964031218187988]