In [5]:
import pandas as pd
from scipy.io import arff
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, f1_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

In [6]:
data = arff.loadarff('Dry_Bean_dataset.arff')
bean_data = pd.DataFrame(data[0])
bean_data.head()

Unnamed: 0,Area,Perimeter,MajorAxisLength,MinorAxisLength,AspectRation,Eccentricity,ConvexArea,EquivDiameter,Extent,Solidity,roundness,Compactness,ShapeFactor1,ShapeFactor2,ShapeFactor3,ShapeFactor4,Class
0,28395.0,610.291,208.178117,173.888747,1.197191,0.549812,28715.0,190.141097,0.763923,0.988856,0.958027,0.913358,0.007332,0.003147,0.834222,0.998724,b'SEKER'
1,28734.0,638.018,200.524796,182.734419,1.097356,0.411785,29172.0,191.27275,0.783968,0.984986,0.887034,0.953861,0.006979,0.003564,0.909851,0.99843,b'SEKER'
2,29380.0,624.11,212.82613,175.931143,1.209713,0.562727,29690.0,193.410904,0.778113,0.989559,0.947849,0.908774,0.007244,0.003048,0.825871,0.999066,b'SEKER'
3,30008.0,645.884,210.557999,182.516516,1.153638,0.498616,30724.0,195.467062,0.782681,0.976696,0.903936,0.928329,0.007017,0.003215,0.861794,0.994199,b'SEKER'
4,30140.0,620.134,201.847882,190.279279,1.060798,0.33368,30417.0,195.896503,0.773098,0.990893,0.984877,0.970516,0.006697,0.003665,0.9419,0.999166,b'SEKER'


In [9]:
bean_data['Class'].value_counts()

b'DERMASON'    3546
b'SIRA'        2636
b'SEKER'       2027
b'HOROZ'       1928
b'CALI'        1630
b'BARBUNYA'    1322
b'BOMBAY'       522
Name: Class, dtype: int64

In [16]:
#For the purpose of making this abinary problem,
#I will consider the two most popular classes 'DERMASON' and 'SIRA' +1
bean_data['Class'] = np.where((bean_data['Class'] == b'DERMASON'), 1, bean_data['Class'])
bean_data['Class'] = np.where((bean_data['Class'] == b'SIRA'), 1, bean_data['Class'])

#I will consider the rest of the class -1
bean_data['Class'] = np.where((bean_data['Class'] == b'SEKER'), -1, bean_data['Class'])
bean_data['Class'] = np.where((bean_data['Class'] == b'HOROZ'), -1, bean_data['Class'])
bean_data['Class'] = np.where((bean_data['Class'] == b'CALI'), -1, bean_data['Class'])
bean_data['Class'] = np.where((bean_data['Class'] == b'BARBUNYA'), -1, bean_data['Class'])
bean_data['Class'] = np.where((bean_data['Class'] == b'BOMBAY'), -1, bean_data['Class'])



In [17]:
bean_data['Class'].value_counts()

-1    7429
 1    6182
Name: Class, dtype: int64

In [18]:
bean_data.head()

Unnamed: 0,Area,Perimeter,MajorAxisLength,MinorAxisLength,AspectRation,Eccentricity,ConvexArea,EquivDiameter,Extent,Solidity,roundness,Compactness,ShapeFactor1,ShapeFactor2,ShapeFactor3,ShapeFactor4,Class
0,28395.0,610.291,208.178117,173.888747,1.197191,0.549812,28715.0,190.141097,0.763923,0.988856,0.958027,0.913358,0.007332,0.003147,0.834222,0.998724,-1
1,28734.0,638.018,200.524796,182.734419,1.097356,0.411785,29172.0,191.27275,0.783968,0.984986,0.887034,0.953861,0.006979,0.003564,0.909851,0.99843,-1
2,29380.0,624.11,212.82613,175.931143,1.209713,0.562727,29690.0,193.410904,0.778113,0.989559,0.947849,0.908774,0.007244,0.003048,0.825871,0.999066,-1
3,30008.0,645.884,210.557999,182.516516,1.153638,0.498616,30724.0,195.467062,0.782681,0.976696,0.903936,0.928329,0.007017,0.003215,0.861794,0.994199,-1
4,30140.0,620.134,201.847882,190.279279,1.060798,0.33368,30417.0,195.896503,0.773098,0.990893,0.984877,0.970516,0.006697,0.003665,0.9419,0.999166,-1


In [19]:
#Standard Scaler on numerical data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

def scale_columns(dataset, cols_scale):
    for col in cols_scale:
        dataset[col] = pd.DataFrame(scaler.fit_transform(pd.DataFrame(bean_data[col])), columns=[col])
    return dataset

In [20]:
scaled_bean_data = scale_columns(bean_data, [i for i in list(bean_data.columns) if i not in ['Class']])
scaled_bean_data.head()

Unnamed: 0,Area,Perimeter,MajorAxisLength,MinorAxisLength,AspectRation,Eccentricity,ConvexArea,EquivDiameter,Extent,Solidity,roundness,Compactness,ShapeFactor1,ShapeFactor2,ShapeFactor3,ShapeFactor4,Class
0,-0.840749,-1.143319,-1.306598,-0.631153,-1.565053,-2.18572,-0.841451,-1.063341,0.289087,0.367613,1.423867,1.839116,0.680786,2.402173,1.925723,0.838371,-1
1,-0.829188,-1.013924,-1.395911,-0.434445,-1.969784,-3.68604,-0.826102,-1.044217,0.697477,-0.462907,0.231054,2.495449,0.367967,3.100893,2.689702,0.771138,-1
2,-0.807157,-1.078829,-1.252357,-0.585735,-1.514291,-2.045336,-0.808704,-1.008084,0.578195,0.518417,1.252865,1.764843,0.603129,2.235091,1.841356,0.916755,-1
3,-0.785741,-0.977215,-1.278825,-0.43929,-1.741618,-2.742211,-0.773975,-0.973337,0.67126,-2.241767,0.515049,2.081715,0.401718,2.515075,2.20425,-0.197985,-1
4,-0.781239,-1.097384,-1.380471,-0.266663,-2.117993,-4.535028,-0.784286,-0.96608,0.47602,0.804772,1.874992,2.76533,0.118268,3.270983,3.013462,0.93964,-1


In [21]:
scaled_bean_data['Class'].dtypes

dtype('O')

In [22]:
scaled_bean_data['Class'] = pd.to_numeric(scaled_bean_data['Class'])

In [24]:
#Collecting all the samples we will need for each algorithm
all_samples = []

for sample in range(0,5):
    all_samples.append(scaled_bean_data.sample(n=5000, replace=False))

In [25]:
len(all_samples)

5

In [58]:
len(scaled_bean_data)

13611

In [59]:
scaled_bean_data['Class'].value_counts()

-1    7429
 1    6182
Name: Class, dtype: int64

# Logistic Regression

In [26]:
#Lists to save models and training/testing sets
all_models = []
X_training_sets = []
y_training_sets = []
X_testing_sets = []
y_testing_sets = []

#lists to save best params
accuracy_best_params = []
roc_auc_best_params = []
f1_best_params = []

#looping across each sample for each trial
for sample in all_samples:
    X_train = sample.iloc[:, :-1]
    y_train = sample.iloc[:, -1]
    X_training_sets.append(X_train)
    y_training_sets.append(y_train)
    
    #Separating rows for the test set that were not in the sample
    ix = [i for i in scaled_bean_data.index if i not in sample.index]
    test_set = scaled_bean_data.loc[ix]
    X_test = test_set.iloc[:, :-1]
    y_test = test_set.iloc[:, -1]
    X_testing_sets.append(X_test)
    y_testing_sets.append(y_test)
    
    #Initiating classifier
    log_reg = LogisticRegression(max_iter=10000)
    
    grid_values = [{'solver':['saga'],
                   'penalty': ['l1', 'l2'],
                   'C': [.00000001, .0000001, .000001, .00001, .0001, .001, .01, .1, 1, 10, 100, 1000, 10000]},
                   {'solver':['lbfgs'],
                   'penalty': ['l2'],
                   'C': [.00000001, .0000001, .000001, .00001, .0001, .001, .01, .1, 1, 10, 100, 1000, 10000]},
                   {'solver':['lbfgs', 'saga'],
                   'penalty': ['none'],}
                   ]
    clf = GridSearchCV(estimator = log_reg, param_grid = grid_values, cv = StratifiedKFold(n_splits=5),
                      scoring = ['accuracy', 'roc_auc_ovr', 'f1_micro'], refit=False, verbose=0)
    
    model = clf.fit(X_train, y_train)
    
    accuracy_best_params.append(model.cv_results_['params'][np.argmin(model.cv_results_['rank_test_accuracy'])])
    roc_auc_best_params.append(model.cv_results_['params'][np.argmin(model.cv_results_['rank_test_roc_auc_ovr'])])
    f1_best_params.append(model.cv_results_['params'][np.argmin(model.cv_results_['rank_test_f1_micro'])])
    
    all_models.append(model)

In [27]:
accuracy_best_params

[{'penalty': 'none', 'solver': 'lbfgs'},
 {'penalty': 'none', 'solver': 'lbfgs'},
 {'penalty': 'none', 'solver': 'lbfgs'},
 {'penalty': 'none', 'solver': 'lbfgs'},
 {'penalty': 'none', 'solver': 'lbfgs'}]

In [28]:
roc_auc_best_params

[{'penalty': 'none', 'solver': 'lbfgs'},
 {'penalty': 'none', 'solver': 'lbfgs'},
 {'C': 10000, 'penalty': 'l2', 'solver': 'lbfgs'},
 {'penalty': 'none', 'solver': 'lbfgs'},
 {'penalty': 'none', 'solver': 'lbfgs'}]

In [29]:
f1_best_params

[{'penalty': 'none', 'solver': 'lbfgs'},
 {'penalty': 'none', 'solver': 'lbfgs'},
 {'penalty': 'none', 'solver': 'lbfgs'},
 {'penalty': 'none', 'solver': 'lbfgs'},
 {'penalty': 'none', 'solver': 'lbfgs'}]

In [60]:
accuracy_test_errors = []
accuracy_train_errors = []
for param in accuracy_best_params:
    
    n = 0
    if 'C' in param:
        
        logreg_clf = LogisticRegression(penalty = param['penalty'], solver = param['solver'], C = param['C'], max_iter = 10000)
        model = logreg_clf.fit(X_training_sets[n], y_training_sets[n])
    else:
        
        logreg_clf = LogisticRegression(penalty = param['penalty'], solver = param['solver'],  max_iter = 10000)
        model = logreg_clf.fit(X_training_sets[n], y_training_sets[n])
    
    y_pred = model.predict(X_testing_sets[n])
    accuracy_test_errors.append(accuracy_score(y_testing_sets[n], y_pred))
    
    y_pred_train = model.predict(X_training_sets[n])
    accuracy_train_errors.append(accuracy_score(y_training_sets[n], y_pred_train))
    
    n += 1

In [61]:
accuracy_test_errors

[0.9736383695273487,
 0.9736383695273487,
 0.9736383695273487,
 0.9736383695273487,
 0.9736383695273487]

In [62]:
accuracy_train_errors

[0.9742, 0.9742, 0.9742, 0.9742, 0.9742]

In [63]:
roc_auc_test_errors = []
roc_auc_train_errors = []
for param in roc_auc_best_params:
    
    n = 0
    
    if 'C' in param:
        
        logreg_clf = LogisticRegression(penalty = param['penalty'], solver = param['solver'], C = param['C'], max_iter = 10000)
        model = logreg_clf.fit(X_training_sets[n], y_training_sets[n])
    else:
        
        logreg_clf = LogisticRegression(penalty = param['penalty'], solver = param['solver'],  max_iter = 10000)
        model = logreg_clf.fit(X_training_sets[n], y_training_sets[n])
    
    y_pred2 = model.predict(X_testing_sets[n])
    roc_auc_test_errors.append(roc_auc_score(y_testing_sets[n], y_pred2))
    
    y_pred_train2 = model.predict(X_training_sets[n])
    roc_auc_train_errors.append(roc_auc_score(y_training_sets[n], y_pred_train2))
    
    n += 1

In [64]:
roc_auc_test_errors 

[0.9740346039397707,
 0.9740346039397707,
 0.9727472494804082,
 0.9740346039397707,
 0.9740346039397707]

In [65]:
roc_auc_train_errors

[0.9747103562738684,
 0.9747103562738684,
 0.9725793196796596,
 0.9747103562738684,
 0.9747103562738684]

In [66]:
f1_score_test_errors = []
f1_score_train_errors = []
for param in f1_best_params:
    
    n = 0
    
    if 'C' in param:
        
        logreg_clf = LogisticRegression(penalty = param['penalty'], solver = param['solver'], C = param['C'], max_iter = 10000)
        model = logreg_clf.fit(X_training_sets[n], y_training_sets[n])
    else:
        
        logreg_clf = LogisticRegression(penalty = param['penalty'], solver = param['solver'],  max_iter = 10000)
        model = logreg_clf.fit(X_training_sets[n], y_training_sets[n])
    
    y_pred3 = model.predict(X_testing_sets[n])
    f1_score_test_errors.append(f1_score(y_testing_sets[n], y_pred3))
    
    y_pred_train3 = model.predict(X_training_sets[n])
    f1_score_train_errors.append(f1_score(y_training_sets[n], y_pred_train3))
    
    
    n += 1

In [67]:
f1_score_test_errors

[0.9710496110190027,
 0.9710496110190027,
 0.9710496110190027,
 0.9710496110190027,
 0.9710496110190027]

In [68]:
f1_score_train_errors

[0.9720839645098464,
 0.9720839645098464,
 0.9720839645098464,
 0.9720839645098464,
 0.9720839645098464]

# KNN

In [37]:
#Lists to save models and training/testing sets
all_models_knn = []
X_training_sets_knn = []
y_training_sets_knn = []
X_testing_sets_knn = []
y_testing_sets_knn = []

#lists to save best params
accuracy_best_params_knn = []
roc_auc_best_params_knn = []
f1_best_params_knn = []

#looping across each sample for each trial
for sample in all_samples:
    X_train_knn = sample.iloc[:, :-1]
    y_train_knn = sample.iloc[:, -1]
    X_training_sets_knn.append(X_train_knn)
    y_training_sets_knn.append(y_train_knn)
    
    #Separating rows for the test set that were not in the sample
    ix_knn = [i for i in scaled_bean_data.index if i not in sample.index]
    test_set_knn = scaled_bean_data.loc[ix]
    X_test_knn = test_set_knn.iloc[:, :-1]
    y_test_knn = test_set_knn.iloc[:, -1]
    X_testing_sets_knn.append(X_test_knn)
    y_testing_sets_knn.append(y_test_knn)
    
    #Initiating classifier
    knn = KNeighborsClassifier()
    
    grid_values = {'n_neighbors' : list(range(1,105,4))}
    
    clf_knn = GridSearchCV(estimator = knn, param_grid = grid_values, cv = StratifiedKFold(n_splits=5),
                      scoring = ['accuracy', 'roc_auc_ovr', 'f1_micro'], refit=False, verbose=0)
    
    model_knn = clf_knn.fit(X_train_knn, y_train_knn)
    
    accuracy_best_params_knn.append(model_knn.cv_results_['params'][np.argmin(model_knn.cv_results_['rank_test_accuracy'])])
    roc_auc_best_params_knn.append(model_knn.cv_results_['params'][np.argmin(model_knn.cv_results_['rank_test_roc_auc_ovr'])])
    f1_best_params_knn.append(model_knn.cv_results_['params'][np.argmin(model_knn.cv_results_['rank_test_f1_micro'])])
    
    all_models_knn.append(model_knn)

In [39]:
accuracy_best_params_knn

[{'n_neighbors': 13},
 {'n_neighbors': 9},
 {'n_neighbors': 9},
 {'n_neighbors': 9},
 {'n_neighbors': 25}]

In [40]:
roc_auc_best_params_knn

[{'n_neighbors': 77},
 {'n_neighbors': 93},
 {'n_neighbors': 89},
 {'n_neighbors': 21},
 {'n_neighbors': 93}]

In [41]:
f1_best_params_knn

[{'n_neighbors': 13},
 {'n_neighbors': 9},
 {'n_neighbors': 9},
 {'n_neighbors': 9},
 {'n_neighbors': 25}]

In [69]:
#Now we need to loop across all models from each metric, train on entire sample, and then predict on test set
accuracy_test_errors_knn = []
accuracy_train_errors_knn = []
for param in accuracy_best_params_knn:
    
    n_knn = 0
    
    knn_clf = KNeighborsClassifier(n_neighbors = param['n_neighbors'])
    model = knn_clf.fit(X_training_sets_knn[n_knn], y_training_sets_knn[n_knn])
    
    y_pred_knn = model.predict(X_testing_sets_knn[n_knn])
    accuracy_test_errors_knn.append(accuracy_score(y_testing_sets_knn[n_knn], y_pred_knn))
    
    y_pred_train_knn = model.predict(X_training_sets_knn[n_knn])
    accuracy_train_errors_knn.append(accuracy_score(y_training_sets_knn[n_knn], y_pred_train_knn))
    
    n_knn += 1

In [70]:
accuracy_test_errors_knn

[0.9738706305887818,
 0.9744512832423644,
 0.9744512832423644,
 0.9744512832423644,
 0.9727093252816166]

In [71]:
accuracy_train_errors_knn

[0.9762, 0.977, 0.977, 0.977, 0.9752]

In [72]:
roc_auc_test_errors_knn = []
roc_auc_train_errors_knn = []
for param in roc_auc_best_params_knn:
    
    n_knn = 0
    
    knn_clf2 = KNeighborsClassifier(n_neighbors = param['n_neighbors'])
    model = knn_clf2.fit(X_training_sets_knn[n_knn], y_training_sets_knn[n_knn])
    
    y_pred_knn2 = model.predict(X_testing_sets_knn[n_knn])
    roc_auc_test_errors_knn.append(roc_auc_score(y_testing_sets_knn[n_knn], y_pred_knn2))
    
    y_pred_train_knn2 = model.predict(X_training_sets_knn[n_knn])
    roc_auc_train_errors_knn.append(roc_auc_score(y_training_sets_knn[n_knn], y_pred_train_knn2))
    
    n_knn += 1


In [73]:
roc_auc_test_errors_knn

[0.969880306449445,
 0.969370863359658,
 0.9698176463642187,
 0.9743672093216875,
 0.969370863359658]

In [74]:
roc_auc_train_errors_knn 

[0.9716568104545675,
 0.9709526418408289,
 0.9715061473758843,
 0.9770258947130956,
 0.9709526418408289]

In [75]:
f1_score_test_errors_knn = []
f1_score_train_errors_knn = []
for param in f1_best_params_knn:
    
    n_knn = 0
    
    knn_clf3 = KNeighborsClassifier(n_neighbors = param['n_neighbors'])
    model = knn_clf3.fit(X_training_sets_knn[n_knn], y_training_sets_knn[n_knn])
    
    y_pred_knn3 = model.predict(X_testing_sets_knn[n_knn])
    f1_score_test_errors_knn.append(f1_score(y_testing_sets_knn[n_knn], y_pred_knn3))
    
    y_pred_train_knn3 = model.predict(X_training_sets_knn[n_knn])
    f1_score_train_errors_knn.append(f1_score(y_training_sets_knn[n_knn], y_pred_train_knn3))
    
    
    n_knn += 1

In [76]:
f1_score_test_errors_knn

[0.9713631156930125,
 0.9719887955182073,
 0.9719887955182073,
 0.9719887955182073,
 0.9701207883026064]

In [77]:
f1_score_train_errors_knn

[0.9742033383915022,
 0.9750596399913251,
 0.9750596399913251,
 0.9750596399913251,
 0.9731601731601732]

In [48]:
#Lists to save models and training/testing sets
all_models_dt = []
X_training_sets_dt = []
y_training_sets_dt = []
X_testing_sets_dt = []
y_testing_sets_dt = []

#lists to save best params
accuracy_best_params_dt = []
roc_auc_best_params_dt = []
f1_best_params_dt = []

#looping across each sample for each trial
for sample in all_samples:
    X_train_dt = sample.iloc[:, :-1]
    y_train_dt = sample.iloc[:, -1]
    X_training_sets_dt.append(X_train_dt)
    y_training_sets_dt.append(y_train_dt)
    
    #Separating rows for the test set that were not in the sample
    ix_dt = [i for i in scaled_bean_data.index if i not in sample.index]
    test_set_dt = scaled_bean_data.loc[ix_dt]
    X_test_dt = test_set_dt.iloc[:, :-1]
    y_test_dt = test_set_dt.iloc[:, -1]
    X_testing_sets_dt.append(X_test_dt)
    y_testing_sets_dt.append(y_test_dt)
    
    #Initiating classifier
    dt = DecisionTreeClassifier()
    
    grid_values = [{'criterion': ['gini', 'entropy'], 'max_depth':list(range(1,100,3)), 'min_samples_leaf': list(range(10,100,10))}]
    
    clf_dt = GridSearchCV(estimator = dt, param_grid = grid_values, cv = StratifiedKFold(n_splits=5),
                      scoring = ['accuracy', 'roc_auc_ovr', 'f1_micro'], refit=False, verbose=0)
    
    model_dt = clf_dt.fit(X_train_dt, y_train_dt)
    
    accuracy_best_params_dt.append(model_dt.cv_results_['params'][np.argmin(model_dt.cv_results_['rank_test_accuracy'])])
    roc_auc_best_params_dt.append(model_dt.cv_results_['params'][np.argmin(model_dt.cv_results_['rank_test_roc_auc_ovr'])])
    f1_best_params_dt.append(model_dt.cv_results_['params'][np.argmin(model_dt.cv_results_['rank_test_f1_micro'])])
    
    all_models_dt.append(model_dt)

In [49]:
accuracy_best_params_dt

[{'criterion': 'entropy', 'max_depth': 22, 'min_samples_leaf': 10},
 {'criterion': 'entropy', 'max_depth': 7, 'min_samples_leaf': 10},
 {'criterion': 'entropy', 'max_depth': 7, 'min_samples_leaf': 10},
 {'criterion': 'entropy', 'max_depth': 16, 'min_samples_leaf': 10},
 {'criterion': 'gini', 'max_depth': 22, 'min_samples_leaf': 10}]

In [50]:
roc_auc_best_params_dt

[{'criterion': 'entropy', 'max_depth': 64, 'min_samples_leaf': 40},
 {'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 30},
 {'criterion': 'gini', 'max_depth': 73, 'min_samples_leaf': 30},
 {'criterion': 'gini', 'max_depth': 37, 'min_samples_leaf': 30},
 {'criterion': 'gini', 'max_depth': 91, 'min_samples_leaf': 40}]

In [51]:
f1_best_params_dt

[{'criterion': 'entropy', 'max_depth': 22, 'min_samples_leaf': 10},
 {'criterion': 'entropy', 'max_depth': 7, 'min_samples_leaf': 10},
 {'criterion': 'entropy', 'max_depth': 7, 'min_samples_leaf': 10},
 {'criterion': 'entropy', 'max_depth': 16, 'min_samples_leaf': 10},
 {'criterion': 'gini', 'max_depth': 22, 'min_samples_leaf': 10}]

In [78]:
#Now we need to loop across all models from each metric, train on entire sample, and then predict on test set
accuracy_test_errors_dt = []
accuracy_train_errors_dt = []
for param in accuracy_best_params_dt:
    
    n_dt = 0
    
    dt_clf = DecisionTreeClassifier(criterion = param['criterion'], max_depth = param['max_depth'],
                                   min_samples_leaf = param['min_samples_leaf'])
    model = dt_clf.fit(X_training_sets_dt[n_dt], y_training_sets_dt[n_dt])
    
    y_pred_dt = model.predict(X_testing_sets_dt[n_dt])
    accuracy_test_errors_dt.append(accuracy_score(y_testing_sets_dt[n_dt], y_pred_dt))
    
    y_pred_train_dt = model.predict(X_training_sets_dt[n_dt])
    accuracy_train_errors_dt.append(accuracy_score(y_training_sets_dt[n_dt], y_pred_train_dt))
    
    n_dt += 1

In [79]:
accuracy_test_errors_dt 

[0.9649285797236093,
 0.965741493438625,
 0.965741493438625,
 0.9650447102543258,
 0.9667866682150738]

In [80]:
accuracy_train_errors_dt 

[0.9818, 0.979, 0.979, 0.9818, 0.9812]

In [81]:
roc_auc_test_errors_dt = []
roc_auc_train_errors_dt = []
for param in roc_auc_best_params_dt:
    
    n_dt = 0
    
    dt_clf2 = DecisionTreeClassifier(criterion = param['criterion'], max_depth = param['max_depth'],
                                   min_samples_leaf = param['min_samples_leaf'])
    model = dt_clf2.fit(X_training_sets_dt[n_dt], y_training_sets_dt[n_dt])
    
    y_pred_dt2 = model.predict(X_testing_sets_dt[n_dt])
    roc_auc_test_errors_dt.append(roc_auc_score(y_testing_sets_dt[n_dt], y_pred_dt2))
    
    y_pred_train_dt2 = model.predict(X_training_sets_dt[n_dt])
    roc_auc_train_errors_dt.append(roc_auc_score(y_training_sets_dt[n_dt], y_pred_train_dt2))
    
    n_dt += 1

In [82]:
roc_auc_test_errors_dt 

[0.9643808883508033,
 0.9641504124826556,
 0.9627161316991271,
 0.9625651490432393,
 0.9586369320672787]

In [83]:
roc_auc_train_errors_dt

[0.970076056655763,
 0.9721611692099454,
 0.9694637361220774,
 0.9694637361220774,
 0.9641558839169176]

In [84]:
f1_score_test_errors_dt = []
f1_score_train_errors_dt = []
for param in f1_best_params_dt:
    
    n_dt = 0
    
    dt_clf3 = DecisionTreeClassifier(criterion = param['criterion'], max_depth = param['max_depth'],
                                   min_samples_leaf = param['min_samples_leaf'])
    model = dt_clf3.fit(X_training_sets_dt[n_dt], y_training_sets_dt[n_dt])
    
    y_pred_dt3 = model.predict(X_testing_sets_dt[n_dt])
    f1_score_test_errors_dt.append(f1_score(y_testing_sets_dt[n_dt], y_pred_dt3))
    
    y_pred_train_dt3 = model.predict(X_training_sets_dt[n_dt])
    f1_score_train_errors_dt.append(f1_score(y_training_sets_dt[n_dt], y_pred_train_dt3))
    
    n_dt += 1

In [85]:
f1_score_test_errors_dt

[0.9612762125305544,
 0.9621524201853758,
 0.9622860084953018,
 0.9614197530864197,
 0.9626875240415438]

In [86]:
f1_score_train_errors_dt

[0.980144010473489,
 0.9770892428540258,
 0.9770892428540258,
 0.980144010473489,
 0.9797957853573755]