In [49]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, f1_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

In [2]:
#First going to import three datasets 
occupancy_data1 = pd.read_csv('datatest.txt')
occupancy_data2 = pd.read_csv('datatest2.txt')
occupancy_data3 = pd.read_csv('datatraining.txt')

In [4]:
#Let's look at the head of each data
occupancy_data1.head()

Unnamed: 0,date,Temperature,Humidity,Light,CO2,HumidityRatio,Occupancy
140,2015-02-02 14:19:00,23.7,26.272,585.2,749.2,0.004764,1
141,2015-02-02 14:19:59,23.718,26.29,578.4,760.4,0.004773,1
142,2015-02-02 14:21:00,23.73,26.23,572.666667,769.666667,0.004765,1
143,2015-02-02 14:22:00,23.7225,26.125,493.75,774.75,0.004744,1
144,2015-02-02 14:23:00,23.754,26.2,488.6,779.0,0.004767,1


In [5]:
occupancy_data2.head()

Unnamed: 0,date,Temperature,Humidity,Light,CO2,HumidityRatio,Occupancy
1,2015-02-11 14:48:00,21.76,31.133333,437.333333,1029.666667,0.005021,1
2,2015-02-11 14:49:00,21.79,31.0,437.333333,1000.0,0.005009,1
3,2015-02-11 14:50:00,21.7675,31.1225,434.0,1003.75,0.005022,1
4,2015-02-11 14:51:00,21.7675,31.1225,439.0,1009.5,0.005022,1
5,2015-02-11 14:51:59,21.79,31.133333,437.333333,1005.666667,0.00503,1


In [6]:
occupancy_data3.head()

Unnamed: 0,date,Temperature,Humidity,Light,CO2,HumidityRatio,Occupancy
1,2015-02-04 17:51:00,23.18,27.272,426.0,721.25,0.004793,1
2,2015-02-04 17:51:59,23.15,27.2675,429.5,714.0,0.004783,1
3,2015-02-04 17:53:00,23.15,27.245,426.0,713.5,0.004779,1
4,2015-02-04 17:54:00,23.15,27.2,426.0,708.25,0.004772,1
5,2015-02-04 17:55:00,23.1,27.2,426.0,704.5,0.004757,1


In [7]:
#Creating list of three dataframes so I can merge into one dataframe
dataframes = [occupancy_data1, occupancy_data2, occupancy_data3]

In [8]:
occupancy_data = pd.concat(dataframes).reset_index()
occupancy_data.head()

Unnamed: 0,index,date,Temperature,Humidity,Light,CO2,HumidityRatio,Occupancy
0,140,2015-02-02 14:19:00,23.7,26.272,585.2,749.2,0.004764,1
1,141,2015-02-02 14:19:59,23.718,26.29,578.4,760.4,0.004773,1
2,142,2015-02-02 14:21:00,23.73,26.23,572.666667,769.666667,0.004765,1
3,143,2015-02-02 14:22:00,23.7225,26.125,493.75,774.75,0.004744,1
4,144,2015-02-02 14:23:00,23.754,26.2,488.6,779.0,0.004767,1


In [9]:
#Dropping index column and returning top 5 rows of new dataset
occupancy_data = occupancy_data.drop(columns='index')
occupancy_data.head()

Unnamed: 0,date,Temperature,Humidity,Light,CO2,HumidityRatio,Occupancy
0,2015-02-02 14:19:00,23.7,26.272,585.2,749.2,0.004764,1
1,2015-02-02 14:19:59,23.718,26.29,578.4,760.4,0.004773,1
2,2015-02-02 14:21:00,23.73,26.23,572.666667,769.666667,0.004765,1
3,2015-02-02 14:22:00,23.7225,26.125,493.75,774.75,0.004744,1
4,2015-02-02 14:23:00,23.754,26.2,488.6,779.0,0.004767,1


In [10]:
#Turning all '0' values of 'Occupancy' to -1 
occupancy_data['Occupancy'] = np.where((occupancy_data['Occupancy'] == 0), -1, occupancy_data['Occupancy'])
occupancy_data['Occupancy'].value_counts()

-1    15810
 1     4750
Name: Occupancy, dtype: int64

In [15]:
#Now have to standardize the numerical columns excluding 'date' and 'occupancy'
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

def scale_columns(dataset, cols_to_scale):
    for col in cols_to_scale:
        dataset[col] = pd.DataFrame(scaler.fit_transform(pd.DataFrame(occupancy_data[col])), columns =[col])
    return dataset

In [16]:
scaled_occupancy_data = scale_columns(occupancy_data, [i for i in list(occupancy_data.columns) if i not in ['date', 'Occupancy']])
scaled_occupancy_data.head()

Unnamed: 0,date,Temperature,Humidity,Light,CO2,HumidityRatio,Occupancy
0,2015-02-02 14:19:00,2.647415,-0.277783,2.159638,0.188457,0.697856,1
1,2015-02-02 14:19:59,2.664472,-0.27417,2.127322,0.224448,0.708923,1
2,2015-02-02 14:21:00,2.675843,-0.286213,2.100076,0.254226,0.699145,1
3,2015-02-02 14:22:00,2.668736,-0.307289,1.725043,0.27056,0.671302,1
4,2015-02-02 14:23:00,2.698586,-0.292235,1.700568,0.284218,0.701022,1


In [17]:
#changing date column so it only reflects number of day of the week
scaled_occupancy_data['date'] = pd.to_datetime(scaled_occupancy_data['date']).dt.dayofweek
scaled_occupancy_data.head()

Unnamed: 0,date,Temperature,Humidity,Light,CO2,HumidityRatio,Occupancy
0,0,2.647415,-0.277783,2.159638,0.188457,0.697856,1
1,0,2.664472,-0.27417,2.127322,0.224448,0.708923,1
2,0,2.675843,-0.286213,2.100076,0.254226,0.699145,1
3,0,2.668736,-0.307289,1.725043,0.27056,0.671302,1
4,0,2.698586,-0.292235,1.700568,0.284218,0.701022,1


In [19]:
#Collecting all the samples we will need for each algorithm
all_samples = []

for sample in range(0,5):
    all_samples.append(scaled_occupancy_data.sample(n=5000, replace=False))

In [20]:
len(all_samples)

5

In [61]:
len(scaled_occupancy_data)

20560

In [62]:
scaled_occupancy_data['Occupancy'].value_counts()

-1    15810
 1     4750
Name: Occupancy, dtype: int64

# Logistic Regression

In [21]:

#Lists to save models and training/testing sets
all_models = []
X_training_sets = []
y_training_sets = []
X_testing_sets = []
y_testing_sets = []

#lists to save best params
accuracy_best_params = []
roc_auc_best_params = []
f1_best_params = []

#looping across each sample for each trial
for sample in all_samples:
    X_train = sample.iloc[:, :-1]
    y_train = sample.iloc[:, -1]
    X_training_sets.append(X_train)
    y_training_sets.append(y_train)
    
    #Separating rows for the test set that were not in the sample
    ix = [i for i in scaled_occupancy_data.index if i not in sample.index]
    test_set = scaled_occupancy_data.loc[ix]
    X_test = test_set.iloc[:, :-1]
    y_test = test_set.iloc[:, -1]
    X_testing_sets.append(X_test)
    y_testing_sets.append(y_test)
    
    #Initiating classifier
    log_reg = LogisticRegression(max_iter=10000)
    
    grid_values = [{'solver':['saga'],
                   'penalty': ['l1', 'l2'],
                   'C': [.00000001, .0000001, .000001, .00001, .0001, .001, .01, .1, 1, 10, 100, 1000, 10000]},
                   {'solver':['lbfgs'],
                   'penalty': ['l2'],
                   'C': [.00000001, .0000001, .000001, .00001, .0001, .001, .01, .1, 1, 10, 100, 1000, 10000]},
                   {'solver':['lbfgs', 'saga'],
                   'penalty': ['none'],}
                   ]
    clf = GridSearchCV(estimator = log_reg, param_grid = grid_values, cv = StratifiedKFold(n_splits=5),
                      scoring = ['accuracy', 'roc_auc_ovr', 'f1_micro'], refit=False, verbose=0)
    
    model = clf.fit(X_train, y_train)
    
    accuracy_best_params.append(model.cv_results_['params'][np.argmin(model.cv_results_['rank_test_accuracy'])])
    roc_auc_best_params.append(model.cv_results_['params'][np.argmin(model.cv_results_['rank_test_roc_auc_ovr'])])
    f1_best_params.append(model.cv_results_['params'][np.argmin(model.cv_results_['rank_test_f1_micro'])])
    
    all_models.append(model)
    

In [22]:
accuracy_best_params

[{'C': 0.01, 'penalty': 'l1', 'solver': 'saga'},
 {'C': 0.01, 'penalty': 'l1', 'solver': 'saga'},
 {'C': 0.01, 'penalty': 'l1', 'solver': 'saga'},
 {'C': 0.01, 'penalty': 'l1', 'solver': 'saga'},
 {'C': 0.01, 'penalty': 'l1', 'solver': 'saga'}]

In [23]:
roc_auc_best_params

[{'C': 0.01, 'penalty': 'l2', 'solver': 'saga'},
 {'C': 1000, 'penalty': 'l2', 'solver': 'lbfgs'},
 {'C': 0.01, 'penalty': 'l2', 'solver': 'lbfgs'},
 {'C': 100, 'penalty': 'l1', 'solver': 'saga'},
 {'C': 10000, 'penalty': 'l2', 'solver': 'lbfgs'}]

In [24]:
f1_best_params

[{'C': 0.01, 'penalty': 'l1', 'solver': 'saga'},
 {'C': 0.01, 'penalty': 'l1', 'solver': 'saga'},
 {'C': 0.01, 'penalty': 'l1', 'solver': 'saga'},
 {'C': 0.01, 'penalty': 'l1', 'solver': 'saga'},
 {'C': 0.01, 'penalty': 'l1', 'solver': 'saga'}]

In [65]:
#Now we need to loop across all models from each metric, train on entire sample, and then predict on test set
accuracy_test_errors = []
accuracy_train_errors = []
for param in accuracy_best_params:
    
    n = 0
    if 'C' in param:
        
        logreg_clf = LogisticRegression(penalty = param['penalty'], solver = param['solver'], C = param['C'], max_iter = 10000)
        model = logreg_clf.fit(X_training_sets[n], y_training_sets[n])
    else:
        
        logreg_clf = LogisticRegression(penalty = param['penalty'], solver = param['solver'],  max_iter = 10000)
        model = logreg_clf.fit(X_training_sets[n], y_training_sets[n])
    
    y_pred = model.predict(X_testing_sets[n])
    accuracy_test_errors.append(accuracy_score(y_testing_sets[n], y_pred))
    
    y_pred_train = model.predict(X_training_sets[n])
    accuracy_train_errors.append(accuracy_score(y_training_sets[n], y_pred_train))
    
    n += 1

In [66]:
accuracy_test_errors

[0.9900385604113111,
 0.9900385604113111,
 0.9900385604113111,
 0.9900385604113111,
 0.9900385604113111]

In [67]:
accuracy_train_errors

[0.9886, 0.9886, 0.9886, 0.9886, 0.9886]

In [68]:
roc_auc_test_errors = []
roc_auc_train_errors = []
for param in roc_auc_best_params:
    
    n = 0
    
    if 'C' in param:
        
        logreg_clf = LogisticRegression(penalty = param['penalty'], solver = param['solver'], C = param['C'], max_iter = 10000)
        model = logreg_clf.fit(X_training_sets[n], y_training_sets[n])
    else:
        
        logreg_clf = LogisticRegression(penalty = param['penalty'], solver = param['solver'],  max_iter = 10000)
        model = logreg_clf.fit(X_training_sets[n], y_training_sets[n])
    
    y_pred2 = model.predict(X_testing_sets[n])
    roc_auc_test_errors.append(roc_auc_score(y_testing_sets[n], y_pred2))
    
    y_pred_train2 = model.predict(X_training_sets[n])
    roc_auc_train_errors.append(roc_auc_score(y_training_sets[n], y_pred_train2))
    
    n += 1

In [69]:
roc_auc_test_errors

[0.9871554100374157,
 0.9914761210575027,
 0.9871554100374157,
 0.9914761210575027,
 0.9914761210575027]

In [70]:
roc_auc_train_errors

[0.9881614048627007,
 0.9920316969735197,
 0.9881614048627007,
 0.9920316969735197,
 0.9921601656271681]

In [71]:
f1_score_test_errors = []
f1_score_train_errors = []
for param in f1_best_params:
    
    n = 0
    
    if 'C' in param:
        
        logreg_clf = LogisticRegression(penalty = param['penalty'], solver = param['solver'], C = param['C'], max_iter = 10000)
        model = logreg_clf.fit(X_training_sets[n], y_training_sets[n])
    else:
        
        logreg_clf = LogisticRegression(penalty = param['penalty'], solver = param['solver'],  max_iter = 10000)
        model = logreg_clf.fit(X_training_sets[n], y_training_sets[n])
    
    y_pred3 = model.predict(X_testing_sets[n])
    f1_score_test_errors.append(f1_score(y_testing_sets[n], y_pred3))
    
    y_pred_train3 = model.predict(X_training_sets[n])
    f1_score_train_errors.append(f1_score(y_training_sets[n], y_pred_train3))
    
    
    n += 1

In [72]:
f1_score_test_errors

[0.9790907864562256,
 0.9790907864562256,
 0.9790907864562256,
 0.9790907864562256,
 0.9790907864562256]

In [73]:
f1_score_train_errors

[0.9748344370860927,
 0.9748344370860927,
 0.9748344370860927,
 0.9748344370860927,
 0.9748344370860927]

# KNN

In [36]:

#Lists to save models and training/testing sets
all_models_knn = []
X_training_sets_knn = []
y_training_sets_knn = []
X_testing_sets_knn = []
y_testing_sets_knn = []

#lists to save best params
accuracy_best_params_knn = []
roc_auc_best_params_knn = []
f1_best_params_knn = []

#looping across each sample for each trial
for sample in all_samples:
    X_train_knn = sample.iloc[:, :-1]
    y_train_knn = sample.iloc[:, -1]
    X_training_sets_knn.append(X_train_knn)
    y_training_sets_knn.append(y_train_knn)
    
    #Separating rows for the test set that were not in the sample
    ix_knn = [i for i in scaled_occupancy_data.index if i not in sample.index]
    test_set_knn = scaled_occupancy_data.loc[ix]
    X_test_knn = test_set_knn.iloc[:, :-1]
    y_test_knn = test_set_knn.iloc[:, -1]
    X_testing_sets_knn.append(X_test_knn)
    y_testing_sets_knn.append(y_test_knn)
    
    #Initiating classifier
    knn = KNeighborsClassifier()
    
    grid_values = {'n_neighbors' : list(range(1,105,4))}
    
    clf_knn = GridSearchCV(estimator = knn, param_grid = grid_values, cv = StratifiedKFold(n_splits=5),
                      scoring = ['accuracy', 'roc_auc_ovr', 'f1_micro'], refit=False, verbose=0)
    
    model_knn = clf_knn.fit(X_train_knn, y_train_knn)
    
    accuracy_best_params_knn.append(model_knn.cv_results_['params'][np.argmin(model_knn.cv_results_['rank_test_accuracy'])])
    roc_auc_best_params_knn.append(model_knn.cv_results_['params'][np.argmin(model_knn.cv_results_['rank_test_roc_auc_ovr'])])
    f1_best_params_knn.append(model_knn.cv_results_['params'][np.argmin(model_knn.cv_results_['rank_test_f1_micro'])])
    
    all_models_knn.append(model_knn)
    

In [37]:
accuracy_best_params_knn

[{'n_neighbors': 5},
 {'n_neighbors': 1},
 {'n_neighbors': 5},
 {'n_neighbors': 13},
 {'n_neighbors': 5}]

In [38]:
roc_auc_best_params_knn

[{'n_neighbors': 9},
 {'n_neighbors': 9},
 {'n_neighbors': 41},
 {'n_neighbors': 17},
 {'n_neighbors': 9}]

In [40]:
f1_best_params_knn

[{'n_neighbors': 5},
 {'n_neighbors': 1},
 {'n_neighbors': 5},
 {'n_neighbors': 13},
 {'n_neighbors': 5}]

In [74]:
#Now we need to loop across all models from each metric, train on entire sample, and then predict on test set
accuracy_test_errors_knn = []
accuracy_train_errors_knn = []
for param in accuracy_best_params_knn:
    
    n_knn = 0
    
    knn_clf = KNeighborsClassifier(n_neighbors = param['n_neighbors'])
    model = knn_clf.fit(X_training_sets_knn[n_knn], y_training_sets_knn[n_knn])
    
    y_pred_knn = model.predict(X_testing_sets_knn[n_knn])
    accuracy_test_errors_knn.append(accuracy_score(y_testing_sets_knn[n_knn], y_pred_knn))
    
    y_pred_train_knn = model.predict(X_training_sets_knn[n_knn])
    accuracy_train_errors_knn.append(accuracy_score(y_training_sets_knn[n_knn], y_pred_train_knn))
    
    n_knn += 1

In [75]:
accuracy_test_errors_knn

[0.990681233933162,
 0.991452442159383,
 0.990681233933162,
 0.9902313624678664,
 0.990681233933162]

In [76]:
accuracy_train_errors_knn

[0.992, 1.0, 0.992, 0.9902, 0.992]

In [77]:
roc_auc_test_errors_knn = []
roc_auc_train_errors_knn = []
for param in roc_auc_best_params_knn:
    
    n_knn = 0
    
    knn_clf2 = KNeighborsClassifier(n_neighbors = param['n_neighbors'])
    model = knn_clf2.fit(X_training_sets_knn[n_knn], y_training_sets_knn[n_knn])
    
    y_pred_knn2 = model.predict(X_testing_sets_knn[n_knn])
    roc_auc_test_errors_knn.append(roc_auc_score(y_testing_sets_knn[n_knn], y_pred_knn2))
    
    y_pred_train_knn2 = model.predict(X_training_sets_knn[n_knn])
    roc_auc_train_errors_knn.append(roc_auc_score(y_training_sets_knn[n_knn], y_pred_train_knn2))
    
    n_knn += 1

In [78]:
roc_auc_test_errors_knn

[0.9903330916025821,
 0.9903330916025821,
 0.9918264963834647,
 0.9906374606142785,
 0.9903330916025821]

In [79]:
roc_auc_train_errors_knn

[0.9929903421254744,
 0.9929903421254744,
 0.9917089020892621,
 0.9917682666656773,
 0.9929903421254744]

In [80]:
f1_score_test_errors_knn = []
f1_score_train_errors_knn = []
for param in f1_best_params_knn:
    
    n_knn = 0
    
    knn_clf3 = KNeighborsClassifier(n_neighbors = param['n_neighbors'])
    model = knn_clf3.fit(X_training_sets_knn[n_knn], y_training_sets_knn[n_knn])
    
    y_pred_knn3 = model.predict(X_testing_sets_knn[n_knn])
    f1_score_test_errors_knn.append(f1_score(y_testing_sets_knn[n_knn], y_pred_knn3))
    
    y_pred_train_knn3 = model.predict(X_training_sets_knn[n_knn])
    f1_score_train_errors_knn.append(f1_score(y_training_sets_knn[n_knn], y_pred_train_knn3))
    
    
    n_knn += 1

In [81]:
f1_score_test_errors_knn

[0.980013783597519,
 0.981401202629003,
 0.980013783597519,
 0.9791552386176632,
 0.980013783597519]

In [82]:
f1_score_train_errors_knn

[0.9821268990169795,
 1.0,
 0.9821268990169795,
 0.9782512205947627,
 0.9821268990169795]

# Decision Tree

In [50]:
#Lists to save models and training/testing sets
all_models_dt = []
X_training_sets_dt = []
y_training_sets_dt = []
X_testing_sets_dt = []
y_testing_sets_dt = []

#lists to save best params
accuracy_best_params_dt = []
roc_auc_best_params_dt = []
f1_best_params_dt = []

#looping across each sample for each trial
for sample in all_samples:
    X_train_dt = sample.iloc[:, :-1]
    y_train_dt = sample.iloc[:, -1]
    X_training_sets_dt.append(X_train_dt)
    y_training_sets_dt.append(y_train_dt)
    
    #Separating rows for the test set that were not in the sample
    ix_dt = [i for i in scaled_occupancy_data.index if i not in sample.index]
    test_set_dt = scaled_occupancy_data.loc[ix_dt]
    X_test_dt = test_set_dt.iloc[:, :-1]
    y_test_dt = test_set_dt.iloc[:, -1]
    X_testing_sets_dt.append(X_test_dt)
    y_testing_sets_dt.append(y_test_dt)
    
    #Initiating classifier
    dt = DecisionTreeClassifier()
    
    grid_values = [{'criterion': ['gini', 'entropy'], 'max_depth':list(range(1,100,3)), 'min_samples_leaf': list(range(10,100,10))}]
    
    clf_dt = GridSearchCV(estimator = dt, param_grid = grid_values, cv = StratifiedKFold(n_splits=5),
                      scoring = ['accuracy', 'roc_auc_ovr', 'f1_micro'], refit=False, verbose=0)
    
    model_dt = clf_dt.fit(X_train_dt, y_train_dt)
    
    accuracy_best_params_dt.append(model_dt.cv_results_['params'][np.argmin(model_dt.cv_results_['rank_test_accuracy'])])
    roc_auc_best_params_dt.append(model_dt.cv_results_['params'][np.argmin(model_dt.cv_results_['rank_test_roc_auc_ovr'])])
    f1_best_params_dt.append(model_dt.cv_results_['params'][np.argmin(model_dt.cv_results_['rank_test_f1_micro'])])
    
    all_models_dt.append(model_dt)

In [51]:
accuracy_best_params_dt

[{'criterion': 'gini', 'max_depth': 1, 'min_samples_leaf': 10},
 {'criterion': 'entropy', 'max_depth': 7, 'min_samples_leaf': 10},
 {'criterion': 'gini', 'max_depth': 7, 'min_samples_leaf': 30},
 {'criterion': 'entropy', 'max_depth': 16, 'min_samples_leaf': 10},
 {'criterion': 'entropy', 'max_depth': 7, 'min_samples_leaf': 10}]

In [52]:
roc_auc_best_params_dt

[{'criterion': 'entropy', 'max_depth': 4, 'min_samples_leaf': 90},
 {'criterion': 'gini', 'max_depth': 55, 'min_samples_leaf': 50},
 {'criterion': 'gini', 'max_depth': 7, 'min_samples_leaf': 20},
 {'criterion': 'entropy', 'max_depth': 55, 'min_samples_leaf': 30},
 {'criterion': 'gini', 'max_depth': 97, 'min_samples_leaf': 20}]

In [53]:
f1_best_params_dt

[{'criterion': 'gini', 'max_depth': 1, 'min_samples_leaf': 10},
 {'criterion': 'entropy', 'max_depth': 7, 'min_samples_leaf': 10},
 {'criterion': 'gini', 'max_depth': 7, 'min_samples_leaf': 30},
 {'criterion': 'entropy', 'max_depth': 16, 'min_samples_leaf': 10},
 {'criterion': 'entropy', 'max_depth': 7, 'min_samples_leaf': 10}]

In [83]:
#Now we need to loop across all models from each metric, train on entire sample, and then predict on test set
accuracy_test_errors_dt = []
accuracy_train_errors_dt = []
for param in accuracy_best_params_dt:
    
    n_dt = 0
    
    dt_clf = DecisionTreeClassifier(criterion = param['criterion'], max_depth = param['max_depth'],
                                   min_samples_leaf = param['min_samples_leaf'])
    model = dt_clf.fit(X_training_sets_dt[n_dt], y_training_sets_dt[n_dt])
    
    y_pred_dt = model.predict(X_testing_sets_dt[n_dt])
    accuracy_test_errors_dt.append(accuracy_score(y_testing_sets_dt[n_dt], y_pred_dt))
    
    y_pred_train_dt = model.predict(X_training_sets_dt[n_dt])
    accuracy_train_errors_dt.append(accuracy_score(y_training_sets_dt[n_dt], y_pred_train_dt))
    
    n_dt += 1

In [84]:
accuracy_test_errors_dt

[0.9897172236503856,
 0.9865681233933162,
 0.9897172236503856,
 0.9857969151670951,
 0.9865681233933162]

In [85]:
accuracy_train_errors_dt

[0.9878, 0.99, 0.9878, 0.991, 0.99]

In [86]:
roc_auc_test_errors_dt = []
roc_auc_train_errors_dt = []
for param in roc_auc_best_params_dt:
    
    n_dt = 0
    
    dt_clf2 = DecisionTreeClassifier(criterion = param['criterion'], max_depth = param['max_depth'],
                                   min_samples_leaf = param['min_samples_leaf'])
    model = dt_clf2.fit(X_training_sets_dt[n_dt], y_training_sets_dt[n_dt])
    
    y_pred_dt2 = model.predict(X_testing_sets_dt[n_dt])
    roc_auc_test_errors_dt.append(roc_auc_score(y_testing_sets_dt[n_dt], y_pred_dt2))
    
    y_pred_train_dt2 = model.predict(X_training_sets_dt[n_dt])
    roc_auc_train_errors_dt.append(roc_auc_score(y_training_sets_dt[n_dt], y_pred_train_dt2))
    
    n_dt += 1

In [87]:
roc_auc_test_errors_dt

[0.9915714549144581,
 0.9915714549144581,
 0.9915714549144581,
 0.9816946093012116,
 0.9915714549144581]

In [88]:
roc_auc_train_errors_dt

[0.9899038479376374,
 0.9899038479376374,
 0.9899038479376374,
 0.9857701255189762,
 0.9899038479376374]

In [89]:
f1_score_test_errors_dt = []
f1_score_train_errors_dt = []
for param in f1_best_params_dt:
    
    n_dt = 0
    
    dt_clf3 = DecisionTreeClassifier(criterion = param['criterion'], max_depth = param['max_depth'],
                                   min_samples_leaf = param['min_samples_leaf'])
    model = dt_clf3.fit(X_training_sets_dt[n_dt], y_training_sets_dt[n_dt])
    
    y_pred_dt3 = model.predict(X_testing_sets_dt[n_dt])
    f1_score_test_errors_dt.append(f1_score(y_testing_sets_dt[n_dt], y_pred_dt3))
    
    y_pred_train_dt3 = model.predict(X_training_sets_dt[n_dt])
    f1_score_train_errors_dt.append(f1_score(y_training_sets_dt[n_dt], y_pred_train_dt3))
    
    n_dt += 1

In [90]:
f1_score_test_errors_dt

[0.9784017278617712,
 0.9713030344638198,
 0.9784017278617712,
 0.9692031487363625,
 0.9713030344638198]

In [91]:
f1_score_train_errors_dt

[0.9730446310207688,
 0.9776785714285714,
 0.9730446310207688,
 0.9797205948625508,
 0.9776785714285714]