# 1. Import Libraries and modules

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn import preprocessing

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier

from sklearn.neural_network import MLPClassifier

from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

from sklearn.pipeline import make_pipeline

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import RepeatedKFold

import multiprocessing


import joblib

# how can i import a function to measure balanced accuracy?
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import confusion_matrix


import sys
sys.path.append('../../python')
import splitDataset
np.set_printoptions(suppress=True)

# 2. Load data

In [2]:
data1= pd.read_csv('E:/Documentos\/PCEO\/5\/Informatica/TFG/scripts/TFG-Informatica/Outputs/featuresOutput_percentil1.csv', sep=";")
data2= pd.read_csv('E:/Documentos\/PCEO\/5\/Informatica/TFG/scripts/TFG-Informatica/Outputs/featuresOutput_percentil2.csv', sep=";")
data3= pd.read_csv('E:/Documentos\/PCEO\/5\/Informatica/TFG/scripts/TFG-Informatica/Outputs/featuresOutput_percentil3.csv', sep=";")

# want to eliminate the rows where feature bestSubmit is 100.0

data1 = data1[data1.bestSubmit != 100.0]
data2 = data2[data2.bestSubmit != 100.0]
data3 = data3[data3.bestSubmit != 100.0]

In [3]:
counts = data1['completed'].value_counts()

print(counts)

1.0    3460
0.0    1450
Name: completed, dtype: int64


# 3. Split data into training and test sets

In [3]:
X_train1, X_test1, y_train1, y_test1 = splitDataset.splitDataset(data1)
X_train2, X_test2, y_train2, y_test2 = splitDataset.splitDataset(data2)
X_train3, X_test3, y_train3, y_test3 = splitDataset.splitDataset(data3)

# show the number of rows of X_train1
print("Number of rows X_train1 dataset: ", X_train1.shape[0])
print("Number of rows X_test1 dataset: ", X_test1.shape[0])

print("Number of rows X_train2 dataset: ", X_train2.shape[0])
print("Number of rows X_test2 dataset: ", X_test2.shape[0])

print("Number of rows X_train3 dataset: ", X_train3.shape[0])
print("Number of rows X_test3 dataset: ", X_test3.shape[0])


Number of rows X_train1 dataset:  3295
Number of rows X_test1 dataset:  1615
Number of rows X_train2 dataset:  2551
Number of rows X_test2 dataset:  1286
Number of rows X_train3 dataset:  2079
Number of rows X_test3 dataset:  971


# 4. Declare data preprocessing steps

# 5. Declare hyperparameters to tune

In [4]:
param_grid_lr = { 'penalty' : ['l1', 'l2','elasticnet',None],
                    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}

param_grid_rfc = {'n_estimators': [150],
                    'max_features': [5, 7, 9],
                    'max_depth'   : [None, 3, 10, 20],
                    'criterion'   : ['gini', 'entropy']
                    }

# another param grid for rfc but with more parameters
param_grid_rfc2 = {'n_estimators': [150, 200, 250],
                    'max_features': ['sqrt', 'log2', None, 5, 7, 9],
                    'max_depth'   : [None, 3, 10, 20],
                    'criterion'   : ['gini', 'entropy'],
                    'min_samples_split': [2, 5, 10],
                    'min_samples_leaf': [1, 3, 5]
                    }

# i want to make a param grid for the SGD Classifier
param_grid_sgd = {
    'loss': ['hinge', 'log', 'modified_huber'],
    'penalty': ['l1', 'l2', 'elasticnet'],
    'alpha': [0.0001, 0.001, 0.01, 0.1],
    'max_iter': [100, 500, 1000],
    'random_state': [42]
}

param_grid_mlp = {
    'hidden_layer_sizes': [(50,50,50), (50,100,50), (100,)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.05],
    'learning_rate': ['constant','adaptive'],
}

#param_grid_mlp = {'solver': ['lbfgs'], 'max_iter': [1000,1100,1200,1300,1400,1500,1600,1700,1800,1900,2000 ], 'alpha': 10.0 ** -np.arange(1, 10), 'hidden_layer_sizes':np.arange(10, 15), 'random_state':[0,1,2,3,4,5,6,7,8,9]}

param_grid_adaboost = {
    'n_estimators': [50, 100, 200, 500],
    'learning_rate': [0.001, 0.01, 0.1, 1.0],
    'algorithm': ['SAMME', 'SAMME.R']
}

param_grid_svc = {
    'C': [0.1, 1, 10, 100],
    'gamma': [1, 0.1, 0.01, 0.001],
    'kernel': ['rbf', 'poly', 'sigmoid']
}

param_grid_tree = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 3, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 3, 5]
}

param_grid_knn = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree'],
    'p': [1, 2]
}


# 6. Tune model using cross-validation pipeline

## 6.1 Logistic Regression

In [6]:
grid_lr = GridSearchCV(
    estimator  = LogisticRegression(random_state = 33),
    param_grid = param_grid_lr,
    scoring    = 'balanced_accuracy',
    n_jobs     = multiprocessing.cpu_count() - 1,
    cv         = 10,
    refit      = True,
    verbose    = 0,
    return_train_score = True
)


grid_lr_1 = grid_lr.fit(X_train1, y_train1)
model_lr_1 = grid_lr_1.best_estimator_
grid_lr_2 = grid_lr.fit(X_train2, y_train2)
model_lr_2 = grid_lr_2.best_estimator_
grid_lr_3 = grid_lr.fit(X_train3, y_train3)
model_lr_3 = grid_lr_3.best_estimator_

140 fits failed out of a total of 280.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
70 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\alvar\anaconda3\envs\xAI\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\alvar\anaconda3\envs\xAI\lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\alvar\anaconda3\envs\xAI\lib\site-packages\sklearn\linear_model\_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.



## 6.2 Random Forest classifier

In [7]:
grid_rfc = GridSearchCV(
    estimator  = RandomForestClassifier(random_state = 123),
    param_grid = param_grid_rfc,
    scoring    = 'balanced_accuracy',
    n_jobs     = multiprocessing.cpu_count() - 1,
    cv         = 10,
    refit      = True,
    verbose    = 0,
    return_train_score = True
)

grid_rfc_1 = grid_rfc.fit(X = X_train1, y = y_train1)
model_rfc_1 = grid_rfc_1.best_estimator_
grid_rfc_2 = grid_rfc.fit(X = X_train2, y = y_train2)
model_rfc_2 = grid_rfc_2.best_estimator_
grid_rfc_3 = grid_rfc.fit(X = X_train3, y = y_train3)
model_rfc_3 = grid_rfc_3.best_estimator_

In [8]:
resultados_rfc_1 = pd.DataFrame(grid_rfc_1.cv_results_)
resultados_rfc_1 = resultados_rfc_1.filter(regex = '(param*|mean_t|std_t)') \
    .drop(columns = 'params') \
    .sort_values('mean_test_score', ascending = False)
resultados_rfc_1.to_csv("../../Outputs/grids/grid_rfc_1.csv", decimal=".", sep=";", mode='w')
print('El mejor modelo random forest para el percentil 1 es: ', grid_rfc_1.best_estimator_)

resultados_rfc_2 = pd.DataFrame(grid_rfc_2.cv_results_)
resultados_rfc_2 = resultados_rfc_2.filter(regex = '(param*|mean_t|std_t)') \
    .drop(columns = 'params') \
    .sort_values('mean_test_score', ascending = False)
resultados_rfc_2.to_csv("../../Outputs/grids/grid_rfc_2.csv", decimal=".", sep=";", mode='w')
print('El mejor modelo random forest para el percentil 2 es: ', grid_rfc_2.best_estimator_)

resultados_rfc_3 = pd.DataFrame(grid_rfc_3.cv_results_)
resultados_rfc_3 = resultados_rfc_3.filter(regex = '(param*|mean_t|std_t)') \
    .drop(columns = 'params') \
    .sort_values('mean_test_score', ascending = False)
resultados_rfc_3.to_csv("../../Outputs/grids/grid_rfc_3.csv", decimal=".", sep=";", mode='w')
print('El mejor modelo random forest para el percentil 3 es: ', grid_rfc_3.best_estimator_)

El mejor modelo random forest para el percentil 1 es:  RandomForestClassifier(criterion='entropy', max_depth=20, max_features=5,
                       n_estimators=150, random_state=123)
El mejor modelo random forest para el percentil 2 es:  RandomForestClassifier(criterion='entropy', max_depth=20, max_features=5,
                       n_estimators=150, random_state=123)
El mejor modelo random forest para el percentil 3 es:  RandomForestClassifier(criterion='entropy', max_depth=20, max_features=5,
                       n_estimators=150, random_state=123)


## 6.3 Sthocastic Gradient Descent

In [9]:
grid_sgd = GridSearchCV(
    estimator  = SGDClassifier(random_state = 34),
    scoring    = 'balanced_accuracy',
    param_grid = param_grid_sgd,
    n_jobs     = multiprocessing.cpu_count() - 1,
    cv         = 10,
    refit      = True,
    verbose    = 0,
    return_train_score = True
)



grid_sgd_1 = grid_sgd.fit(X = X_train1, y = y_train1)
model_sgd_1 = grid_sgd_1.best_estimator_
grid_sgd_2 = grid_sgd.fit(X = X_train2, y = y_train2)
model_sgd_2 = grid_sgd_2.best_estimator_
grid_sgd_3 = grid_sgd.fit(X = X_train3, y = y_train3)
model_sgd_3 = grid_sgd_3.best_estimator_



In [10]:
resultados_sgd_1 = pd.DataFrame(grid_sgd_1.cv_results_)
resultados_sgd_1 = resultados_sgd_1.filter(regex = '(param*|mean_t|std_t)') \
    .drop(columns = 'params') \
    .sort_values('mean_test_score', ascending = False)
resultados_sgd_1.to_csv("../../Outputs/grids/grid_sgd_1.csv", decimal=".", sep=";", mode='w')
print('El mejor modelo SGD para el percentil 1 es: ', grid_sgd_1.best_estimator_)

resultados_sgd_2 = pd.DataFrame(grid_sgd_2.cv_results_)
resultados_sgd_2 = resultados_sgd_2.filter(regex = '(param*|mean_t|std_t)') \
    .drop(columns = 'params') \
    .sort_values('mean_test_score', ascending = False)
resultados_sgd_2.to_csv("../../Outputs/grids/grid_sgd_2.csv", decimal=".", sep=";", mode='w')
print('El mejor modelo SGD para el percentil 2 es: ', grid_sgd_2.best_estimator_)

resultados_sgd_3 = pd.DataFrame(grid_sgd_3.cv_results_)
resultados_sgd_3 = resultados_sgd_3.filter(regex = '(param*|mean_t|std_t)') \
    .drop(columns = 'params') \
    .sort_values('mean_test_score', ascending = False)
resultados_sgd_3.to_csv("../../Outputs/grids/grid_sgd_3.csv", decimal=".", sep=";", mode='w')
print('El mejor modelo SGD para el percentil 3 es: ', grid_sgd_3.best_estimator_)


El mejor modelo SGD para el percentil 1 es:  SGDClassifier(loss='modified_huber', max_iter=500, penalty='l1',
              random_state=42)
El mejor modelo SGD para el percentil 2 es:  SGDClassifier(loss='modified_huber', max_iter=500, penalty='l1',
              random_state=42)
El mejor modelo SGD para el percentil 3 es:  SGDClassifier(loss='modified_huber', max_iter=500, penalty='l1',
              random_state=42)


## 6.4 MLP

In [5]:
grid_mlp = GridSearchCV(
    estimator  = MLPClassifier(random_state = 34),
    scoring    = 'balanced_accuracy',
    param_grid = param_grid_mlp,
    n_jobs     = multiprocessing.cpu_count() - 1,
    cv         = 10,
    refit      = True,
    verbose    = 0,
    return_train_score = True
)

grid_mlp_1 = grid_mlp.fit(X = X_train1, y = y_train1)
model_mlp_1 = grid_mlp_1.best_estimator_
grid_mlp_2 = grid_mlp.fit(X = X_train2, y = y_train2)
model_mlp_2 = grid_mlp_2.best_estimator_
grid_mlp_3 = grid_mlp.fit(X = X_train3, y = y_train3)
model_mlp_3 = grid_mlp_3.best_estimator_

In [None]:
resultados_mlp_1 = pd.DataFrame(grid_mlp_1.cv_results_)
resultados_mlp_1 = resultados_mlp_1.filter(regex = '(param*|mean_t|std_t)') \
    .drop(columns = 'params') \
    .sort_values('mean_test_score', ascending = False)
resultados_mlp_1.to_csv("../../Outputs/grids/grid_mlp_1.csv", decimal=".", sep=";", mode='w')
print('El mejor modelo MLP para el percentil 1 es: ', grid_mlp_1.best_estimator_)

resultados_mlp_2 = pd.DataFrame(grid_mlp_2.cv_results_)
resultados_mlp_2 = resultados_mlp_2.filter(regex = '(param*|mean_t|std_t)') \
    .drop(columns = 'params') \
    .sort_values('mean_test_score', ascending = False)
resultados_mlp_2.to_csv("../../Outputs/grids/grid_mlp_2.csv", decimal=".", sep=";", mode='w')
print('El mejor modelo MLP para el percentil 2 es: ', grid_mlp_2.best_estimator_)

resultados_mlp_3 = pd.DataFrame(grid_mlp_3.cv_results_)
resultados_mlp_3 = resultados_mlp_3.filter(regex = '(param*|mean_t|std_t)') \
    .drop(columns = 'params') \
    .sort_values('mean_test_score', ascending = False)
resultados_mlp_3.to_csv("../../Outputs/grids/grid_mlp_3.csv", decimal=".", sep=";", mode='w')
print('El mejor modelo MLP para el percentil 3 es: ', grid_mlp_3.best_estimator_)
print('Los mejores parámetros para el modelo MLP son: ', grid_mlp_3.best_params_)

## 6.5 AdaBoost

In [11]:
grid_adaboost = GridSearchCV(
    estimator  = AdaBoostClassifier(random_state = 34),
    scoring    = 'balanced_accuracy',
    param_grid = param_grid_adaboost,
    n_jobs     = multiprocessing.cpu_count() - 1,
    cv         = 10,
    refit      = True,
    verbose    = 0,
    return_train_score = True
)


grid_adaboost_1 = grid_adaboost.fit(X = X_train1, y = y_train1)
model_adaboost_1 = grid_adaboost_1.best_estimator_
grid_adaboost_2 = grid_adaboost.fit(X = X_train2, y = y_train2)
model_adaboost_2 = grid_adaboost_2.best_estimator_
grid_adaboost_3 = grid_adaboost.fit(X = X_train3, y = y_train3)
model_adaboost_3 = grid_adaboost_3.best_estimator_

In [12]:
resultados_adaboost_1 = pd.DataFrame(grid_adaboost_1.cv_results_)
resultados_adaboost_1 = resultados_adaboost_1.filter(regex = '(param*|mean_t|std_t)') \
    .drop(columns = 'params') \
    .sort_values('mean_test_score', ascending = False)
resultados_adaboost_1.to_csv("../../Outputs/grids/grid_adaboost_1.csv", decimal=".", sep=";", mode='w')
print('El mejor modelo AdaBoost para el percentil 1 es: ', grid_adaboost_1.best_estimator_)

resultados_adaboost_2 = pd.DataFrame(grid_adaboost_2.cv_results_)
resultados_adaboost_2 = resultados_adaboost_2.filter(regex = '(param*|mean_t|std_t)') \
    .drop(columns = 'params') \
    .sort_values('mean_test_score', ascending = False)
resultados_adaboost_2.to_csv("../../Outputs/grids/grid_adaboost_2.csv", decimal=".", sep=";", mode='w')
print('El mejor modelo AdaBoost para el percentil 2 es: ', grid_adaboost_2.best_estimator_)

resultados_adaboost_3 = pd.DataFrame(grid_adaboost_3.cv_results_)
resultados_adaboost_3 = resultados_adaboost_3.filter(regex = '(param*|mean_t|std_t)') \
    .drop(columns = 'params') \
    .sort_values('mean_test_score', ascending = False)
resultados_adaboost_3.to_csv("../../Outputs/grids/grid_adaboost_3.csv", decimal=".", sep=";", mode='w')
print('El mejor modelo AdaBoost para el percentil 3 es: ', grid_adaboost_3.best_estimator_)
print('Los mejores parámetros para el modelo AdaBoost son: ', grid_adaboost_3.best_params_)

El mejor modelo AdaBoost para el percentil 1 es:  AdaBoostClassifier(learning_rate=0.1, n_estimators=500, random_state=34)
El mejor modelo AdaBoost para el percentil 2 es:  AdaBoostClassifier(learning_rate=0.1, n_estimators=500, random_state=34)
El mejor modelo AdaBoost para el percentil 3 es:  AdaBoostClassifier(learning_rate=0.1, n_estimators=500, random_state=34)
Los mejores parámetros para el modelo AdaBoost son:  {'algorithm': 'SAMME.R', 'learning_rate': 0.1, 'n_estimators': 500}


## 6.6 K-Nearest Neighbors

In [13]:
grid_knn = GridSearchCV(
    estimator  = KNeighborsClassifier(),
    scoring    = 'balanced_accuracy',
    param_grid = param_grid_knn,
    n_jobs     = multiprocessing.cpu_count() - 1,
    cv         = 10,
    refit      = True,
    verbose    = 0,
    return_train_score = True
)

grid_knn_1 = grid_knn.fit(X = X_train1, y = y_train1)
model_knn_1 = grid_knn_1.best_estimator_
grid_knn_2 = grid_knn.fit(X = X_train2, y = y_train2)
model_knn_2 = grid_knn_2.best_estimator_
grid_knn_3 = grid_knn.fit(X = X_train3, y = y_train3)
model_knn_3 = grid_knn_3.best_estimator_

In [14]:
resultados_knn_1 = pd.DataFrame(grid_knn_1.cv_results_)
resultados_knn_1 = resultados_knn_1.filter(regex = '(param*|mean_t|std_t)') \
    .drop(columns = 'params') \
    .sort_values('mean_test_score', ascending = False)
resultados_knn_1.to_csv("../../Outputs/grids/grid_knn_1.csv", decimal=".", sep=";", mode='w')
print('El mejor modelo KNN para el percentil 1 es: ', grid_knn_1.best_estimator_)

resultados_knn_2 = pd.DataFrame(grid_knn_2.cv_results_)
resultados_knn_2 = resultados_knn_2.filter(regex = '(param*|mean_t|std_t)') \
    .drop(columns = 'params') \
    .sort_values('mean_test_score', ascending = False)
resultados_knn_2.to_csv("../../Outputs/grids/grid_knn_2.csv", decimal=".", sep=";", mode='w')
print('El mejor modelo KNN para el percentil 2 es: ', grid_knn_2.best_estimator_)

resultados_knn_3 = pd.DataFrame(grid_knn_3.cv_results_)
resultados_knn_3 = resultados_knn_3.filter(regex = '(param*|mean_t|std_t)') \
    .drop(columns = 'params') \
    .sort_values('mean_test_score', ascending = False)
resultados_knn_3.to_csv("../../Outputs/grids/grid_knn_3.csv", decimal=".", sep=";", mode='w')
print('El mejor modelo KNN para el percentil 3 es: ', grid_knn_3.best_estimator_)
print('Los mejores parametros para KNN para el percentil 3 es: ', grid_knn_3.best_params_)

El mejor modelo KNN para el percentil 1 es:  KNeighborsClassifier(n_neighbors=9, p=1, weights='distance')
El mejor modelo KNN para el percentil 2 es:  KNeighborsClassifier(n_neighbors=9, p=1, weights='distance')
El mejor modelo KNN para el percentil 3 es:  KNeighborsClassifier(n_neighbors=9, p=1, weights='distance')
Los mejores parametros para KNN para el percentil 3 es:  {'algorithm': 'auto', 'n_neighbors': 9, 'p': 1, 'weights': 'distance'}


## 6.7 Tree decision classifier

In [15]:
grid_tree = GridSearchCV(
    estimator  = DecisionTreeClassifier(random_state = 34),
    scoring    = 'balanced_accuracy',
    param_grid = param_grid_tree,
    n_jobs     = multiprocessing.cpu_count() - 1,
    cv         = 10,
    refit      = True,
    verbose    = 0,
    return_train_score = True
)

grid_tree_1 = grid_tree.fit(X = X_train1, y = y_train1)
model_tree_1 = grid_tree_1.best_estimator_
grid_tree_2 = grid_tree.fit(X = X_train2, y = y_train2)
model_tree_2 = grid_tree_2.best_estimator_
grid_tree_3 = grid_tree.fit(X = X_train3, y = y_train3)
model_tree_3 = grid_tree_3.best_estimator_

In [16]:
resultados_tree_1 = pd.DataFrame(grid_tree_1.cv_results_)
resultados_tree_1 = resultados_tree_1.filter(regex = '(param*|mean_t|std_t)') \
    .drop(columns = 'params') \
    .sort_values('mean_test_score', ascending = False)
resultados_tree_1.to_csv("../../Outputs/grids/grid_tree_1.csv", decimal=".", sep=";", mode='w')
print('El mejor modelo Árbol de Decisión para el percentil 1 es: ', grid_tree_1.best_estimator_)

resultados_tree_2 = pd.DataFrame(grid_tree_2.cv_results_)
resultados_tree_2 = resultados_tree_2.filter(regex = '(param*|mean_t|std_t)') \
    .drop(columns = 'params') \
    .sort_values('mean_test_score', ascending = False)
resultados_tree_2.to_csv("../../Outputs/grids/grid_tree_2.csv", decimal=".", sep=";", mode='w')
print('El mejor modelo Árbol de Decisión para el percentil 2 es: ', grid_tree_2.best_estimator_)

resultados_tree_3 = pd.DataFrame(grid_tree_3.cv_results_)
resultados_tree_3 = resultados_tree_3.filter(regex = '(param*|mean_t|std_t)') \
    .drop(columns = 'params') \
    .sort_values('mean_test_score', ascending = False)
resultados_tree_3.to_csv("../../Outputs/grids/grid_tree_3.csv", decimal=".", sep=";", mode='w')
print('El mejor modelo Árbol de Decisión para el percentil 3 es: ', grid_tree_3.best_estimator_)

El mejor modelo Árbol de Decisión para el percentil 1 es:  DecisionTreeClassifier(criterion='entropy', max_depth=10, random_state=34)
El mejor modelo Árbol de Decisión para el percentil 2 es:  DecisionTreeClassifier(criterion='entropy', max_depth=10, random_state=34)
El mejor modelo Árbol de Decisión para el percentil 3 es:  DecisionTreeClassifier(criterion='entropy', max_depth=10, random_state=34)


## 6.8 Support vector classifier

In [6]:
grid_svc = GridSearchCV(
    estimator  = SVC(random_state = 34,max_iter=10000),
    scoring    = 'balanced_accuracy',
    param_grid = param_grid_svc,
    n_jobs     = multiprocessing.cpu_count() - 1,
    cv         = 10,
    refit      = True,
    verbose    = 1,
    return_train_score = True
)

grid_svc_1 = grid_svc.fit(X = X_train1, y = y_train1)
model_svc_1 = grid_svc_1.best_estimator_
grid_svc_2 = grid_svc.fit(X = X_train2, y = y_train2)
model_svc_2 = grid_svc_2.best_estimator_
grid_svc_3 = grid_svc.fit(X = X_train3, y = y_train3)
model_svc_3 = grid_svc_3.best_estimator_

Fitting 10 folds for each of 48 candidates, totalling 480 fits




Fitting 10 folds for each of 48 candidates, totalling 480 fits




Fitting 10 folds for each of 48 candidates, totalling 480 fits


In [None]:
resultados_svc_1 = pd.DataFrame(grid_svc_1.cv_results_)
resultados_svc_1 = resultados_svc_1.filter(regex = '(param*|mean_t|std_t)') \
    .drop(columns = 'params') \
    .sort_values('mean_test_score', ascending = False)
resultados_svc_1.to_csv("../../Outputs/grids/grid_svc_1.csv", decimal=".", sep=";", mode='w')
print('El mejor modelo SVC para el percentil 1 es: ', grid_svc_1.best_estimator_)

resultados_svc_2 = pd.DataFrame(grid_svc_2.cv_results_)
resultados_svc_2 = resultados_svc_2.filter(regex = '(param*|mean_t|std_t)') \
    .drop(columns = 'params') \
    .sort_values('mean_test_score', ascending = False)
resultados_svc_2.to_csv("../../Outputs/grids/grid_svc_2.csv", decimal=".", sep=";", mode='w')
print('El mejor modelo SVC para el percentil 2 es: ', grid_svc_2.best_estimator_)

resultados_svc_3 = pd.DataFrame(grid_svc_3.cv_results_)
resultados_svc_3 = resultados_svc_3.filter(regex = '(param*|mean_t|std_t)') \
    .drop(columns = 'params') \
    .sort_values('mean_test_score', ascending = False)
resultados_svc_3.to_csv("../../Outputs/grids/grid_svc_3.csv", decimal=".", sep=";", mode='w')
print('El mejor modelo SVC para el percentil 3 es: ', grid_svc_3.best_estimator_)

# Refit on the entire training set
# No additional code needed if clf.refit == True (default is True)

# Evaluate model pipeline on test data

In [17]:
y_pred_lr_1 = model_lr_1.predict(X_test1)
y_pred_binary_lr_1 = np.where(y_pred_lr_1 > 0.5, 1, 0)

y_pred_lr_2 = model_lr_2.predict(X_test2)
y_pred_binary_lr_2 = np.where(y_pred_lr_2 > 0.5, 1, 0)

y_pred_lr_3 = model_lr_3.predict(X_test3)
y_pred_binary_lr_3 = np.where(y_pred_lr_3 > 0.5, 1, 0)

In [18]:
print('The balanced accuracy for percentile 1 using lr is : ' + str(balanced_accuracy_score(y_test1, y_pred_binary_lr_1)))
print('The balanced accuracy for percentile 2 using lr is : ' + str(balanced_accuracy_score(y_test2, y_pred_binary_lr_2)))
print('The balanced accuracy for percentile 3 using lr is : ' + str(balanced_accuracy_score(y_test3, y_pred_binary_lr_3)))

print('The f1 score for percentile 1 using lr is : ' + str(f1_score(y_test1, y_pred_binary_lr_1)))
print('The f1 score for percentile 2 using lr is : ' + str(f1_score(y_test2, y_pred_binary_lr_2)))
print('The f1 score for percentile 3 using lr is : ' + str(f1_score(y_test3, y_pred_binary_lr_3)))

print('The cohen kappa score for percentile 1 using lr is : ' + str(cohen_kappa_score(y_test1, y_pred_binary_lr_1)))
print('The cohen kappa score for percentile 2 using lr is : ' + str(cohen_kappa_score(y_test2, y_pred_binary_lr_2)))
print('The cohen kappa score for percentile 3 using lr is : ' + str(cohen_kappa_score(y_test3, y_pred_binary_lr_3)))

print('The confusion matrix for percentile 1 using lr is : ' + str(confusion_matrix(y_test1, y_pred_binary_lr_1)))
print('The confusion matrix for percentile 2 using lr is : ' + str(confusion_matrix(y_test2, y_pred_binary_lr_2)))
print('The confusion matrix for percentile 3 using lr is : ' + str(confusion_matrix(y_test3, y_pred_binary_lr_3)))

The balanced accuracy for percentile 1 using lr is : 0.6842594963721724
The balanced accuracy for percentile 2 using lr is : 0.7256444313262496
The balanced accuracy for percentile 3 using lr is : 0.7432330380477208
The f1 score for percentile 1 using lr is : 0.8361629881154499
The f1 score for percentile 2 using lr is : 0.7932118164676304
The f1 score for percentile 3 using lr is : 0.7598842815814851
The cohen kappa score for percentile 1 using lr is : 0.40917060780392567
The cohen kappa score for percentile 2 using lr is : 0.4644691574920954
The cohen kappa score for percentile 3 using lr is : 0.48678102240911225
The confusion matrix for percentile 1 using lr is : [[244 306]
 [ 80 985]]
The confusion matrix for percentile 2 using lr is : [[326 234]
 [ 95 631]]
The confusion matrix for percentile 3 using lr is : [[328 155]
 [ 94 394]]


In [19]:
y_pred_rfc_1 = model_rfc_1.predict(X_test1)
y_pred_rfc_2 = model_rfc_2.predict(X_test2)
y_pred_rfc_3 = model_rfc_3.predict(X_test3)

In [20]:
print('The balanced accuracy for percentile 1 using rfc is : ' + str(balanced_accuracy_score(y_test1, y_pred_rfc_1)))
print('The balanced accuracy for percentile 2 using rfc is : ' + str(balanced_accuracy_score(y_test2, y_pred_rfc_2)))
print('The balanced accuracy for percentile 3 using rfc is : ' + str(balanced_accuracy_score(y_test3, y_pred_rfc_3)))

print('The f1 score for percentile 1 using rfc is : ' + str(f1_score(y_test1, y_pred_rfc_1)))
print('The f1 score for percentile 2 using rfc is : ' + str(f1_score(y_test2, y_pred_rfc_2)))
print('The f1 score for percentile 3 using rfc is : ' + str(f1_score(y_test3, y_pred_rfc_3)))

print('The cohen kappa score for percentile 1 using rfc is : ' + str(cohen_kappa_score(y_test1, y_pred_rfc_1)))
print('The cohen kappa score for percentile 2 using rfc is : ' + str(cohen_kappa_score(y_test2, y_pred_rfc_2)))
print('The cohen kappa score for percentile 3 using rfc is : ' + str(cohen_kappa_score(y_test3, y_pred_rfc_3)))

print('The confusion matrix for percentile 1 using rfc is : ' + str(confusion_matrix(y_test1, y_pred_rfc_1)))
print('The confusion matrix for percentile 2 using rfc is : ' + str(confusion_matrix(y_test2, y_pred_rfc_2)))
print('The confusion matrix for percentile 3 using rfc is : ' + str(confusion_matrix(y_test3, y_pred_rfc_3)))

The balanced accuracy for percentile 1 using rfc is : 0.7536491677336747
The balanced accuracy for percentile 2 using rfc is : 0.7839334907516726
The balanced accuracy for percentile 3 using rfc is : 0.7978226928690222
The f1 score for percentile 1 using rfc is : 0.8701298701298701
The f1 score for percentile 2 using rfc is : 0.8356687898089172
The f1 score for percentile 3 using rfc is : 0.8108108108108109
The cohen kappa score for percentile 1 using rfc is : 0.5508898776418243
The cohen kappa score for percentile 2 using rfc is : 0.5818845364774199
The cohen kappa score for percentile 3 using rfc is : 0.5960246907290897
The confusion matrix for percentile 1 using rfc is : [[ 310  240]
 [  60 1005]]
The confusion matrix for percentile 2 using rfc is : [[372 188]
 [ 70 656]]
The confusion matrix for percentile 3 using rfc is : [[355 128]
 [ 68 420]]


In [21]:
y_pred_sgd_1 = model_sgd_1.predict(X_test1)
y_pred_sgd_2 = model_sgd_2.predict(X_test2)
y_pred_sgd_3 = model_sgd_3.predict(X_test3)

In [22]:
print('The balanced accuracy for percentile 1 using sgd is : ' + str(balanced_accuracy_score(y_test1, y_pred_sgd_1)))
print('The balanced accuracy for percentile 2 using sgd is : ' + str(balanced_accuracy_score(y_test2, y_pred_sgd_2)))
print('The balanced accuracy for percentile 3 using sgd is : ' + str(balanced_accuracy_score(y_test3, y_pred_sgd_3)))

print('The f1 score for percentile 1 using sgd is : ' + str(f1_score(y_test1, y_pred_sgd_1)))
print('The f1 score for percentile 2 using sgd is : ' + str(f1_score(y_test2, y_pred_sgd_2)))
print('The f1 score for percentile 3 using sgd is : ' + str(f1_score(y_test3, y_pred_sgd_3)))

print('The cohen kappa score for percentile 1 using sgd is : ' + str(cohen_kappa_score(y_test1, y_pred_sgd_1)))
print('The cohen kappa score for percentile 2 using sgd is : ' + str(cohen_kappa_score(y_test2, y_pred_sgd_2)))
print('The cohen kappa score for percentile 3 using sgd is : ' + str(cohen_kappa_score(y_test3, y_pred_sgd_3)))

print('The confusion matrix for percentile 1 using sgd is : ' + str(confusion_matrix(y_test1, y_pred_sgd_1)))
print('The confusion matrix for percentile 2 using sgd is : ' + str(confusion_matrix(y_test2, y_pred_sgd_2)))
print('The confusion matrix for percentile 3 using sgd is : ' + str(confusion_matrix(y_test3, y_pred_sgd_3)))

The balanced accuracy for percentile 1 using sgd is : 0.7241912078531797
The balanced accuracy for percentile 2 using sgd is : 0.7272087760724124
The balanced accuracy for percentile 3 using sgd is : 0.7140460068560568
The f1 score for percentile 1 using sgd is : 0.84108012394865
The f1 score for percentile 2 using sgd is : 0.7438494934876989
The f1 score for percentile 3 using sgd is : 0.7488667271078876
The cohen kappa score for percentile 1 using sgd is : 0.47533855474564846
The cohen kappa score for percentile 2 using sgd is : 0.44801513204161314
The cohen kappa score for percentile 3 using sgd is : 0.4286694436596832
The confusion matrix for percentile 1 using sgd is : [[306 244]
 [115 950]]
The confusion matrix for percentile 2 using sgd is : [[418 142]
 [212 514]]
The confusion matrix for percentile 3 using sgd is : [[281 202]
 [ 75 413]]


In [7]:
y_pred_mlp_1 = model_mlp_1.predict(X_test1)
y_pred_mlp_2 = model_mlp_2.predict(X_test2)
y_pred_mlp_3 = model_mlp_3.predict(X_test3)

In [8]:
print('The balanced accuracy for percentile 1 using mlp is : ' + str(balanced_accuracy_score(y_test1, y_pred_mlp_1)))
print('The balanced accuracy for percentile 2 using mlp is : ' + str(balanced_accuracy_score(y_test2, y_pred_mlp_2)))
print('The balanced accuracy for percentile 3 using mlp is : ' + str(balanced_accuracy_score(y_test3, y_pred_mlp_3)))

print('The f1 score for percentile 1 using mlp is : ' + str(f1_score(y_test1, y_pred_mlp_1)))
print('The f1 score for percentile 2 using mlp is : ' + str(f1_score(y_test2, y_pred_mlp_2)))
print('The f1 score for percentile 3 using mlp is : ' + str(f1_score(y_test3, y_pred_mlp_3)))

print('The cohen kappa score for percentile 1 using mlp is : ' + str(cohen_kappa_score(y_test1, y_pred_mlp_1)))
print('The cohen kappa score for percentile 2 using mlp is : ' + str(cohen_kappa_score(y_test2, y_pred_mlp_2)))
print('The cohen kappa score for percentile 3 using mlp is : ' + str(cohen_kappa_score(y_test3, y_pred_mlp_3)))

print('The confusion matrix for percentile 1 using mlp is : ' + str(confusion_matrix(y_test1, y_pred_mlp_1)))
print('The confusion matrix for percentile 2 using mlp is : ' + str(confusion_matrix(y_test2, y_pred_mlp_2)))
print('The confusion matrix for percentile 3 using mlp is : ' + str(confusion_matrix(y_test3, y_pred_mlp_3)))

The balanced accuracy for percentile 1 using mlp is : 0.7420614596670935
The balanced accuracy for percentile 2 using mlp is : 0.7785050177095632
The balanced accuracy for percentile 3 using mlp is : 0.7976105624002987
The f1 score for percentile 1 using mlp is : 0.8608695652173913
The f1 score for percentile 2 using mlp is : 0.8055363321799308
The f1 score for percentile 3 using mlp is : 0.8178438661710038
The cohen kappa score for percentile 1 using mlp is : 0.523225241016652
The cohen kappa score for percentile 2 using mlp is : 0.5562151687007695
The cohen kappa score for percentile 3 using mlp is : 0.5958531178064506
The confusion matrix for percentile 1 using mlp is : [[305 245]
 [ 75 990]]
The confusion matrix for percentile 2 using mlp is : [[423 137]
 [144 582]]
The confusion matrix for percentile 3 using mlp is : [[335 148]
 [ 48 440]]


In [23]:
y_pred_adaboost_1 = model_adaboost_1.predict(X_test1)
y_pred_adaboost_2 = model_adaboost_2.predict(X_test2)
y_pred_adaboost_3 = model_adaboost_3.predict(X_test3)

In [24]:
print('RESULTS FOR ADABOOST')
print()
print('-----Balanced accuracy-----')
print('percentile 1 : ' + str(balanced_accuracy_score(y_test1, y_pred_adaboost_1)))
print('percentile 2 : ' + str(balanced_accuracy_score(y_test2, y_pred_adaboost_2)))
print('percentile 3 : ' + str(balanced_accuracy_score(y_test3, y_pred_adaboost_3)))
print()
print('-----F1 score-----')
print('percentile 1 : ' + str(f1_score(y_test1, y_pred_adaboost_1)))
print('percentile 2 : ' + str(f1_score(y_test2, y_pred_adaboost_2)))
print('percentile 3 : ' + str(f1_score(y_test3, y_pred_adaboost_3)))
print()
print('-----Cohen kappa score-----')
print('percentile 1 : ' + str(cohen_kappa_score(y_test1, y_pred_adaboost_1)))
print('percentile 2 : ' + str(cohen_kappa_score(y_test2, y_pred_adaboost_2)))
print('percentile 3 : ' + str(cohen_kappa_score(y_test3, y_pred_adaboost_3)))
print()
print('-----Confusion matrix-----')
print('percentile 1 : ' + str(confusion_matrix(y_test1, y_pred_adaboost_1)))
print('percentile 2 : ' + str(confusion_matrix(y_test2, y_pred_adaboost_2)))
print('percentile 3 : ' + str(confusion_matrix(y_test3, y_pred_adaboost_3)))

RESULTS FOR ADABOOST

-----Balanced accuracy-----
percentile 1 : 0.7697737942808365
percentile 2 : 0.7764339826839827
percentile 3 : 0.8010343481654957

-----F1 score-----
percentile 1 : 0.8721340388007055
percentile 2 : 0.8288633461047253
percentile 3 : 0.8094768015794668

-----Cohen kappa score-----
percentile 1 : 0.5743936461201531
percentile 2 : 0.5660416320404535
percentile 3 : 0.6023050657544304

-----Confusion matrix-----
percentile 1 : [[336 214]
 [ 76 989]]
percentile 2 : [[369 191]
 [ 77 649]]
percentile 3 : [[368 115]
 [ 78 410]]


In [25]:
y_pred_knn_1 = model_knn_1.predict(X_test1)
y_pred_knn_2 = model_knn_2.predict(X_test2)
y_pred_knn_3 = model_knn_3.predict(X_test3)

In [26]:
print('The balanced accuracy for percentile 1 using knn is : ' + str(balanced_accuracy_score(y_test1, y_pred_knn_1)))
print('The balanced accuracy for percentile 2 using knn is : ' + str(balanced_accuracy_score(y_test2, y_pred_knn_2)))
print('The balanced accuracy for percentile 3 using knn is : ' + str(balanced_accuracy_score(y_test3, y_pred_knn_3)))

print('The f1 score for percentile 1 using knn is : ' + str(f1_score(y_test1, y_pred_knn_1)))
print('The f1 score for percentile 2 using knn is : ' + str(f1_score(y_test2, y_pred_knn_2)))
print('The f1 score for percentile 3 using knn is : ' + str(f1_score(y_test3, y_pred_knn_3)))

print('The cohen kappa score for percentile 1 using knn is : ' + str(cohen_kappa_score(y_test1, y_pred_knn_1)))
print('The cohen kappa score for percentile 2 using knn is : ' + str(cohen_kappa_score(y_test2, y_pred_knn_2)))
print('The cohen kappa score for percentile 3 using knn is : ' + str(cohen_kappa_score(y_test3, y_pred_knn_3)))

print('The confusion matrix for percentile 1 using knn is : ' + str(confusion_matrix(y_test1, y_pred_knn_1)))
print('The confusion matrix for percentile 2 using knn is : ' + str(confusion_matrix(y_test2, y_pred_knn_2)))
print('The confusion matrix for percentile 3 using knn is : ' + str(confusion_matrix(y_test3, y_pred_knn_3)))

The balanced accuracy for percentile 1 using knn is : 0.6754588134869824
The balanced accuracy for percentile 2 using knn is : 0.6930416174734357
The balanced accuracy for percentile 3 using knn is : 0.7409823337745647
The f1 score for percentile 1 using knn is : 0.8292890591741165
The f1 score for percentile 2 using knn is : 0.7795992714025501
The f1 score for percentile 3 using knn is : 0.766076421248835
The cohen kappa score for percentile 1 using knn is : 0.3883009119545864
The cohen kappa score for percentile 2 using knn is : 0.402090297790586
The cohen kappa score for percentile 3 using knn is : 0.4824610392781835
The confusion matrix for percentile 1 using knn is : [[240 310]
 [ 91 974]]
The confusion matrix for percentile 2 using knn is : [[281 279]
 [ 84 642]]
The confusion matrix for percentile 3 using knn is : [[309 174]
 [ 77 411]]


In [27]:
y_pred_tree_1 = model_tree_1.predict(X_test1)
y_pred_tree_2 = model_tree_2.predict(X_test2)
y_pred_tree_3 = model_tree_3.predict(X_test3)

In [28]:
print('The balanced accuracy for percentile 1 using tree is : ' + str(balanced_accuracy_score(y_test1, y_pred_tree_1)))
print('The balanced accuracy for percentile 2 using tree is : ' + str(balanced_accuracy_score(y_test2, y_pred_tree_2)))
print('The balanced accuracy for percentile 3 using tree is : ' + str(balanced_accuracy_score(y_test3, y_pred_tree_3)))

print('The f1 score for percentile 1 using tree is : ' + str(f1_score(y_test1, y_pred_tree_1)))
print('The f1 score for percentile 2 using tree is : ' + str(f1_score(y_test2, y_pred_tree_2)))
print('The f1 score for percentile 3 using tree is : ' + str(f1_score(y_test3, y_pred_tree_3)))

print('The cohen kappa score for percentile 1 using tree is : ' + str(cohen_kappa_score(y_test1, y_pred_tree_1)))
print('The cohen kappa score for percentile 2 using tree is : ' + str(cohen_kappa_score(y_test2, y_pred_tree_2)))
print('The cohen kappa score for percentile 3 using tree is : ' + str(cohen_kappa_score(y_test3, y_pred_tree_3)))

print('The confusion matrix for percentile 1 using tree is : ' + str(confusion_matrix(y_test1, y_pred_tree_1)))
print('The confusion matrix for percentile 2 using tree is : ' + str(confusion_matrix(y_test2, y_pred_tree_2)))
print('The confusion matrix for percentile 3 using tree is : ' + str(confusion_matrix(y_test3, y_pred_tree_3)))

The balanced accuracy for percentile 1 using tree is : 0.724950917626974
The balanced accuracy for percentile 2 using tree is : 0.7526834907516726
The balanced accuracy for percentile 3 using tree is : 0.7608525947798934
The f1 score for percentile 1 using tree is : 0.8381631743201069
The f1 score for percentile 2 using tree is : 0.8174454828660436
The f1 score for percentile 3 using tree is : 0.7716535433070867
The cohen kappa score for percentile 1 using tree is : 0.473419233723014
The cohen kappa score for percentile 2 using tree is : 0.5216611951932281
The cohen kappa score for percentile 3 using tree is : 0.5219266226379858
The confusion matrix for percentile 1 using tree is : [[312 238]
 [125 940]]
The confusion matrix for percentile 2 using tree is : [[337 223]
 [ 70 656]]
The confusion matrix for percentile 3 using tree is : [[347 136]
 [ 96 392]]


In [9]:
y_pred_svc_1 = model_svc_1.predict(X_test1)
y_pred_svc_2 = model_svc_2.predict(X_test2)
y_pred_svc_3 = model_svc_3.predict(X_test3)

In [10]:
print('The balanced accuracy for percentile 1 using svc is : ' + str(balanced_accuracy_score(y_test1, y_pred_svc_1)))
print('The balanced accuracy for percentile 2 using svc is : ' + str(balanced_accuracy_score(y_test2, y_pred_svc_2)))
print('The balanced accuracy for percentile 3 using svc is : ' + str(balanced_accuracy_score(y_test3, y_pred_svc_3)))

print('The f1 score for percentile 1 using svc is : ' + str(f1_score(y_test1, y_pred_svc_1)))
print('The f1 score for percentile 2 using svc is : ' + str(f1_score(y_test2, y_pred_svc_2)))
print('The f1 score for percentile 3 using svc is : ' + str(f1_score(y_test3, y_pred_svc_3)))

print('The cohen kappa score for percentile 1 using svc is : ' + str(cohen_kappa_score(y_test1, y_pred_svc_1)))
print('The cohen kappa score for percentile 2 using svc is : ' + str(cohen_kappa_score(y_test2, y_pred_svc_2)))
print('The cohen kappa score for percentile 3 using svc is : ' + str(cohen_kappa_score(y_test3, y_pred_svc_3)))

print('The confusion matrix for percentile 1 using svc is : ' + str(confusion_matrix(y_test1, y_pred_svc_1)))
print('The confusion matrix for percentile 2 using svc is : ' + str(confusion_matrix(y_test2, y_pred_svc_2)))
print('The confusion matrix for percentile 3 using svc is : ' + str(confusion_matrix(y_test3, y_pred_svc_3)))

The balanced accuracy for percentile 1 using svc is : 0.7015023474178403
The balanced accuracy for percentile 2 using svc is : 0.7312303227075954
The balanced accuracy for percentile 3 using svc is : 0.7451019074771748
The f1 score for percentile 1 using svc is : 0.842330762639246
The f1 score for percentile 2 using svc is : 0.800747198007472
The f1 score for percentile 3 using svc is : 0.7698042870456664
The cohen kappa score for percentile 1 using svc is : 0.4427065751472188
The cohen kappa score for percentile 2 using svc is : 0.4774720781760281
The cohen kappa score for percentile 3 using svc is : 0.4907086721183719
The confusion matrix for percentile 1 using svc is : [[264 286]
 [ 82 983]]
The confusion matrix for percentile 2 using svc is : [[323 237]
 [ 83 643]]
The confusion matrix for percentile 3 using svc is : [[311 172]
 [ 75 413]]


# Importancia de los predictores

##

In [None]:
importancia_predictores_1 = pd.DataFrame(
    {'predictor': X_train1.columns,
     'importancia': model_rfc_1.feature_importances_}
)
print("Importancia de los predictores en el modelo")
print("-------------------------------------------")
print(importancia_predictores_1.sort_values('importancia', ascending=False))

importancia_predictores_2 = pd.DataFrame(
    {'predictor': X_train2.columns,
     'importancia': model_rfc_2.feature_importances_}
)
print("Importancia de los predictores en el modelo")
print("-------------------------------------------")
print(importancia_predictores_2.sort_values('importancia', ascending=False))

importancia_predictores_3 = pd.DataFrame(
    {'predictor': X_train3.columns,
     'importancia': model_rfc_3.feature_importances_}
)
print("Importancia de los predictores en el modelo")
print("-------------------------------------------")
print(importancia_predictores_3.sort_values('importancia', ascending=False))

# Save model for future use

In [12]:
joblib.dump(model_lr_1, './models2/lr_1.pkl')
joblib.dump(model_lr_2, './models2/lr_2.pkl')
joblib.dump(model_lr_3, './models2/lr_3.pkl')

joblib.dump(model_rfc_1, './models2/rfc_1.pkl')
joblib.dump(model_rfc_2, './models2/rfc_2.pkl')
joblib.dump(model_rfc_3, './models2/rfc_3.pkl')

joblib.dump(model_tree_1, './models2/tree_1.pkl')
joblib.dump(model_tree_2, './models2/tree_2.pkl')
joblib.dump(model_tree_3, './models2/tree_3.pkl')

joblib.dump(model_svc_1, './models2/svc_1.pkl')
joblib.dump(model_svc_2, './models2/svc_2.pkl')
joblib.dump(model_svc_3, './models2/svc_3.pkl')

joblib.dump(model_knn_1, './models2/knn_1.pkl')
joblib.dump(model_knn_2, './models2/knn_2.pkl')
joblib.dump(model_knn_3, './models2/knn_3.pkl')

joblib.dump(model_mlp_1,'./models2/mlp_1.pkl')
joblib.dump(model_mlp_2,'./models2/mlp_2.pkl')
joblib.dump(model_mlp_3,'./models2/mlp_3.pkl')

joblib.dump(model_sgd_1,'./models2/sgd_1.pkl')
joblib.dump(model_sgd_2,'./models2/sgd_2.pkl')
joblib.dump(model_sgd_3,'./models2/sgd_3.pkl')

joblib.dump(model_adaboost_1,'./models2/adaboost_1.pkl')
joblib.dump(model_adaboost_2,'./models2/adaboost_2.pkl')
joblib.dump(model_adaboost_3,'./models2/adaboost_3.pkl')


['./models2/mlp_3.pkl']

In [None]:
model_lr_1 = joblib.load('./models/lr_1.pkl')
model_lr_2 = joblib.load('./models/lr_2.pkl')
model_lr_3 = joblib.load('./models/lr_3.pkl')

model_rfc_1 = joblib.load('./models/rfc_1.pkl')
model_rfc_2 = joblib.load('./models/rfc_2.pkl')
model_rfc_3 = joblib.load('./models/rfc_3.pkl')

model_tree_1 = joblib.load('./models/tree_1.pkl')
model_tree_2 = joblib.load('./models/tree_2.pkl')
model_tree_3 = joblib.load('./models/tree_3.pkl')

model_svc_1 = joblib.load('./models/svc_1.pkl')
model_svc_2 = joblib.load('./models/svc_2.pkl')
model_svc_3 = joblib.load('./models/svc_3.pkl')

model_knn_1 = joblib.load('./models/knn_1.pkl')
model_knn_2 = joblib.load('./models/knn_2.pkl')
model_knn_3 = joblib.load('./models/knn_3.pkl')

model_mlp_1 = joblib.load('./models/mlp_1.pkl')
model_mlp_2 = joblib.load('./models/mlp_2.pkl')
model_mlp_3 = joblib.load('./models/mlp_3.pkl')

model_sgd_1 = joblib.load('./models/sgd_1.pkl')
model_sgd_2 = joblib.load('./models/sgd_2.pkl')
model_sgd_3 = joblib.load('./models/sgd_3.pkl')

model_adaboost_1 = joblib.load('./models/adaboost_1.pkl')
model_adaboost_2 = joblib.load('./models/adaboost_2.pkl')
model_adaboost_3 = joblib.load('./models/adaboost_3.pkl')