# 1. Import Libraries and modules

In [7]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier

from sklearn.neural_network import MLPClassifier

from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier



from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import GroupKFold


import multiprocessing


import joblib



import sys
sys.path.append('../../python')
import splitDataset
import splitDataset2
np.set_printoptions(suppress=True)

# 2. Load data

In [8]:
data1= pd.read_csv('E:/Documentos\/PCEO\/5\/Informatica/TFG/scripts/TFG-Informatica/Outputs/featuresOutput_percentil1.csv', sep=";")
data2= pd.read_csv('E:/Documentos\/PCEO\/5\/Informatica/TFG/scripts/TFG-Informatica/Outputs/featuresOutput_percentil2.csv', sep=";")
data3= pd.read_csv('E:/Documentos\/PCEO\/5\/Informatica/TFG/scripts/TFG-Informatica/Outputs/featuresOutput_percentil3.csv', sep=";")

# want to eliminate the rows where feature bestSubmit is 100.0

data1 = data1[data1.bestSubmit != 100.0]
data2 = data2[data2.bestSubmit != 100.0]
data3 = data3[data3.bestSubmit != 100.0]

In [9]:
counts = data1['completed'].value_counts()

print(counts)

1.0    3460
0.0    1450
Name: completed, dtype: int64


# 3. Split data into training and test sets

In [10]:
X_train1, X_test1, y_train1, y_test1 = splitDataset.splitDataset(data1)
X_train2, X_test2, y_train2, y_test2 = splitDataset.splitDataset(data2)
X_train3, X_test3, y_train3, y_test3 = splitDataset.splitDataset(data3)

groups1 = splitDataset2.splitDataset2(X_train1)
groups2 = splitDataset2.splitDataset2(X_train2)
groups3 = splitDataset2.splitDataset2(X_train3)

X_train1 = X_train1.drop(['user'], axis=1)
X_test1 = X_test1.drop(['user'], axis=1)

X_train2 = X_train2.drop(['user'], axis=1)
X_test2 = X_test2.drop(['user'], axis=1)

X_train3 = X_train3.drop(['user'], axis=1)
X_test3 = X_test3.drop(['user'], axis=1)



# show the number of rows of X_train1
print("Number of rows X_train1 dataset: ", X_train1.shape[0])
print("Number of rows X_test1 dataset: ", X_test1.shape[0])

print("Number of rows X_train2 dataset: ", X_train2.shape[0])
print("Number of rows X_test2 dataset: ", X_test2.shape[0])

print("Number of rows X_train3 dataset: ", X_train3.shape[0])
print("Number of rows X_test3 dataset: ", X_test3.shape[0])


Number of rows X_train1 dataset:  3295
Number of rows X_test1 dataset:  1615
Number of rows X_train2 dataset:  2551
Number of rows X_test2 dataset:  1286
Number of rows X_train3 dataset:  2079
Number of rows X_test3 dataset:  971


# 4. Declare data preprocessing steps

# 5. Declare hyperparameters to tune

In [11]:
param_grid_rfc = {'n_estimators': [150],
                    'max_features': [5, 7, 9],
                    'max_depth'   : [None, 3, 10, 20],
                    'criterion'   : ['gini', 'entropy','log_loss'],
                    'max_leaf_nodes' : [3, 10, 20],
                    }


param_grid_rfc2 = {
    'n_estimators': [100, 200, 300],
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2'],
    'bootstrap': [True, False],
    'class_weight': [None, 'balanced']
}


# i want to make a param grid for the SGD Classifier
param_grid_sgd = {
    'loss': ['hinge', 'log', 'modified_huber'],
    'penalty': ['l1', 'l2', 'elasticnet'],
    'alpha': [0.0001, 0.001, 0.01, 0.1],
    'max_iter': [100, 500, 1000],
    'random_state': [42]
}

param_grid_sgd2 = {
    'loss': ['hinge', 'log', 'modified_huber'],
    'penalty': ['l2', 'l1', 'elasticnet'],
    'alpha': [0.0001, 0.001, 0.01],
    'l1_ratio': [0.15, 0.3, 0.5],
    'fit_intercept': [True, False],
    'max_iter': [1000, 2000],
    'tol': [0.0001, 0.001],
    'learning_rate': ['constant', 'optimal', 'invscaling'],
    'eta0': [0.01, 0.1, 1.0],
    'class_weight': [None, 'balanced']
}


param_grid_mlp = {
    'hidden_layer_sizes': [(50,50,50), (50,100,50), (100,)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.05],
    'learning_rate': ['constant','adaptive'],
}

param_grid_mlp2 = {
    'hidden_layer_sizes': [(100,), (50, 50), (50, 100, 50)],
    'activation': ['relu', 'logistic', 'tanh'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.001, 0.01],
    'learning_rate': ['constant', 'adaptive'],
    'learning_rate_init': [0.001, 0.01, 0.1],
    'max_iter': [200, 500, 1000],
    'tol': [0.0001, 0.001],
    'early_stopping': [True, False],
    'validation_fraction': [0.1, 0.2, 0.3],
    'beta_1': [0.9, 0.95, 0.99],
    'beta_2': [0.999, 0.9, 0.99]
}



param_grid_adaboost = {
    'n_estimators': [50, 100, 200, 500],
    'learning_rate': [0.001, 0.01, 0.1, 1.0],
    'algorithm': ['SAMME', 'SAMME.R']
}

param_grid_adaboost2 = {
    'base_estimator': [DecisionTreeClassifier(max_depth=1), LogisticRegression()],
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.1, 0.5, 1.0],
    'algorithm': ['SAMME', 'SAMME.R'],
    'random_state': [42]
}


param_grid_svc = {
    'C': [0.1, 1, 10, 100],
    'gamma': [1, 0.1, 0.01, 0.001],
    'kernel': ['rbf', 'poly', 'sigmoid']
}

param_grid_svc2 = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'degree': [2, 3, 4],
    'gamma': ['scale', 'auto'],
    'coef0': [0.0, 0.5, 1.0],
    'shrinking': [True, False],
    'probability': [True, False],
    'tol': [0.001, 0.0001],
    'class_weight': [None, 'balanced']
}


param_grid_tree = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 3, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 3, 5]
}

param_grid_tree2 = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2'],
    'class_weight': [None, 'balanced']
}


param_grid_knn = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree'],
    'p': [1, 2]
}

param_grid_knn2 = {
    'n_neighbors': [3, 5, 7],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree'],
    'leaf_size': [30, 40, 50],
    'p': [1, 2],
    'metric': ['euclidean', 'manhattan'],
}

# 6. Tune model using cross-validation pipeline

## 6.1 Logistic Regression

## 6.2 Random Forest classifier

In [6]:
grid_rfc_1 = GridSearchCV(
    estimator  = RandomForestClassifier(random_state = 123),
    param_grid = param_grid_rfc2,
    scoring    = 'balanced_accuracy',
    n_jobs     = multiprocessing.cpu_count() - 1,
    cv         = GroupKFold(n_splits = 10),
    refit      = True,
    verbose    = 1,
    return_train_score = True
)

grid_rfc_2 = GridSearchCV(
    estimator  = RandomForestClassifier(random_state = 123),
    param_grid = param_grid_rfc2,
    scoring    = 'balanced_accuracy',
    n_jobs     = multiprocessing.cpu_count() - 1,
    cv         = GroupKFold(n_splits = 10),
    refit      = True,
    verbose    = 1,
    return_train_score = True
)

grid_rfc_3 = GridSearchCV(
    estimator  = RandomForestClassifier(random_state = 123),
    param_grid = param_grid_rfc2,
    scoring    = 'balanced_accuracy',
    n_jobs     = multiprocessing.cpu_count() - 1,
    cv         = GroupKFold(n_splits = 10),
    refit      = True,
    verbose    = 1,
    return_train_score = True
)

grid_rfc_1 = grid_rfc_1.fit(X = X_train1, y = y_train1, groups = groups1)
model_rfc_1 = grid_rfc_1.best_estimator_
grid_rfc_2 = grid_rfc_2.fit(X = X_train2, y = y_train2, groups = groups2)
model_rfc_2 = grid_rfc_2.best_estimator_
grid_rfc_3 = grid_rfc_3.fit(X = X_train3, y = y_train3, groups = groups3)
model_rfc_3 = grid_rfc_3.best_estimator_

Fitting 10 folds for each of 1944 candidates, totalling 19440 fits


KeyboardInterrupt: 

In [9]:
resultados_rfc_1 = pd.DataFrame(grid_rfc_1.cv_results_)
resultados_rfc_1 = resultados_rfc_1.filter(regex = '(param*|mean_t|std_t)') \
    .drop(columns = 'params') \
    .sort_values('mean_test_score', ascending = False)
resultados_rfc_1.to_csv("../../Outputs/grids/grid_rfc_1.csv", decimal=".", sep=";", mode='w')
print('El mejor modelo random forest para el percentil 1 es: ', grid_rfc_1.best_estimator_)

resultados_rfc_2 = pd.DataFrame(grid_rfc_2.cv_results_)
resultados_rfc_2 = resultados_rfc_2.filter(regex = '(param*|mean_t|std_t)') \
    .drop(columns = 'params') \
    .sort_values('mean_test_score', ascending = False)
resultados_rfc_2.to_csv("../../Outputs/grids/grid_rfc_2.csv", decimal=".", sep=";", mode='w')
print('El mejor modelo random forest para el percentil 2 es: ', grid_rfc_2.best_estimator_)

resultados_rfc_3 = pd.DataFrame(grid_rfc_3.cv_results_)
resultados_rfc_3 = resultados_rfc_3.filter(regex = '(param*|mean_t|std_t)') \
    .drop(columns = 'params') \
    .sort_values('mean_test_score', ascending = False)
resultados_rfc_3.to_csv("../../Outputs/grids/grid_rfc_3.csv", decimal=".", sep=";", mode='w')
print('El mejor modelo random forest para el percentil 3 es: ', grid_rfc_3.best_estimator_)

El mejor modelo random forest para el percentil 1 es:  RandomForestClassifier(max_features=9, max_leaf_nodes=20, n_estimators=150,
                       random_state=123)
    param_criterion param_max_depth param_max_features param_max_leaf_nodes  \
8              gini            None                  9                   20   
35             gini              20                  9                   20   
26             gini              10                  9                   20   
107        log_loss              20                  9                   20   
80         log_loss            None                  9                   20   
..              ...             ...                ...                  ...   
72         log_loss            None                  5                    3   
63          entropy              20                  5                    3   
45          entropy               3                  5                    3   
36          entropy            None   

## 6.3 Sthocastic Gradient Descent

In [12]:
grid_sgd_1 = GridSearchCV(
    estimator  = SGDClassifier(random_state = 34),
    scoring    = 'balanced_accuracy',
    param_grid = param_grid_sgd2,
    n_jobs     = multiprocessing.cpu_count() - 1,
    cv         = GroupKFold(n_splits = 10),
    refit      = True,
    verbose    = 0,
    return_train_score = True
)

grid_sgd_2 = GridSearchCV(
    estimator  = SGDClassifier(random_state = 34),
    scoring    = 'balanced_accuracy',
    param_grid = param_grid_sgd2,
    n_jobs     = multiprocessing.cpu_count() - 1,
    cv         = GroupKFold(n_splits = 10),
    refit      = True,
    verbose    = 0,
    return_train_score = True
)

grid_sgd_3 = GridSearchCV(
    estimator  = SGDClassifier(random_state = 34),
    scoring    = 'balanced_accuracy',
    param_grid = param_grid_sgd2,
    n_jobs     = multiprocessing.cpu_count() - 1,
    cv         = GroupKFold(n_splits = 10),
    refit      = True,
    verbose    = 0,
    return_train_score = True
)

grid_sgd_1 = grid_sgd_1.fit(X = X_train1, y = y_train1, groups = groups1)
model_sgd_1 = grid_sgd_1.best_estimator_
grid_sgd_2 = grid_sgd_2.fit(X = X_train2, y = y_train2, groups = groups2)
model_sgd_2 = grid_sgd_2.best_estimator_
grid_sgd_3 = grid_sgd_3.fit(X = X_train3, y = y_train3, groups = groups3)
model_sgd_3 = grid_sgd_3.best_estimator_



In [11]:
resultados_sgd_1 = pd.DataFrame(grid_sgd_1.cv_results_)
resultados_sgd_1 = resultados_sgd_1.filter(regex = '(param*|mean_t|std_t)') \
    .drop(columns = 'params') \
    .sort_values('mean_test_score', ascending = False)
resultados_sgd_1.to_csv("../../Outputs/grids/grid_sgd_1.csv", decimal=".", sep=";", mode='w')
print('El mejor modelo SGD para el percentil 1 es: ', grid_sgd_1.best_estimator_)

resultados_sgd_2 = pd.DataFrame(grid_sgd_2.cv_results_)
resultados_sgd_2 = resultados_sgd_2.filter(regex = '(param*|mean_t|std_t)') \
    .drop(columns = 'params') \
    .sort_values('mean_test_score', ascending = False)
resultados_sgd_2.to_csv("../../Outputs/grids/grid_sgd_2.csv", decimal=".", sep=";", mode='w')
print('El mejor modelo SGD para el percentil 2 es: ', grid_sgd_2.best_estimator_)

resultados_sgd_3 = pd.DataFrame(grid_sgd_3.cv_results_)
resultados_sgd_3 = resultados_sgd_3.filter(regex = '(param*|mean_t|std_t)') \
    .drop(columns = 'params') \
    .sort_values('mean_test_score', ascending = False)
resultados_sgd_3.to_csv("../../Outputs/grids/grid_sgd_3.csv", decimal=".", sep=";", mode='w')
print('El mejor modelo SGD para el percentil 3 es: ', grid_sgd_3.best_estimator_)

# I want to know the balanced accuracy reached by the best model of every perecentile

El mejor modelo SGD para el percentil 1 es:  SGDClassifier(loss='log', max_iter=500, penalty='l1', random_state=42)
El mejor modelo SGD para el percentil 2 es:  SGDClassifier(alpha=0.001, loss='modified_huber', max_iter=100, penalty='l1',
              random_state=42)
El mejor modelo SGD para el percentil 3 es:  SGDClassifier(alpha=0.001, loss='log', max_iter=100, penalty='l1',
              random_state=42)


## 6.4 MLP

In [59]:
grid_mlp_1 = GridSearchCV(
    estimator  = MLPClassifier(random_state = 34),
    scoring    = 'balanced_accuracy',
    param_grid = param_grid_mlp2,
    n_jobs     = multiprocessing.cpu_count() - 1,
    cv         = GroupKFold(n_splits = 10),
    refit      = True,
    verbose    = 0,
    return_train_score = True
)

grid_mlp_2 = GridSearchCV(
    estimator  = MLPClassifier(random_state = 34),
    scoring    = 'balanced_accuracy',
    param_grid = param_grid_mlp2,
    n_jobs     = multiprocessing.cpu_count() - 1,
    cv         = GroupKFold(n_splits = 10),
    refit      = True,
    verbose    = 0,
    return_train_score = True
)

grid_mlp_3 = GridSearchCV(
    estimator  = MLPClassifier(random_state = 34),
    scoring    = 'balanced_accuracy',
    param_grid = param_grid_mlp2,
    n_jobs     = multiprocessing.cpu_count() - 1,
    cv         = GroupKFold(n_splits = 10),
    refit      = True,
    verbose    = 0,
    return_train_score = True
)

grid_mlp_1 = grid_mlp_1.fit(X = X_train1, y = y_train1, groups = groups1)
model_mlp_1 = grid_mlp_1.best_estimator_
grid_mlp_2 = grid_mlp_2.fit(X = X_train2, y = y_train2, groups = groups2)
model_mlp_2 = grid_mlp_2.best_estimator_
grid_mlp_3 = grid_mlp_3.fit(X = X_train3, y = y_train3, groups = groups3)
model_mlp_3 = grid_mlp_3.best_estimator_

In [13]:
resultados_mlp_1 = pd.DataFrame(grid_mlp_1.cv_results_)
resultados_mlp_1 = resultados_mlp_1.filter(regex = '(param*|mean_t|std_t)') \
    .drop(columns = 'params') \
    .sort_values('mean_test_score', ascending = False)
resultados_mlp_1.to_csv("../../Outputs/grids/grid_mlp_1.csv", decimal=".", sep=";", mode='w')
print('El mejor modelo MLP para el percentil 1 es: ', grid_mlp_1.best_estimator_)

resultados_mlp_2 = pd.DataFrame(grid_mlp_2.cv_results_)
resultados_mlp_2 = resultados_mlp_2.filter(regex = '(param*|mean_t|std_t)') \
    .drop(columns = 'params') \
    .sort_values('mean_test_score', ascending = False)
resultados_mlp_2.to_csv("../../Outputs/grids/grid_mlp_2.csv", decimal=".", sep=";", mode='w')
print('El mejor modelo MLP para el percentil 2 es: ', grid_mlp_2.best_estimator_)

resultados_mlp_3 = pd.DataFrame(grid_mlp_3.cv_results_)
resultados_mlp_3 = resultados_mlp_3.filter(regex = '(param*|mean_t|std_t)') \
    .drop(columns = 'params') \
    .sort_values('mean_test_score', ascending = False)
resultados_mlp_3.to_csv("../../Outputs/grids/grid_mlp_3.csv", decimal=".", sep=";", mode='w')
print('El mejor modelo MLP para el percentil 3 es: ', grid_mlp_3.best_estimator_)
print('Los mejores parámetros para el modelo MLP son: ', grid_mlp_3.best_params_)

El mejor modelo MLP para el percentil 1 es:  MLPClassifier(alpha=0.05, hidden_layer_sizes=(50, 100, 50), random_state=34)
El mejor modelo MLP para el percentil 2 es:  MLPClassifier(random_state=34)
El mejor modelo MLP para el percentil 3 es:  MLPClassifier(random_state=34)
Los mejores parámetros para el modelo MLP son:  {'activation': 'relu', 'alpha': 0.0001, 'hidden_layer_sizes': (100,), 'learning_rate': 'constant', 'solver': 'adam'}


## 6.5 AdaBoost

In [62]:
grid_adaboost_1 = GridSearchCV(
    estimator  = AdaBoostClassifier(random_state = 34),
    scoring    = 'balanced_accuracy',
    param_grid = param_grid_adaboost2,
    n_jobs     = multiprocessing.cpu_count() - 1,
    cv         = GroupKFold(n_splits = 10),
    refit      = True,
    verbose    = 0,
    return_train_score = True
)

grid_adaboost_2 = GridSearchCV(
    estimator  = AdaBoostClassifier(random_state = 34),
    scoring    = 'balanced_accuracy',
    param_grid = param_grid_adaboost2,
    n_jobs     = multiprocessing.cpu_count() - 1,
    cv         = GroupKFold(n_splits = 10),
    refit      = True,
    verbose    = 0,
    return_train_score = True
)

grid_adaboost_3 = GridSearchCV(
    estimator  = AdaBoostClassifier(random_state = 34),
    scoring    = 'balanced_accuracy',
    param_grid = param_grid_adaboost2,
    n_jobs     = multiprocessing.cpu_count() - 1,
    cv         = GroupKFold(n_splits = 10),
    refit      = True,
    verbose    = 0,
    return_train_score = True
)

grid_adaboost_1 = grid_adaboost_1.fit(X = X_train1, y = y_train1, groups = groups1)
model_adaboost_1 = grid_adaboost_1.best_estimator_
grid_adaboost_2 = grid_adaboost_2.fit(X = X_train2, y = y_train2, groups = groups2)
model_adaboost_2 = grid_adaboost_2.best_estimator_
grid_adaboost_3 = grid_adaboost_3.fit(X = X_train3, y = y_train3, groups = groups3)
model_adaboost_3 = grid_adaboost_3.best_estimator_

In [15]:
resultados_adaboost_1 = pd.DataFrame(grid_adaboost_1.cv_results_)
resultados_adaboost_1 = resultados_adaboost_1.filter(regex = '(param*|mean_t|std_t)') \
    .drop(columns = 'params') \
    .sort_values('mean_test_score', ascending = False)
resultados_adaboost_1.to_csv("../../Outputs/grids/grid_adaboost_1.csv", decimal=".", sep=";", mode='w')
print('El mejor modelo AdaBoost para el percentil 1 es: ', grid_adaboost_1.best_estimator_)

resultados_adaboost_2 = pd.DataFrame(grid_adaboost_2.cv_results_)
resultados_adaboost_2 = resultados_adaboost_2.filter(regex = '(param*|mean_t|std_t)') \
    .drop(columns = 'params') \
    .sort_values('mean_test_score', ascending = False)
resultados_adaboost_2.to_csv("../../Outputs/grids/grid_adaboost_2.csv", decimal=".", sep=";", mode='w')
print('El mejor modelo AdaBoost para el percentil 2 es: ', grid_adaboost_2.best_estimator_)

resultados_adaboost_3 = pd.DataFrame(grid_adaboost_3.cv_results_)
resultados_adaboost_3 = resultados_adaboost_3.filter(regex = '(param*|mean_t|std_t)') \
    .drop(columns = 'params') \
    .sort_values('mean_test_score', ascending = False)
resultados_adaboost_3.to_csv("../../Outputs/grids/grid_adaboost_3.csv", decimal=".", sep=";", mode='w')
print('El mejor modelo AdaBoost para el percentil 3 es: ', grid_adaboost_3.best_estimator_)
print('Los mejores parámetros para el modelo AdaBoost son: ', grid_adaboost_3.best_params_)

El mejor modelo AdaBoost para el percentil 1 es:  AdaBoostClassifier(algorithm='SAMME', n_estimators=500, random_state=34)
El mejor modelo AdaBoost para el percentil 2 es:  AdaBoostClassifier(n_estimators=100, random_state=34)
El mejor modelo AdaBoost para el percentil 3 es:  AdaBoostClassifier(random_state=34)
Los mejores parámetros para el modelo AdaBoost son:  {'algorithm': 'SAMME.R', 'learning_rate': 1.0, 'n_estimators': 50}


## 6.6 K-Nearest Neighbors

In [16]:
grid_knn_1 = GridSearchCV(
    estimator  = KNeighborsClassifier(),
    scoring    = 'balanced_accuracy',
    param_grid = param_grid_knn2,
    n_jobs     = multiprocessing.cpu_count() - 1,
    cv         = GroupKFold(n_splits = 10),
    refit      = True,
    verbose    = 0,
    return_train_score = True
)

grid_knn_2 = GridSearchCV(
    estimator  = KNeighborsClassifier(),
    scoring    = 'balanced_accuracy',
    param_grid = param_grid_knn2,
    n_jobs     = multiprocessing.cpu_count() - 1,
    cv         = GroupKFold(n_splits = 10),
    refit      = True,
    verbose    = 0,
    return_train_score = True
)

grid_knn_3 = GridSearchCV(
    estimator  = KNeighborsClassifier(),
    scoring    = 'balanced_accuracy',
    param_grid = param_grid_knn2,
    n_jobs     = multiprocessing.cpu_count() - 1,
    cv         = GroupKFold(n_splits = 10),
    refit      = True,
    verbose    = 0,
    return_train_score = True
)

grid_knn_1 = grid_knn_1.fit(X = X_train1, y = y_train1, groups = groups1)
model_knn_1 = grid_knn_1.best_estimator_
grid_knn_2 = grid_knn_2.fit(X = X_train2, y = y_train2, groups = groups2)
model_knn_2 = grid_knn_2.best_estimator_
grid_knn_3 = grid_knn_3.fit(X = X_train3, y = y_train3, groups = groups3)
model_knn_3 = grid_knn_3.best_estimator_

In [17]:
resultados_knn_1 = pd.DataFrame(grid_knn_1.cv_results_)
resultados_knn_1 = resultados_knn_1.filter(regex = '(param*|mean_t|std_t)') \
    .drop(columns = 'params') \
    .sort_values('mean_test_score', ascending = False)
resultados_knn_1.to_csv("../../Outputs/grids/grid_knn_1.csv", decimal=".", sep=";", mode='w')
print('El mejor modelo KNN para el percentil 1 es: ', grid_knn_1.best_estimator_)

resultados_knn_2 = pd.DataFrame(grid_knn_2.cv_results_)
resultados_knn_2 = resultados_knn_2.filter(regex = '(param*|mean_t|std_t)') \
    .drop(columns = 'params') \
    .sort_values('mean_test_score', ascending = False)
resultados_knn_2.to_csv("../../Outputs/grids/grid_knn_2.csv", decimal=".", sep=";", mode='w')
print('El mejor modelo KNN para el percentil 2 es: ', grid_knn_2.best_estimator_)

resultados_knn_3 = pd.DataFrame(grid_knn_3.cv_results_)
resultados_knn_3 = resultados_knn_3.filter(regex = '(param*|mean_t|std_t)') \
    .drop(columns = 'params') \
    .sort_values('mean_test_score', ascending = False)
resultados_knn_3.to_csv("../../Outputs/grids/grid_knn_3.csv", decimal=".", sep=";", mode='w')
print('El mejor modelo KNN para el percentil 3 es: ', grid_knn_3.best_estimator_)
print('Los mejores parametros para KNN para el percentil 3 es: ', grid_knn_3.best_params_)

El mejor modelo KNN para el percentil 1 es:  KNeighborsClassifier(n_neighbors=3, p=1)
El mejor modelo KNN para el percentil 2 es:  KNeighborsClassifier(n_neighbors=3, p=1, weights='distance')
El mejor modelo KNN para el percentil 3 es:  KNeighborsClassifier(n_neighbors=9, p=1, weights='distance')
Los mejores parametros para KNN para el percentil 3 es:  {'algorithm': 'auto', 'n_neighbors': 9, 'p': 1, 'weights': 'distance'}


## 6.7 Tree decision classifier

In [18]:
grid_tree_1 = GridSearchCV(
    estimator  = DecisionTreeClassifier(random_state = 34),
    scoring    = 'balanced_accuracy',
    param_grid = param_grid_tree2,
    n_jobs     = multiprocessing.cpu_count() - 1,
    cv         = GroupKFold(n_splits = 10),
    refit      = True,
    verbose    = 0,
    return_train_score = True
)

grid_tree_2 = GridSearchCV(
    estimator  = DecisionTreeClassifier(random_state = 34),
    scoring    = 'balanced_accuracy',
    param_grid = param_grid_tree2,
    n_jobs     = multiprocessing.cpu_count() - 1,
    cv         = GroupKFold(n_splits = 10),
    refit      = True,
    verbose    = 0,
    return_train_score = True
)

grid_tree_3 = GridSearchCV(
    estimator  = DecisionTreeClassifier(random_state = 34),
    scoring    = 'balanced_accuracy',
    param_grid = param_grid_tree2,
    n_jobs     = multiprocessing.cpu_count() - 1,
    cv         = GroupKFold(n_splits = 10),
    refit      = True,
    verbose    = 0,
    return_train_score = True
)

grid_tree_1 = grid_tree_1.fit(X = X_train1, y = y_train1, groups=groups1)
model_tree_1 = grid_tree_1.best_estimator_
grid_tree_2 = grid_tree_2.fit(X = X_train2, y = y_train2, groups=groups2)
model_tree_2 = grid_tree_2.best_estimator_
grid_tree_3 = grid_tree_3.fit(X = X_train3, y = y_train3, groups=groups3)
model_tree_3 = grid_tree_3.best_estimator_

In [19]:
resultados_tree_1 = pd.DataFrame(grid_tree_1.cv_results_)
resultados_tree_1 = resultados_tree_1.filter(regex = '(param*|mean_t|std_t)') \
    .drop(columns = 'params') \
    .sort_values('mean_test_score', ascending = False)
resultados_tree_1.to_csv("../../Outputs/grids/grid_tree_1.csv", decimal=".", sep=";", mode='w')
print('El mejor modelo Árbol de Decisión para el percentil 1 es: ', grid_tree_1.best_estimator_)

resultados_tree_2 = pd.DataFrame(grid_tree_2.cv_results_)
resultados_tree_2 = resultados_tree_2.filter(regex = '(param*|mean_t|std_t)') \
    .drop(columns = 'params') \
    .sort_values('mean_test_score', ascending = False)
resultados_tree_2.to_csv("../../Outputs/grids/grid_tree_2.csv", decimal=".", sep=";", mode='w')
print('El mejor modelo Árbol de Decisión para el percentil 2 es: ', grid_tree_2.best_estimator_)

resultados_tree_3 = pd.DataFrame(grid_tree_3.cv_results_)
resultados_tree_3 = resultados_tree_3.filter(regex = '(param*|mean_t|std_t)') \
    .drop(columns = 'params') \
    .sort_values('mean_test_score', ascending = False)
resultados_tree_3.to_csv("../../Outputs/grids/grid_tree_3.csv", decimal=".", sep=";", mode='w')
print('El mejor modelo Árbol de Decisión para el percentil 3 es: ', grid_tree_3.best_estimator_)

El mejor modelo Árbol de Decisión para el percentil 1 es:  DecisionTreeClassifier(max_depth=20, min_samples_leaf=3, min_samples_split=10,
                       random_state=34)
El mejor modelo Árbol de Decisión para el percentil 2 es:  DecisionTreeClassifier(criterion='entropy', max_depth=10, min_samples_split=5,
                       random_state=34)
El mejor modelo Árbol de Decisión para el percentil 3 es:  DecisionTreeClassifier(criterion='entropy', max_depth=10, min_samples_split=10,
                       random_state=34)


## 6.8 Support vector classifier

In [20]:
grid_svc_1 = GridSearchCV(
    estimator  = SVC(random_state = 34,max_iter=10000),
    scoring    = 'balanced_accuracy',
    param_grid = param_grid_svc2,
    n_jobs     = multiprocessing.cpu_count() - 1,
    cv         = GroupKFold(n_splits = 10),
    refit      = True,
    verbose    = 1,
    return_train_score = True
)

grid_svc_2 = GridSearchCV(
    estimator  = SVC(random_state = 34,max_iter=10000),
    scoring    = 'balanced_accuracy',
    param_grid = param_grid_svc2,
    n_jobs     = multiprocessing.cpu_count() - 1,
    cv         = GroupKFold(n_splits = 10),
    refit      = True,
    verbose    = 1,
    return_train_score = True
)

grid_svc_3 = GridSearchCV(
    estimator  = SVC(random_state = 34,max_iter=10000),
    scoring    = 'balanced_accuracy',
    param_grid = param_grid_svc2,
    n_jobs     = multiprocessing.cpu_count() - 1,
    cv         = GroupKFold(n_splits = 10),
    refit      = True,
    verbose    = 1,
    return_train_score = True
)

grid_svc_1 = grid_svc_1.fit(X = X_train1, y = y_train1, groups=groups1)
model_svc_1 = grid_svc_1.best_estimator_
grid_svc_2 = grid_svc_2.fit(X = X_train2, y = y_train2, groups=groups2)
model_svc_2 = grid_svc_2.best_estimator_
grid_svc_3 = grid_svc_3.fit(X = X_train3, y = y_train3, groups=groups3)
model_svc_3 = grid_svc_3.best_estimator_

Fitting 10 folds for each of 48 candidates, totalling 480 fits




Fitting 10 folds for each of 48 candidates, totalling 480 fits




Fitting 10 folds for each of 48 candidates, totalling 480 fits


In [21]:
resultados_svc_1 = pd.DataFrame(grid_svc_1.cv_results_)
resultados_svc_1 = resultados_svc_1.filter(regex = '(param*|mean_t|std_t)') \
    .drop(columns = 'params') \
    .sort_values('mean_test_score', ascending = False)
resultados_svc_1.to_csv("../../Outputs/grids/grid_svc_1.csv", decimal=".", sep=";", mode='w')
print('El mejor modelo SVC para el percentil 1 es: ', grid_svc_1.best_estimator_)

resultados_svc_2 = pd.DataFrame(grid_svc_2.cv_results_)
resultados_svc_2 = resultados_svc_2.filter(regex = '(param*|mean_t|std_t)') \
    .drop(columns = 'params') \
    .sort_values('mean_test_score', ascending = False)
resultados_svc_2.to_csv("../../Outputs/grids/grid_svc_2.csv", decimal=".", sep=";", mode='w')
print('El mejor modelo SVC para el percentil 2 es: ', grid_svc_2.best_estimator_)

resultados_svc_3 = pd.DataFrame(grid_svc_3.cv_results_)
resultados_svc_3 = resultados_svc_3.filter(regex = '(param*|mean_t|std_t)') \
    .drop(columns = 'params') \
    .sort_values('mean_test_score', ascending = False)
resultados_svc_3.to_csv("../../Outputs/grids/grid_svc_3.csv", decimal=".", sep=";", mode='w')
print('El mejor modelo SVC para el percentil 3 es: ', grid_svc_3.best_estimator_)

El mejor modelo SVC para el percentil 1 es:  SVC(C=100, gamma=0.001, max_iter=10000, random_state=34)
El mejor modelo SVC para el percentil 2 es:  SVC(C=100, gamma=0.001, max_iter=10000, random_state=34)
El mejor modelo SVC para el percentil 3 es:  SVC(C=10, gamma=0.001, max_iter=10000, random_state=34)


# Comparación de modelos

In [53]:
file = open("resultados-entrenamiento.txt", "w")  # Abre el archivo en modo de escritura

print("#### Comparación de modelos ####", file=file)
print("#################################", file=file)
print("RandomForestClassifier", file=file)
print("Balanced Accuracy percentil 1: ", str(grid_rfc_1.best_score_), file=file)
print("Balanced Accuracy percentil 2: ", str(grid_rfc_2.best_score_), file=file)
print("Balanced Accuracy percentil 3: ", str(grid_rfc_3.best_score_), file=file)
print("#################################", file=file)
print("Stochastic Gradient Descent", file=file)
print("Balanced Accuracy percentil 1: ", str(grid_sgd_1.best_score_), file=file)
print("Balanced Accuracy percentil 2: ", str(grid_sgd_2.best_score_), file=file)
print("Balanced Accuracy percentil 3: ", str(grid_sgd_3.best_score_), file=file)
print("#################################", file=file)
print("Support Vector Classifier", file=file)
print("Balanced Accuracy percentil 1: ", str(grid_svc_1.best_score_), file=file)
print("Balanced Accuracy percentil 2: ", str(grid_svc_2.best_score_), file=file)
print("Balanced Accuracy percentil 3: ", str(grid_svc_3.best_score_), file=file)
print("#################################", file=file)
print("Decision Tree Classifier", file=file)
print("Balanced Accuracy percentil 1: ", str(grid_tree_1.best_score_), file=file)
print("Balanced Accuracy percentil 2: ", str(grid_tree_2.best_score_), file=file)
print("Balanced Accuracy percentil 3: ", str(grid_tree_3.best_score_), file=file)
print("#################################", file=file)
print("KNeighborsClassifier", file=file)
print("Balanced Accuracy percentil 1: ", str(grid_knn_1.best_score_), file=file)
print("Balanced Accuracy percentil 2: ", str(grid_knn_2.best_score_), file=file)
print("Balanced Accuracy percentil 3: ", str(grid_knn_3.best_score_), file=file)
print("#################################", file=file)
print("Multilayer Perceptron", file=file)
print("Balanced Accuracy percentil 1: ", str(grid_mlp_1.best_score_), file=file)
print("Balanced Accuracy percentil 2: ", str(grid_mlp_2.best_score_), file=file)
print("Balanced Accuracy percentil 3: ", str(grid_mlp_3.best_score_), file=file)
print("#################################", file=file)
print("AdaBoostClassifier", file=file)
print("Balanced Accuracy percentil 1: ", str(grid_adaboost_1.best_score_), file=file)
print("Balanced Accuracy percentil 2: ", str(grid_adaboost_2.best_score_), file=file)
print("Balanced Accuracy percentil 3: ", str(grid_adaboost_3.best_score_), file=file)

file.close()  # Cierra el archivo cuando hayas terminado de escribir en él

#### Comparación de modelos ####
#################################
RandomForestClassifier
Balanced Accuracy percentil 1:  0.6975260971028372
Balanced Accuracy percentil 2:  0.7353960569198085
Balanced Accuracy percentil 3:  0.7922757417690903
#################################
Stochastic Gradient Descent
Balanced Accuracy percentil 1:  0.691967105316122
Balanced Accuracy percentil 2:  0.6997300747490411
Balanced Accuracy percentil 3:  0.692401610854762
#################################
Support Vector Classifier
Balanced Accuracy percentil 1:  0.6908438422016241
Balanced Accuracy percentil 2:  0.7185847187485483
Balanced Accuracy percentil 3:  0.7338226664867661
#################################
Decision Tree Classifier
Balanced Accuracy percentil 1:  0.7307185146396312
Balanced Accuracy percentil 2:  0.7606199213180149
Balanced Accuracy percentil 3:  0.7527115538451107
#################################
KNeighborsClassifier
Balanced Accuracy percentil 1:  0.6744955663961238
Balanced Accu

In [48]:
joblib.dump(model_rfc_1, './models/rfc_1.pkl')
joblib.dump(model_rfc_2, './models/rfc_2.pkl')
joblib.dump(model_rfc_3, './models/rfc_3.pkl')

joblib.dump(model_tree_1, './models/tree_1.pkl')
joblib.dump(model_tree_2, './models/tree_2.pkl')
joblib.dump(model_tree_3, './models/tree_3.pkl')

joblib.dump(model_svc_1, './models/svc_1.pkl')
joblib.dump(model_svc_2, './models/svc_2.pkl')
joblib.dump(model_svc_3, './models/svc_3.pkl')

joblib.dump(model_knn_1, './models/knn_1.pkl')
joblib.dump(model_knn_2, './models/knn_2.pkl')
joblib.dump(model_knn_3, './models/knn_3.pkl')

joblib.dump(model_mlp_1,'./models/mlp_1.pkl')
joblib.dump(model_mlp_2,'./models/mlp_2.pkl')
joblib.dump(model_mlp_3,'./models/mlp_3.pkl')

joblib.dump(model_sgd_1,'./models/sgd_1.pkl')
joblib.dump(model_sgd_2,'./models/sgd_2.pkl')
joblib.dump(model_sgd_3,'./models/sgd_3.pkl')

joblib.dump(model_adaboost_1,'./models/adaboost_1.pkl')
joblib.dump(model_adaboost_2,'./models/adaboost_2.pkl')
joblib.dump(model_adaboost_3,'./models/adaboost_3.pkl')

['./models2/adaboost_3.pkl']