In [1]:
import os

import pandas as pd
from time import time
from joblib import load, dump

In [2]:
from sklearn.model_selection import GridSearchCV, RepeatedStratifiedKFold, RepeatedKFold, train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.linear_model import SGDClassifier, SGDRegressor, Ridge, LogisticRegression
from xgboost import XGBClassifier, XGBRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score, r2_score, mean_squared_error, mean_absolute_error

In [3]:
seed = 42

In [4]:
csv_path = os.path.join('data', 'fires_transformed.csv')
df = pd.read_csv(csv_path)
df.sample(5, random_state=seed)

Unnamed: 0,elevacion,erodi,slope,orientacion_sen,orientacion_cos,altura,lfcc,inflam,mcroth,anomalia,dpv,vel_media_viento,severidad_real,severidad_discreta,coord_x_etrs89,coord_y_etrs89,incendio,provincia
119340,1024.904053,4,15.177,0.99465,0.103298,9.0,46.956501,3,2,73.561211,2.246882,2.916394,0.845614,3,490110.0,4538190.0,Cogolludo,Guadalajara
58023,505.352997,1,25.718,0.206331,0.978482,3.0,19.892401,4,3,86.961823,2.385298,4.876472,0.270908,1,604950.0,4257480.0,Talave,Albacete
46997,388.710999,2,31.593,-0.984,0.178167,1.0,0.9569,4,3,79.168137,2.319516,5.026659,0.412787,1,617730.0,4243860.0,Donceles,Albacete
67214,1063.177002,1,22.364,0.901304,-0.433188,2.0,7.424201,4,1,50.321835,0.349119,2.470344,0.576249,2,561090.0,4256790.0,Yeste,Albacete
153590,507.516998,3,13.911,0.872081,0.489362,3.0,32.275799,4,3,72.89386,1.469779,3.393716,0.310006,1,407340.0,4412550.0,Montesion,Toledo


In [5]:
numerical_variables = ['elevacion', 'slope', 'orientacion_sen', 'orientacion_cos', 'altura', 'lfcc', 'anomalia', 'dpv', 'vel_media_viento']
categorical_variables = ['erodi', 'inflam', 'mcroth']
variables = numerical_variables + categorical_variables
target_real = 'severidad_real'
target_discrete = 'severidad_discreta'

# Regresión

In [6]:
X = df[variables]
y = df[target_real]

In [7]:
X.sample(5, random_state=seed)

Unnamed: 0,elevacion,slope,orientacion_sen,orientacion_cos,altura,lfcc,anomalia,dpv,vel_media_viento,erodi,inflam,mcroth
119340,1024.904053,15.177,0.99465,0.103298,9.0,46.956501,73.561211,2.246882,2.916394,4,3,2
58023,505.352997,25.718,0.206331,0.978482,3.0,19.892401,86.961823,2.385298,4.876472,1,4,3
46997,388.710999,31.593,-0.984,0.178167,1.0,0.9569,79.168137,2.319516,5.026659,2,4,3
67214,1063.177002,22.364,0.901304,-0.433188,2.0,7.424201,50.321835,0.349119,2.470344,1,4,1
153590,507.516998,13.911,0.872081,0.489362,3.0,32.275799,72.89386,1.469779,3.393716,3,4,3


In [8]:
y.sample(5, random_state=seed)

119340    0.845614
58023     0.270908
46997     0.412787
67214     0.576249
153590    0.310006
Name: severidad_real, dtype: float64

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=seed)

In [10]:
def optimize_params(estimator, X, y, cv, scoring=None, refit=True, **param_grid):
    t=time()
    
    # GridSearch over specified parameter values for an estimator
    grid_search_cv = GridSearchCV(estimator,
                                  param_grid,
                                  scoring=scoring,
                                  refit=refit,
                                  cv=cv,
                                  verbose=1,
                                  n_jobs=10,
                                  return_train_score=True).fit(X, y)

    cv_results = pd.DataFrame(grid_search_cv.cv_results_)

    # Drop the results for each validation split and sort by the validation metric
    labels = cv_results.filter(regex="split")
    by = cv_results.filter(regex="rank_test").columns[0]
    cv_results = cv_results.drop(labels, axis=1).sort_values(by)

    print(f'Time: {round(time()-t, 2)} seg.')
    display(cv_results)

    return grid_search_cv

In [11]:
n_splits = 4
n_repeats = 2

cv = RepeatedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=seed)

In [12]:
preprocessing = make_column_transformer(
        (StandardScaler(), numerical_variables),
        (OneHotEncoder(handle_unknown='ignore'), categorical_variables),
        remainder='passthrough'
)

## Grid Search

### Linear Regression

In [13]:
poly = PolynomialFeatures(include_bias=False)
linear_reg = Ridge(random_state=seed)
linear_reg_pipeline = make_pipeline(preprocessing, poly, linear_reg)

param_grid = {
    'polynomialfeatures__degree': [1, 2, 3],
    'ridge__alpha': [0.01, 0.1, 0.5, 1, 2, 10]
}

linear_reg_gs = optimize_params(linear_reg_pipeline, X_train, y_train, cv, scoring='r2', **param_grid)

Fitting 8 folds for each of 18 candidates, totalling 144 fits
Time: 412.78 seg.


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_polynomialfeatures__degree,param_ridge__alpha,params,mean_test_score,std_test_score,rank_test_score,mean_train_score,std_train_score
13,75.829907,103.074499,1.383585,0.373411,3,0.1,"{'polynomialfeatures__degree': 3, 'ridge__alph...",0.377136,0.003145,1,0.39353,0.000562
14,37.345602,12.251588,1.173967,0.137924,3,0.5,"{'polynomialfeatures__degree': 3, 'ridge__alph...",0.374343,0.002965,2,0.389748,0.000529
15,42.889975,16.4928,1.525825,0.251134,3,1.0,"{'polynomialfeatures__degree': 3, 'ridge__alph...",0.373488,0.002121,3,0.387783,0.000524
16,55.070395,8.820388,2.4974,0.572462,3,2.0,"{'polynomialfeatures__degree': 3, 'ridge__alph...",0.372326,0.001986,4,0.385726,0.000547
17,40.658198,15.023955,1.084017,0.373859,3,10.0,"{'polynomialfeatures__degree': 3, 'ridge__alph...",0.368162,0.002225,5,0.380343,0.000656
12,177.65321,156.100592,1.148886,0.420077,3,0.01,"{'polynomialfeatures__degree': 3, 'ridge__alph...",0.363046,0.01385,6,0.396016,0.000531
7,1.191994,0.04611,0.201425,0.052889,2,0.1,"{'polynomialfeatures__degree': 2, 'ridge__alph...",0.294344,0.003153,7,0.296912,0.001047
6,1.20549,0.045761,0.205072,0.058121,2,0.01,"{'polynomialfeatures__degree': 2, 'ridge__alph...",0.294339,0.003155,8,0.296932,0.001048
8,1.214597,0.062313,0.203282,0.054007,2,0.5,"{'polynomialfeatures__degree': 2, 'ridge__alph...",0.294311,0.003154,9,0.296877,0.001047
9,1.206393,0.076548,0.216131,0.028443,2,1.0,"{'polynomialfeatures__degree': 2, 'ridge__alph...",0.294295,0.003158,10,0.296856,0.001047


### KNeighbors

In [14]:
k_neighbors_reg = KNeighborsRegressor()
k_neighbours_reg_pipeline = make_pipeline(preprocessing, k_neighbors_reg)

param_grid = {
    'kneighborsregressor__n_neighbors': [5, 10, 20, 50, 100],
    'kneighborsregressor__weights': ['uniform', 'distance']
}

k_neighbors_reg_gs = optimize_params(k_neighbours_reg_pipeline, X_train, y_train, cv, scoring='r2', **param_grid)

Fitting 8 folds for each of 10 candidates, totalling 80 fits
Time: 975.94 seg.


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_kneighborsregressor__n_neighbors,param_kneighborsregressor__weights,params,mean_test_score,std_test_score,rank_test_score,mean_train_score,std_train_score
5,0.083662,0.017594,30.349784,0.594223,20,distance,"{'kneighborsregressor__n_neighbors': 20, 'knei...",0.402597,0.003619,1,1.0,3.3159e-14
3,0.081026,0.007687,30.20086,0.27428,10,distance,"{'kneighborsregressor__n_neighbors': 10, 'knei...",0.39418,0.003806,2,1.0,1.089603e-14
7,0.08004,0.00609,29.890787,0.470342,50,distance,"{'kneighborsregressor__n_neighbors': 50, 'knei...",0.390295,0.002747,3,1.0,1.432198e-13
4,0.078418,0.008341,29.914341,0.588444,20,uniform,"{'kneighborsregressor__n_neighbors': 20, 'knei...",0.390282,0.003577,4,0.450484,0.001433049
2,0.082693,0.007258,30.960135,0.628392,10,uniform,"{'kneighborsregressor__n_neighbors': 10, 'knei...",0.38587,0.003866,5,0.500988,0.001063657
6,0.083855,0.007811,30.241276,1.05672,50,uniform,"{'kneighborsregressor__n_neighbors': 50, 'knei...",0.374378,0.002643,6,0.400057,0.000934739
9,0.0867,0.00665,30.65568,0.443233,100,distance,"{'kneighborsregressor__n_neighbors': 100, 'kne...",0.373607,0.002664,7,1.0,4.100923e-13
8,0.088301,0.00735,30.880084,0.557419,100,uniform,"{'kneighborsregressor__n_neighbors': 100, 'kne...",0.356726,0.002653,8,0.369827,0.000878502
1,0.084762,0.006408,30.927618,0.550506,5,distance,"{'kneighborsregressor__n_neighbors': 5, 'kneig...",0.3567,0.005074,9,1.0,3.858923e-15
0,0.117182,0.005129,31.104398,0.391777,5,uniform,"{'kneighborsregressor__n_neighbors': 5, 'kneig...",0.35256,0.005337,10,0.572631,0.001512176


### Decision Tree

In [15]:
decision_tree_reg = DecisionTreeRegressor(random_state=seed)
decision_tree_reg_pipeline = make_pipeline(preprocessing, decision_tree_reg)

param_grid = {
    'decisiontreeregressor__max_depth': [3, 5, 7, 10, None],
    'decisiontreeregressor__min_samples_split': [2, 50, 200],
    'decisiontreeregressor__criterion': ['squared_error', 'friedman_mse', 'poisson'],
    'decisiontreeregressor__ccp_alpha': [0, 0.0001, 0.001, 0.01, 0.1]
}

decision_tree_reg_gs = optimize_params(decision_tree_reg_pipeline, X_train, y_train, cv, scoring='r2', **param_grid)

Fitting 8 folds for each of 225 candidates, totalling 1800 fits
Time: 1327.67 seg.


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_decisiontreeregressor__ccp_alpha,param_decisiontreeregressor__criterion,param_decisiontreeregressor__max_depth,params,mean_test_score,std_test_score,rank_test_score,mean_train_score,std_train_score
28,4.346671,0.097910,0.026741,0.003859,0,friedman_mse,,"{'decisiontreeregressor__ccp_alpha': 0, 'decis...",0.526107,0.006673,1,7.199219e-01,2.555821e-03
13,4.356798,0.108957,0.028819,0.004048,0,squared_error,,"{'decisiontreeregressor__ccp_alpha': 0, 'decis...",0.526011,0.006812,2,7.199219e-01,2.555821e-03
43,5.104722,0.191381,0.026566,0.004836,0,poisson,,"{'decisiontreeregressor__ccp_alpha': 0, 'decis...",0.525571,0.007287,3,7.217258e-01,3.646081e-03
44,4.330285,0.150079,0.028007,0.002989,0,poisson,,"{'decisiontreeregressor__ccp_alpha': 0, 'decis...",0.492008,0.006260,4,5.633251e-01,3.397572e-03
14,3.748897,0.152828,0.024795,0.004253,0,squared_error,,"{'decisiontreeregressor__ccp_alpha': 0, 'decis...",0.491750,0.006309,5,5.612105e-01,1.714298e-03
...,...,...,...,...,...,...,...,...,...,...,...,...,...
160,3.070820,0.066390,0.020045,0.003455,0.01,friedman_mse,10,"{'decisiontreeregressor__ccp_alpha': 0.01, 'de...",-0.000031,0.000036,136,1.387779e-17,1.029204e-16
159,3.068889,0.026608,0.019112,0.001702,0.01,friedman_mse,10,"{'decisiontreeregressor__ccp_alpha': 0.01, 'de...",-0.000031,0.000036,136,1.387779e-17,1.029204e-16
158,2.202245,0.017484,0.021293,0.001507,0.01,friedman_mse,7,"{'decisiontreeregressor__ccp_alpha': 0.01, 'de...",-0.000031,0.000036,136,1.387779e-17,1.029204e-16
156,2.391590,0.135455,0.021785,0.002523,0.01,friedman_mse,7,"{'decisiontreeregressor__ccp_alpha': 0.01, 'de...",-0.000031,0.000036,136,1.387779e-17,1.029204e-16


### Random Forest

In [16]:
random_forest_reg = RandomForestRegressor(random_state=seed)
random_forest_reg_pipeline = make_pipeline(preprocessing, random_forest_reg)

param_grid = {
    'randomforestregressor__n_estimators': [10, 25, 50],
    'randomforestregressor__max_depth': [3, 5, 7, 10, None]
}

random_forest_reg_gs = optimize_params(random_forest_reg_pipeline, X_train, y_train, cv, scoring='r2', **param_grid)

Fitting 8 folds for each of 15 candidates, totalling 120 fits
Time: 914.97 seg.


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_randomforestregressor__max_depth,param_randomforestregressor__n_estimators,params,mean_test_score,std_test_score,rank_test_score,mean_train_score,std_train_score
14,205.446483,6.953326,0.540484,0.096241,,50,"{'randomforestregressor__max_depth': None, 'ra...",0.696175,0.00327,1,0.955699,0.000138
13,117.04166,0.659441,0.369089,0.026215,,25,"{'randomforestregressor__max_depth': None, 'ra...",0.688685,0.003413,2,0.951485,0.000133
12,47.609875,0.38449,0.157735,0.011014,,10,"{'randomforestregressor__max_depth': None, 'ra...",0.665966,0.004384,3,0.938738,0.000295
11,112.623868,0.819149,0.202577,0.003966,10.0,50,"{'randomforestregressor__max_depth': 10, 'rand...",0.503961,0.004757,4,0.545528,0.002908
10,57.583311,0.215842,0.115772,0.003956,10.0,25,"{'randomforestregressor__max_depth': 10, 'rand...",0.501109,0.005459,5,0.542642,0.003379
9,23.304336,0.218505,0.056929,0.003871,10.0,10,"{'randomforestregressor__max_depth': 10, 'rand...",0.49448,0.0063,6,0.536179,0.004224
8,83.598183,0.190003,0.145744,0.010834,7.0,50,"{'randomforestregressor__max_depth': 7, 'rando...",0.36343,0.004306,7,0.376014,0.002262
7,41.956758,0.263319,0.086886,0.009545,7.0,25,"{'randomforestregressor__max_depth': 7, 'rando...",0.361419,0.005385,8,0.3741,0.003398
6,16.781558,0.053491,0.04617,0.00524,7.0,10,"{'randomforestregressor__max_depth': 7, 'rando...",0.357213,0.006144,9,0.369984,0.004554
5,60.642174,0.356147,0.107334,0.009066,5.0,50,"{'randomforestregressor__max_depth': 5, 'rando...",0.267666,0.003105,10,0.272108,0.001521


### XGBoost

In [17]:
xgb_reg = XGBRegressor(random_state=seed)
xgb_reg_pipeline = make_pipeline(preprocessing, xgb_reg)

param_grid = {
    'xgbregressor__n_estimators': [10, 25, 50, 100],
    'xgbregressor__max_depth': [3, 5, 7, 10, None],
    'xgbregressor__learning_rate': [0.001, 0.005, 0.01, 0.05, 0.1, 0.2]
}

xgb_reg_gs = optimize_params(xgb_reg_pipeline, X_train, y_train, cv, scoring='r2', **param_grid)

Fitting 8 folds for each of 120 candidates, totalling 960 fits
Time: 216.52 seg.


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_xgbregressor__learning_rate,param_xgbregressor__max_depth,param_xgbregressor__n_estimators,params,mean_test_score,std_test_score,rank_test_score,mean_train_score,std_train_score
115,5.465034,0.121549,0.284260,0.037699,0.2,10,100,"{'xgbregressor__learning_rate': 0.2, 'xgbregre...",0.646875,0.004021,1,0.837901,0.003577
95,6.090098,0.085865,0.353832,0.029728,0.1,10,100,"{'xgbregressor__learning_rate': 0.1, 'xgbregre...",0.634160,0.003125,2,0.769770,0.002125
114,3.372739,0.112558,0.147926,0.014054,0.2,10,50,"{'xgbregressor__learning_rate': 0.2, 'xgbregre...",0.627759,0.004357,3,0.767062,0.004974
75,7.277454,0.133802,0.374869,0.038699,0.05,10,100,"{'xgbregressor__learning_rate': 0.05, 'xgbregr...",0.607905,0.002535,4,0.712821,0.003404
94,4.004450,0.051419,0.178938,0.012474,0.1,10,50,"{'xgbregressor__learning_rate': 0.1, 'xgbregre...",0.607413,0.003199,5,0.713196,0.002370
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1,0.701782,0.032674,0.047892,0.005162,0.001,3,25,"{'xgbregressor__learning_rate': 0.001, 'xgbreg...",0.007196,0.000049,116,0.007266,0.000071
8,0.833344,0.063007,0.050288,0.007422,0.001,7,10,"{'xgbregressor__learning_rate': 0.001, 'xgbreg...",0.006350,0.000164,117,0.006502,0.000217
16,0.791126,0.025585,0.044900,0.006428,0.001,,10,"{'xgbregressor__learning_rate': 0.001, 'xgbreg...",0.005555,0.000189,118,0.005662,0.000230
4,0.702626,0.051680,0.040527,0.005429,0.001,5,10,"{'xgbregressor__learning_rate': 0.001, 'xgbreg...",0.004798,0.000107,119,0.004877,0.000129


### SGD

In [18]:
sgd_reg = SGDRegressor(random_state=seed)
sgd_reg_pipeline = make_pipeline(preprocessing, sgd_reg)

param_grid = {
    'sgdregressor__alpha': [0.0001, 0.001, 0.01, 0.1, 0.2, 0.5, 0.8],
    'sgdregressor__max_iter': [50, 100, 500, 1000],
    'sgdregressor__loss': ['squared_error', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive']
}

sgd_reg_gs = optimize_params(sgd_reg_pipeline, X_train, y_train, cv, scoring='r2', **param_grid)

Fitting 8 folds for each of 112 candidates, totalling 896 fits
Time: 41.62 seg.


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_sgdregressor__alpha,param_sgdregressor__loss,param_sgdregressor__max_iter,params,mean_test_score,std_test_score,rank_test_score,mean_train_score,std_train_score
0,0.283873,0.020260,0.028547,0.006413,0.0001,squared_error,50,"{'sgdregressor__alpha': 0.0001, 'sgdregressor_...",0.158093,0.003777,1,0.158000,0.001418
1,0.299962,0.015654,0.028330,0.003544,0.0001,squared_error,100,"{'sgdregressor__alpha': 0.0001, 'sgdregressor_...",0.158093,0.003777,1,0.158000,0.001418
2,0.312899,0.012363,0.031897,0.002571,0.0001,squared_error,500,"{'sgdregressor__alpha': 0.0001, 'sgdregressor_...",0.158093,0.003777,1,0.158000,0.001418
3,0.320403,0.012524,0.030787,0.005139,0.0001,squared_error,1000,"{'sgdregressor__alpha': 0.0001, 'sgdregressor_...",0.158093,0.003777,1,0.158000,0.001418
19,0.320402,0.009974,0.032108,0.005047,0.001,squared_error,1000,"{'sgdregressor__alpha': 0.001, 'sgdregressor__...",0.158084,0.003755,5,0.157990,0.001406
...,...,...,...,...,...,...,...,...,...,...,...,...,...
84,0.335991,0.009660,0.031035,0.003569,0.5,huber,50,"{'sgdregressor__alpha': 0.5, 'sgdregressor__lo...",0.076538,0.004778,105,0.076619,0.004766
101,0.333453,0.013886,0.032463,0.004232,0.8,huber,100,"{'sgdregressor__alpha': 0.8, 'sgdregressor__lo...",0.058141,0.004859,109,0.058209,0.004743
100,0.337987,0.004914,0.030806,0.001406,0.8,huber,50,"{'sgdregressor__alpha': 0.8, 'sgdregressor__lo...",0.058141,0.004859,109,0.058209,0.004743
102,0.336595,0.010583,0.031892,0.002256,0.8,huber,500,"{'sgdregressor__alpha': 0.8, 'sgdregressor__lo...",0.058141,0.004859,109,0.058209,0.004743


## Evaluación

In [19]:
reg_dict = {
    'Regresion lineal': linear_reg_gs,
    'KNeighbors' : k_neighbors_reg_gs,
    'Arbol de decision': decision_tree_reg_gs,
    'Random Forest': random_forest_reg_gs,
    'XGBoost': xgb_reg_gs,
    'SGD': sgd_reg_gs
}


df_results_class = pd.DataFrame({
    'Modelo': reg_dict.keys(),
    'Mejor puntuación': [gs.best_score_ for gs in reg_dict.values()]
})
df_results_class = df_results_class.sort_values(by='Mejor puntuación', ascending=False)
df_results_class

Unnamed: 0,Modelo,Mejor puntuación
3,Random Forest,0.696175
4,XGBoost,0.646875
2,Arbol de decision,0.526107
1,KNeighbors,0.402597
0,Regresion lineal,0.377136
5,SGD,0.158093


In [20]:
model = random_forest_reg_gs.best_estimator_
model_path = os.path.join('models', 'experiment_1', 'model_reg.joblib')
dump(model, model_path)

['models\\experiment_1\\model_reg.joblib']

In [21]:
model_path = os.path.join('models', 'experiment_1', 'model_reg.joblib')
model = load(model_path)
pred = model.predict(X_test)

print(f"""Métricas predicción del test
R2:  {r2_score(y_test, pred)}
MSE: {mean_squared_error(y_test, pred)}
MAE: {mean_absolute_error(y_test, pred)}""")

Métricas predicción del test
R2:  0.724605944426127
MSE: 0.008783141496746506
MAE: 0.06940586333961964


# Clasificación

In [22]:
X = df[variables]
y = df[target_discrete]

In [23]:
X.sample(5, random_state=seed)

Unnamed: 0,elevacion,slope,orientacion_sen,orientacion_cos,altura,lfcc,anomalia,dpv,vel_media_viento,erodi,inflam,mcroth
119340,1024.904053,15.177,0.99465,0.103298,9.0,46.956501,73.561211,2.246882,2.916394,4,3,2
58023,505.352997,25.718,0.206331,0.978482,3.0,19.892401,86.961823,2.385298,4.876472,1,4,3
46997,388.710999,31.593,-0.984,0.178167,1.0,0.9569,79.168137,2.319516,5.026659,2,4,3
67214,1063.177002,22.364,0.901304,-0.433188,2.0,7.424201,50.321835,0.349119,2.470344,1,4,1
153590,507.516998,13.911,0.872081,0.489362,3.0,32.275799,72.89386,1.469779,3.393716,3,4,3


In [24]:
y.sample(5, random_state=seed)

119340    3
58023     1
46997     1
67214     2
153590    1
Name: severidad_discreta, dtype: int64

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=seed)

In [26]:
def optimize_params(estimator, X, y, cv, scoring=None, refit=True, **param_grid):
    t=time()
    """Exhaustive search over specified parameter values for an estimator."""
    grid_search_cv = GridSearchCV(estimator,
                                  param_grid,
                                  scoring=scoring,
                                  refit=refit,
                                  cv=cv,
                                  verbose=1,
                                  n_jobs=10,
                                  return_train_score=True).fit(X, y)

    cv_results = pd.DataFrame(grid_search_cv.cv_results_)

    # Drop the results for each validation split and sort by the refit metric
    labels = cv_results.filter(regex="split")
    by = cv_results.filter(regex="rank_test").columns[0]
    cv_results = cv_results.drop(labels, axis=1).sort_values(by)

    print(f'Time: {round(time()-t, 2)} seg.')
    display(cv_results)

    return grid_search_cv

In [27]:
n_splits = 4
n_repeats = 2

cv = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=seed)

In [28]:
preprocessing = make_column_transformer(
        (StandardScaler(), numerical_variables),
        (OneHotEncoder(handle_unknown='ignore'), categorical_variables),
        remainder='passthrough'
)

## Grid Search

### Logistic Regression

In [29]:
poly = PolynomialFeatures(include_bias=False)
logistic_reg = LogisticRegression(random_state=seed)
logistic_reg_pipeline = make_pipeline(preprocessing, poly, logistic_reg)

param_grid = {
    'polynomialfeatures__degree': [1, 2],
    'logisticregression__C': [0.01, 0.1, 1, 2, 5, 10],
    'logisticregression__max_iter': [100, 200]
}

logistic_reg_gs = optimize_params(logistic_reg_pipeline, X_train, y_train, cv, scoring='f1_weighted', **param_grid)

Fitting 8 folds for each of 24 candidates, totalling 192 fits
Time: 281.49 seg.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_logisticregression__C,param_logisticregression__max_iter,param_polynomialfeatures__degree,params,mean_test_score,std_test_score,rank_test_score,mean_train_score,std_train_score
15,30.820185,0.437455,0.205498,0.018195,2.0,200,2,"{'logisticregression__C': 2, 'logisticregressi...",0.493238,0.002226,1,0.496808,0.001036
23,25.663738,1.562366,0.109945,0.034109,10.0,200,2,"{'logisticregression__C': 10, 'logisticregress...",0.4931,0.002047,2,0.496823,0.001038
11,30.507444,0.414901,0.205945,0.030887,1.0,200,2,"{'logisticregression__C': 1, 'logisticregressi...",0.492868,0.002104,3,0.496766,0.000953
19,30.784516,0.577427,0.216177,0.031001,5.0,200,2,"{'logisticregression__C': 5, 'logisticregressi...",0.492839,0.001952,4,0.49668,0.001029
7,30.692747,0.473382,0.236708,0.049162,0.1,200,2,"{'logisticregression__C': 0.1, 'logisticregres...",0.492375,0.002319,5,0.49607,0.000893
9,15.65135,0.318693,0.205065,0.026444,1.0,100,2,"{'logisticregression__C': 1, 'logisticregressi...",0.49113,0.002166,6,0.494298,0.000957
21,15.62453,0.259628,0.199408,0.011773,10.0,100,2,"{'logisticregression__C': 10, 'logisticregress...",0.491078,0.002376,7,0.494353,0.001037
5,15.419031,0.318179,0.195657,0.019747,0.1,100,2,"{'logisticregression__C': 0.1, 'logisticregres...",0.490963,0.002481,8,0.494123,0.000883
13,15.696041,0.281077,0.19333,0.019439,2.0,100,2,"{'logisticregression__C': 2, 'logisticregressi...",0.49094,0.002424,9,0.494368,0.000806
17,15.567282,0.324095,0.215859,0.018823,5.0,100,2,"{'logisticregression__C': 5, 'logisticregressi...",0.490691,0.002485,10,0.494085,0.000836


### KNeighbors

In [30]:
k_neighbors_class = KNeighborsClassifier()
k_neighbours_class_pipeline = make_pipeline(preprocessing, k_neighbors_class)

param_grid = {
    'kneighborsclassifier__n_neighbors': [5, 10, 20, 50, 100],
    'kneighborsclassifier__weights': ['uniform', 'distance']
}

k_neighbors_class_gs = optimize_params(k_neighbours_class_pipeline, X_train, y_train, cv, scoring='f1_weighted', **param_grid)

Fitting 8 folds for each of 10 candidates, totalling 80 fits
Time: 940.98 seg.


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_kneighborsclassifier__n_neighbors,param_kneighborsclassifier__weights,params,mean_test_score,std_test_score,rank_test_score,mean_train_score,std_train_score
5,0.092311,0.012136,28.326503,0.485484,20,distance,"{'kneighborsclassifier__n_neighbors': 20, 'kne...",0.52621,0.0021,1,1.0,0.0
7,0.090106,0.006129,29.467259,0.343622,50,distance,"{'kneighborsclassifier__n_neighbors': 50, 'kne...",0.524804,0.00256,2,1.0,0.0
4,0.092563,0.005926,29.066204,0.580566,20,uniform,"{'kneighborsclassifier__n_neighbors': 20, 'kne...",0.521357,0.002475,3,0.576024,0.000939
9,0.094815,0.003761,30.627608,0.326247,100,distance,"{'kneighborsclassifier__n_neighbors': 100, 'kn...",0.518494,0.002023,4,1.0,0.0
3,0.09038,0.007927,28.940787,0.552591,10,distance,"{'kneighborsclassifier__n_neighbors': 10, 'kne...",0.517919,0.002027,5,1.0,0.0
6,0.090239,0.009356,29.314803,0.306098,50,uniform,"{'kneighborsclassifier__n_neighbors': 50, 'kne...",0.517525,0.002922,6,0.543023,0.000762
2,0.086856,0.010215,28.89673,0.582043,10,uniform,"{'kneighborsclassifier__n_neighbors': 10, 'kne...",0.514632,0.001903,7,0.608842,0.001116
8,0.095518,0.011936,30.382068,0.20947,100,uniform,"{'kneighborsclassifier__n_neighbors': 100, 'kn...",0.509992,0.002022,8,0.523552,0.000835
1,0.092393,0.010765,28.823852,0.689589,5,distance,"{'kneighborsclassifier__n_neighbors': 5, 'knei...",0.503075,0.002736,9,1.0,0.0
0,0.088082,0.017359,29.014254,0.518967,5,uniform,"{'kneighborsclassifier__n_neighbors': 5, 'knei...",0.49974,0.002709,10,0.658221,0.000993


### Decision Tree

In [31]:
decision_tree_class = DecisionTreeClassifier(random_state=seed)
decision_tree_class_pipeline = make_pipeline(preprocessing, decision_tree_class)

param_grid = {
    'decisiontreeclassifier__max_depth': [3, 5, 7, 10, None],
    'decisiontreeclassifier__min_samples_split': [2, 50, 200],
    'decisiontreeclassifier__criterion': ['entropy', 'gini'],
    'decisiontreeclassifier__ccp_alpha': [0, 0.0001, 0.001, 0.01, 0.1]
}

decision_tree_class_gs = optimize_params(decision_tree_class_pipeline, X_train, y_train, cv, scoring='f1_weighted', **param_grid)

Fitting 8 folds for each of 150 candidates, totalling 1200 fits
Time: 410.35 seg.


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_decisiontreeclassifier__ccp_alpha,param_decisiontreeclassifier__criterion,param_decisiontreeclassifier__max_depth,params,mean_test_score,std_test_score,rank_test_score,mean_train_score,std_train_score
42,9.510453,0.139261,0.041882,0.005889,0.0001,entropy,,"{'decisiontreeclassifier__ccp_alpha': 0.0001, ...",0.581348,0.003109,1,0.698796,0.002230
43,6.206828,0.144965,0.043873,0.004095,0.0001,entropy,,"{'decisiontreeclassifier__ccp_alpha': 0.0001, ...",0.574542,0.002661,2,0.663367,0.002057
13,6.157456,0.082327,0.042433,0.003232,0,entropy,,"{'decisiontreeclassifier__ccp_alpha': 0, 'deci...",0.571682,0.001683,3,0.691800,0.001180
28,4.661161,0.110850,0.045339,0.004460,0,gini,,"{'decisiontreeclassifier__ccp_alpha': 0, 'deci...",0.568135,0.002630,4,0.705379,0.001407
12,7.139856,0.075566,0.043459,0.003595,0,entropy,,"{'decisiontreeclassifier__ccp_alpha': 0, 'deci...",0.567436,0.002638,5,1.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
121,1.464324,0.046950,0.036248,0.002377,0.1,entropy,3,"{'decisiontreeclassifier__ccp_alpha': 0.1, 'de...",0.206183,0.000013,121,0.206183,0.000004
120,1.417351,0.030197,0.032559,0.003655,0.1,entropy,3,"{'decisiontreeclassifier__ccp_alpha': 0.1, 'de...",0.206183,0.000013,121,0.206183,0.000004
148,4.691471,0.114044,0.032351,0.002911,0.1,gini,,"{'decisiontreeclassifier__ccp_alpha': 0.1, 'de...",0.206183,0.000013,121,0.206183,0.000004
133,6.212103,0.145059,0.034349,0.002846,0.1,entropy,,"{'decisiontreeclassifier__ccp_alpha': 0.1, 'de...",0.206183,0.000013,121,0.206183,0.000004


### Random Forest

In [32]:
random_forest_class = RandomForestClassifier(random_state=seed)
random_forest_class_pipeline = make_pipeline(preprocessing, random_forest_class)

param_grid = {
    'randomforestclassifier__n_estimators': [10, 25, 50],
    'randomforestclassifier__max_depth': [3, 5, 7, None],
    'randomforestclassifier__criterion': ['entropy', 'gini'],
}

random_forest_class_gs = optimize_params(random_forest_class_pipeline, X_train, y_train, cv, scoring='f1_weighted', **param_grid)

Fitting 8 folds for each of 24 candidates, totalling 192 fits
Time: 330.89 seg.


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_randomforestclassifier__criterion,param_randomforestclassifier__max_depth,param_randomforestclassifier__n_estimators,params,mean_test_score,std_test_score,rank_test_score,mean_train_score,std_train_score
11,58.902877,0.514842,0.721421,0.06603,entropy,,50,{'randomforestclassifier__criterion': 'entropy...,0.64798,0.001786,1,0.999907,4.2e-05
23,40.702484,1.512994,0.516995,0.106107,gini,,50,"{'randomforestclassifier__criterion': 'gini', ...",0.646736,0.002364,2,0.999897,2.7e-05
10,28.760443,0.320177,0.34421,0.023182,entropy,,25,{'randomforestclassifier__criterion': 'entropy...,0.63886,0.002775,3,0.99884,0.000175
22,23.289972,0.308006,0.341139,0.020542,gini,,25,"{'randomforestclassifier__criterion': 'gini', ...",0.63784,0.002499,4,0.998774,0.000141
9,11.96643,0.198125,0.165894,0.01027,entropy,,10,{'randomforestclassifier__criterion': 'entropy...,0.616763,0.001657,5,0.986164,0.000234
21,9.745839,0.177481,0.157161,0.010747,gini,,10,"{'randomforestclassifier__criterion': 'gini', ...",0.613562,0.002704,6,0.986058,0.000366
7,12.42304,0.169794,0.148695,0.01398,entropy,7.0,25,{'randomforestclassifier__criterion': 'entropy...,0.501909,0.003719,7,0.510395,0.002805
8,24.029045,0.270825,0.247931,0.013175,entropy,7.0,50,{'randomforestclassifier__criterion': 'entropy...,0.500546,0.00192,8,0.510031,0.001668
20,18.489924,0.25067,0.260746,0.010536,gini,7.0,50,"{'randomforestclassifier__criterion': 'gini', ...",0.500192,0.001494,9,0.510646,0.002393
19,9.608556,0.116836,0.148158,0.009573,gini,7.0,25,"{'randomforestclassifier__criterion': 'gini', ...",0.499314,0.002744,10,0.508579,0.002553


### XGBoost

In [33]:
xgb_class = XGBClassifier(random_state=seed)
xgb_class_pipeline = make_pipeline(preprocessing, xgb_class)

param_grid = {
    'xgbclassifier__n_estimators': [10, 25, 50, 100],
    'xgbclassifier__max_depth': [3, 5, 7, 10, None],
    'xgbclassifier__learning_rate': [0.001, 0.005, 0.01, 0.05, 0.1, 0.2]
}

xgb_class_gs = optimize_params(xgb_class_pipeline, X_train, y_train, cv, scoring='f1_weighted', **param_grid)

Fitting 8 folds for each of 120 candidates, totalling 960 fits
Time: 786.03 seg.


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_xgbclassifier__learning_rate,param_xgbclassifier__max_depth,param_xgbclassifier__n_estimators,params,mean_test_score,std_test_score,rank_test_score,mean_train_score,std_train_score
115,19.243033,0.152537,1.372628,0.080234,0.2,10,100,"{'xgbclassifier__learning_rate': 0.2, 'xgbclas...",0.627195,0.003514,1,0.868331,0.003146
95,22.017713,0.261306,1.436079,0.043014,0.1,10,100,"{'xgbclassifier__learning_rate': 0.1, 'xgbclas...",0.618593,0.002975,2,0.793995,0.002288
114,11.673194,0.147223,0.760592,0.078066,0.2,10,50,"{'xgbclassifier__learning_rate': 0.2, 'xgbclas...",0.617582,0.002737,3,0.797319,0.002549
94,13.702497,0.178398,0.864906,0.059900,0.1,10,50,"{'xgbclassifier__learning_rate': 0.1, 'xgbclas...",0.608337,0.002692,4,0.742797,0.002302
75,26.452406,0.405867,1.569868,0.117355,0.05,10,100,"{'xgbclassifier__learning_rate': 0.05, 'xgbcla...",0.608276,0.003364,5,0.743276,0.001778
...,...,...,...,...,...,...,...,...,...,...,...,...,...
41,2.289976,0.115208,0.104203,0.013824,0.01,3,25,"{'xgbclassifier__learning_rate': 0.01, 'xgbcla...",0.422515,0.006108,116,0.424656,0.003539
1,2.383615,0.062610,0.113119,0.007993,0.001,3,25,"{'xgbclassifier__learning_rate': 0.001, 'xgbcl...",0.421914,0.005311,117,0.423883,0.005458
2,4.242704,0.073246,0.157601,0.007924,0.001,3,50,"{'xgbclassifier__learning_rate': 0.001, 'xgbcl...",0.421907,0.005528,118,0.423886,0.005106
20,1.384505,0.048156,0.089041,0.006841,0.005,3,10,"{'xgbclassifier__learning_rate': 0.005, 'xgbcl...",0.421899,0.005463,119,0.423806,0.005166


### SGD

In [34]:
sgd_class = SGDClassifier(random_state=seed)
sgd_class_pipeline = make_pipeline(preprocessing, sgd_class)

param_grid = {
    'sgdclassifier__alpha': [0.0001, 0.001, 0.01, 0.1, 0.2, 0.5, 0.8],
    'sgdclassifier__max_iter': [50, 100, 500, 1000],
    'sgdclassifier__loss': ['log_loss', 'hinge', 'modified_huber']
}

sgd_class_gs = optimize_params(sgd_class_pipeline, X_train, y_train, cv, scoring='f1_weighted', **param_grid)

Fitting 8 folds for each of 84 candidates, totalling 672 fits
Time: 105.87 seg.


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_sgdclassifier__alpha,param_sgdclassifier__loss,param_sgdclassifier__max_iter,params,mean_test_score,std_test_score,rank_test_score,mean_train_score,std_train_score
0,1.947597,0.105173,0.050724,0.005230,0.0001,log_loss,50,"{'sgdclassifier__alpha': 0.0001, 'sgdclassifie...",0.409246,0.004959,1,0.409885,0.004468
1,2.033723,0.106121,0.052463,0.004320,0.0001,log_loss,100,"{'sgdclassifier__alpha': 0.0001, 'sgdclassifie...",0.409246,0.004959,1,0.409885,0.004468
2,2.001328,0.075890,0.044670,0.004214,0.0001,log_loss,500,"{'sgdclassifier__alpha': 0.0001, 'sgdclassifie...",0.409246,0.004959,1,0.409885,0.004468
3,1.986116,0.079868,0.048939,0.004389,0.0001,log_loss,1000,"{'sgdclassifier__alpha': 0.0001, 'sgdclassifie...",0.409246,0.004959,1,0.409885,0.004468
15,1.192703,0.031961,0.047316,0.003387,0.001,log_loss,1000,"{'sgdclassifier__alpha': 0.001, 'sgdclassifier...",0.404558,0.005395,5,0.406124,0.005101
...,...,...,...,...,...,...,...,...,...,...,...,...,...
60,0.982017,0.018025,0.046475,0.005037,0.5,log_loss,50,"{'sgdclassifier__alpha': 0.5, 'sgdclassifier__...",0.237286,0.003161,77,0.237681,0.002962
72,1.003806,0.018261,0.047953,0.004083,0.8,log_loss,50,"{'sgdclassifier__alpha': 0.8, 'sgdclassifier__...",0.212366,0.003170,81,0.212948,0.003425
73,1.009316,0.017137,0.041318,0.005585,0.8,log_loss,100,"{'sgdclassifier__alpha': 0.8, 'sgdclassifier__...",0.212366,0.003170,81,0.212948,0.003425
74,1.006612,0.030697,0.048519,0.005013,0.8,log_loss,500,"{'sgdclassifier__alpha': 0.8, 'sgdclassifier__...",0.212366,0.003170,81,0.212948,0.003425


## Evaluación

In [35]:
class_dict = {
    'Regresion logistica': logistic_reg_gs,
    'KNeighbors' : k_neighbors_class_gs,
    'Arbol de decision': decision_tree_class_gs,
    'Random Forest': random_forest_class_gs,
    'XGBoost': xgb_class_gs,
    'SGD': sgd_class_gs
}


df_results_class = pd.DataFrame({
    'Modelo': class_dict.keys(),
    'Mejor puntuacion': [gs.best_score_ for gs in class_dict.values()]
})
df_results_class = df_results_class.sort_values(by='Mejor puntuacion', ascending=False)
df_results_class

Unnamed: 0,Modelo,Mejor puntuacion
3,Random Forest,0.64798
4,XGBoost,0.627195
2,Arbol de decision,0.581348
1,KNeighbors,0.52621
0,Regresion logistica,0.493238
5,SGD,0.409246


In [36]:
model = random_forest_class_gs.best_estimator_
model_path = os.path.join('models', 'experiment_1', 'model_class.joblib')
dump(model, model_path)

['models\\experiment_1\\model_class.joblib']

In [37]:
model_path = os.path.join('models', 'experiment_1', 'model_class.joblib')
model = load(model_path)
pred = model.predict(X_test)

print(f"""Métricas predicción del test
F1:        {f1_score(y_test, pred, average='weighted')}
Recall:    {recall_score(y_test, pred, average='weighted')}
Precision: {precision_score(y_test, pred, average='weighted')}
Accuracy:  {accuracy_score(y_test, pred)}""")

Métricas predicción del test
F1:        0.660617605233689
Recall:    0.6600650010156409
Precision: 0.6633829307943221
Accuracy:  0.6600650010156409
