In [1]:
import os

import pandas as pd
from time import time
from joblib import load, dump

In [2]:
from sklearn.model_selection import GridSearchCV, RepeatedStratifiedKFold, RepeatedKFold, train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.linear_model import SGDClassifier, SGDRegressor, Ridge, LogisticRegression
from xgboost import XGBClassifier, XGBRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score, r2_score, mean_squared_error, mean_absolute_error

In [3]:
seed = 42

In [4]:
csv_path = os.path.join('data', 'fires_transformed.csv')
df = pd.read_csv(csv_path)
df.sample(5, random_state=seed)

Unnamed: 0,elevacion,erodi,slope,orientacion_sen,orientacion_cos,altura,lfcc,inflam,mcroth,anomalia,dpv,vel_media_viento,severidad_real,severidad_discreta,coord_x_etrs89,coord_y_etrs89,incendio,provincia
119340,1024.904053,4,15.177,0.99465,0.103298,9.0,46.956501,3,2,73.561211,2.246882,2.916394,0.845614,3,490110.0,4538190.0,Cogolludo,Guadalajara
58023,505.352997,1,25.718,0.206331,0.978482,3.0,19.892401,4,3,86.961823,2.385298,4.876472,0.270908,1,604950.0,4257480.0,Talave,Albacete
46997,388.710999,2,31.593,-0.984,0.178167,1.0,0.9569,4,3,79.168137,2.319516,5.026659,0.412787,1,617730.0,4243860.0,Donceles,Albacete
67214,1063.177002,1,22.364,0.901304,-0.433188,2.0,7.424201,4,1,50.321835,0.349119,2.470344,0.576249,2,561090.0,4256790.0,Yeste,Albacete
153590,507.516998,3,13.911,0.872081,0.489362,3.0,32.275799,4,3,72.89386,1.469779,3.393716,0.310006,1,407340.0,4412550.0,Montesion,Toledo


In [10]:
numerical_variables = ['elevacion', 'slope', 'orientacion_sen', 'orientacion_cos', 'altura', 'lfcc', 'anomalia', 'dpv', 'vel_media_viento']
categorical_variables = ['erodi', 'inflam', 'mcroth']
variables = numerical_variables + categorical_variables
target_real = 'severidad_real'
target_discrete = 'severidad_discreta'

# Regresión

In [28]:
X = df[variables]
y = df[target_real]

In [29]:
X.sample(5, random_state=seed)

Unnamed: 0,elevacion,slope,orientacion_sen,orientacion_cos,altura,lfcc,anomalia,dpv,vel_media_viento,erodi,inflam,mcroth
119340,1024.904053,15.177,0.99465,0.103298,9.0,46.956501,73.561211,2.246882,2.916394,4,3,2
58023,505.352997,25.718,0.206331,0.978482,3.0,19.892401,86.961823,2.385298,4.876472,1,4,3
46997,388.710999,31.593,-0.984,0.178167,1.0,0.9569,79.168137,2.319516,5.026659,2,4,3
67214,1063.177002,22.364,0.901304,-0.433188,2.0,7.424201,50.321835,0.349119,2.470344,1,4,1
153590,507.516998,13.911,0.872081,0.489362,3.0,32.275799,72.89386,1.469779,3.393716,3,4,3


In [30]:
y.sample(5, random_state=seed)

119340    0.845614
58023     0.270908
46997     0.412787
67214     0.576249
153590    0.310006
Name: severidad_real, dtype: float64

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=seed)

In [32]:
def optimize_params(estimator, X, y, cv, scoring=None, refit=True, **param_grid):
    t=time()
    
    # GridSearch over specified parameter values for an estimator
    grid_search_cv = GridSearchCV(estimator,
                                  param_grid,
                                  scoring=scoring,
                                  refit=refit,
                                  cv=cv,
                                  verbose=1,
                                  n_jobs=10,
                                  return_train_score=True).fit(X, y)

    cv_results = pd.DataFrame(grid_search_cv.cv_results_)

    # Drop the results for each validation split and sort by the validation metric
    labels = cv_results.filter(regex="split")
    by = cv_results.filter(regex="rank_test").columns[0]
    cv_results = cv_results.drop(labels, axis=1).sort_values(by)

    print(f'Time: {round(time()-t, 2)} seg.')
    display(cv_results)

    return grid_search_cv

In [33]:
n_splits = 4
n_repeats = 2

cv = RepeatedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=seed)

In [34]:
preprocessing = make_column_transformer(
        (StandardScaler(), numerical_variables),
        (OneHotEncoder(handle_unknown='ignore'), categorical_variables),
        remainder='passthrough'
)

## Grid Search

### Linear Regression

In [35]:
poly = PolynomialFeatures(include_bias=False)
linear_reg = Ridge(random_state=seed)
linear_reg_pipeline = make_pipeline(preprocessing, poly, linear_reg)

param_grid = {
    'polynomialfeatures__degree': [1, 2, 3],
    'ridge__alpha': [0.01, 0.1, 0.5, 1, 2, 10]
}

linear_reg_gs = optimize_params(linear_reg_pipeline, X_train, y_train, cv, scoring='r2', **param_grid)

Fitting 8 folds for each of 18 candidates, totalling 144 fits
Time: 565.69 seg.


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_polynomialfeatures__degree,param_ridge__alpha,params,mean_test_score,std_test_score,rank_test_score,mean_train_score,std_train_score
13,40.040815,20.915225,1.178427,0.296734,3,0.1,"{'polynomialfeatures__degree': 3, 'ridge__alph...",0.377136,0.003145,1,0.39353,0.000562
14,30.10678,3.333619,1.112334,0.064227,3,0.5,"{'polynomialfeatures__degree': 3, 'ridge__alph...",0.374343,0.002965,2,0.389748,0.000529
15,37.367767,4.954834,1.919485,1.198218,3,1.0,"{'polynomialfeatures__degree': 3, 'ridge__alph...",0.373488,0.002121,3,0.387783,0.000524
16,37.071441,6.28254,2.173924,0.966095,3,2.0,"{'polynomialfeatures__degree': 3, 'ridge__alph...",0.372326,0.001986,4,0.385726,0.000547
17,52.082184,16.74888,1.914617,1.498325,3,10.0,"{'polynomialfeatures__degree': 3, 'ridge__alph...",0.368162,0.002225,5,0.380343,0.000656
12,387.790188,164.849716,1.14733,0.295897,3,0.01,"{'polynomialfeatures__degree': 3, 'ridge__alph...",0.363046,0.01385,6,0.396016,0.000531
7,1.391587,0.067497,0.196363,0.026523,2,0.1,"{'polynomialfeatures__degree': 2, 'ridge__alph...",0.294344,0.003153,7,0.296912,0.001047
6,1.45919,0.064801,0.21464,0.018513,2,0.01,"{'polynomialfeatures__degree': 2, 'ridge__alph...",0.294339,0.003155,8,0.296932,0.001048
8,1.462244,0.082832,0.21976,0.016667,2,0.5,"{'polynomialfeatures__degree': 2, 'ridge__alph...",0.294311,0.003154,9,0.296877,0.001047
9,1.482839,0.054846,0.226309,0.018567,2,1.0,"{'polynomialfeatures__degree': 2, 'ridge__alph...",0.294295,0.003158,10,0.296856,0.001047


### KNeighbors

In [36]:
k_neighbors_reg = KNeighborsRegressor()
k_neighbours_reg_pipeline = make_pipeline(preprocessing, k_neighbors_reg)

param_grid = {
    'kneighborsregressor__n_neighbors': [5, 10, 20, 50, 100],
    'kneighborsregressor__weights': ['uniform', 'distance']
}

k_neighbors_reg_gs = optimize_params(k_neighbours_reg_pipeline, X_train, y_train, cv, scoring='r2', **param_grid)

Fitting 8 folds for each of 10 candidates, totalling 80 fits
Time: 1204.7 seg.


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_kneighborsregressor__n_neighbors,param_kneighborsregressor__weights,params,mean_test_score,std_test_score,rank_test_score,mean_train_score,std_train_score
5,0.109064,0.010895,36.104413,0.627193,20,distance,"{'kneighborsregressor__n_neighbors': 20, 'knei...",0.402597,0.003619,1,1.0,3.3159e-14
3,0.176094,0.064612,37.31927,0.694311,10,distance,"{'kneighborsregressor__n_neighbors': 10, 'knei...",0.39418,0.003806,2,1.0,1.089603e-14
7,0.115325,0.010626,36.145114,1.063845,50,distance,"{'kneighborsregressor__n_neighbors': 50, 'knei...",0.390295,0.002747,3,1.0,1.432198e-13
4,0.126334,0.017113,36.125306,0.603631,20,uniform,"{'kneighborsregressor__n_neighbors': 20, 'knei...",0.390282,0.003577,4,0.450484,0.001433049
2,0.146398,0.053391,39.052291,1.775319,10,uniform,"{'kneighborsregressor__n_neighbors': 10, 'knei...",0.38587,0.003866,5,0.500988,0.001063657
6,0.115714,0.011893,36.79807,0.520943,50,uniform,"{'kneighborsregressor__n_neighbors': 50, 'knei...",0.374378,0.002643,6,0.400057,0.000934739
9,0.122661,0.008741,36.688955,1.014024,100,distance,"{'kneighborsregressor__n_neighbors': 100, 'kne...",0.373607,0.002664,7,1.0,4.100923e-13
8,0.114096,0.008244,36.743261,1.640566,100,uniform,"{'kneighborsregressor__n_neighbors': 100, 'kne...",0.356726,0.002653,8,0.369827,0.000878502
1,0.186924,0.052661,41.261918,0.657586,5,distance,"{'kneighborsregressor__n_neighbors': 5, 'kneig...",0.3567,0.005074,9,1.0,3.858923e-15
0,0.143028,0.015617,39.954793,0.330159,5,uniform,"{'kneighborsregressor__n_neighbors': 5, 'kneig...",0.35256,0.005337,10,0.572631,0.001512176


### Decision Tree

In [37]:
decision_tree_reg = DecisionTreeRegressor(random_state=seed)
decision_tree_reg_pipeline = make_pipeline(preprocessing, decision_tree_reg)

param_grid = {
    'decisiontreeregressor__max_depth': [3, 5, 7, 10, None],
    'decisiontreeregressor__min_samples_split': [2, 50, 200],
    'decisiontreeregressor__criterion': ['squared_error', 'friedman_mse', 'poisson'],
    'decisiontreeregressor__ccp_alpha': [0, 0.0001, 0.001, 0.01, 0.1]
}

decision_tree_reg_gs = optimize_params(decision_tree_reg_pipeline, X_train, y_train, cv, scoring='r2', **param_grid)

Fitting 8 folds for each of 225 candidates, totalling 1800 fits
Time: 1502.32 seg.


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_decisiontreeregressor__ccp_alpha,param_decisiontreeregressor__criterion,param_decisiontreeregressor__max_depth,params,mean_test_score,std_test_score,rank_test_score,mean_train_score,std_train_score
28,5.046501,0.151272,0.033228,0.008927,0,friedman_mse,,"{'decisiontreeregressor__ccp_alpha': 0, 'decis...",0.526107,0.006673,1,7.199219e-01,2.555821e-03
13,5.100879,0.110822,0.032455,0.006109,0,squared_error,,"{'decisiontreeregressor__ccp_alpha': 0, 'decis...",0.526011,0.006812,2,7.199219e-01,2.555821e-03
43,6.159351,0.115350,0.041011,0.005640,0,poisson,,"{'decisiontreeregressor__ccp_alpha': 0, 'decis...",0.525571,0.007287,3,7.217258e-01,3.646081e-03
44,5.130538,0.169351,0.034449,0.009753,0,poisson,,"{'decisiontreeregressor__ccp_alpha': 0, 'decis...",0.492008,0.006260,4,5.633251e-01,3.397572e-03
14,4.366225,0.232686,0.036934,0.006553,0,squared_error,,"{'decisiontreeregressor__ccp_alpha': 0, 'decis...",0.491750,0.006309,5,5.612105e-01,1.714298e-03
...,...,...,...,...,...,...,...,...,...,...,...,...,...
160,3.433759,0.072381,0.026728,0.006654,0.01,friedman_mse,10,"{'decisiontreeregressor__ccp_alpha': 0.01, 'de...",-0.000031,0.000036,136,1.387779e-17,1.029204e-16
159,3.458890,0.065902,0.028911,0.006616,0.01,friedman_mse,10,"{'decisiontreeregressor__ccp_alpha': 0.01, 'de...",-0.000031,0.000036,136,1.387779e-17,1.029204e-16
158,2.587472,0.056617,0.023014,0.002128,0.01,friedman_mse,7,"{'decisiontreeregressor__ccp_alpha': 0.01, 'de...",-0.000031,0.000036,136,1.387779e-17,1.029204e-16
156,2.636034,0.074822,0.028827,0.005116,0.01,friedman_mse,7,"{'decisiontreeregressor__ccp_alpha': 0.01, 'de...",-0.000031,0.000036,136,1.387779e-17,1.029204e-16


### Random Forest

In [38]:
random_forest_reg = RandomForestRegressor(random_state=seed)
random_forest_reg_pipeline = make_pipeline(preprocessing, random_forest_reg)

param_grid = {
    'randomforestregressor__n_estimators': [10, 25, 50],
    'randomforestregressor__max_depth': [3, 5, 7, 10, None]
}

random_forest_reg_gs = optimize_params(random_forest_reg_pipeline, X_train, y_train, cv, scoring='r2', **param_grid)

Fitting 8 folds for each of 15 candidates, totalling 120 fits
Time: 1080.51 seg.


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_randomforestregressor__max_depth,param_randomforestregressor__n_estimators,params,mean_test_score,std_test_score,rank_test_score,mean_train_score,std_train_score
14,217.987472,9.501452,0.640658,0.072109,,50,"{'randomforestregressor__max_depth': None, 'ra...",0.696175,0.00327,1,0.955699,0.000138
13,134.171816,1.191305,0.451335,0.042784,,25,"{'randomforestregressor__max_depth': None, 'ra...",0.688685,0.003413,2,0.951485,0.000133
12,55.913423,0.534889,0.200781,0.020153,,10,"{'randomforestregressor__max_depth': None, 'ra...",0.665966,0.004384,3,0.938738,0.000295
11,133.636051,0.735355,0.245775,0.020334,10.0,50,"{'randomforestregressor__max_depth': 10, 'rand...",0.503961,0.004757,4,0.545528,0.002908
10,68.015156,0.245512,0.137034,0.01541,10.0,25,"{'randomforestregressor__max_depth': 10, 'rand...",0.501109,0.005459,5,0.542642,0.003379
9,27.572605,0.184246,0.069719,0.012505,10.0,10,"{'randomforestregressor__max_depth': 10, 'rand...",0.49448,0.0063,6,0.536179,0.004224
8,98.202944,0.387704,0.186118,0.016881,7.0,50,"{'randomforestregressor__max_depth': 7, 'rando...",0.36343,0.004306,7,0.376014,0.002262
7,49.977098,0.193271,0.101131,0.016585,7.0,25,"{'randomforestregressor__max_depth': 7, 'rando...",0.361419,0.005385,8,0.3741,0.003398
6,19.95404,0.317576,0.060874,0.013096,7.0,10,"{'randomforestregressor__max_depth': 7, 'rando...",0.357213,0.006144,9,0.369984,0.004554
5,72.085611,0.376602,0.147878,0.0155,5.0,50,"{'randomforestregressor__max_depth': 5, 'rando...",0.267666,0.003105,10,0.272108,0.001521


### XGBoost

In [39]:
xgb_reg = XGBRegressor(random_state=seed)
xgb_reg_pipeline = make_pipeline(preprocessing, xgb_reg)

param_grid = {
    'xgbregressor__n_estimators': [10, 25, 50, 100],
    'xgbregressor__max_depth': [3, 5, 7, 10, None],
    'xgbregressor__learning_rate': [0.001, 0.005, 0.01, 0.05, 0.1, 0.2]
}

xgb_reg_gs = optimize_params(xgb_reg_pipeline, X_train, y_train, cv, scoring='r2', **param_grid)

Fitting 8 folds for each of 120 candidates, totalling 960 fits
Time: 251.98 seg.


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_xgbregressor__learning_rate,param_xgbregressor__max_depth,param_xgbregressor__n_estimators,params,mean_test_score,std_test_score,rank_test_score,mean_train_score,std_train_score
115,6.384209,0.737279,0.380800,0.044614,0.2,10,100,"{'xgbregressor__learning_rate': 0.2, 'xgbregre...",0.646875,0.004021,1,0.837901,0.003577
95,6.838453,0.616383,0.399817,0.046864,0.1,10,100,"{'xgbregressor__learning_rate': 0.1, 'xgbregre...",0.634160,0.003125,2,0.769770,0.002125
114,4.363920,0.420120,0.201637,0.026717,0.2,10,50,"{'xgbregressor__learning_rate': 0.2, 'xgbregre...",0.627759,0.004357,3,0.767062,0.004974
75,8.806289,0.826498,0.457196,0.049709,0.05,10,100,"{'xgbregressor__learning_rate': 0.05, 'xgbregr...",0.607905,0.002535,4,0.712821,0.003404
94,4.575071,0.378477,0.213526,0.023979,0.1,10,50,"{'xgbregressor__learning_rate': 0.1, 'xgbregre...",0.607413,0.003199,5,0.713196,0.002370
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1,0.814251,0.018260,0.042013,0.004644,0.001,3,25,"{'xgbregressor__learning_rate': 0.001, 'xgbreg...",0.007196,0.000049,116,0.007266,0.000071
8,0.717513,0.035032,0.043664,0.009188,0.001,7,10,"{'xgbregressor__learning_rate': 0.001, 'xgbreg...",0.006350,0.000164,117,0.006502,0.000217
16,0.939293,0.019620,0.070404,0.010968,0.001,,10,"{'xgbregressor__learning_rate': 0.001, 'xgbreg...",0.005555,0.000189,118,0.005662,0.000230
4,0.572232,0.063041,0.038998,0.006472,0.001,5,10,"{'xgbregressor__learning_rate': 0.001, 'xgbreg...",0.004798,0.000107,119,0.004877,0.000129


### SGD

In [40]:
sgd_reg = SGDRegressor(random_state=seed)
sgd_reg_pipeline = make_pipeline(preprocessing, sgd_reg)

param_grid = {
    'sgdregressor__alpha': [0.0001, 0.001, 0.01, 0.1, 0.2, 0.5, 0.8],
    'sgdregressor__max_iter': [50, 100, 500, 1000],
    'sgdregressor__loss': ['squared_error', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive']
}

sgd_reg_gs = optimize_params(sgd_reg_pipeline, X_train, y_train, cv, scoring='r2', **param_grid)

Fitting 8 folds for each of 240 candidates, totalling 1920 fits
Time: 111.33 seg.


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_sgdregressor__alpha,param_sgdregressor__loss,param_sgdregressor__max_iter,params,mean_test_score,std_test_score,rank_test_score,mean_train_score,std_train_score
0,0.367657,0.019628,0.062427,0.034286,0.000001,squared_error,50,"{'sgdregressor__alpha': 1e-06, 'sgdregressor__...",0.158093,0.003779,1,0.158000,0.001420
1,0.404428,0.024598,0.042106,0.013183,0.000001,squared_error,100,"{'sgdregressor__alpha': 1e-06, 'sgdregressor__...",0.158093,0.003779,1,0.158000,0.001420
2,0.389496,0.036760,0.040738,0.009770,0.000001,squared_error,300,"{'sgdregressor__alpha': 1e-06, 'sgdregressor__...",0.158093,0.003779,1,0.158000,0.001420
3,0.432062,0.047194,0.047520,0.009703,0.000001,squared_error,500,"{'sgdregressor__alpha': 1e-06, 'sgdregressor__...",0.158093,0.003779,1,0.158000,0.001420
4,0.426448,0.047917,0.036334,0.005066,0.000001,squared_error,1000,"{'sgdregressor__alpha': 1e-06, 'sgdregressor__...",0.158093,0.003779,1,0.158000,0.001420
...,...,...,...,...,...,...,...,...,...,...,...,...,...
225,0.412119,0.051163,0.038739,0.013656,0.1,huber,500,"{'sgdregressor__alpha': 0.1, 'sgdregressor__lo...",0.125366,0.004004,235,0.125486,0.004313
224,0.441676,0.056620,0.036228,0.004448,0.1,huber,300,"{'sgdregressor__alpha': 0.1, 'sgdregressor__lo...",0.125366,0.004004,235,0.125486,0.004313
223,0.391161,0.042848,0.039938,0.009523,0.1,huber,100,"{'sgdregressor__alpha': 0.1, 'sgdregressor__lo...",0.125366,0.004004,235,0.125486,0.004313
227,0.406121,0.047070,0.037053,0.007755,0.1,huber,2000,"{'sgdregressor__alpha': 0.1, 'sgdregressor__lo...",0.125366,0.004004,235,0.125486,0.004313


## Evaluación

In [42]:
reg_dict = {
    'Regresion lineal': linear_reg_gs,
    'KNeighbors' : k_neighbors_reg_gs,
    'Arbol de decision': decision_tree_reg_gs,
    'Random Forest': random_forest_reg_gs,
    'XGBoost': xgb_reg_gs,
    'SGD': sgd_reg_gs
}


df_results_class = pd.DataFrame({
    'Modelo': reg_dict.keys(),
    'Mejor puntuación': [gs.best_score_ for gs in reg_dict.values()]
})
df_results_class = df_results_class.sort_values(by='Mejor puntuación', ascending=False)
df_results_class

Unnamed: 0,Modelo,Mejor puntuacion
3,Random Forest,0.696175
4,XGBoost,0.646875
2,Arbol de decision,0.526107
1,KNeighbors,0.402597
0,Regresion lineal,0.377136
5,SGD,0.158093


In [43]:
model = random_forest_reg_gs.best_estimator_
model_path = os.path.join('models', 'experiment_1', 'model_reg.joblib')
dump(model, model_path)

['models\\model_1_reg.joblib']

In [47]:
model_path = os.path.join('models', 'experiment_1', 'model_reg.joblib')
model = load(model_path)
pred = model.predict(X_test)

print(f"""Métricas predicción del test
R2:  {r2_score(y_test, pred)}
MSE: {mean_squared_error(y_test, pred)}
MAE: {mean_absolute_error(y_test, pred)}""")

Métricas predicción del test
R2:  0.724605944426127
MSE: 0.008783141496746506
MAE: 0.06940586333961964


# Clasificación

In [49]:
X = df[variables]
y = df[target_discrete]

In [50]:
X.sample(5, random_state=seed)

Unnamed: 0,elevacion,slope,orientacion_sen,orientacion_cos,altura,lfcc,anomalia,dpv,vel_media_viento,erodi,inflam,mcroth
119340,1024.904053,15.177,0.99465,0.103298,9.0,46.956501,73.561211,2.246882,2.916394,4,3,2
58023,505.352997,25.718,0.206331,0.978482,3.0,19.892401,86.961823,2.385298,4.876472,1,4,3
46997,388.710999,31.593,-0.984,0.178167,1.0,0.9569,79.168137,2.319516,5.026659,2,4,3
67214,1063.177002,22.364,0.901304,-0.433188,2.0,7.424201,50.321835,0.349119,2.470344,1,4,1
153590,507.516998,13.911,0.872081,0.489362,3.0,32.275799,72.89386,1.469779,3.393716,3,4,3


In [51]:
y.sample(5, random_state=seed)

119340    3
58023     1
46997     1
67214     2
153590    1
Name: severidad_discreta, dtype: int64

In [52]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=seed)

In [15]:
def optimize_params(estimator, X, y, cv, scoring=None, refit=True, **param_grid):
    t=time()
    """Exhaustive search over specified parameter values for an estimator."""
    grid_search_cv = GridSearchCV(estimator,
                                  param_grid,
                                  scoring=scoring,
                                  refit=refit,
                                  cv=cv,
                                  verbose=1,
                                  n_jobs=10,
                                  return_train_score=True).fit(X, y)

    cv_results = pd.DataFrame(grid_search_cv.cv_results_)

    # Drop the results for each validation split and sort by the refit metric
    labels = cv_results.filter(regex="split")
    by = cv_results.filter(regex="rank_test").columns[0]
    cv_results = cv_results.drop(labels, axis=1).sort_values(by)

    print(f'Time: {round(time()-t, 2)} seg.')
    display(cv_results)

    return grid_search_cv

In [17]:
n_splits = 4
n_repeats = 2

cv = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=seed)

In [18]:
preprocessing = make_column_transformer(
        (StandardScaler(), numerical_variables),
        (OneHotEncoder(handle_unknown='ignore'), categorical_variables),
        remainder='passthrough'
)

## Grid Search

### Logistic Regression

In [19]:
poly = PolynomialFeatures(include_bias=False)
logistic_reg = LogisticRegression(random_state=seed)
logistic_reg_pipeline = make_pipeline(preprocessing, poly, logistic_reg)

param_grid = {
    'polynomialfeatures__degree': [1, 2],
    'logisticregression__C': [0.01, 0.1, 1, 2, 5, 10],
    'logisticregression__max_iter': [100, 200]
}

logistic_reg_gs = optimize_params(logistic_reg_pipeline, X_train, y_train, cv, scoring='f1_weighted', **param_grid)

Fitting 8 folds for each of 30 candidates, totalling 240 fits
Time: 2149.01 seg.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_logisticregression__C,param_logisticregression__max_iter,param_polynomialfeatures__degree,params,mean_test_score,std_test_score,rank_test_score,mean_train_score,std_train_score
11,443.240402,458.673276,1.720291,0.541496,1.0,200,3,"{'logisticregression__C': 1, 'logisticregressi...",0.518241,0.00196,1,0.530237,0.000945
29,228.512113,40.517195,1.033737,0.3224,10.0,200,3,"{'logisticregression__C': 10, 'logisticregress...",0.517944,0.002143,2,0.530564,0.000884
5,311.529017,379.944671,3.414598,1.811152,0.1,200,3,"{'logisticregression__C': 0.1, 'logisticregres...",0.51787,0.002063,3,0.530275,0.001022
17,230.082301,157.178977,1.828555,1.312539,2.0,200,3,"{'logisticregression__C': 2, 'logisticregressi...",0.51786,0.001967,4,0.530779,0.000679
23,244.779746,189.291701,2.55895,1.69942,5.0,200,3,"{'logisticregression__C': 5, 'logisticregressi...",0.517744,0.001749,5,0.530293,0.001097
8,92.223264,12.85066,1.586185,0.574931,1.0,100,3,"{'logisticregression__C': 1, 'logisticregressi...",0.512918,0.002082,6,0.523292,0.000575
20,133.406392,108.706183,3.424112,2.151539,5.0,100,3,"{'logisticregression__C': 5, 'logisticregressi...",0.512706,0.001557,7,0.523476,0.000826
26,136.369322,111.959254,1.570215,0.290275,10.0,100,3,"{'logisticregression__C': 10, 'logisticregress...",0.512531,0.002074,8,0.523189,0.000789
14,128.17535,92.807931,2.451749,1.182782,2.0,100,3,"{'logisticregression__C': 2, 'logisticregressi...",0.512311,0.002036,9,0.523351,0.000603
2,285.077466,231.863905,1.689844,0.226057,0.1,100,3,"{'logisticregression__C': 0.1, 'logisticregres...",0.511939,0.001989,10,0.523031,0.001014


### KNeighbors

In [20]:
k_neighbors_class = KNeighborsClassifier()
k_neighbours_class_pipeline = make_pipeline(preprocessing, k_neighbors_class)

param_grid = {
    'kneighborsclassifier__n_neighbors': [5, 10, 20, 50, 100],
    'kneighborsclassifier__weights': ['uniform', 'distance']
}

k_neighbors_class_gs = optimize_params(k_neighbours_class_pipeline, X_train, y_train, cv, scoring='f1_weighted', **param_grid)

Fitting 8 folds for each of 10 candidates, totalling 80 fits
Time: 1368.94 seg.


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_kneighborsclassifier__n_neighbors,param_kneighborsclassifier__weights,params,mean_test_score,std_test_score,rank_test_score,mean_train_score,std_train_score
5,0.1264,0.01278,42.660717,0.423654,20,distance,"{'kneighborsclassifier__n_neighbors': 20, 'kne...",0.52621,0.0021,1,1.0,0.0
7,0.129149,0.019783,41.841677,0.432103,50,distance,"{'kneighborsclassifier__n_neighbors': 50, 'kne...",0.524804,0.00256,2,1.0,0.0
4,0.122472,0.012842,43.237787,0.223263,20,uniform,"{'kneighborsclassifier__n_neighbors': 20, 'kne...",0.521357,0.002475,3,0.576024,0.000939
9,0.127509,0.017777,43.637817,0.285656,100,distance,"{'kneighborsclassifier__n_neighbors': 100, 'kn...",0.518494,0.002023,4,1.0,0.0
3,0.12811,0.009401,43.875219,0.403632,10,distance,"{'kneighborsclassifier__n_neighbors': 10, 'kne...",0.517919,0.002027,5,1.0,0.0
6,0.127023,0.009336,42.316413,0.852796,50,uniform,"{'kneighborsclassifier__n_neighbors': 50, 'kne...",0.517525,0.002922,6,0.543023,0.000762
2,0.122265,0.007661,42.922589,1.112874,10,uniform,"{'kneighborsclassifier__n_neighbors': 10, 'kne...",0.514632,0.001903,7,0.608842,0.001116
8,0.134658,0.012356,42.982573,0.607287,100,uniform,"{'kneighborsclassifier__n_neighbors': 100, 'kn...",0.509992,0.002022,8,0.523552,0.000835
1,0.120996,0.012453,41.818116,0.517395,5,distance,"{'kneighborsclassifier__n_neighbors': 5, 'knei...",0.503075,0.002736,9,1.0,0.0
0,0.135024,0.023822,42.122886,0.337946,5,uniform,"{'kneighborsclassifier__n_neighbors': 5, 'knei...",0.49974,0.002709,10,0.658221,0.000993


### Decision Tree

In [21]:
decision_tree_class = DecisionTreeClassifier(random_state=seed)
decision_tree_class_pipeline = make_pipeline(preprocessing, decision_tree_class)

param_grid = {
    'decisiontreeclassifier__max_depth': [3, 5, 7, 10, None],
    'decisiontreeclassifier__min_samples_split': [2, 50, 200],
    'decisiontreeclassifier__criterion': ['entropy', 'gini'],
    'decisiontreeclassifier__ccp_alpha': [0, 0.0001, 0.001, 0.01, 0.1]
}

decision_tree_class_gs = optimize_params(decision_tree_class_pipeline, X_train, y_train, cv, scoring='f1_weighted', **param_grid)

Fitting 8 folds for each of 150 candidates, totalling 1200 fits
Time: 535.27 seg.


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_decisiontreeclassifier__ccp_alpha,param_decisiontreeclassifier__criterion,param_decisiontreeclassifier__max_depth,params,mean_test_score,std_test_score,rank_test_score,mean_train_score,std_train_score
42,11.770180,0.160472,0.054351,0.010750,0.0001,entropy,,"{'decisiontreeclassifier__ccp_alpha': 0.0001, ...",0.581348,0.003109,1,0.698796,0.002230
43,7.806605,0.211261,0.054392,0.010480,0.0001,entropy,,"{'decisiontreeclassifier__ccp_alpha': 0.0001, ...",0.574542,0.002661,2,0.663367,0.002057
13,7.461764,0.124419,0.057015,0.009566,0,entropy,,"{'decisiontreeclassifier__ccp_alpha': 0, 'deci...",0.571682,0.001683,3,0.691800,0.001180
28,5.518354,0.083520,0.055263,0.005500,0,gini,,"{'decisiontreeclassifier__ccp_alpha': 0, 'deci...",0.568135,0.002630,4,0.705379,0.001407
12,8.581525,0.110795,0.053660,0.010436,0,entropy,,"{'decisiontreeclassifier__ccp_alpha': 0, 'deci...",0.567436,0.002638,5,1.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
121,1.741134,0.044919,0.045847,0.007936,0.1,entropy,3,"{'decisiontreeclassifier__ccp_alpha': 0.1, 'de...",0.206183,0.000013,121,0.206183,0.000004
120,1.727384,0.034572,0.043254,0.007419,0.1,entropy,3,"{'decisiontreeclassifier__ccp_alpha': 0.1, 'de...",0.206183,0.000013,121,0.206183,0.000004
148,5.690282,0.074390,0.045387,0.009387,0.1,gini,,"{'decisiontreeclassifier__ccp_alpha': 0.1, 'de...",0.206183,0.000013,121,0.206183,0.000004
133,7.855267,0.140384,0.046464,0.006132,0.1,entropy,,"{'decisiontreeclassifier__ccp_alpha': 0.1, 'de...",0.206183,0.000013,121,0.206183,0.000004


### Random Forest

In [22]:
random_forest_class = RandomForestClassifier(random_state=seed)
random_forest_class_pipeline = make_pipeline(preprocessing, random_forest_class)

param_grid = {
    'randomforestclassifier__n_estimators': [10, 25, 50],
    'randomforestclassifier__max_depth': [3, 5, 7, None],
    'randomforestclassifier__criterion': ['entropy', 'gini'],
}

random_forest_class_gs = optimize_params(random_forest_class_pipeline, X_train, y_train, cv, scoring='f1_weighted', **param_grid)

Fitting 8 folds for each of 24 candidates, totalling 192 fits
Time: 394.78 seg.


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_randomforestclassifier__criterion,param_randomforestclassifier__max_depth,param_randomforestclassifier__n_estimators,params,mean_test_score,std_test_score,rank_test_score,mean_train_score,std_train_score
11,67.804655,0.54745,0.808632,0.064649,entropy,,50,{'randomforestclassifier__criterion': 'entropy...,0.64798,0.001786,1,0.999907,4.2e-05
23,42.825474,1.69221,0.607352,0.064526,gini,,50,"{'randomforestclassifier__criterion': 'gini', ...",0.646736,0.002364,2,0.999897,2.7e-05
10,32.633367,0.759254,0.409323,0.023843,entropy,,25,{'randomforestclassifier__criterion': 'entropy...,0.63886,0.002775,3,0.99884,0.000175
22,26.550581,0.243329,0.425691,0.027004,gini,,25,"{'randomforestclassifier__criterion': 'gini', ...",0.63784,0.002499,4,0.998774,0.000141
9,13.56657,0.529937,0.18469,0.021004,entropy,,10,{'randomforestclassifier__criterion': 'entropy...,0.616763,0.001657,5,0.986164,0.000234
21,11.265668,0.151762,0.191543,0.017305,gini,,10,"{'randomforestclassifier__criterion': 'gini', ...",0.613562,0.002704,6,0.986058,0.000366
7,14.567633,0.164105,0.200347,0.024862,entropy,7.0,25,{'randomforestclassifier__criterion': 'entropy...,0.501909,0.003719,7,0.510395,0.002805
8,28.424646,0.20676,0.310892,0.015575,entropy,7.0,50,{'randomforestclassifier__criterion': 'entropy...,0.500546,0.00192,8,0.510031,0.001668
20,21.790407,0.170052,0.305204,0.017882,gini,7.0,50,"{'randomforestclassifier__criterion': 'gini', ...",0.500192,0.001494,9,0.510646,0.002393
19,11.153449,0.158048,0.175459,0.004652,gini,7.0,25,"{'randomforestclassifier__criterion': 'gini', ...",0.499314,0.002744,10,0.508579,0.002553


### XGBoost

In [23]:
xgb_class = XGBClassifier(random_state=seed)
xgb_class_pipeline = make_pipeline(preprocessing, xgb_class)

param_grid = {
    'xgbclassifier__n_estimators': [10, 25, 50, 100],
    'xgbclassifier__max_depth': [3, 5, 7, 10, None],
    'xgbclassifier__learning_rate': [0.001, 0.005, 0.01, 0.05, 0.1, 0.2]
}

xgb_class_gs = optimize_params(xgb_class_pipeline, X_train, y_train, cv, scoring='f1_weighted', **param_grid)

Fitting 8 folds for each of 120 candidates, totalling 960 fits
Time: 860.55 seg.


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_xgbclassifier__learning_rate,param_xgbclassifier__max_depth,param_xgbclassifier__n_estimators,params,mean_test_score,std_test_score,rank_test_score,mean_train_score,std_train_score
115,21.124042,0.302633,1.561242,0.059328,0.2,10,100,"{'xgbclassifier__learning_rate': 0.2, 'xgbclas...",0.627195,0.003514,1,0.868331,0.003146
95,24.412941,0.293710,1.685831,0.070886,0.1,10,100,"{'xgbclassifier__learning_rate': 0.1, 'xgbclas...",0.618593,0.002975,2,0.793995,0.002288
114,12.925971,0.162241,0.897412,0.047517,0.2,10,50,"{'xgbclassifier__learning_rate': 0.2, 'xgbclas...",0.617582,0.002737,3,0.797319,0.002549
94,15.209643,0.172570,1.013881,0.038459,0.1,10,50,"{'xgbclassifier__learning_rate': 0.1, 'xgbclas...",0.608337,0.002692,4,0.742797,0.002302
75,29.610080,0.277972,1.838876,0.147562,0.05,10,100,"{'xgbclassifier__learning_rate': 0.05, 'xgbcla...",0.608276,0.003364,5,0.743276,0.001778
...,...,...,...,...,...,...,...,...,...,...,...,...,...
41,2.447839,0.100219,0.114016,0.008543,0.01,3,25,"{'xgbclassifier__learning_rate': 0.01, 'xgbcla...",0.422515,0.006108,116,0.424656,0.003539
1,2.223206,0.031152,0.101435,0.009668,0.001,3,25,"{'xgbclassifier__learning_rate': 0.001, 'xgbcl...",0.421914,0.005311,117,0.423883,0.005458
2,3.881504,0.052472,0.141498,0.013096,0.001,3,50,"{'xgbclassifier__learning_rate': 0.001, 'xgbcl...",0.421907,0.005528,118,0.423886,0.005106
20,1.412193,0.029793,0.096262,0.014096,0.005,3,10,"{'xgbclassifier__learning_rate': 0.005, 'xgbcl...",0.421899,0.005463,119,0.423806,0.005166


### SGD

In [24]:
sgd_class = SGDClassifier(random_state=seed)
sgd_class_pipeline = make_pipeline(preprocessing, sgd_class)

param_grid = {
    'sgdclassifier__alpha': [0.0001, 0.001, 0.01, 0.1, 0.2, 0.5, 0.8],
    'sgdclassifier__max_iter': [50, 100, 500, 1000],
    'sgdclassifier__loss': ['log_loss', 'hinge', 'modified_huber']
}

sgd_class_gs = optimize_params(sgd_class_pipeline, X_train, y_train, cv, scoring='f1_weighted', **param_grid)

Fitting 8 folds for each of 180 candidates, totalling 1440 fits
Time: 1377.26 seg.


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_sgdclassifier__alpha,param_sgdclassifier__loss,param_sgdclassifier__max_iter,params,mean_test_score,std_test_score,rank_test_score,mean_train_score,std_train_score
112,2.515418,0.120805,0.059986,0.006223,0.0001,log_loss,1000,"{'sgdclassifier__alpha': 0.0001, 'sgdclassifie...",0.409246,0.004959,1,0.409885,0.004468
111,2.511172,0.110801,0.059513,0.009936,0.0001,log_loss,500,"{'sgdclassifier__alpha': 0.0001, 'sgdclassifie...",0.409246,0.004959,1,0.409885,0.004468
110,2.498599,0.094907,0.057342,0.008019,0.0001,log_loss,300,"{'sgdclassifier__alpha': 0.0001, 'sgdclassifie...",0.409246,0.004959,1,0.409885,0.004468
109,2.528776,0.106622,0.054987,0.007011,0.0001,log_loss,100,"{'sgdclassifier__alpha': 0.0001, 'sgdclassifie...",0.409246,0.004959,1,0.409885,0.004468
108,2.590369,0.143261,0.057712,0.011486,0.0001,log_loss,50,"{'sgdclassifier__alpha': 0.0001, 'sgdclassifie...",0.409246,0.004959,1,0.409885,0.004468
...,...,...,...,...,...,...,...,...,...,...,...,...,...
166,1.198717,0.040953,0.062393,0.010062,0.1,log_loss,1000,"{'sgdclassifier__alpha': 0.1, 'sgdclassifier__...",0.319164,0.001996,175,0.319228,0.001576
167,1.210795,0.057262,0.064693,0.010072,0.1,log_loss,2000,"{'sgdclassifier__alpha': 0.1, 'sgdclassifier__...",0.319164,0.001996,175,0.319228,0.001576
163,1.205162,0.038457,0.053555,0.005227,0.1,log_loss,100,"{'sgdclassifier__alpha': 0.1, 'sgdclassifier__...",0.319164,0.001996,175,0.319228,0.001576
162,1.208990,0.040598,0.063363,0.011684,0.1,log_loss,50,"{'sgdclassifier__alpha': 0.1, 'sgdclassifier__...",0.319164,0.001996,175,0.319228,0.001576


## Evaluación

In [25]:
class_dict = {
    'Regresion logistica': logistic_reg_gs,
    'KNeighbors' : k_neighbors_class_gs,
    'Arbol de decision': decision_tree_class_gs,
    'Random Forest': random_forest_class_gs,
    'XGBoost': xgb_class_gs,
    'SGD': sgd_class_gs
}


df_results_class = pd.DataFrame({
    'Modelo': class_dict.keys(),
    'Mejor puntuacion': [gs.best_score_ for gs in class_dict.values()]
})
df_results_class = df_results_class.sort_values(by='Mejor puntuacion', ascending=False)
df_results_class

Unnamed: 0,Modelo,Mejor puntuacion
3,Random Forest,0.64798
4,XGBoost,0.627195
2,Arbol de decision,0.581348
1,KNeighbors,0.52621
0,Regresion logistica,0.518241
5,SGD,0.409246


In [26]:
model = random_forest_class_gs.best_estimator_
model_path = os.path.join('models', 'experiment_1', 'model_class.joblib')
dump(model, model_path)

['models\\model_1_class.joblib']

In [53]:
model_path = os.path.join('models', 'experiment_1', 'model_class.joblib')
model = load(model_path)
pred = model.predict(X_test)

print(f"""Métricas predicción del test
F1:        {f1_score(y_test, pred, average='weighted')}
Recall:    {recall_score(y_test, pred, average='weighted')}
Precision: {precision_score(y_test, pred, average='weighted')}
Accuracy:  {accuracy_score(y_test, pred)}""")

Métricas predicción del test
F1:        0.660617605233689
Recall:    0.6600650010156409
Precision: 0.6633829307943221
Accuracy:  0.6600650010156409
