# Analysis of Models

## Import libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

# Regression Models
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor

## Preparing Data

In [2]:
df = pd.read_csv('../dataset.csv')
df.head(5)

Unnamed: 0,LOTE,Fecha/hora inicio_preinoculo,Fecha/hora fin_preinoculo,línea 1,línea 2,línea 3,línea 1.1,línea 2.1,línea 3.1,línea 1.2,...,Viabilidad final cultivo_cultivo_final,ID Centrífuga,Centrifugación 1 turbidez,Centrifugación 2 turbidez,Producto 1,Producto 2,media_PV,media_temp_bioreactor,media_ph_biorreactor,media_PO_biorreactor
0,23019,2023-03-19 05:00:00,2023-03-20 07:24:00,,,,,,,,...,184000000,17825,,,1747.92,6.0,0.409767,29.689535,6.191826,24.35514
1,23020,2023-03-19 05:00:00,2023-03-20 07:24:00,,,,,,,,...,181600000,14246,,,1676.16,6.56,0.426746,29.689535,6.191826,24.35514
2,23021,2023-03-20 05:00:00,2023-03-21 06:37:00,,,,,,,,...,248000000,17825,,,1928.496,8.08,0.737972,29.439842,6.028036,25.462923
3,23022,2023-03-20 05:00:00,2023-03-21 06:37:00,,,,,,,,...,229600000,12912,,,1782.8,5.92,0.147557,29.439842,6.028036,25.462923
4,23023,2023-03-26 05:00:00,2023-03-27 07:21:00,5.496,5.504,5.5200000000000005,28.32,27.92,32.0,1.0,...,132800000,17825,26.56,20.88,1861.84,2.96,0.408323,29.442474,6.002998,23.396945


In [3]:
target = df.iloc[:,31]
columns_to_drop = df.iloc[:, [0,1,2,12,13,14,20,21,22,23,28,31,32]].columns
df_dropped = df.drop(columns=columns_to_drop)

In [4]:
df.iloc[:,0]

0      23019
1      23020
2      23021
3      23022
4      23023
       ...  
128    24045
129    24044
130    24049
131    24050
132    24051
Name: LOTE, Length: 133, dtype: int64

In [5]:
data = df_dropped
data.replace('N.A', np.nan, inplace=True)

In [6]:
data['línea 3'] = data['línea 3'].astype(float)
data['línea 3.1'] = data['línea 3.1'].astype(float)
data['Viabilidad final cultivo_cultivo_final'] = pd.to_numeric(data['Viabilidad final cultivo_cultivo_final'], errors='coerce')
data['Viabilidad final cultivo_cultivo_final'] = data['Viabilidad final cultivo_cultivo_final'].astype(float)

In [7]:
def clean_and_convert(column):
    # Reemplazar ',' por '.' y eliminar el signo '+'
    column = column.str.replace(',', '.', regex=False)
    column = column.str.replace('+', '', regex=False)
    # Convertir a float, forzando errores a NaN
    return pd.to_numeric(column, errors='coerce')

# Aplicar la función a todas las columnas de tipo object
for col in data.select_dtypes(include=['object']).columns:
    data[col] = clean_and_convert(data[col])

In [8]:
for column in data.select_dtypes(include=[np.number]).columns:
    mean_value = data[column].mean()
    print(mean_value)
    data[column].fillna(mean_value, inplace=True) 

5.47023622047244
5.473310344827587
5.498252427184466
32.12409448818898
30.317241379310346
29.89669902912621
0.8217054263565892
0.6589147286821705
0.40310077519379844
150.33984962406018
15.338582677165354
26.21593984962406
98829227.06766917
1.0
81.60123076923077
17.337142857142858
73.51578947368422
169149494.94949493
30.14899224806202
23.60603174603175
0.5136521196513597
29.458078188226544
7.076652084297919
27.246767458045756


In [9]:
target.dtypes

dtype('float64')

In [10]:
# Definir características (X) y objetivo (y)
X = data
y = target

In [11]:
# Dividir el dataset en entrenamiento, validación y prueba
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)  # 60% entrenamiento
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)  # 20% validación, 20% prueba

## Model 1: XGBRegressor

In [12]:
# Definir el pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),               # Escalado de las características
    ('regressor', XGBRegressor(random_state=42))  # Modelo XGBoost
])

# Definir los hiperparámetros a ajustar
param_distributions = {
    'regressor__n_estimators': [100, 200, 300, 500],
    'regressor__learning_rate': [0.01, 0.05, 0.1, 0.2],
    'regressor__max_depth': [3, 5, 7, 10],
    'regressor__min_child_weight': [1, 3, 5],
    'regressor__subsample': [0.6, 0.8, 1.0],
    'regressor__colsample_bytree': [0.6, 0.8, 1.0]
}

# Realizar la búsqueda aleatoria (Random Search) para encontrar los mejores parámetros
random_search = GridSearchCV(pipeline, param_distributions, cv=5, 
                                   scoring='neg_mean_squared_error', verbose=2, n_jobs=-1)

# Entrenar el modelo
random_search.fit(X_train, y_train)


Fitting 5 folds for each of 1728 candidates, totalling 8640 fits


### Model 1 Results

In [13]:
print("Mejores parámetros encontrados:", random_search.best_params_)

y_val_pred = random_search.predict(X_val)

mse_val = mean_squared_error(y_val, y_val_pred)
print("Root Mean Squared Error en el conjunto de validación:", np.sqrt(mse_val))

Mejores parámetros encontrados: {'regressor__colsample_bytree': 0.8, 'regressor__learning_rate': 0.2, 'regressor__max_depth': 10, 'regressor__min_child_weight': 3, 'regressor__n_estimators': 100, 'regressor__subsample': 0.8}
Root Mean Squared Error en el conjunto de validación: 283.98945200991284


In [14]:
y_test_pred = random_search.predict(X_test)
mse_test = mean_squared_error(y_test, y_test_pred)
print("Root Mean Squared Error en el conjunto de prueba:", np.sqrt(mse_test))

Root Mean Squared Error en el conjunto de prueba: 319.3649662816627


## Model 2: 

In [15]:
# Definir el pipeline
ridge_pipeline = Pipeline([
    ('scaler', StandardScaler()),              
    ('model', Ridge())
])

ridge_param_grid = {'model__alpha': [0.1, 1.0, 10.0, 100.0]}
ridge_grid_search = GridSearchCV(estimator=ridge_pipeline, param_grid=ridge_param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

ridge_grid_search.fit(X_train, y_train)


### Model 2 Results

In [16]:
y_pred_ridge = ridge_grid_search.best_estimator_.predict(X_val)

rmse_ridge = np.sqrt(mean_squared_error(y_val, y_pred_ridge))
r2_ridge = r2_score(y_val, y_pred_ridge)

print(f"Ridge Regression - Best Params: {ridge_grid_search.best_params_}, RMSE: {rmse_ridge}, R2: {r2_ridge}")

Ridge Regression - Best Params: {'model__alpha': 100.0}, RMSE: 274.7414546467676, R2: -0.017557793326383964


In [17]:
y_test_ridge = ridge_grid_search.best_estimator_.predict(X_test)

rmse_ridge_test = np.sqrt(mean_squared_error(y_test, y_test_ridge))
r2_ridge_test = r2_score(y_test, y_test_ridge)

print(f"Ridge Regression: RMSE: {rmse_ridge_test}, R2: {r2_ridge_test}")

Ridge Regression: RMSE: 339.88775301378314, R2: 0.12099662433362035


## Model 3: Random Forest Regressor

In [18]:
# Random Forest Regressor
rf_pipeline = Pipeline([
    ('model', RandomForestRegressor(random_state=42))
])

# GridSearchCV for Random Forest
rf_param_grid = {
    'model__n_estimators': [100, 200],
    'model__max_depth': [None, 10, 20],
    'model__min_samples_split': [2, 5],
    'model__min_samples_leaf': [1, 2],
    'model__max_features': ['auto', 'sqrt']
}

rf_grid_search = GridSearchCV(estimator=rf_pipeline, param_grid=rf_param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

rf_grid_search.fit(X_train, y_train)

120 fits failed out of a total of 240.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
40 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\jaime\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\jaime\anaconda3\lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\jaime\anaconda3\lib\site-packages\sklearn\pipeline.py", line 475, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "C:\Users\jaime\anaconda3\lib\site-packages\sklearn\base.py", line 1467, in wrapper
    estimator._validate_para

## Model 3 Results

In [19]:
y_pred_rf = rf_grid_search.best_estimator_.predict(X_val)

rmse_rf = np.sqrt(mean_squared_error(y_val, y_pred_rf))
r2_rf = r2_score(y_val, y_pred_rf)

print(f"Random Forest Regression - Best Params: {rf_grid_search.best_params_}, RMSE: {rmse_rf}, R2: {r2_rf}")

Random Forest Regression - Best Params: {'model__max_depth': None, 'model__max_features': 'sqrt', 'model__min_samples_leaf': 1, 'model__min_samples_split': 2, 'model__n_estimators': 100}, RMSE: 257.43550149319833, R2: 0.10659665574950106


In [20]:
y_test_rf = rf_grid_search.best_estimator_.predict(X_test)

rmse_rf_test = np.sqrt(mean_squared_error(y_test, y_test_rf))
r2_rf_test = r2_score(y_test, y_test_rf)

print(f"Ridge Regression: RMSE: {rmse_rf_test}, R2: {r2_rf_test}")

Ridge Regression: RMSE: 306.83665091269296, R2: 0.2836356281562781


## Model 4: Gradient Boosting Regressor

In [21]:
# Gradient Boosting Regressor
gbr_pipeline = Pipeline([
    ('model', GradientBoostingRegressor(random_state=42))
])

# GridSearchCV for Gradient Boosting
gbr_param_grid = {
    'model__n_estimators': [100, 200],
    'model__learning_rate': [0.01, 0.1],
    'model__max_depth': [3, 5],
    'model__min_samples_split': [2, 5],
    'model__min_samples_leaf': [1, 2],
    'model__max_features': ['auto', 'sqrt', 'log2']
}

gbr_grid_search = GridSearchCV(estimator=gbr_pipeline, param_grid=gbr_param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

gbr_grid_search.fit(X_train, y_train)

160 fits failed out of a total of 480.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
96 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\jaime\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\jaime\anaconda3\lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\jaime\anaconda3\lib\site-packages\sklearn\pipeline.py", line 475, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "C:\Users\jaime\anaconda3\lib\site-packages\sklearn\base.py", line 1467, in wrapper
    estimator._validate_para

### Model 4 Results

In [22]:
y_pred_gbr = gbr_grid_search.best_estimator_.predict(X_val)

rmse_gbr = np.sqrt(mean_squared_error(y_val, y_pred_gbr))
r2_gbr = r2_score(y_val, y_pred_gbr)

print(f"Gradient Boosting - Best Params: {gbr_grid_search.best_params_}, RMSE: {rmse_gbr}, R2: {r2_gbr}")

Gradient Boosting - Best Params: {'model__learning_rate': 0.01, 'model__max_depth': 5, 'model__max_features': 'sqrt', 'model__min_samples_leaf': 2, 'model__min_samples_split': 2, 'model__n_estimators': 200}, RMSE: 257.3364216410847, R2: 0.10728421621872508


In [23]:
y_test_gbr = gbr_grid_search.best_estimator_.predict(X_test)

rmse_gbr_test = np.sqrt(mean_squared_error(y_test, y_test_gbr))
r2_gbr_test = r2_score(y_test, y_test_gbr)

print(f"Gradient Boosting: RMSE: {rmse_gbr_test}, R2: {r2_gbr_test}")

Gradient Boosting: RMSE: 312.15755694471096, R2: 0.2585750154380769


## Model 5: Support Vector Regression

In [24]:
# Support Vector Regression (SVR)
svr_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', SVR())
])

# GridSearchCV for SVR
svr_param_grid = {
    'model__C': [0.1, 1.0, 10.0],
    'model__epsilon': [0.01, 0.1, 1.0],
    'model__kernel': ['linear', 'rbf']
}

svr_grid_search = GridSearchCV(estimator=svr_pipeline, param_grid=svr_param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

svr_grid_search.fit(X_train, y_train)

### Model 5 Results

In [25]:
y_pred_svr = svr_grid_search.best_estimator_.predict(X_val)

rmse_svr = np.sqrt(mean_squared_error(y_val, y_pred_svr))
r2_svr = r2_score(y_val, y_pred_svr)

print(f"Support Vector Regression - Best Params: {svr_grid_search.best_params_}, RMSE: {rmse_svr}, R2: {r2_svr}")

Support Vector Regression - Best Params: {'model__C': 10.0, 'model__epsilon': 1.0, 'model__kernel': 'rbf'}, RMSE: 274.2327926248016, R2: -0.013793426102816086


In [26]:
y_test_svr = svr_grid_search.best_estimator_.predict(X_test)

rmse_svr_test = np.sqrt(mean_squared_error(y_test, y_test_svr))
r2_svr_test = r2_score(y_test, y_test_svr)

print(f"Support Vector Regression: RMSE: {rmse_svr_test}, R2: {r2_svr_test}")

Support Vector Regression: RMSE: 359.88078755701576, R2: 0.014544914630394423


## Model 6: KNN Regression

In [27]:
 # K-Nearest Neighbors (KNN) Regressor
knn_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', KNeighborsRegressor())
])

# GridSearchCV for KNN
knn_param_grid = {
    'model__n_neighbors': [3, 5, 7, 9],
    'model__weights': ['uniform', 'distance']
}

knn_grid_search = GridSearchCV(estimator=knn_pipeline, param_grid=knn_param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

knn_grid_search.fit(X_train, y_train)

### Model 6 Results

In [28]:
y_pred_knn = knn_grid_search.best_estimator_.predict(X_val)

rmse_knn = np.sqrt(mean_squared_error(y_val, y_pred_knn))
r2_knn = r2_score(y_val, y_pred_knn)

print(f"KNN - Best Params: {knn_grid_search.best_params_}, RMSE: {rmse_knn}, R2: {r2_knn}")

KNN - Best Params: {'model__n_neighbors': 9, 'model__weights': 'distance'}, RMSE: 282.515964081912, R2: -0.0759613844750433


In [29]:
y_test_knn = knn_grid_search.best_estimator_.predict(X_test)

rmse_knn_test = np.sqrt(mean_squared_error(y_test, y_test_knn))
r2_knn_test = r2_score(y_test, y_test_knn)

print(f"KNN: RMSE: {rmse_knn_test}, R2: {r2_knn_test}")

KNN: RMSE: 346.8211167260098, R2: 0.08476930906128599


## More models

In [30]:
models = [
    {
        'name': 'Linear Regression',
        'estimator': Pipeline([
            ('scaler', StandardScaler()),
            ('model', LinearRegression())
        ]),
        'param_grid': {
            # LinearRegression hyperparams
        }
    },
    {
        'name': 'Ridge Regression',
        'estimator': Pipeline([
            ('scaler', StandardScaler()),
            ('model', Ridge())
        ]),
        'param_grid': {
            'model__alpha': [0.1, 1.0, 10.0, 100.0]
        }
    },
    {
        'name': 'Lasso Regression',
        'estimator': Pipeline([
            ('scaler', StandardScaler()),
            ('model', Lasso(max_iter=10000))
        ]),
        'param_grid': {
            'model__alpha': [0.01, 0.1, 1.0, 10.0]
        }
    },
    {
        'name': 'Random Forest',
        'estimator': Pipeline([
            ('model', RandomForestRegressor(random_state=42))
        ]),
        'param_grid': {
            'model__n_estimators': [100, 200],
            'model__max_depth': [None, 10, 20],
            'model__min_samples_split': [2, 5],
            'model__min_samples_leaf': [1, 2],
            'model__max_features': ['auto', 'sqrt']
        }
    },
    {
        'name': 'Gradient Boosting',
        'estimator': Pipeline([
            ('model', GradientBoostingRegressor(random_state=42))
        ]),
        'param_grid': {
            'model__n_estimators': [100, 200],
            'model__learning_rate': [0.01, 0.1],
            'model__max_depth': [3, 5],
            'model__min_samples_split': [2, 5],
            'model__min_samples_leaf': [1, 2],
            'model__max_features': ['auto', 'sqrt', 'log2']
        }
    },
    {
        'name': 'Support Vector Regression',
        'estimator': Pipeline([
            ('scaler', StandardScaler()),
            ('model', SVR())
        ]),
        'param_grid': {
            'model__C': [0.1, 1.0, 10.0],
            'model__epsilon': [0.01, 0.1, 1.0],
            'model__kernel': ['linear', 'rbf']
        }
    },
    {
        'name': 'K-Nearest Neighbors',
        'estimator': Pipeline([
            ('scaler', StandardScaler()),
            ('model', KNeighborsRegressor())
        ]),
        'param_grid': {
            'model__n_neighbors': [3, 5, 7, 9],
            'model__weights': ['uniform', 'distance']
        }
    }
]

In [31]:

best_models = {}


for m in models:
    print(f"Entrenando y optimizando modelo: {m['name']}")
    grid_search = GridSearchCV(
        estimator=m['estimator'],
        param_grid=m['param_grid'],
        cv=5,
        scoring='neg_mean_squared_error',
        n_jobs=-1
    )
    
    try:
        grid_search.fit(X_train, y_train)
    except Exception as e:
        print(f"Error al entrenar {m['name']}: {e}")
        continue
    
  
    best_models[m['name']] = {
        'model': grid_search.best_estimator_,
        'best_params': grid_search.best_params_,
        'best_score': -grid_search.best_score_  
    }
    
    y_pred_val = grid_search.best_estimator_.predict(X_val)
    mse_val = mean_squared_error(y_val, y_pred_val)
    r2_val = r2_score(y_val, y_pred_val)
    rmse_val = np.sqrt(mse_val)
    best_models[m['name']]['rmse_val'] = rmse_val
    print(f"Mejores hiperparámetros para {m['name']}: {grid_search.best_params_}")
    print(f"MSE en validación para {m['name']}: {mse_val}")
    print(f"RMSE en validación para {m['name']}: {rmse_val}")
    print(f"R2 en validación para {m['name']}: {r2_val}")
    print("-" * 50)

Entrenando y optimizando modelo: Linear Regression
Mejores hiperparámetros para Linear Regression: {}
MSE en validación para Linear Regression: 164135.9581766447
RMSE en validación para Linear Regression: 405.13696224443
R2 en validación para Linear Regression: -1.2126587166576228
--------------------------------------------------
Entrenando y optimizando modelo: Ridge Regression
Mejores hiperparámetros para Ridge Regression: {'model__alpha': 100.0}
MSE en validación para Ridge Regression: 75482.86690142186
RMSE en validación para Ridge Regression: 274.7414546467676
R2 en validación para Ridge Regression: -0.017557793326383964
--------------------------------------------------
Entrenando y optimizando modelo: Lasso Regression
Mejores hiperparámetros para Lasso Regression: {'model__alpha': 10.0}
MSE en validación para Lasso Regression: 112494.18464967328
RMSE en validación para Lasso Regression: 335.40152750050686
R2 en validación para Lasso Regression: -0.5164942588053194
-------------

120 fits failed out of a total of 240.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
44 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\jaime\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\jaime\anaconda3\lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\jaime\anaconda3\lib\site-packages\sklearn\pipeline.py", line 475, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "C:\Users\jaime\anaconda3\lib\site-packages\sklearn\base.py", line 1467, in wrapper
    estimator._validate_para

Mejores hiperparámetros para Random Forest: {'model__max_depth': None, 'model__max_features': 'sqrt', 'model__min_samples_leaf': 1, 'model__min_samples_split': 2, 'model__n_estimators': 100}
MSE en validación para Random Forest: 66273.03742905453
RMSE en validación para Random Forest: 257.43550149319833
R2 en validación para Random Forest: 0.10659665574950106
--------------------------------------------------
Entrenando y optimizando modelo: Gradient Boosting


160 fits failed out of a total of 480.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
104 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\jaime\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\jaime\anaconda3\lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\jaime\anaconda3\lib\site-packages\sklearn\pipeline.py", line 475, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "C:\Users\jaime\anaconda3\lib\site-packages\sklearn\base.py", line 1467, in wrapper
    estimator._validate_par

Mejores hiperparámetros para Gradient Boosting: {'model__learning_rate': 0.01, 'model__max_depth': 5, 'model__max_features': 'sqrt', 'model__min_samples_leaf': 2, 'model__min_samples_split': 2, 'model__n_estimators': 200}
MSE en validación para Gradient Boosting: 66222.03390303813
RMSE en validación para Gradient Boosting: 257.3364216410847
R2 en validación para Gradient Boosting: 0.10728421621872508
--------------------------------------------------
Entrenando y optimizando modelo: Support Vector Regression
Mejores hiperparámetros para Support Vector Regression: {'model__C': 10.0, 'model__epsilon': 1.0, 'model__kernel': 'rbf'}
MSE en validación para Support Vector Regression: 75203.62455079744
RMSE en validación para Support Vector Regression: 274.2327926248016
R2 en validación para Support Vector Regression: -0.013793426102816086
--------------------------------------------------
Entrenando y optimizando modelo: K-Nearest Neighbors
Mejores hiperparámetros para K-Nearest Neighbors: {'

In [32]:
# Comparar modelos basados en RMSE
print("Comparación de modelos por RMSE en validación:\n")
for name, info in best_models.items():
    print(f"Modelo: {name}")
    print(f"MSE (cross-validation): {info['best_score']}")
    print(f"RMSE en validación: {info['rmse_val']}")
    print(f"Mejores hiperparámetros: {info['best_params']}")
    print("-" * 50)

# Comparar y elegir el mejor modelo basado en el RMSE más bajo
mejor_modelo = min(best_models, key=lambda x: best_models[x]['rmse_val'])
print(f"El mejor modelo basado en RMSE en validación es: {mejor_modelo} con {best_models[mejor_modelo]['rmse_val']}")

Comparación de modelos por RMSE en validación:

Modelo: Linear Regression
MSE (cross-validation): 2041894.3898869082
RMSE en validación: 405.13696224443
Mejores hiperparámetros: {}
--------------------------------------------------
Modelo: Ridge Regression
MSE (cross-validation): 100988.90782242679
RMSE en validación: 274.7414546467676
Mejores hiperparámetros: {'model__alpha': 100.0}
--------------------------------------------------
Modelo: Lasso Regression
MSE (cross-validation): 203423.0667036909
RMSE en validación: 335.40152750050686
Mejores hiperparámetros: {'model__alpha': 10.0}
--------------------------------------------------
Modelo: Random Forest
MSE (cross-validation): 85430.76735885642
RMSE en validación: 257.43550149319833
Mejores hiperparámetros: {'model__max_depth': None, 'model__max_features': 'sqrt', 'model__min_samples_leaf': 1, 'model__min_samples_split': 2, 'model__n_estimators': 100}
--------------------------------------------------
Modelo: Gradient Boosting
MSE (

In [33]:
# Comparing models
for name, info in best_models.items():
    print(f"Modelo: {name}")
    print(f"MSE (cross-validation): {info['best_score']}")
    
    print(f"Mejores hiperparámetros: {info['best_params']}")
    print("-" * 50)


Modelo: Linear Regression
MSE (cross-validation): 2041894.3898869082
Mejores hiperparámetros: {}
--------------------------------------------------
Modelo: Ridge Regression
MSE (cross-validation): 100988.90782242679
Mejores hiperparámetros: {'model__alpha': 100.0}
--------------------------------------------------
Modelo: Lasso Regression
MSE (cross-validation): 203423.0667036909
Mejores hiperparámetros: {'model__alpha': 10.0}
--------------------------------------------------
Modelo: Random Forest
MSE (cross-validation): 85430.76735885642
Mejores hiperparámetros: {'model__max_depth': None, 'model__max_features': 'sqrt', 'model__min_samples_leaf': 1, 'model__min_samples_split': 2, 'model__n_estimators': 100}
--------------------------------------------------
Modelo: Gradient Boosting
MSE (cross-validation): 91679.90977260828
Mejores hiperparámetros: {'model__learning_rate': 0.01, 'model__max_depth': 5, 'model__max_features': 'sqrt', 'model__min_samples_leaf': 2, 'model__min_samples_spl

In [34]:
best_model_name = min(best_models, key=lambda k: best_models[k]['best_score'])
best_model = best_models[best_model_name]['model']

# Predecir en el conjunto de prueba
y_pred_test = best_model.predict(X_test)
mse_test = mean_squared_error(y_test, y_pred_test)
r2_test = r2_score(y_test, y_pred_test)
rmse_test = np.sqrt(mse_test)

print(f"Rendimiento en el conjunto de prueba con {best_model_name}:")
print(f"MSE en prueba: {mse_test}")
print(f"R2 en prueba: {r2_test}")
print(f"RMSE en prueba: {rmse_test}")

Rendimiento en el conjunto de prueba con Random Forest:
MSE en prueba: 94148.73034331782
R2 en prueba: 0.2836356281562781
RMSE en prueba: 306.83665091269296


In [35]:
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Evaluar todos los modelos en el conjunto de prueba
for name, info in best_models.items():
    best_model = info['model']
    
    # Predecir en el conjunto de prueba
    y_pred_test = best_model.predict(X_test)
    mse_test = mean_squared_error(y_test, y_pred_test)
    r2_test = r2_score(y_test, y_pred_test)
    rmse_test = np.sqrt(mse_test)
    
    # Guardar los resultados de prueba
    best_models[name]['mse_test'] = mse_test
    best_models[name]['rmse_test'] = rmse_test
    best_models[name]['r2_test'] = r2_test
    
    # Imprimir resultados
    print(f"Rendimiento en el conjunto de prueba con {name}:")
    print(f"MSE en prueba: {mse_test}")
    print(f"RMSE en prueba: {rmse_test}")
    print(f"R2 en prueba: {r2_test}")
    print("-" * 50)

# Comparar modelos basados en RMSE en el conjunto de prueba
mejor_modelo_test = min(best_models, key=lambda x: best_models[x]['rmse_test'])
print(f"El mejor modelo basado en RMSE en el conjunto de prueba es: {mejor_modelo_test} con {best_models[mejor_modelo_test]['rmse_test']}")

Rendimiento en el conjunto de prueba con Linear Regression:
MSE en prueba: 206888.96602465305
RMSE en prueba: 454.85048755019824
R2 en prueba: -0.5741888780358548
--------------------------------------------------
Rendimiento en el conjunto de prueba con Ridge Regression:
MSE en prueba: 115523.68464875846
RMSE en prueba: 339.88775301378314
R2 en prueba: 0.12099662433362035
--------------------------------------------------
Rendimiento en el conjunto de prueba con Lasso Regression:
MSE en prueba: 143126.06641665922
RMSE en prueba: 378.320058173842
R2 en prueba: -0.08902599418123724
--------------------------------------------------
Rendimiento en el conjunto de prueba con Random Forest:
MSE en prueba: 94148.73034331782
RMSE en prueba: 306.83665091269296
R2 en prueba: 0.2836356281562781
--------------------------------------------------
Rendimiento en el conjunto de prueba con Gradient Boosting:
MSE en prueba: 97442.34035769045
RMSE en prueba: 312.15755694471096
R2 en prueba: 0.258575015

In [36]:
# Importancia de las características del BEST MODEL 
if hasattr(best_model.named_steps['model'], 'feature_importances_'):
    import matplotlib.pyplot as plt

    feature_importances = best_model.named_steps['model'].feature_importances_
    features = X.columns

    # Crear un DataFrame para visualizar
    importances_df = pd.DataFrame({
        'Feature': features,
        'Importance': feature_importances
    }).sort_values(by='Importance', ascending=False)

    # Graficar
    plt.figure(figsize=(10, 6))
    plt.barh(importances_df['Feature'], importances_df['Importance'])
    plt.gca().invert_yaxis()
    plt.xlabel('Importancia')
    plt.title('Importancia de las Características')
    plt.show()
else:
    print(f"El modelo {best_model_name} no proporciona importancias de características.")

El modelo Random Forest no proporciona importancias de características.


In [37]:
 # Make a df with the validation and test results to see it clearly
results = pd.DataFrame(best_models).T
results


Unnamed: 0,model,best_params,best_score,rmse_val,mse_test,rmse_test,r2_test
Linear Regression,"(StandardScaler(), LinearRegression())",{},2041894.389887,405.136962,206888.966025,454.850488,-0.574189
Ridge Regression,"(StandardScaler(), Ridge(alpha=100.0))",{'model__alpha': 100.0},100988.907822,274.741455,115523.684649,339.887753,0.120997
Lasso Regression,"(StandardScaler(), Lasso(alpha=10.0, max_iter=...",{'model__alpha': 10.0},203423.066704,335.401528,143126.066417,378.320058,-0.089026
Random Forest,"((DecisionTreeRegressor(max_features='sqrt', r...","{'model__max_depth': None, 'model__max_feature...",85430.767359,257.435501,94148.730343,306.836651,0.283636
Gradient Boosting,(([DecisionTreeRegressor(criterion='friedman_m...,"{'model__learning_rate': 0.01, 'model__max_dep...",91679.909773,257.336422,97442.340358,312.157557,0.258575
Support Vector Regression,"(StandardScaler(), SVR(C=10.0, epsilon=1.0))","{'model__C': 10.0, 'model__epsilon': 1.0, 'mod...",111095.185735,274.232793,129514.181253,359.880788,0.014545
K-Nearest Neighbors,"(StandardScaler(), KNeighborsRegressor(n_neigh...","{'model__n_neighbors': 9, 'model__weights': 'd...",97968.552306,282.515964,120284.887007,346.821117,0.084769
