# Analysis of Models

## Import libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

# Regression Models
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor

## Preparing Data

In [2]:
df = pd.read_csv('../dataset_new.csv')
df.head(5)

Unnamed: 0,LOTE,Fecha/hora inicio_preinoculo,Fecha/hora fin_preinoculo,línea 1,línea 2,línea 3,línea 1.1,línea 2.1,línea 3.1,línea 1.2,...,Centrifugación 1 turbidez,Centrifugación 2 turbidez,Producto 1,Producto 2,media_PV,temperature_preinoculo,humedad_preinoculo,media_temp_bioreactor,media_ph_biorreactor,media_PO_biorreactor
0,23019,2023-03-19 05:00:00,2023-03-20 07:24:00,,,,,,,,...,,,1747.92,6.0,0.330522,20.157241,50.33905,29.689535,6.191826,24.35514
1,23020,2023-03-19 05:00:00,2023-03-20 07:24:00,,,,,,,,...,,,1676.16,6.56,0.426711,20.157241,50.33905,29.689535,6.191826,24.35514
2,23021,2023-03-20 05:00:00,2023-03-21 06:37:00,,,,,,,,...,,,1928.496,8.08,0.409436,20.173527,52.099099,29.439842,6.028036,25.462923
3,23022,2023-03-20 05:00:00,2023-03-21 06:37:00,,,,,,,,...,,,1782.8,5.92,0.080985,20.173527,52.099099,29.439842,6.028036,25.462923
4,23023,2023-03-26 05:00:00,2023-03-27 07:21:00,5.496,5.504,5.5200000000000005,28.32,27.92,32.0,1.0,...,26.56,20.88,1861.84,2.96,0.368708,20.251146,48.900195,29.442474,6.002998,23.396945


In [3]:
target = df.iloc[:,41]
columns_to_drop = df.iloc[:, [0,1,2,13,14,12,22,23,24,30,31,32,33,38,41,42]].columns

df_dropped = df.drop(columns=columns_to_drop)

In [4]:
data = df_dropped
data.replace('N.A', np.nan, inplace=True)

In [5]:
data['línea 3'] = data['línea 3'].astype(float)
data['línea 3.1'] = data['línea 3.1'].astype(float)
data['Viabilidad final cultivo_cultivo_final'] = data['Viabilidad final cultivo_cultivo_final'].astype(float)

In [6]:
def clean_and_convert(column):
    # Reemplazar ',' por '.' y eliminar el signo '+'
    column = column.str.replace(',', '.', regex=False)
    column = column.str.replace('+', '', regex=False)
    # Convertir a float, forzando errores a NaN
    return pd.to_numeric(column, errors='coerce')

# Aplicar la función a todas las columnas de tipo object
for col in data.select_dtypes(include=['object']).columns:
    data[col] = clean_and_convert(data[col])

In [7]:
for column in data.select_dtypes(include=[np.number]).columns:
    mean_value = data[column].mean()
    print(mean_value)
    data[column].fillna(mean_value, inplace=True) 
for column in data.select_dtypes(exclude=[np.number]).columns:
    mode_value = data[column].mode()[0]  # Use the most frequent value
    data[column].fillna(mode_value, inplace=True)

5.468736
5.470035087719299
5.498217821782178
32.061440000000005
30.212631578947367
29.801188118811883
0.8188976377952756
0.6456692913385826
0.41732283464566927
150.5514503816794
15.36064
26.362137404580157
99092268.70229007
13.696641221374044
20.1720479825518
44926560.913380995
150.5514503816794
15.36064
26.362137404580157
99092268.70229007
1.0
81.55658914728681
17.380763358778626
74.06106870229007
168240203.56234095
30.287401574803148
23.35516129032258
0.3661848971527833
20.16984734592939
47.94785228808695
29.45899697760463
6.965512947242943
27.171243680776026


In [8]:
# Definir características (X) y objetivo (y)
X = data
y = target

In [9]:
# Dividir el dataset en entrenamiento, validación y prueba
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)  # 60% entrenamiento
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)  # 20% validación, 20% prueba

## Model 1: XGBRegressor

In [11]:
# Definir el pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),               # Escalado de las características
    ('regressor', XGBRegressor(random_state=42))  # Modelo XGBoost
])

# Definir los hiperparámetros a ajustar
param_distributions = {
    'regressor__n_estimators': [100, 200, 300, 500],
    'regressor__learning_rate': [0.01, 0.05, 0.1, 0.2],
    'regressor__max_depth': [3, 5, 7, 10],
    'regressor__min_child_weight': [1, 3, 5],
    'regressor__subsample': [0.6, 0.8, 1.0],
    'regressor__colsample_bytree': [0.6, 0.8, 1.0]
}

# Realizar la búsqueda aleatoria (Random Search) para encontrar los mejores parámetros
random_search = GridSearchCV(pipeline, param_distributions, cv=5, 
                                   scoring='neg_mean_squared_error', verbose=2, n_jobs=-1)

# Entrenar el modelo
random_search.fit(X_train, y_train)


Fitting 5 folds for each of 1728 candidates, totalling 8640 fits


### Model 1 Results

In [14]:
print("Mejores parámetros encontrados:", random_search.best_params_)

y_val_pred = random_search.predict(X_val)

mse_val = mean_squared_error(y_val, y_val_pred)
print("Root Mean Squared Error en el conjunto de validación:", np.sqrt(mse_val))

Mejores parámetros encontrados: {'regressor__colsample_bytree': 0.8, 'regressor__learning_rate': 0.05, 'regressor__max_depth': 3, 'regressor__min_child_weight': 3, 'regressor__n_estimators': 300, 'regressor__subsample': 0.8}
Root Mean Squared Error en el conjunto de validación: 284.7568073732479


In [15]:
y_test_pred = random_search.predict(X_test)
mse_test = mean_squared_error(y_test, y_test_pred)
print("Root Mean Squared Error en el conjunto de prueba:", np.sqrt(mse_test))

Root Mean Squared Error en el conjunto de prueba: 276.52240735733375


## Model 2: 

In [16]:
# Definir el pipeline
ridge_pipeline = Pipeline([
    ('scaler', StandardScaler()),              
    ('model', Ridge())
])

ridge_param_grid = {'model__alpha': [0.1, 1.0, 10.0, 100.0]}
ridge_grid_search = GridSearchCV(estimator=ridge_pipeline, param_grid=ridge_param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

ridge_grid_search.fit(X_train, y_train)


### Model 2 Results

In [17]:
y_pred_ridge = ridge_grid_search.best_estimator_.predict(X_val)

rmse_ridge = np.sqrt(mean_squared_error(y_val, y_pred_ridge))
r2_ridge = r2_score(y_val, y_pred_ridge)

print(f"Ridge Regression - Best Params: {ridge_grid_search.best_params_}, RMSE: {rmse_ridge}, R2: {r2_ridge}")

Ridge Regression - Best Params: {'model__alpha': 100.0}, RMSE: 314.1177444561765, R2: 0.2651666931495331


In [18]:
y_test_ridge = ridge_grid_search.best_estimator_.predict(X_test)

rmse_ridge_test = np.sqrt(mean_squared_error(y_test, y_test_ridge))
r2_ridge_test = r2_score(y_test, y_test_ridge)

print(f"Ridge Regression: RMSE: {rmse_ridge_test}, R2: {r2_ridge_test}")

Ridge Regression: RMSE: 230.17278206055036, R2: 0.22022343548541612


## Model 3: Random Forest Regressor

In [19]:
# Random Forest Regressor
rf_pipeline = Pipeline([
    ('model', RandomForestRegressor(random_state=42))
])

# GridSearchCV for Random Forest
rf_param_grid = {
    'model__n_estimators': [100, 200],
    'model__max_depth': [None, 10, 20],
    'model__min_samples_split': [2, 5],
    'model__min_samples_leaf': [1, 2],
    'model__max_features': ['auto', 'sqrt']
}

rf_grid_search = GridSearchCV(estimator=rf_pipeline, param_grid=rf_param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

rf_grid_search.fit(X_train, y_train)

120 fits failed out of a total of 240.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
90 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\jaime\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\jaime\anaconda3\lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\jaime\anaconda3\lib\site-packages\sklearn\pipeline.py", line 475, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "C:\Users\jaime\anaconda3\lib\site-packages\sklearn\base.py", line 1467, in wrapper
    estimator._validate_para

## Model 3 Results

In [20]:
y_pred_rf = rf_grid_search.best_estimator_.predict(X_val)

rmse_rf = np.sqrt(mean_squared_error(y_val, y_pred_rf))
r2_rf = r2_score(y_val, y_pred_rf)

print(f"Random Forest Regression - Best Params: {rf_grid_search.best_params_}, RMSE: {rmse_rf}, R2: {r2_rf}")

Random Forest Regression - Best Params: {'model__max_depth': 10, 'model__max_features': 'sqrt', 'model__min_samples_leaf': 1, 'model__min_samples_split': 2, 'model__n_estimators': 100}, RMSE: 312.96747803889343, R2: 0.27053860526803364


In [21]:
y_test_rf = rf_grid_search.best_estimator_.predict(X_test)

rmse_rf_test = np.sqrt(mean_squared_error(y_test, y_test_rf))
r2_rf_test = r2_score(y_test, y_test_rf)

print(f"Ridge Regression: RMSE: {rmse_rf_test}, R2: {r2_rf_test}")

Ridge Regression: RMSE: 252.11541795986676, R2: 0.06446281167212597


## Model 4: Gradient Boosting Regressor

In [22]:
# Gradient Boosting Regressor
gbr_pipeline = Pipeline([
    ('model', GradientBoostingRegressor(random_state=42))
])

# GridSearchCV for Gradient Boosting
gbr_param_grid = {
    'model__n_estimators': [100, 200],
    'model__learning_rate': [0.01, 0.1],
    'model__max_depth': [3, 5],
    'model__min_samples_split': [2, 5],
    'model__min_samples_leaf': [1, 2],
    'model__max_features': ['auto', 'sqrt', 'log2']
}

gbr_grid_search = GridSearchCV(estimator=gbr_pipeline, param_grid=gbr_param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

gbr_grid_search.fit(X_train, y_train)

160 fits failed out of a total of 480.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
115 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\jaime\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\jaime\anaconda3\lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\jaime\anaconda3\lib\site-packages\sklearn\pipeline.py", line 475, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "C:\Users\jaime\anaconda3\lib\site-packages\sklearn\base.py", line 1467, in wrapper
    estimator._validate_par

### Model 4 Results

In [23]:
y_pred_gbr = gbr_grid_search.best_estimator_.predict(X_val)

rmse_gbr = np.sqrt(mean_squared_error(y_val, y_pred_gbr))
r2_gbr = r2_score(y_val, y_pred_gbr)

print(f"Gradient Boosting - Best Params: {gbr_grid_search.best_params_}, RMSE: {rmse_gbr}, R2: {r2_gbr}")

Gradient Boosting - Best Params: {'model__learning_rate': 0.1, 'model__max_depth': 5, 'model__max_features': 'sqrt', 'model__min_samples_leaf': 2, 'model__min_samples_split': 2, 'model__n_estimators': 100}, RMSE: 310.66571262826653, R2: 0.2812290106140467


In [24]:
y_test_gbr = gbr_grid_search.best_estimator_.predict(X_test)

rmse_gbr_test = np.sqrt(mean_squared_error(y_test, y_test_gbr))
r2_gbr_test = r2_score(y_test, y_test_gbr)

print(f"Gradient Boosting: RMSE: {rmse_gbr_test}, R2: {r2_gbr_test}")

Gradient Boosting: RMSE: 282.7652341633496, R2: -0.17683145785499588


## Model 5: Support Vector Regression

In [25]:
# Support Vector Regression (SVR)
svr_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', SVR())
])

# GridSearchCV for SVR
svr_param_grid = {
    'model__C': [0.1, 1.0, 10.0],
    'model__epsilon': [0.01, 0.1, 1.0],
    'model__kernel': ['linear', 'rbf']
}

svr_grid_search = GridSearchCV(estimator=svr_pipeline, param_grid=svr_param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

svr_grid_search.fit(X_train, y_train)

### Model 5 Results

In [26]:
y_pred_svr = svr_grid_search.best_estimator_.predict(X_val)

rmse_svr = np.sqrt(mean_squared_error(y_val, y_pred_svr))
r2_svr = r2_score(y_val, y_pred_svr)

print(f"Support Vector Regression - Best Params: {svr_grid_search.best_params_}, RMSE: {rmse_svr}, R2: {r2_svr}")

Support Vector Regression - Best Params: {'model__C': 1.0, 'model__epsilon': 1.0, 'model__kernel': 'linear'}, RMSE: 336.5858742295102, R2: 0.1562852075702741


In [27]:
y_test_svr = svr_grid_search.best_estimator_.predict(X_test)

rmse_svr_test = np.sqrt(mean_squared_error(y_test, y_test_svr))
r2_svr_test = r2_score(y_test, y_test_svr)

print(f"Support Vector Regression: RMSE: {rmse_svr_test}, R2: {r2_svr_test}")

Support Vector Regression: RMSE: 245.80066369898537, R2: 0.11074084054108657


## Model 6: KNN Regression

In [28]:
 # K-Nearest Neighbors (KNN) Regressor
knn_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', KNeighborsRegressor())
])

# GridSearchCV for KNN
knn_param_grid = {
    'model__n_neighbors': [3, 5, 7, 9],
    'model__weights': ['uniform', 'distance']
}

knn_grid_search = GridSearchCV(estimator=knn_pipeline, param_grid=knn_param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

knn_grid_search.fit(X_train, y_train)

### Model 6 Results

In [29]:
y_pred_knn = knn_grid_search.best_estimator_.predict(X_val)

rmse_knn = np.sqrt(mean_squared_error(y_val, y_pred_knn))
r2_knn = r2_score(y_val, y_pred_knn)

print(f"KNN - Best Params: {knn_grid_search.best_params_}, RMSE: {rmse_knn}, R2: {r2_knn}")

KNN - Best Params: {'model__n_neighbors': 7, 'model__weights': 'distance'}, RMSE: 314.900217072142, R2: 0.26150116898753906


In [30]:
y_test_knn = knn_grid_search.best_estimator_.predict(X_test)

rmse_knn_test = np.sqrt(mean_squared_error(y_test, y_test_knn))
r2_knn_test = r2_score(y_test, y_test_knn)

print(f"KNN: RMSE: {rmse_knn_test}, R2: {r2_knn_test}")

KNN: RMSE: 288.5244885170533, R2: -0.22525816230810936


## More models

In [31]:
models = [
    {
        'name': 'Linear Regression',
        'estimator': Pipeline([
            ('scaler', StandardScaler()),
            ('model', LinearRegression())
        ]),
        'param_grid': {
            # LinearRegression hyperparams
        }
    },
    {
        'name': 'Ridge Regression',
        'estimator': Pipeline([
            ('scaler', StandardScaler()),
            ('model', Ridge())
        ]),
        'param_grid': {
            'model__alpha': [0.1, 1.0, 10.0, 100.0]
        }
    },
    {
        'name': 'Lasso Regression',
        'estimator': Pipeline([
            ('scaler', StandardScaler()),
            ('model', Lasso(max_iter=10000))
        ]),
        'param_grid': {
            'model__alpha': [0.01, 0.1, 1.0, 10.0]
        }
    },
    {
        'name': 'Random Forest',
        'estimator': Pipeline([
            ('model', RandomForestRegressor(random_state=42))
        ]),
        'param_grid': {
            'model__n_estimators': [100, 200],
            'model__max_depth': [None, 10, 20],
            'model__min_samples_split': [2, 5],
            'model__min_samples_leaf': [1, 2],
            'model__max_features': ['auto', 'sqrt']
        }
    },
    {
        'name': 'Gradient Boosting',
        'estimator': Pipeline([
            ('model', GradientBoostingRegressor(random_state=42))
        ]),
        'param_grid': {
            'model__n_estimators': [100, 200],
            'model__learning_rate': [0.01, 0.1],
            'model__max_depth': [3, 5],
            'model__min_samples_split': [2, 5],
            'model__min_samples_leaf': [1, 2],
            'model__max_features': ['auto', 'sqrt', 'log2']
        }
    },
    {
        'name': 'Support Vector Regression',
        'estimator': Pipeline([
            ('scaler', StandardScaler()),
            ('model', SVR())
        ]),
        'param_grid': {
            'model__C': [0.1, 1.0, 10.0],
            'model__epsilon': [0.01, 0.1, 1.0],
            'model__kernel': ['linear', 'rbf']
        }
    },
    {
        'name': 'K-Nearest Neighbors',
        'estimator': Pipeline([
            ('scaler', StandardScaler()),
            ('model', KNeighborsRegressor())
        ]),
        'param_grid': {
            'model__n_neighbors': [3, 5, 7, 9],
            'model__weights': ['uniform', 'distance']
        }
    }
]

In [32]:

best_models = {}


for m in models:
    print(f"Entrenando y optimizando modelo: {m['name']}")
    grid_search = GridSearchCV(
        estimator=m['estimator'],
        param_grid=m['param_grid'],
        cv=5,
        scoring='neg_mean_squared_error',
        n_jobs=-1
    )
    
    try:
        grid_search.fit(X_train, y_train)
    except Exception as e:
        print(f"Error al entrenar {m['name']}: {e}")
        continue
    
  
    best_models[m['name']] = {
        'model': grid_search.best_estimator_,
        'best_params': grid_search.best_params_,
        'best_score': -grid_search.best_score_  
    }
    
    y_pred_val = grid_search.best_estimator_.predict(X_val)
    mse_val = mean_squared_error(y_val, y_pred_val)
    r2_val = r2_score(y_val, y_pred_val)
    rmse_val = np.sqrt(mse_val)
    best_models[m['name']]['rmse_val'] = rmse_val
    print(f"Mejores hiperparámetros para {m['name']}: {grid_search.best_params_}")
    print(f"MSE en validación para {m['name']}: {mse_val}")
    print(f"RMSE en validación para {m['name']}: {rmse_val}")
    print(f"R2 en validación para {m['name']}: {r2_val}")
    print("-" * 50)

Entrenando y optimizando modelo: Linear Regression
Mejores hiperparámetros para Linear Regression: {}
MSE en validación para Linear Regression: 76226.30271724031
RMSE en validación para Linear Regression: 276.0911130718269
R2 en validación para Linear Regression: 0.4323132635225104
--------------------------------------------------
Entrenando y optimizando modelo: Ridge Regression
Mejores hiperparámetros para Ridge Regression: {'model__alpha': 100.0}
MSE en validación para Ridge Regression: 98669.95738223582
RMSE en validación para Ridge Regression: 314.1177444561765
R2 en validación para Ridge Regression: 0.2651666931495331
--------------------------------------------------
Entrenando y optimizando modelo: Lasso Regression
Mejores hiperparámetros para Lasso Regression: {'model__alpha': 10.0}
MSE en validación para Lasso Regression: 63914.68496471499
RMSE en validación para Lasso Regression: 252.81353793797314
R2 en validación para Lasso Regression: 0.5240026391520158
-----------------

120 fits failed out of a total of 240.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
114 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\jaime\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\jaime\anaconda3\lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\jaime\anaconda3\lib\site-packages\sklearn\pipeline.py", line 475, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "C:\Users\jaime\anaconda3\lib\site-packages\sklearn\base.py", line 1467, in wrapper
    estimator._validate_par

Mejores hiperparámetros para Random Forest: {'model__max_depth': 10, 'model__max_features': 'sqrt', 'model__min_samples_leaf': 1, 'model__min_samples_split': 2, 'model__n_estimators': 100}
MSE en validación para Random Forest: 97948.64231002526
RMSE en validación para Random Forest: 312.96747803889343
R2 en validación para Random Forest: 0.27053860526803364
--------------------------------------------------
Entrenando y optimizando modelo: Gradient Boosting


160 fits failed out of a total of 480.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
154 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\jaime\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\jaime\anaconda3\lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\jaime\anaconda3\lib\site-packages\sklearn\pipeline.py", line 475, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "C:\Users\jaime\anaconda3\lib\site-packages\sklearn\base.py", line 1467, in wrapper
    estimator._validate_par

Mejores hiperparámetros para Gradient Boosting: {'model__learning_rate': 0.1, 'model__max_depth': 5, 'model__max_features': 'sqrt', 'model__min_samples_leaf': 2, 'model__min_samples_split': 2, 'model__n_estimators': 100}
MSE en validación para Gradient Boosting: 96513.18500282869
RMSE en validación para Gradient Boosting: 310.66571262826653
R2 en validación para Gradient Boosting: 0.2812290106140467
--------------------------------------------------
Entrenando y optimizando modelo: Support Vector Regression
Mejores hiperparámetros para Support Vector Regression: {'model__C': 1.0, 'model__epsilon': 1.0, 'model__kernel': 'linear'}
MSE en validación para Support Vector Regression: 113290.05073084365
RMSE en validación para Support Vector Regression: 336.5858742295102
R2 en validación para Support Vector Regression: 0.1562852075702741
--------------------------------------------------
Entrenando y optimizando modelo: K-Nearest Neighbors
Mejores hiperparámetros para K-Nearest Neighbors: {'m

In [33]:
# Comparar modelos basados en RMSE
print("Comparación de modelos por RMSE en validación:\n")
for name, info in best_models.items():
    print(f"Modelo: {name}")
    print(f"MSE (cross-validation): {info['best_score']}")
    print(f"RMSE en validación: {info['rmse_val']}")
    print(f"Mejores hiperparámetros: {info['best_params']}")
    print("-" * 50)

# Comparar y elegir el mejor modelo basado en el RMSE más bajo
mejor_modelo = min(best_models, key=lambda x: best_models[x]['rmse_val'])
print(f"El mejor modelo basado en RMSE en validación es: {mejor_modelo} con {best_models[mejor_modelo]['rmse_val']}")

Comparación de modelos por RMSE en validación:

Modelo: Linear Regression
MSE (cross-validation): 623102.8687794165
RMSE en validación: 276.0911130718269
Mejores hiperparámetros: {}
--------------------------------------------------
Modelo: Ridge Regression
MSE (cross-validation): 118807.03704963179
RMSE en validación: 314.1177444561765
Mejores hiperparámetros: {'model__alpha': 100.0}
--------------------------------------------------
Modelo: Lasso Regression
MSE (cross-validation): 278819.66989288357
RMSE en validación: 252.81353793797314
Mejores hiperparámetros: {'model__alpha': 10.0}
--------------------------------------------------
Modelo: Random Forest
MSE (cross-validation): 78246.59208113936
RMSE en validación: 312.96747803889343
Mejores hiperparámetros: {'model__max_depth': 10, 'model__max_features': 'sqrt', 'model__min_samples_leaf': 1, 'model__min_samples_split': 2, 'model__n_estimators': 100}
--------------------------------------------------
Modelo: Gradient Boosting
MSE (

In [34]:
# Comparing models
for name, info in best_models.items():
    print(f"Modelo: {name}")
    print(f"MSE (cross-validation): {info['best_score']}")
    
    print(f"Mejores hiperparámetros: {info['best_params']}")
    print("-" * 50)


Modelo: Linear Regression
MSE (cross-validation): 623102.8687794165
Mejores hiperparámetros: {}
--------------------------------------------------
Modelo: Ridge Regression
MSE (cross-validation): 118807.03704963179
Mejores hiperparámetros: {'model__alpha': 100.0}
--------------------------------------------------
Modelo: Lasso Regression
MSE (cross-validation): 278819.66989288357
Mejores hiperparámetros: {'model__alpha': 10.0}
--------------------------------------------------
Modelo: Random Forest
MSE (cross-validation): 78246.59208113936
Mejores hiperparámetros: {'model__max_depth': 10, 'model__max_features': 'sqrt', 'model__min_samples_leaf': 1, 'model__min_samples_split': 2, 'model__n_estimators': 100}
--------------------------------------------------
Modelo: Gradient Boosting
MSE (cross-validation): 72039.41814732771
Mejores hiperparámetros: {'model__learning_rate': 0.1, 'model__max_depth': 5, 'model__max_features': 'sqrt', 'model__min_samples_leaf': 2, 'model__min_samples_split'

In [35]:
best_model_name = min(best_models, key=lambda k: best_models[k]['best_score'])
best_model = best_models[best_model_name]['model']

# Predecir en el conjunto de prueba
y_pred_test = best_model.predict(X_test)
mse_test = mean_squared_error(y_test, y_pred_test)
r2_test = r2_score(y_test, y_pred_test)
rmse_test = np.sqrt(mse_test)

print(f"Rendimiento en el conjunto de prueba con {best_model_name}:")
print(f"MSE en prueba: {mse_test}")
print(f"R2 en prueba: {r2_test}")
print(f"RMSE en prueba: {rmse_test}")

Rendimiento en el conjunto de prueba con Gradient Boosting:
MSE en prueba: 79956.17765145392
R2 en prueba: -0.17683145785499588
RMSE en prueba: 282.7652341633496


In [36]:
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Evaluar todos los modelos en el conjunto de prueba
for name, info in best_models.items():
    best_model = info['model']
    
    # Predecir en el conjunto de prueba
    y_pred_test = best_model.predict(X_test)
    mse_test = mean_squared_error(y_test, y_pred_test)
    r2_test = r2_score(y_test, y_pred_test)
    rmse_test = np.sqrt(mse_test)
    
    # Guardar los resultados de prueba
    best_models[name]['mse_test'] = mse_test
    best_models[name]['rmse_test'] = rmse_test
    best_models[name]['r2_test'] = r2_test
    
    # Imprimir resultados
    print(f"Rendimiento en el conjunto de prueba con {name}:")
    print(f"MSE en prueba: {mse_test}")
    print(f"RMSE en prueba: {rmse_test}")
    print(f"R2 en prueba: {r2_test}")
    print("-" * 50)

# Comparar modelos basados en RMSE en el conjunto de prueba
mejor_modelo_test = min(best_models, key=lambda x: best_models[x]['rmse_test'])
print(f"El mejor modelo basado en RMSE en el conjunto de prueba es: {mejor_modelo_test} con {best_models[mejor_modelo_test]['rmse_test']}")

Rendimiento en el conjunto de prueba con Linear Regression:
MSE en prueba: 82733.86122394586
RMSE en prueba: 287.63494437210835
R2 en prueba: -0.2177146704358328
--------------------------------------------------
Rendimiento en el conjunto de prueba con Ridge Regression:
MSE en prueba: 52979.50960149361
RMSE en prueba: 230.17278206055036
R2 en prueba: 0.22022343548541612
--------------------------------------------------
Rendimiento en el conjunto de prueba con Lasso Regression:
MSE en prueba: 52300.27647526243
RMSE en prueba: 228.69253699074315
R2 en prueba: 0.23022069815660784
--------------------------------------------------
Rendimiento en el conjunto de prueba con Random Forest:
MSE en prueba: 63562.1839730783
RMSE en prueba: 252.11541795986676
R2 en prueba: 0.06446281167212597
--------------------------------------------------
Rendimiento en el conjunto de prueba con Gradient Boosting:
MSE en prueba: 79956.17765145392
RMSE en prueba: 282.7652341633496
R2 en prueba: -0.17683145785

In [37]:
# Importancia de las características del BEST MODEL 
if hasattr(best_model.named_steps['model'], 'feature_importances_'):
    import matplotlib.pyplot as plt

    feature_importances = best_model.named_steps['model'].feature_importances_
    features = X.columns

    # Crear un DataFrame para visualizar
    importances_df = pd.DataFrame({
        'Feature': features,
        'Importance': feature_importances
    }).sort_values(by='Importance', ascending=False)

    # Graficar
    plt.figure(figsize=(10, 6))
    plt.barh(importances_df['Feature'], importances_df['Importance'])
    plt.gca().invert_yaxis()
    plt.xlabel('Importancia')
    plt.title('Importancia de las Características')
    plt.show()
else:
    print(f"El modelo {best_model_name} no proporciona importancias de características.")

El modelo Gradient Boosting no proporciona importancias de características.


In [38]:
#Trying with Neural Networks

from sklearn.neural_network import MLPRegressor

# Neural Network Regressor
nn_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', MLPRegressor(random_state=42))
])

# GridSearchCV for Neural Network
nn_param_grid = {
    'model__hidden_layer_sizes': [(100,), (50, 50), (100, 50, 100)],
    'model__activation': ['identity', 'logistic', 'tanh', 'relu'],
    'model__solver': ['lbfgs', 'sgd', 'adam'],
    'model__alpha': [0.0001, 0.001, 0.01],
    'model__learning_rate': ['constant', 'invscaling', 'adaptive']
}

nn_grid_search = GridSearchCV(estimator=nn_pipeline, param_grid=nn_param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

nn_grid_search.fit(X_train, y_train)

171 fits failed out of a total of 1620.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
171 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\jaime\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\jaime\anaconda3\lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\jaime\anaconda3\lib\site-packages\sklearn\pipeline.py", line 475, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "C:\Users\jaime\anaconda3\lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(esti

In [39]:

y_pred_nn = nn_grid_search.best_estimator_.predict(X_val)

rmse_nn = np.sqrt(mean_squared_error(y_val, y_pred_nn))
r2_nn = r2_score(y_val, y_pred_nn)

print(f"Neural Network - Best Params: {nn_grid_search.best_params_}, RMSE: {rmse_nn}, R2: {r2_nn}")

Neural Network - Best Params: {'model__activation': 'logistic', 'model__alpha': 0.01, 'model__hidden_layer_sizes': (100, 50, 100), 'model__learning_rate': 'adaptive', 'model__solver': 'sgd'}, RMSE: 367.7476411948219, R2: -0.007172092934610852


In [40]:

y_test_nn = nn_grid_search.best_estimator_.predict(X_test)

rmse_nn_test = np.sqrt(mean_squared_error(y_test, y_test_nn))
r2_nn_test = r2_score(y_test, y_test_nn)

print(f"Neural Network: RMSE: {rmse_nn_test}, R2: {r2_nn_test}")


Neural Network: RMSE: 265.716759771132, R2: -0.039202398686891016


In [41]:

# Comparar modelos basados en RMSE en validación
print("Comparación de modelos por RMSE en validación:\n")
for name, info in best_models.items():
    print(f"Modelo: {name}")
    print(f"MSE (cross-validation): {info['best_score']}")
    print(f"RMSE en validación: {info['rmse_val']}")
    print(f"Mejores hiperparámetros: {info['best_params']}")
    print("-" * 50)
    
# Comparar y elegir el mejor modelo basado en el RMSE más bajo
mejor_modelo = min(best_models, key=lambda x: best_models[x]['rmse_val'])
print(f"El mejor modelo basado en RMSE en validación es: {mejor_modelo} con {best_models[mejor_modelo]['rmse_val']}")

Comparación de modelos por RMSE en validación:

Modelo: Linear Regression
MSE (cross-validation): 623102.8687794165
RMSE en validación: 276.0911130718269
Mejores hiperparámetros: {}
--------------------------------------------------
Modelo: Ridge Regression
MSE (cross-validation): 118807.03704963179
RMSE en validación: 314.1177444561765
Mejores hiperparámetros: {'model__alpha': 100.0}
--------------------------------------------------
Modelo: Lasso Regression
MSE (cross-validation): 278819.66989288357
RMSE en validación: 252.81353793797314
Mejores hiperparámetros: {'model__alpha': 10.0}
--------------------------------------------------
Modelo: Random Forest
MSE (cross-validation): 78246.59208113936
RMSE en validación: 312.96747803889343
Mejores hiperparámetros: {'model__max_depth': 10, 'model__max_features': 'sqrt', 'model__min_samples_leaf': 1, 'model__min_samples_split': 2, 'model__n_estimators': 100}
--------------------------------------------------
Modelo: Gradient Boosting
MSE (

In [42]:
# Comparar modelos basados en RMSE en el conjunto de prueba
print("Comparación de modelos por RMSE en el conjunto de prueba:\n")
for name, info in best_models.items():
    print(f"Modelo: {name}")
    print(f"MSE en prueba: {info['mse_test']}")
    print(f"RMSE en prueba: {info['rmse_test']}")
    print(f"R2 en prueba: {info['r2_test']}")
    print("-" * 50)
    
# Comparar y elegir el mejor modelo basado en el RMSE más bajo en el conjunto de prueba
mejor_modelo_test = min(best_models, key=lambda x: best_models[x]['rmse_test'])
print(f"El mejor modelo basado en RMSE en el conjunto de prueba es: {mejor_modelo_test} con {best_models[mejor_modelo_test]['rmse_test']}")

Comparación de modelos por RMSE en el conjunto de prueba:

Modelo: Linear Regression
MSE en prueba: 82733.86122394586
RMSE en prueba: 287.63494437210835
R2 en prueba: -0.2177146704358328
--------------------------------------------------
Modelo: Ridge Regression
MSE en prueba: 52979.50960149361
RMSE en prueba: 230.17278206055036
R2 en prueba: 0.22022343548541612
--------------------------------------------------
Modelo: Lasso Regression
MSE en prueba: 52300.27647526243
RMSE en prueba: 228.69253699074315
R2 en prueba: 0.23022069815660784
--------------------------------------------------
Modelo: Random Forest
MSE en prueba: 63562.1839730783
RMSE en prueba: 252.11541795986676
R2 en prueba: 0.06446281167212597
--------------------------------------------------
Modelo: Gradient Boosting
MSE en prueba: 79956.17765145392
RMSE en prueba: 282.7652341633496
R2 en prueba: -0.17683145785499588
--------------------------------------------------
Modelo: Support Vector Regression
MSE en prueba: 6041

In [43]:

# Importancia de las características del mejor modelo
if hasattr(best_models[mejor_modelo_test]['model'].named_steps['model'], 'feature_importances_'):
    import matplotlib.pyplot as plt

    feature_importances = best_models[mejor_modelo_test]['model'].named_steps['model'].feature_importances_
    features = X.columns

    # Crear un DataFrame para visualizar
    importances_df = pd.DataFrame({
        'Feature': features,
        'Importance': feature_importances
    }).sort_values(by='Importance', ascending=False)

    # Graficar
    plt.figure(figsize=(10, 6))
    plt.barh(importances_df['Feature'], importances_df['Importance'])
    plt.gca().invert_yaxis()
    plt.xlabel('Importancia')
    plt.title('Importancia de las Características')
    plt.show()
else:
    print(f"El modelo {mejor_modelo_test} no proporciona importancias de características.")

El modelo Lasso Regression no proporciona importancias de características.


In [44]:
# Trying a gaussian model
from sklearn.gaussian_process import GaussianProcessRegressor


# Gaussian Process Regressor
gp_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', GaussianProcessRegressor(random_state=42))
])

# GridSearchCV for Gaussian Process
gp_param_grid = {
    'model__kernel': [None, 'rbf', 'linear', 'poly', 'sigmoid']
}

gp_grid_search = GridSearchCV(estimator=gp_pipeline, param_grid=gp_param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

gp_grid_search.fit(X_train, y_train)

20 fits failed out of a total of 25.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\jaime\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\jaime\anaconda3\lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\jaime\anaconda3\lib\site-packages\sklearn\pipeline.py", line 475, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "C:\Users\jaime\anaconda3\lib\site-packages\sklearn\base.py", line 1467, in wrapper
    estimator._validate_params(

In [45]:

y_pred_gp = gp_grid_search.best_estimator_.predict(X_val)

rmse_gp = np.sqrt(mean_squared_error(y_val, y_pred_gp))
r2_gp = r2_score(y_val, y_pred_gp)

print(f"Gaussian Process - Best Params: {gp_grid_search.best_params_}, RMSE: {rmse_gp}, R2: {r2_gp}")

Gaussian Process - Best Params: {'model__kernel': None}, RMSE: 1446.9118488009426, R2: -14.591505002123116


In [46]:

y_test_gp = gp_grid_search.best_estimator_.predict(X_test)

rmse_gp_test = np.sqrt(mean_squared_error(y_test, y_test_gp))
r2_gp_test = r2_score(y_test, y_test_gp)

print(f"Gaussian Process: RMSE: {rmse_gp_test}, R2: {r2_gp_test}")

Gaussian Process: RMSE: 1484.5712589515892, R2: -31.438767246953674


In [47]:

# Comparar modelos basados en RMSE en validación
print("Comparación de modelos por RMSE en validación:\n")
for name, info in best_models.items():
    print(f"Modelo: {name}")
    print(f"MSE (cross-validation): {info['best_score']}")
    print(f"RMSE en validación: {info['rmse_val']}")
    print(f"Mejores hiperparámetros: {info['best_params']}")
    print("-" * 50)
    
    
# Comparar y elegir el mejor modelo basado en el RMSE más bajo
mejor_modelo = min(best_models, key=lambda x: best_models[x]['rmse_val'])
print(f"El mejor modelo basado en RMSE en validación es: {mejor_modelo} con {best_models[mejor_modelo]['rmse_val']}")

Comparación de modelos por RMSE en validación:

Modelo: Linear Regression
MSE (cross-validation): 623102.8687794165
RMSE en validación: 276.0911130718269
Mejores hiperparámetros: {}
--------------------------------------------------
Modelo: Ridge Regression
MSE (cross-validation): 118807.03704963179
RMSE en validación: 314.1177444561765
Mejores hiperparámetros: {'model__alpha': 100.0}
--------------------------------------------------
Modelo: Lasso Regression
MSE (cross-validation): 278819.66989288357
RMSE en validación: 252.81353793797314
Mejores hiperparámetros: {'model__alpha': 10.0}
--------------------------------------------------
Modelo: Random Forest
MSE (cross-validation): 78246.59208113936
RMSE en validación: 312.96747803889343
Mejores hiperparámetros: {'model__max_depth': 10, 'model__max_features': 'sqrt', 'model__min_samples_leaf': 1, 'model__min_samples_split': 2, 'model__n_estimators': 100}
--------------------------------------------------
Modelo: Gradient Boosting
MSE (

In [48]:
# Comparar modelos basados en RMSE en el conjunto de prueba
print("Comparación de modelos por RMSE en el conjunto de prueba:\n")
for name, info in best_models.items():
    print(f"Modelo: {name}")
    print(f"MSE en prueba: {info['mse_test']}")
    print(f"RMSE en prueba: {info['rmse_test']}")
    print(f"R2 en prueba: {info['r2_test']}")
    print("-" * 50)
    
# Comparar y elegir el mejor modelo basado en el RMSE más bajo en el conjunto de prueba 
mejor_modelo_test = min(best_models, key=lambda x: best_models[x]['rmse_test'])
print(f"El mejor modelo basado en RMSE en el conjunto de prueba es: {mejor_modelo_test} con {best_models[mejor_modelo_test]['rmse_test']}")

Comparación de modelos por RMSE en el conjunto de prueba:

Modelo: Linear Regression
MSE en prueba: 82733.86122394586
RMSE en prueba: 287.63494437210835
R2 en prueba: -0.2177146704358328
--------------------------------------------------
Modelo: Ridge Regression
MSE en prueba: 52979.50960149361
RMSE en prueba: 230.17278206055036
R2 en prueba: 0.22022343548541612
--------------------------------------------------
Modelo: Lasso Regression
MSE en prueba: 52300.27647526243
RMSE en prueba: 228.69253699074315
R2 en prueba: 0.23022069815660784
--------------------------------------------------
Modelo: Random Forest
MSE en prueba: 63562.1839730783
RMSE en prueba: 252.11541795986676
R2 en prueba: 0.06446281167212597
--------------------------------------------------
Modelo: Gradient Boosting
MSE en prueba: 79956.17765145392
RMSE en prueba: 282.7652341633496
R2 en prueba: -0.17683145785499588
--------------------------------------------------
Modelo: Support Vector Regression
MSE en prueba: 6041

In [49]:

# Importancia de las características del mejor modelo
if hasattr(best_models[mejor_modelo_test]['model'].named_steps['model'], 'feature_importances_'):
    import matplotlib.pyplot as plt

    feature_importances = best_models[mejor_modelo_test]['model'].named_steps['model'].feature_importances_
    features = X.columns

    # Crear un DataFrame para visualizar
    importances_df = pd.DataFrame({
        'Feature': features,
        'Importance': feature_importances
    }).sort_values(by='Importance', ascending=False)

    # Graficar
    plt.figure(figsize=(10, 6))
    plt.barh(importances_df['Feature'], importances_df['Importance'])
    plt.gca().invert_yaxis()
    plt.xlabel('Importancia')
    plt.title('Importancia de las Características')
    plt.show()
else:
    print(f"El modelo {mejor_modelo_test} no proporciona importancias de características.")

El modelo Lasso Regression no proporciona importancias de características.


In [50]:
 # Make a df with the validation and test results to see it clearly
results = pd.DataFrame(best_models).T
results


Unnamed: 0,model,best_params,best_score,rmse_val,mse_test,rmse_test,r2_test
Linear Regression,"(StandardScaler(), LinearRegression())",{},623102.868779,276.091113,82733.861224,287.634944,-0.217715
Ridge Regression,"(StandardScaler(), Ridge(alpha=100.0))",{'model__alpha': 100.0},118807.03705,314.117744,52979.509601,230.172782,0.220223
Lasso Regression,"(StandardScaler(), Lasso(alpha=10.0, max_iter=...",{'model__alpha': 10.0},278819.669893,252.813538,52300.276475,228.692537,0.230221
Random Forest,"((DecisionTreeRegressor(max_depth=10, max_feat...","{'model__max_depth': 10, 'model__max_features'...",78246.592081,312.967478,63562.183973,252.115418,0.064463
Gradient Boosting,(([DecisionTreeRegressor(criterion='friedman_m...,"{'model__learning_rate': 0.1, 'model__max_dept...",72039.418147,310.665713,79956.177651,282.765234,-0.176831
Support Vector Regression,"(StandardScaler(), SVR(epsilon=1.0, kernel='li...","{'model__C': 1.0, 'model__epsilon': 1.0, 'mode...",101346.145554,336.585874,60417.966275,245.800664,0.110741
K-Nearest Neighbors,"(StandardScaler(), KNeighborsRegressor(n_neigh...","{'model__n_neighbors': 7, 'model__weights': 'd...",96986.446102,314.900217,83246.380474,288.524489,-0.225258


In [53]:
# give a line with the creation of the best model with best hyperparameters
model = best_models[mejor_modelo_test]['model']
model = Lasso(alpha=10.0, max_iter = 10000)