In [42]:
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [43]:
df = pd.read_excel("../data/raw/entrenamiento.xlsx")
df = df.rename(columns={"C7.1": "C8", "des": "QUALITY"})
features = df.columns[:-1]
df.head()

Unnamed: 0,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,QUALITY
0,6.8,0.19062,0.307485,18.1,46.0,32.0,4.89784,1.0,3.27,0.392042,8.8,5
1,6.2,0.24686,,5.0,43.0,50.0,5.241747,0.99318,3.23,0.494696,10.8,6
2,6.7,0.350657,0.329304,12.1,0.04,61.0,5.517453,0.99794,3.31,0.457425,9.7,5
3,6.8,0.254642,0.405465,13.3,53.0,48.0,5.273,0.9974,3.09,0.371564,9.4,5
4,6.6,0.215111,0.239017,15.8,35.0,46.0,5.241747,0.9982,3.24,0.41211,9.2,5


## Ouliers

In [49]:
# Correct the values in column C8 that are above 900 by dividing them by 1000
high_values = df["C8"] > 900
df.loc[high_values, "C8"] = df.loc[high_values, "C8"] / 1000

# Display the basic statistics of the corrected column
df["C8"].describe()

count    3646.000000
mean        0.994044
std         0.003008
min         0.987110
25%         0.991760
50%         0.993800
75%         0.996100
max         1.038980
Name: C8, dtype: float64

In [50]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
C1,3684.0,6.857166,0.84643,3.8,6.3,6.8,7.3,14.2
C2,3664.0,0.42144,0.988207,0.076961,0.19062,0.231112,0.285179,6.913737
C3,3648.0,0.284054,0.086887,0.0,0.239017,0.277632,0.329304,0.802002
C4,3636.0,6.424642,5.084063,0.6,1.7,5.3,9.9,65.8
C5,3659.0,40.685294,25.106022,0.02,33.0,41.0,49.0,346.0
C6,3655.0,35.216142,17.167346,2.0,23.0,34.0,46.0,289.0
C7,3645.0,4.885591,0.341723,2.302585,4.691348,4.912655,5.129899,6.089045
C8,3646.0,0.994044,0.003008,0.98711,0.99176,0.9938,0.9961,1.03898
C9,3658.0,3.188291,0.152155,2.72,3.09,3.18,3.28,3.82
C10,3639.0,0.395421,0.074535,0.198851,0.34359,0.385262,0.438255,0.732368


## Tratamiento de outliers

In [51]:
# Function to impute outliers using IQR and column median
def impute_outliers_iqr(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Identify outliers
    outliers = (data[column] < lower_bound) | (data[column] > upper_bound)

    # Impute outliers with the median of the column
    data.loc[outliers, column] = data[column].median()
    return data


# Impute outliers in the dataset using IQR method
df_imputed_outliers = df.copy()
for column in df.columns:
    if df[column].dtype != 'object':  # Apply only for numerical columns
        df_imputed_outliers = impute_outliers_iqr(df_imputed_outliers, column)

# Display basic statistics of the dataset after outlier imputation
df_imputed_outliers.describe()

Unnamed: 0,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,QUALITY
count,3684.0,3664.0,3648.0,3636.0,3659.0,3655.0,3645.0,3646.0,3658.0,3639.0,3658.0,3918.0
mean,6.810016,0.235175,0.280678,6.390058,42.117245,34.541176,4.90308,0.994025,3.182701,0.390783,10.491607,5.8073
std,0.744556,0.06122,0.066104,4.951515,9.325894,15.412465,0.300378,0.002898,0.140424,0.066773,1.218126,0.763862
min,4.8,0.076961,0.10436,0.6,9.0,2.0,4.043051,0.98711,2.82,0.207014,8.0,4.0
25%,6.3,0.19062,0.239017,1.7,36.0,23.0,4.70953,0.99176,3.09,0.34359,9.4,5.0
50%,6.8,0.231112,0.277632,5.3,41.0,34.0,4.912655,0.9938,3.18,0.385262,10.3,6.0
75%,7.3,0.270027,0.314811,9.9,47.0,45.0,5.129899,0.9961,3.27,0.431782,11.3,6.0
max,8.8,0.425268,0.463734,22.0,73.0,80.0,5.749393,1.0024,3.56,0.576613,14.0,7.0


In [52]:
df_imputed_outliers

Unnamed: 0,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,QUALITY
0,6.8,0.190620,0.307485,18.1,46.0,32.0,4.897840,1.00000,3.27,0.392042,8.8,5
1,6.2,0.246860,,5.0,43.0,50.0,5.241747,0.99318,3.23,0.494696,10.8,6
2,6.7,0.350657,0.329304,12.1,41.0,61.0,5.517453,0.99794,3.31,0.457425,9.7,5
3,6.8,0.254642,0.405465,13.3,53.0,48.0,5.273000,0.99740,3.09,0.371564,9.4,5
4,6.6,0.215111,0.239017,15.8,35.0,46.0,5.241747,0.99820,3.24,0.412110,9.2,5
...,...,...,...,...,...,...,...,...,...,...,...,...
3913,7.7,,0.215111,11.2,31.0,41.0,5.123964,0.99480,3.12,0.357674,11.3,7
3914,8.1,0.378436,0.270027,1.7,52.0,50.0,5.214936,0.99230,3.03,0.350657,11.2,5
3915,6.1,0.148420,0.254642,6.0,41.0,29.0,4.976734,0.99474,3.18,0.378436,10.7,6
3916,8.2,0.239017,0.329304,7.8,,49.0,5.342334,0.99760,3.31,0.412110,9.5,6


## Imputacion de valores faltantes

In [53]:
# Calculate the number of missing values in each column
missing_values = df_imputed_outliers.isnull().sum()

# Display columns with missing values
missing_values[missing_values > 0]

C1     234
C2     254
C3     270
C4     282
C5     259
C6     263
C7     273
C8     272
C9     260
C10    279
C11    260
dtype: int64

In [54]:
# Impute missing values using median for predictor columns (excluding 'des' or 'QUALITY')
for column in df_imputed_outliers.columns:
    if column != "des" and df_imputed_outliers[
        column].dtype != 'object':  # Exclude target column and non-numerical columns
        median_value = df_imputed_outliers[column].median()
        df_imputed_outliers[column].fillna(median_value, inplace=True)

# Check if there are any missing values left
remaining_missing_values = df_imputed_outliers.isnull().sum()
remaining_missing_values[remaining_missing_values > 0]

Series([], dtype: int64)

In [55]:
df_imputed_outliers.isna().sum()

C1         0
C2         0
C3         0
C4         0
C5         0
C6         0
C7         0
C8         0
C9         0
C10        0
C11        0
QUALITY    0
dtype: int64

In [56]:
df_completed = df_imputed_outliers.copy()

In [57]:
df_completed.head()

Unnamed: 0,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,QUALITY
0,6.8,0.19062,0.307485,18.1,46.0,32.0,4.89784,1.0,3.27,0.392042,8.8,5
1,6.2,0.24686,0.277632,5.0,43.0,50.0,5.241747,0.99318,3.23,0.494696,10.8,6
2,6.7,0.350657,0.329304,12.1,41.0,61.0,5.517453,0.99794,3.31,0.457425,9.7,5
3,6.8,0.254642,0.405465,13.3,53.0,48.0,5.273,0.9974,3.09,0.371564,9.4,5
4,6.6,0.215111,0.239017,15.8,35.0,46.0,5.241747,0.9982,3.24,0.41211,9.2,5


## Standardize Dataframe

In [58]:
# Separar las características y la variable objetivo
X = df_completed.drop("QUALITY", axis=1)
y = df_completed["QUALITY"]

# Estandarizar las características
scaler = StandardScaler()
X_standardized = scaler.fit_transform(X)

## Splitting the DataFrame

In [59]:
# Dividir los datos en conjuntos de entrenamiento y prueba
X_train_std, X_test_std, y_train, y_test = train_test_split(X_standardized, y, test_size=0.3, random_state=42)

## Linear Regression

In [60]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

# Instanciar y entrenar el modelo de regresión lineal
linear_model_std = LinearRegression()
linear_model_std.fit(X_train_std, y_train)

# Hacer predicciones en el conjunto de entrenamiento
y_train_pred = linear_model_std.predict(X_train_std)

# Calcular el MAE para las predicciones
mae_linear_std = mean_absolute_error(y_train, y_train_pred)
print(f'MAE of Linear Regression = {mae_linear_std:.4f}')

MAE of Linear Regression = 0.5501


## Ridge Regression 

In [61]:
from sklearn.linear_model import Ridge

# Instanciar y entrenar el modelo Ridge
ridge_model_std = Ridge(
    alpha=1.0)  # Aquí, alpha es el parámetro de regularización; puedes ajustarlo según sea necesario
ridge_model_std.fit(X_train_std, y_train)

# Hacer predicciones en el conjunto de entrenamiento
y_train_pred_ridge = ridge_model_std.predict(X_train_std)

# Calcular el MAE para las predicciones
mae_ridge_std = mean_absolute_error(y_train, y_train_pred_ridge)
print(f'MAE of Ridge Regression: {mae_ridge_std:.4f}')

MAE of Ridge Regression: 0.5501


## Lasso Regression

In [62]:
from sklearn.linear_model import Lasso

# Instanciar y entrenar el modelo Lasso
lasso_model_std = Lasso(
    alpha=0.01)  # Aquí, alpha es el parámetro de regularización; puedes ajustarlo según sea necesario
lasso_model_std.fit(X_train_std, y_train)

# Hacer predicciones en el conjunto de entrenamiento
y_train_pred_lasso = lasso_model_std.predict(X_train_std)

# Calcular el MAE para las predicciones
mae_lasso_std = mean_absolute_error(y_train, y_train_pred_lasso)
print(f'MAE of Lasso Regression: {mae_lasso_std:.4f}')

MAE of Lasso Regression: 0.5529


## Decision Tree

In [80]:
from sklearn.tree import DecisionTreeRegressor

# Instanciar y entrenar el modelo Decision Tree
tree_model_std = DecisionTreeRegressor(max_depth=6,
                                       random_state=42)  # Puedes ajustar los hiperparámetros según sea necesario
tree_model_std.fit(X_train_std, y_train)

# Hacer predicciones en el conjunto de entrenamiento
y_train_pred_tree = tree_model_std.predict(X_train_std)

# Calcular el MAE para las predicciones
mae_tree_std = mean_absolute_error(y_train, y_train_pred_tree)
print(f'MAE of Decision Tree Regression: {mae_tree_std:.4f}')

MAE of Decision Tree Regression: 0.4964


## Random Forest

### Busqueda de hyperparametros

In [81]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

# Definir los hiperparámetros y sus posibles valores
param_grid_forest = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 5, 6, 7, 8, 9, 10],
    'min_samples_split': [2, 3, 4, 5],
    'min_samples_leaf': [1, 2, 3, 4],
    'bootstrap': [True, False]
}

# Instanciar el modelo y el GridSearchCV
forest = RandomForestRegressor(random_state=42)
grid_search_forest = GridSearchCV(forest, param_grid_forest, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)

# Ajustar el GridSearchCV al conjunto de datos estandarizado
grid_search_forest.fit(X_train_std, y_train)

# Obtener los mejores hiperparámetros y el mejor MAE
best_params_forest = grid_search_forest.best_params_
best_mae_forest = -grid_search_forest.best_score_

best_params_forest, best_mae_forest


({'bootstrap': True,
  'max_depth': None,
  'min_samples_leaf': 1,
  'min_samples_split': 2,
  'n_estimators': 150},
 0.47967540185872115)

### Sin hyperparametros optimizados

In [35]:
from sklearn.ensemble import RandomForestRegressor

# Instanciar y entrenar el modelo Random Forest
forest_model_std = RandomForestRegressor(n_estimators=100,
                                         random_state=42)  # Puedes ajustar los hiperparámetros según sea necesario
forest_model_std.fit(X_train_std, y_train)

# Hacer predicciones en el conjunto de entrenamiento
y_train_pred_forest = forest_model_std.predict(X_train_std)

# Calcular el MAE para las predicciones
mae_forest_std = mean_absolute_error(y_train, y_train_pred_forest)
print(f'MAE of Random Forest Regression: {mae_forest_std:.4}')

MAE of Random Forest Regression: 0.1759


### Hyperparametros optimizados

In [82]:
from sklearn.ensemble import RandomForestRegressor

# Instanciar y entrenar el modelo Random Forest
forest_model_std = RandomForestRegressor(
    max_depth=None,
    min_samples_leaf=1,
    min_samples_split=2,
    n_estimators=150,
    random_state=42)  # Puedes ajustar los hiperparámetros según sea necesario
forest_model_std.fit(X_train_std, y_train)

# Hacer predicciones en el conjunto de entrenamiento
y_train_pred_forest = forest_model_std.predict(X_train_std)

# Calcular el MAE para las predicciones
mae_forest_std = mean_absolute_error(y_train, y_train_pred_forest)
print(f'MAE of Random Forest Regression: {mae_forest_std:.4}')

MAE of Random Forest Regression: 0.1747


## Gradiend Boosting

In [88]:
from sklearn.ensemble import GradientBoostingRegressor

# Instanciar y entrenar el modelo Gradient Boosting
gb_model_std = GradientBoostingRegressor(n_estimators=100,
                                         random_state=42)  # Puedes ajustar los hiperparámetros según sea necesario
gb_model_std.fit(X_train_std, y_train)

# Hacer predicciones en el conjunto de entrenamiento
y_train_pred_gb = gb_model_std.predict(X_train_std)

# Calcular el MAE para las predicciones
mae_gb_std = mean_absolute_error(y_train, y_train_pred_gb)
print(f'MAE of Gradient Boosting Regression: {mae_gb_std:.4}')

MAE of Gradient Boosting Regression: 0.4603


In [85]:
# Cálculo de hiperparámetros
# Definir los hiperparámetros y sus posibles valores
param_grid_gb = {
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.01, 0.05, 0.1, 0.5],
    'max_depth': [3, 4, 5, 6, 7],
    'min_samples_split': [2, 3, 4],
    'min_samples_leaf': [1, 2, 3],
    'subsample': [0.8, 0.9, 1.0],
    'max_features': [None, 'sqrt', 'log2']
}

# Instanciar el modelo y el GridSearchCV
gb = GradientBoostingRegressor(random_state=42)
grid_search_gb = GridSearchCV(gb, param_grid_gb, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)

# Ajustar el GridSearchCV al conjunto de datos estandarizado
grid_search_gb.fit(X_train_std, y_train)

# Obtener los mejores hiperparámetros y el mejor MAE
best_params_gb = grid_search_gb.best_params_
best_mae_gb = -grid_search_gb.best_score_

best_params_gb, best_mae_gb

({'learning_rate': 0.1,
  'max_depth': 7,
  'max_features': 'sqrt',
  'min_samples_leaf': 3,
  'min_samples_split': 2,
  'n_estimators': 150,
  'subsample': 1.0},
 0.4808757504226359)

In [89]:
# Hiperparámetros optimizados
gb_model_std = GradientBoostingRegressor(
    learning_rate=0.1,
    max_depth=7,
    max_features='sqrt',
    min_samples_leaf=3,
    min_samples_split=2,
    n_estimators=150,
    subsample=1.0,
    random_state=42)  # Puedes ajustar los hiperparámetros según sea necesario
gb_model_std.fit(X_train_std, y_train)

# Hacer predicciones en el conjunto de entrenamiento
y_train_pred_gb = gb_model_std.predict(X_train_std)

# Calcular el MAE para las predicciones
mae_gb_std = mean_absolute_error(y_train, y_train_pred_gb)
print(f'MAE of Gradient Boosting Regression: {mae_gb_std:.4}')

MAE of Gradient Boosting Regression: 0.1764


## KNN

In [141]:
from sklearn.neighbors import KNeighborsRegressor

# Instanciar y entrenar el modelo KNN
knn_model_std = KNeighborsRegressor(n_neighbors=5)  # Puedes ajustar los hiperparámetros según sea necesario
knn_model_std.fit(X_train_std, y_train)

# Hacer predicciones en el conjunto de entrenamiento
y_train_pred_knn = knn_model_std.predict(X_train_std)

# Calcular el MAE para las predicciones
mae_knn_std_a = mean_absolute_error(y_train, y_train_pred_knn)
print(f'MAE of KNN: {mae_knn_std_a:.4}')

MAE of KNN: 0.4311


In [104]:
from sklearn.model_selection import train_test_split

# Dividir los datos en subconjuntos de entrenamiento y validación
X_train_sub, X_val_sub, y_train_sub, y_val_sub = train_test_split(X_train_std, y_train, test_size=0.2, random_state=42)

# Instanciar y entrenar el modelo KNN con los hiperparámetros optimizados
knn_optimized = KNeighborsRegressor(metric='manhattan', n_neighbors=8, p=1, weights='distance')
knn_optimized.fit(X_train_sub, y_train_sub)

# Hacer predicciones en el subconjunto de validación
y_val_pred_knn = knn_optimized.predict(X_val_sub)

# Calcular el MAE para las predicciones en el subconjunto de validación
mae_knn_val = mean_absolute_error(y_val_sub, y_val_pred_knn)
print(f'MAE of KNN: {mae_knn_val:.4}')

MAE of KNN: 0.4892


In [92]:
# Búsqueda de hiperparámetros
from sklearn.model_selection import GridSearchCV

# Definir los hiperparámetros y sus posibles valores
param_grid_knn = {
    'n_neighbors': list(range(1, 31)),
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski'],
    'p': [1, 2, 3]  # Solo se usa cuando metric='minkowski'
}

# Instanciar el modelo y el GridSearchCV
knn = KNeighborsRegressor()
grid_search_knn = GridSearchCV(knn, param_grid_knn, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)

# Ajustar el GridSearchCV al conjunto de datos estandarizado
grid_search_knn.fit(X_train_std, y_train)

# Obtener los mejores hiperparámetros y el mejor MAE
best_params_knn = grid_search_knn.best_params_
best_mae_knn = -grid_search_knn.best_score_

best_params_knn, best_mae_knn


({'metric': 'manhattan', 'n_neighbors': 8, 'p': 1, 'weights': 'distance'},
 0.4845735121521789)

## SVR

In [107]:
from sklearn.svm import SVR

# Instanciar y entrenar el modelo SVR
svr_model = SVR()
svr_model.fit(X_train_std, y_train)

# Hacer predicciones en el conjunto de entrenamiento
y_train_pred_svr = svr_model.predict(X_train_std)

# Calcular el MAE para SVR
mae_svr = mean_absolute_error(y_train, y_train_pred_svr)
print(f'MAE of SVR: {mae_svr:.4}')

MAE of SVR: 0.4181


In [108]:
# Búsqueda de hiperparámetros
from sklearn.model_selection import GridSearchCV

# Definir el espacio de búsqueda para los hiperparámetros
param_grid = {
    'C': [0.1, 1, 10, 100],
    'epsilon': [0.01, 0.1, 1],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto']
}

# Instanciar GridSearchCV
grid_search_svr = GridSearchCV(SVR(), param_grid, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1, verbose=1)

# Ajustar el modelo
grid_search_svr.fit(X_train_std, y_train)

# Obtener los mejores hiperparámetros
best_params_svr = grid_search_svr.best_params_
best_mae_svr = -grid_search_svr.best_score_

best_params_svr, best_mae_svr


Fitting 5 folds for each of 72 candidates, totalling 360 fits


({'C': 1, 'epsilon': 0.01, 'gamma': 'scale', 'kernel': 'rbf'},
 0.5075046911340438)

In [110]:
# Con hiperparámetros optimizados
optimized_svr = SVR(C=1, epsilon=0.01, gamma='scale', kernel='rbf')
optimized_svr.fit(X_train_std, y_train)

# Hacer predicciones en el conjunto de entrenamiento
y_train_pred_opt_svr = optimized_svr.predict(X_train_std)

# Calcular el MAE
mae_opt_svr = mean_absolute_error(y_train, y_train_pred_opt_svr)
print(f'MAE of SVR: {mae_opt_svr:.4}')

MAE of SVR: 0.4062


## AdaBoost

In [112]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor

# Crear y entrenar un modelo AdaBoost
ada_model = AdaBoostRegressor(
    base_estimator=DecisionTreeRegressor(max_depth=4),
    n_estimators=100,
    learning_rate=1,
    random_state=42)

ada_model.fit(X_train_std, y_train)

# Hacer predicciones en el conjunto de entrenamiento
ada_train_predictions = ada_model.predict(X_train_std)

# Calcular el MAE para AdaBoost
mae_ada = mean_absolute_error(y_train, ada_train_predictions)
print(f'MAE of AdaBoost: {mae_ada:.4}')



MAE of AdaBoost: 0.5227


In [113]:
from sklearn.model_selection import GridSearchCV

# Definir el modelo base
base = DecisionTreeRegressor(random_state=42)

# Definir el modelo AdaBoost con el modelo base
ada = AdaBoostRegressor(base_estimator=base, random_state=42)

# Definir los hiperparámetros para la búsqueda
param_grid = {
    'n_estimators': [30, 50, 70, 100],
    'learning_rate': [0.01, 0.05, 0.1, 0.5, 1],
    'base_estimator__max_depth': [2, 4, 6, 8],
    'base_estimator__min_samples_split': [2, 3, 4],
    'base_estimator__min_samples_leaf': [1, 2, 3]
}

# Configurar GridSearchCV
grid_search_ada = GridSearchCV(ada, param_grid, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)
grid_search_ada.fit(X_train_std, y_train)

# Mostrar los mejores hiperparámetros encontrados
grid_search_ada.best_params_




{'base_estimator__max_depth': 8,
 'base_estimator__min_samples_leaf': 1,
 'base_estimator__min_samples_split': 2,
 'learning_rate': 0.5,
 'n_estimators': 100}

In [115]:
# Hiperparámetros optimizados
# Crear y entrenar un modelo AdaBoost con los hiperparámetros óptimos
optimal_ada_model = AdaBoostRegressor(
    base_estimator=DecisionTreeRegressor(max_depth=8, min_samples_leaf=1, min_samples_split=2, random_state=42),
    n_estimators=100,
    learning_rate=0.5,
    random_state=42
)

optimal_ada_model.fit(X_train_std, y_train)

# Hacer predicciones en el conjunto de entrenamiento
optimal_ada_train_predictions = optimal_ada_model.predict(X_train_std)

# Calcular el MAE para AdaBoost con hiperparámetros óptimos
mae_optimal_ada = mean_absolute_error(y_train, optimal_ada_train_predictions)
print(f'MAE of AdaBoost: {mae_optimal_ada:.4}')



MAE of AdaBoost: 0.3602


## XGBoost

In [131]:
import xgboost as xgb

# Entrenar un modelo XGBoost con hiperparámetros por defecto
xgb_model_std = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
xgb_model_std.fit(X_train_std, y_train)

# Hacer predicciones en el conjunto de entrenamiento
xgb_train_predictions_std = xgb_model_std.predict(X_train_std)

# Calcular el MAE para XGBoost con datos estandarizados
mae_xgb_std = mean_absolute_error(y_train, xgb_train_predictions_std)
print(f'MAE of XGBoost: {mae_xgb_std:.4}')

  if is_sparse(data):


MAE of XGBoost: 0.1216


In [127]:
from sklearn.model_selection import RandomizedSearchCV
import xgboost as xgb

# Ampliar los hiperparámetros y sus rangos
param_grid_expanded = {
    'learning_rate': [0.001, 0.01, 0.05, 0.1, 0.2, 0.3],
    'max_depth': [3, 4, 5, 6, 7, 8, 9, 10],
    'min_child_weight': [1, 2, 3, 4, 5],
    'gamma': [0, 0.1, 0.2, 0.3, 0.4, 0.5],
    'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0]
}

# Inicializar XGBoost Regressor
xgb_reg = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

# Inicializar RandomizedSearchCV
rand_search = RandomizedSearchCV(xgb_reg, param_distributions=param_grid_expanded, n_iter=100,
                                 cv=3, scoring='neg_mean_absolute_error', verbose=2, n_jobs=-1, random_state=42)

# Ajustar el modelo
rand_search.fit(X_train_std, y_train)

# Obtener los mejores hiperparámetros
best_params = rand_search.best_params_
best_score = -rand_search.best_score_

best_params, best_score

Fitting 3 folds for each of 100 candidates, totalling 300 fits


  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sp

({'subsample': 1.0,
  'min_child_weight': 4,
  'max_depth': 9,
  'learning_rate': 0.1,
  'gamma': 0,
  'colsample_bytree': 0.6},
 0.49531071852107367)

In [130]:
# Hiperparámetros optimizados
# Aplicar los hiperparámetros óptimos al modelo XGBoost
optimized_xgb = xgb.XGBRegressor(objective ='reg:squarederror', 
                                 subsample=1.0,
                                 min_child_weight=4,
                                 max_depth=9,
                                 learning_rate=0.1,
                                 gamma=0,
                                 colsample_bytree=0.6,
                                 random_state=42)

# Entrenar el modelo con los datos estandarizados
optimized_xgb.fit(X_train_std, y_train)

# Hacer predicciones en el conjunto de entrenamiento
xgb_optimized_predictions = optimized_xgb.predict(X_train_std)

# Calcular el MAE
mae_optimized_xgb = mean_absolute_error(y_train, xgb_optimized_predictions)
mae_optimized_xgb
print(f'MAE of XGBoost: {mae_optimized_xgb:.4}')

  if is_sparse(data):


MAE of XGBoost: 0.1578


## Stacking Regressor

In [143]:
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# Define the base models
base_models = [
    ("xgb", xgb.XGBRegressor(objective='reg:squarederror', random_state=42)),
    ("rf", RandomForestRegressor(max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=150, random_state=42)),
    ("gb", GradientBoostingRegressor(learning_rate=0.1, max_depth=7, max_features='sqrt', min_samples_leaf=3, min_samples_split=2,n_estimators=150, subsample=1.0, random_state=42))
]

# Initialize the Stacking Regressor with the base models and a linear meta-model
stacked_model = StackingRegressor(estimators=base_models, final_estimator=LinearRegression())

# Train the model
stacked_model.fit(X_train_std, y_train)

# Predict on the training set
stacked_train_predictions = stacked_model.predict(X_train_std)

# Calculate the MAE for the Stacking Regressor
mae_stacked = mean_absolute_error(y_train, stacked_train_predictions)
print(f'MAE of Stacking Regressor: {mae_stacked:.6f}')

MAE of Stacking Regressor: 0.164331


## Comparison

In [144]:
# Creating a dictionary with MAE values for each model
mae_values = {
    "Linear Regression": mae_linear_std,
    "Ridge Regression": mae_ridge_std,
    "Lasso Regression": mae_lasso_std,
    "Decision Tree Regression": mae_tree_std,
    "Random Forest": mae_forest_std,
    "Gradient Boosting": mae_gb_std,
    "K-Nearest Neighbors": mae_knn_std_a,
    "SVR": mae_opt_svr,
    "AdaBoost": mae_optimal_ada,
    "XGBoost": mae_xgb_std,
    "Stacking": mae_stacked
}

# Convert the dictionary to a pandas DataFrame for better visualization
mae_comparison = pd.DataFrame(list(mae_values.items()), columns=["Model", "MAE"]).sort_values(by="MAE")
mae_comparison

Unnamed: 0,Model,MAE
9,XGBoost,0.121594
10,Stacking,0.164331
4,Random Forest,0.174656
5,Gradient Boosting,0.176407
8,AdaBoost,0.360193
7,SVR,0.406164
6,K-Nearest Neighbors,0.431072
3,Decision Tree Regression,0.496378
0,Linear Regression,0.550097
1,Ridge Regression,0.550105
