In [41]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.decomposition import PCA
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import xgboost as xgb

import warnings

warnings.filterwarnings("ignore", category=FutureWarning)

In [42]:
df = pd.read_excel("../data/raw/entrenamiento.xlsx")
df = df.rename(columns={"C7.1": "C8", "des": "QUALITY"})
features = df.drop(columns=["QUALITY"])
target = df["QUALITY"]

In [43]:
# Correct the values in column C8 that are above 900 by dividing them by 1000
high_values = df["C8"] > 900
df.loc[high_values, "C8"] = df.loc[high_values, "C8"] / 1000

# Display the basic statistics of the corrected column
df["C8"].describe()

count    3646.000000
mean        0.994044
std         0.003008
min         0.987110
25%         0.991760
50%         0.993800
75%         0.996100
max         1.038980
Name: C8, dtype: float64

In [44]:
# Imputar datos faltantes con la mediana
for col in df.columns:
    median_value = df[col].median()
    df[col].fillna(median_value, inplace=True)

In [45]:
# Function to impute outliers using IQR and column median
def impute_outliers_iqr(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Identify outliers
    outliers = (data[column] < lower_bound) | (data[column] > upper_bound)

    # Impute outliers with the median of the column
    data.loc[outliers, column] = data[column].median()
    return data


# Impute outliers in the dataset using IQR method
df_imputed_outliers = df.copy()
for column in df.columns:
    if df[column].dtype != 'object':  # Apply only for numerical columns
        df_imputed_outliers = impute_outliers_iqr(df_imputed_outliers, column)

# Display basic statistics of the dataset after outlier imputation
df_imputed_outliers.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
C1,3918.0,6.81169,0.68228,5.1,6.4,6.8,7.2,8.6
C2,3918.0,0.233108,0.05615,0.086178,0.198851,0.231112,0.262364,0.392042
C3,3918.0,0.279778,0.061094,0.122218,0.24686,0.277632,0.314811,0.444686
C4,3918.0,6.307338,4.771707,0.6,1.8,5.3,9.375,20.8
C5,3918.0,41.964523,8.859699,12.0,37.0,41.0,47.0,69.0
C6,3918.0,34.369321,14.687726,2.0,24.0,34.0,44.0,76.0
C7,3918.0,4.907424,0.283764,4.110874,4.727388,4.912655,5.117994,5.717028
C8,3918.0,0.994007,0.002793,0.98711,0.99188,0.9938,0.995857,1.00182
C9,3918.0,3.180084,0.130941,2.85,3.1,3.18,3.26,3.52
C10,3918.0,0.388687,0.061853,0.223144,0.350657,0.385262,0.425268,0.559616


In [46]:
X = df_imputed_outliers.drop(columns=['QUALITY'])
y = df_imputed_outliers['QUALITY']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)

In [47]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(
    max_depth=30,
    min_samples_leaf=1,
    min_samples_split=2,
    n_estimators=100,
    bootstrap=True,
    random_state=42)
rf.fit(X_train_std, y_train)
predictions_rf = rf.predict(X_train_std)
mae_rf = mean_absolute_error(y_train, predictions_rf)
print(f"MAE for Random Forest: {mae_rf:.4f}")

MAE for Random Forest: 0.1696


In [48]:
# Hiperparámetros optimizados
gb_model_std = GradientBoostingRegressor(
    learning_rate=0.1,
    max_depth=7,
    max_features='sqrt',
    min_samples_leaf=3,
    min_samples_split=2,
    n_estimators=100,
    subsample=0.9,
    random_state=42)  # Puedes ajustar los hiperparámetros según sea necesario
gb_model_std.fit(X_train_std, y_train)

# Hacer predicciones en el conjunto de entrenamiento
y_train_pred_gb = gb_model_std.predict(X_train_std)

# Calcular el MAE para las predicciones
mae_gb_std = mean_absolute_error(y_train, y_train_pred_gb)
print(f'MAE of Gradient Boosting Regression: {mae_gb_std:.4}')

MAE of Gradient Boosting Regression: 0.2411


In [49]:
optimized_xgb = xgb.XGBRegressor(objective='reg:squarederror',
                                 subsample=0.8,
                                 min_child_weight=4,
                                 max_depth=9,
                                 learning_rate=0.1,
                                 gamma=0,
                                 colsample_bytree=0.8,
                                 random_state=42)

# Entrenar el modelo con los datos estandarizados
optimized_xgb.fit(X_train_std, y_train)

# Hacer predicciones en el conjunto de entrenamiento
xgb_optimized_predictions = optimized_xgb.predict(X_train_std)

# Calcular el MAE
mae_optimized_xgb = mean_absolute_error(y_train, xgb_optimized_predictions)
mae_optimized_xgb
print(f'MAE of XGBoost: {mae_optimized_xgb:.4}')

MAE of XGBoost: 0.1422


In [50]:
base_learners = [
    ('rf', RandomForestRegressor(max_depth=30,
                                 min_samples_leaf=1,
                                 min_samples_split=2,
                                 n_estimators=100,
                                 bootstrap=True,
                                 random_state=42)),
    ('gboost', GradientBoostingRegressor(learning_rate=0.1,
                                         max_depth=7,
                                         max_features='sqrt',
                                         min_samples_leaf=3,
                                         min_samples_split=2,
                                         n_estimators=100,
                                         subsample=0.9,
                                         random_state=42)),
    ('xgb', XGBRegressor(objective='reg:squarederror',
                         subsample=0.8,
                         min_child_weight=4,
                         max_depth=9,
                         learning_rate=0.1,
                         gamma=0,
                         colsample_bytree=0.8,
                         random_state=42))
]

# Inicializa el modelo de Stacking
stack_reg = StackingRegressor(
    estimators=base_learners,
    final_estimator=LinearRegression(),
    cv=5  # Utiliza validación cruzada con 5 folds para entrenar los modelos base
)


# Train the model
stack_reg.fit(X_test_std, y_test)

# Predict on the training set
stacked_train_predictions = stack_reg.predict(X_test_std)

# Calculate the MAE for the Stacking Regressor
mae_stacked = mean_absolute_error(y_test, stacked_train_predictions)
print(f'MAE of Stacking Regressor: {mae_stacked:.6f}')

MAE of Stacking Regressor: 0.196939
