In [1]:
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
import xgboost as xgb

In [2]:
df= pd.read_excel("../data/raw/entrenamiento.xlsx")
df= df.rename(columns={"C7.1": "C8", "des": "QUALITY"})


In [3]:
high_values_train = df["C8"] > 900
df.loc[high_values_train, "C8"] = df.loc[high_values_train, "C8"] / 1000

In [4]:
def impute_outliers_iqr(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Identify outliers
    outliers = (data[column] < lower_bound) | (data[column] > upper_bound)

    # Impute outliers with the median of the column
    data.loc[outliers, column] = data[column].median()
    return data


# Impute outliers in the dataset using IQR method
df_imputed_outliers = df.copy()
for column in df.columns:
    if df[column].dtype != 'object':  # Apply only for numerical columns
        df_imputed_outliers = impute_outliers_iqr(df_imputed_outliers, column)

In [5]:
# Impute missing values using median for predictor columns (excluding 'des' or 'QUALITY')
for column in df_imputed_outliers.columns:
    if column != "des" and df_imputed_outliers[
        column].dtype != 'object':  # Exclude target column and non-numerical columns
        median_value = df_imputed_outliers[column].median()
        df_imputed_outliers[column].fillna(median_value, inplace=True)

# Check if there are any missing values left
remaining_missing_values = df_imputed_outliers.isnull().sum()
remaining_missing_values[remaining_missing_values > 0]

Series([], dtype: int64)

In [8]:
df_completed = df_imputed_outliers.copy()
# Separar las características y la variable objetivo
X = df_completed.drop("QUALITY", axis=1)
y = df_completed["QUALITY"]

# Estandarizar las características
scaler = StandardScaler()
X_standardized = scaler.fit_transform(X)
## Splitting the DataFrame
# Dividir los datos en conjuntos de entrenamiento y prueba
X_train_std, X_test_std, y_train, y_test = train_test_split(X_standardized, y, test_size=0.3, random_state=42)

In [12]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# Definir los modelos base con sus hiperparámetros optimizados
base_models = [
    ("xgb", xgb.XGBRegressor(
        objective='reg:squarederror',
        subsample=0.8,
        n_estimators=200,
        min_child_weight=1,
        max_depth=8,
        learning_rate=0.1,
        gamma=0,
        colsample_bytree=0.8,
        random_state=42
    )),
    ("rf", RandomForestRegressor(
        n_estimators=150,
        min_samples_split=2,
        min_samples_leaf=1,
        max_depth=None,
        bootstrap=True,
        random_state=42
    )),
    ("gb", GradientBoostingRegressor(
        learning_rate=0.1,
        max_depth=5,
        max_features=None,
        n_estimators=200,
        subsample=0.9,
        random_state=42
    ))
]

# Inicializar el Stacking Regressor con los modelos base y una regresión lineal como meta-modelo
stacked_model = StackingRegressor(estimators=base_models, final_estimator=LinearRegression())

# Entrenar el modelo de Stacking
stacked_model.fit(X_train_std, y_train)

# Realizar predicciones en el conjunto de entrenamiento
stacked_train_predictions = stacked_model.predict(X_train_std)

# Calcular el MAE para el Stacking Regressor
mae_stacked = mean_absolute_error(y_train, stacked_train_predictions)
print(f"Mean Absolute Error (MAE) del Stacking Regressor: {mae_stacked:.4f}")

# Para validación cruzada
scores = -cross_val_score(stacked_model, X_train_std, y_train, cv=5, scoring='neg_mean_absolute_error')
print(f"MAE scores from cross-validation: {scores}")
print(f"Mean MAE score: {scores.mean()}")
print(f"Standard Deviation of MAE scores: {scores.std()}")


Mean Absolute Error (MAE) del Stacking Regressor: 0.1216
MAE scores from cross-validation: [0.47086998 0.47505819 0.47429711 0.45613098 0.48134349]
Mean MAE score: 0.47153994875048877
Standard Deviation of MAE scores: 0.008413972631977733
