In [62]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.decomposition import PCA
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import xgboost as xgb

import warnings

warnings.filterwarnings("ignore", category=FutureWarning)

In [63]:
df = pd.read_excel("../data/raw/entrenamiento.xlsx")
df = df.rename(columns={"C7.1": "C8", "des": "QUALITY"})
features = df.drop(columns=["QUALITY"])
target = df["QUALITY"]

In [64]:
# Correct the values in column C8 that are above 900 by dividing them by 1000
high_values = df["C8"] > 900
df.loc[high_values, "C8"] = df.loc[high_values, "C8"] / 1000

# Display the basic statistics of the corrected column
df["C8"].describe()

count    3646.000000
mean        0.994044
std         0.003008
min         0.987110
25%         0.991760
50%         0.993800
75%         0.996100
max         1.038980
Name: C8, dtype: float64

In [65]:
# Imputar datos faltantes con la mediana
for col in df.columns:
    median_value = df[col].median()
    df[col].fillna(median_value, inplace=True)

In [80]:
from scipy.spatial import distance

def impute_outliers_mahalanobis(data):
    # Calculate the inverse of the covariance matrix
    inv_covmat = np.linalg.inv(np.cov(data, rowvar=0))
    
    # Calculate the Mahalanobis distance for each observation
    mean_data = np.mean(data, axis=0).values.reshape(1, -1)
    mahal = distance.cdist(data, mean_data, 'mahalanobis', VI=inv_covmat)
    mahal = mahal.flatten()
    
    # Identify outliers using a threshold
    outliers = mahal > np.mean(mahal) + 3 * np.std(mahal)
    
    # Impute outliers with the median of the entire data
    for column in data.columns:
        data.loc[outliers, column] = data[column].median()
    return data

df_imputed_outliers = df.copy()
df_imputed_outliers = impute_outliers_mahalanobis(df_imputed_outliers)





In [81]:
X = df_imputed_outliers.drop(columns=['QUALITY'])
y = df_imputed_outliers['QUALITY']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)

In [82]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(
    max_depth=30,
    min_samples_leaf=1,
    min_samples_split=2,
    n_estimators=100,
    bootstrap=True,
    random_state=42)
rf.fit(X_train_std, y_train)
predictions_rf = rf.predict(X_train_std)
mae_rf = mean_absolute_error(y_train, predictions_rf)
print(f"MAE for Random Forest: {mae_rf:.4f}")

MAE for Random Forest: 0.1831


In [83]:
# Hiperparámetros optimizados
gb_model_std = GradientBoostingRegressor(
    learning_rate=0.1,
    max_depth=7,
    max_features='sqrt',
    min_samples_leaf=3,
    min_samples_split=2,
    n_estimators=100,
    subsample=0.9,
    random_state=42)  # Puedes ajustar los hiperparámetros según sea necesario
gb_model_std.fit(X_train_std, y_train)

# Hacer predicciones en el conjunto de entrenamiento
y_train_pred_gb = gb_model_std.predict(X_train_std)

# Calcular el MAE para las predicciones
mae_gb_std = mean_absolute_error(y_train, y_train_pred_gb)
print(f'MAE of Gradient Boosting Regression: {mae_gb_std:.4}')

MAE of Gradient Boosting Regression: 0.2641


In [84]:
optimized_xgb = xgb.XGBRegressor(objective='reg:squarederror',
                                 subsample=0.8,
                                 min_child_weight=4,
                                 max_depth=9,
                                 learning_rate=0.1,
                                 gamma=0,
                                 colsample_bytree=0.8,
                                 random_state=42)

# Entrenar el modelo con los datos estandarizados
optimized_xgb.fit(X_train_std, y_train)

# Hacer predicciones en el conjunto de entrenamiento
xgb_optimized_predictions = optimized_xgb.predict(X_train_std)

# Calcular el MAE
mae_optimized_xgb = mean_absolute_error(y_train, xgb_optimized_predictions)
mae_optimized_xgb
print(f'MAE of XGBoost: {mae_optimized_xgb:.4}')

MAE of XGBoost: 0.1378


In [85]:
from xgboost import XGBRegressor

base_learners = [
    ('rf', RandomForestRegressor(max_depth=30,
                                 min_samples_leaf=1,
                                 min_samples_split=2,
                                 n_estimators=100,
                                 bootstrap=True,
                                 random_state=42)),
    ('gboost', GradientBoostingRegressor(learning_rate=0.1,
                                         max_depth=7,
                                         max_features='sqrt',
                                         min_samples_leaf=3,
                                         min_samples_split=2,
                                         n_estimators=100,
                                         subsample=0.9,
                                         random_state=42)),
    ('xgb', XGBRegressor(objective='reg:squarederror',
                         subsample=0.8,
                         min_child_weight=4,
                         max_depth=9,
                         learning_rate=0.1,
                         gamma=0,
                         colsample_bytree=0.8,
                         random_state=42))
]

# Inicializa el modelo de Stacking
stack_reg = StackingRegressor(
    estimators=base_learners,
    final_estimator=LinearRegression(),
    cv=5  # Utiliza validación cruzada con 5 folds para entrenar los modelos base
)

# Train the model
stack_reg.fit(X_test_std, y_test)

# Predict on the training set
stacked_train_predictions = stack_reg.predict(X_test_std)

# Calculate the MAE for the Stacking Regressor
mae_stacked = mean_absolute_error(y_test, stacked_train_predictions)
print(f'MAE of Stacking Regressor: {mae_stacked:.6f}')

MAE of Stacking Regressor: 0.168375
