In [2]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA

import warnings

warnings.filterwarnings("ignore", category=FutureWarning)

In [3]:
df= pd.read_excel("../data/raw/entrenamiento.xlsx")
df= df.rename(columns={"C7.1": "C8", "des": "QUALITY"})
features = df.drop(columns=["QUALITY"])
target = df["QUALITY"]

In [4]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
C1,3684.0,6.857166,0.84643,3.8,6.3,6.8,7.3,14.2
C2,3664.0,0.42144,0.988207,0.076961,0.19062,0.231112,0.285179,6.913737
C3,3648.0,0.284054,0.086887,0.0,0.239017,0.277632,0.329304,0.802002
C4,3636.0,6.424642,5.084063,0.6,1.7,5.3,9.9,65.8
C5,3659.0,40.685294,25.106022,0.02,33.0,41.0,49.0,346.0
C6,3655.0,35.216142,17.167346,2.0,23.0,34.0,46.0,289.0
C7,3645.0,4.885591,0.341723,2.302585,4.691348,4.912655,5.129899,6.089045
C8,3646.0,65.824651,245.359289,0.98711,0.9918,0.9941,0.996825,1001.0
C9,3658.0,3.188291,0.152155,2.72,3.09,3.18,3.28,3.82
C10,3639.0,0.395421,0.074535,0.198851,0.34359,0.385262,0.438255,0.732368


In [5]:
# Función para calcular límites inferior y superior para cada columna usando IQR
def compute_iqr_limits(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_limit = Q1 - 1.5 * IQR
    upper_limit = Q3 + 1.5 * IQR
    return lower_limit, upper_limit

# Eliminar las observaciones atípicas
df_dropped_outliers = df.copy()

for column in df.columns:
    if column != 'QUALITY':  # No consideramos la variable objetivo
        lower_limit, upper_limit = compute_iqr_limits(df, column)
        df_dropped_outliers = df_dropped_outliers[(df_dropped_outliers[column] >= lower_limit) & 
                                                  (df_dropped_outliers[column] <= upper_limit)]

df_dropped_outliers.shape

(1209, 12)

In [6]:
# Imputar valores atípicos con la mediana
df_imputed_outliers = df.copy()

for column in df.columns:
    if column != 'QUALITY':  # No consideramos la variable objetivo
        lower_limit, upper_limit = compute_iqr_limits(df, column)
        median_value = df[column].median()
        
        df_imputed_outliers.loc[df_imputed_outliers[column] < lower_limit, column] = median_value
        df_imputed_outliers.loc[df_imputed_outliers[column] > upper_limit, column] = median_value

df_imputed_outliers.shape  # El tamaño del dataset debería seguir siendo el mismo.

(3918, 12)

In [7]:
# Cap and Floor (establecer límites para valores atípicos)
df_capped_outliers = df.copy()

for column in df.columns:
    if column != 'QUALITY':  # No consideramos la variable objetivo
        lower_limit, upper_limit = compute_iqr_limits(df, column)
        
        df_capped_outliers.loc[df_capped_outliers[column] < lower_limit, column] = lower_limit
        df_capped_outliers.loc[df_capped_outliers[column] > upper_limit, column] = upper_limit

df_capped_outliers.shape  # El tamaño del dataset debería seguir siendo el mismo.

(3918, 12)

In [8]:
from sklearn.impute import SimpleImputer

# Imputer para rellenar valores faltantes con la mediana
imputer = SimpleImputer(strategy='median')

# 1. Eliminar observaciones atípicas
df_outliers_removed = df.copy()
for col in df.columns:
    if col != 'QUALITY':  # Excluimos la variable objetivo
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        filter = (df[col] >= Q1 - 1.5 * IQR) & (df[col] <= Q3 + 1.5 * IQR)
        df_outliers_removed = df_outliers_removed[filter]

# 2. Reemplazar observaciones atípicas con la mediana
df_outliers_median = df.copy()
for col in df.columns:
    if col != 'QUALITY':  # Excluimos la variable objetivo
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        median_value = df[col].median()
        filter_lower = df[col] < Q1 - 1.5 * IQR
        filter_upper = df[col] > Q3 + 1.5 * IQR
        df_outliers_median.loc[filter_lower, col] = median_value
        df_outliers_median.loc[filter_upper, col] = median_value

# 3. Cap and Floor (Ajustar valores extremos al rango IQR)
df_outliers_capped = df.copy()
for col in df.columns:
    if col != 'QUALITY':  # Excluimos la variable objetivo
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        filter_lower = df[col] < Q1 - 1.5 * IQR
        filter_upper = df[col] > Q3 + 1.5 * IQR
        df_outliers_capped.loc[filter_lower, col] = Q1 - 1.5 * IQR
        df_outliers_capped.loc[filter_upper, col] = Q3 + 1.5 * IQR

# Imputar valores faltantes
df_outliers_removed_imputed = pd.DataFrame(imputer.fit_transform(df_outliers_removed), columns=df_outliers_removed.columns)
df_outliers_median_imputed = pd.DataFrame(imputer.fit_transform(df_outliers_median), columns=df_outliers_median.columns)
df_outliers_capped_imputed = pd.DataFrame(imputer.fit_transform(df_outliers_capped), columns=df_outliers_capped.columns)

# Verificar si aún existen valores faltantes
missing_removed = df_outliers_removed_imputed.isnull().sum().sum()
missing_median = df_outliers_median_imputed.isnull().sum().sum()
missing_capped = df_outliers_capped_imputed.isnull().sum().sum()

missing_removed, missing_median, missing_capped

  df_outliers_removed = df_outliers_removed[filter]
  df_outliers_removed = df_outliers_removed[filter]
  df_outliers_removed = df_outliers_removed[filter]
  df_outliers_removed = df_outliers_removed[filter]
  df_outliers_removed = df_outliers_removed[filter]
  df_outliers_removed = df_outliers_removed[filter]
  df_outliers_removed = df_outliers_removed[filter]
  df_outliers_removed = df_outliers_removed[filter]
  df_outliers_removed = df_outliers_removed[filter]
  df_outliers_removed = df_outliers_removed[filter]


(0, 0, 0)

## Capped

In [9]:
X = df_outliers_capped_imputed.drop(columns=['QUALITY'])
y = df_outliers_capped_imputed['QUALITY']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)

In [10]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

lr = LinearRegression()
lr.fit(X_train_std, y_train)
predictions_lr = lr.predict(X_test_std)
mae_lr = mean_absolute_error(y_test, predictions_lr)
print(f"MAE for Linear Regression: {mae_lr:.4f}")

MAE for Linear Regression: 0.6083


In [11]:
from sklearn.linear_model import Ridge

ridge = Ridge(random_state=42)
ridge.fit(X_train_std, y_train)
predictions_ridge = ridge.predict(X_test_std)
mae_ridge = mean_absolute_error(y_test, predictions_ridge)
print(f"MAE for Ridge Regression: {mae_ridge:.4f}")

MAE for Ridge Regression: 0.6083


In [12]:
from sklearn.linear_model import Lasso

lasso = Lasso(random_state=42)
lasso.fit(X_train_std, y_train)
predictions_lasso = lasso.predict(X_test_std)
mae_lasso = mean_absolute_error(y_test, predictions_lasso)
print(f"MAE for Lasso Regression: {mae_lasso:.4f}")

MAE for Lasso Regression: 0.6500


In [13]:
from sklearn.tree import DecisionTreeRegressor

dt = DecisionTreeRegressor(random_state=42)
dt.fit(X_train_std, y_train)
predictions_dt = dt.predict(X_test_std)
mae_dt = mean_absolute_error(y_test, predictions_dt)
print(f"MAE for Decision Tree: {mae_dt:.4f}")

MAE for Decision Tree: 0.6148


In [14]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(random_state=42)
rf.fit(X_train_std, y_train)
predictions_rf = rf.predict(X_test_std)
mae_rf = mean_absolute_error(y_test, predictions_rf)
print(f"MAE for Random Forest: {mae_rf:.4f}")

MAE for Random Forest: 0.5100


In [15]:
from sklearn.svm import SVR

svr = SVR()
svr.fit(X_train_std, y_train)
predictions_svr = svr.predict(X_test_std)
mae_svr = mean_absolute_error(y_test, predictions_svr)
print(f"MAE for SVR: {mae_svr:.4f}")

MAE for SVR: 0.5522


In [16]:
from sklearn.ensemble import AdaBoostRegressor

adaboost = AdaBoostRegressor(random_state=42)
adaboost.fit(X_train_std, y_train)
predictions_adaboost = adaboost.predict(X_test_std)
mae_adaboost = mean_absolute_error(y_test, predictions_adaboost)
print(f"MAE for AdaBoost: {mae_adaboost:.4f}")

MAE for AdaBoost: 0.5961


In [17]:
from sklearn.ensemble import GradientBoostingRegressor

gb = GradientBoostingRegressor(random_state=42)
gb.fit(X_train_std, y_train)
predictions_gb = gb.predict(X_test_std)
mae_gb = mean_absolute_error(y_test, predictions_gb)
print(f"MAE for Gradient Boosting: {mae_gb:.4f}")

MAE for Gradient Boosting: 0.5627


In [18]:
from sklearn.ensemble import StackingRegressor

base_learners = [
                 ('rf', RandomForestRegressor(random_state=42)),
                 ('gb', GradientBoostingRegressor(random_state=42)),
                 ('ridge', Ridge(random_state=42))
                ]

stacking_regressor = StackingRegressor(estimators=base_learners, final_estimator=LinearRegression())
stacking_regressor.fit(X_train_std, y_train)
predictions_stack = stacking_regressor.predict(X_test_std)
mae_stack = mean_absolute_error(y_test, predictions_stack)
print(f"MAE for Stacking Regressor: {mae_stack:.4f}")

MAE for Stacking Regressor: 0.5127


In [19]:
# Inicializar un DataFrame para registrar los resultados
results_df = pd.DataFrame(columns=['Model', 'MAE'])

# Función para añadir resultados al DataFrame
def add_result(model_name, mae_value):
    global results_df
    new_row = pd.DataFrame({'Model': [model_name], 'MAE': [mae_value]})
    results_df = pd.concat([results_df, new_row], ignore_index=True)

# Añadir los resultados (solo como ejemplo, reemplaza las cifras ficticias con tus valores reales)
add_result("Linear Regression", mae_lr)  # Sustituye 0.1234 con tu valor real
add_result("Ridge Regression", mae_ridge)  # Sustituye 0.1234 con tu valor real
add_result("Lasso Regression", mae_lasso)  # Sustituye 0.2345 con tu valor real
add_result("Decision Tree", mae_dt)  # Sustituye 0.2345 con tu valor real
add_result("Random Forest", mae_rf)  # Sustituye 0.2345 con tu valor real
add_result("SVR", mae_svr)  # Sustituye 0.2345 con tu valor real
add_result("AdaBoost", mae_adaboost)  # Sustituye 0.2345 con tu valor real
add_result("Gradient Boost", mae_gb)  # Sustituye 0.2345 con tu valor real
add_result("Stacking", mae_stack)  # Sustituye 0.2345 con tu valor real


# Muestra el DataFrame
sorted_results_df = results_df.sort_values(by="MAE", ascending=True)
print(sorted_results_df)

               Model       MAE
4      Random Forest  0.509974
8           Stacking  0.512700
5                SVR  0.552172
7     Gradient Boost  0.562723
6           AdaBoost  0.596111
1   Ridge Regression  0.608274
0  Linear Regression  0.608274
3      Decision Tree  0.614796
2   Lasso Regression  0.650031
