In [1]:
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [2]:
df = pd.read_excel("../data/raw/entrenamiento.xlsx")
df = df.rename(columns={"C7.1": "C8", "des": "QUALITY"})
features = df.columns[:-1]
df.head()

Unnamed: 0,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,QUALITY
0,6.8,0.19062,0.307485,18.1,46.0,32.0,4.89784,1.0,3.27,0.392042,8.8,5
1,6.2,0.24686,,5.0,43.0,50.0,5.241747,0.99318,3.23,0.494696,10.8,6
2,6.7,0.350657,0.329304,12.1,0.04,61.0,5.517453,0.99794,3.31,0.457425,9.7,5
3,6.8,0.254642,0.405465,13.3,53.0,48.0,5.273,0.9974,3.09,0.371564,9.4,5
4,6.6,0.215111,0.239017,15.8,35.0,46.0,5.241747,0.9982,3.24,0.41211,9.2,5


In [8]:
high_values = df["C8"] > 900
df.loc[high_values, "C8"] = df.loc[high_values, "C8"] / 1000

df["C8"].describe()

count    3646.000000
mean        0.994044
std         0.003008
min         0.987110
25%         0.991760
50%         0.993800
75%         0.996100
max         1.038980
Name: C8, dtype: float64

In [9]:
# Function to impute outliers using IQR and column median
def impute_outliers_iqr(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Identify outliers
    outliers = (data[column] < lower_bound) | (data[column] > upper_bound)

    # Impute outliers with the median of the column
    data.loc[outliers, column] = data[column].median()
    return data


# Impute outliers in the dataset using IQR method
df_imputed_outliers = df.copy()
for column in df.columns:
    if df[column].dtype != 'object':  # Apply only for numerical columns
        df_imputed_outliers = impute_outliers_iqr(df_imputed_outliers, column)

# Display basic statistics of the dataset after outlier imputation
df_imputed_outliers.describe()

Unnamed: 0,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,QUALITY
count,3684.0,3664.0,3648.0,3636.0,3659.0,3655.0,3645.0,3646.0,3658.0,3639.0,3658.0,3918.0
mean,6.810016,0.235175,0.280678,6.390058,42.117245,34.541176,4.90308,0.994025,3.182701,0.390783,10.491607,5.8073
std,0.744556,0.06122,0.066104,4.951515,9.325894,15.412465,0.300378,0.002898,0.140424,0.066773,1.218126,0.763862
min,4.8,0.076961,0.10436,0.6,9.0,2.0,4.043051,0.98711,2.82,0.207014,8.0,4.0
25%,6.3,0.19062,0.239017,1.7,36.0,23.0,4.70953,0.99176,3.09,0.34359,9.4,5.0
50%,6.8,0.231112,0.277632,5.3,41.0,34.0,4.912655,0.9938,3.18,0.385262,10.3,6.0
75%,7.3,0.270027,0.314811,9.9,47.0,45.0,5.129899,0.9961,3.27,0.431782,11.3,6.0
max,8.8,0.425268,0.463734,22.0,73.0,80.0,5.749393,1.0024,3.56,0.576613,14.0,7.0


In [10]:
# Impute missing values using median for predictor columns (excluding 'des' or 'QUALITY')
for column in df_imputed_outliers.columns:
    if column != "des" and df_imputed_outliers[
        column].dtype != 'object':  # Exclude target column and non-numerical columns
        median_value = df_imputed_outliers[column].median()
        df_imputed_outliers[column].fillna(median_value, inplace=True)

# Check if there are any missing values left
remaining_missing_values = df_imputed_outliers.isnull().sum()
remaining_missing_values[remaining_missing_values > 0]

Series([], dtype: int64)

In [11]:
df_completed = df_imputed_outliers.copy()

In [12]:
# Separar las características y la variable objetivo
X = df_completed.drop("QUALITY", axis=1)
y = df_completed["QUALITY"]

# Estandarizar las características
scaler = StandardScaler()
X_standardized = scaler.fit_transform(X)
## Splitting the DataFrame
# Dividir los datos en conjuntos de entrenamiento y prueba
X_train_std, X_test_std, y_train, y_test = train_test_split(X_standardized, y, test_size=0.3, random_state=42)

In [13]:
# Definir el modelo
import numpy as np
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

# Definir los hiperparámetros y sus posibles valores
param_dist = {
    'learning_rate': [0.01, 0.05, 0.1, 0.3],
    'n_estimators': [50, 100, 150, 200, 300],
    'max_depth': [3, 4, 5, 6, 8, 10],
    'subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    'gamma': [0, 0.1, 0.2, 0.3, 0.4, 0.5],
    'min_child_weight': [1, 2, 3, 4, 5]
}

# Configurar la búsqueda aleatorizada con validación cruzada
random_search = RandomizedSearchCV(
    model, 
    param_distributions=param_dist, 
    n_iter=100, 
    scoring='neg_mean_absolute_error', 
    n_jobs=-1, 
    cv=5, 
    verbose=3, 
    random_state=42
)

# Ejecutar la búsqueda
random_search.fit(X_train_std, y_train)

# Mostrar los mejores hiperparámetros encontrados
print(random_search.best_params_)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sp

{'subsample': 0.8, 'n_estimators': 200, 'min_child_weight': 1, 'max_depth': 8, 'learning_rate': 0.1, 'gamma': 0, 'colsample_bytree': 0.8}


In [14]:
# Inicializar el modelo XGBoost con los hiperparámetros optimizados
best_params = {
    'subsample': 0.8, 
    'n_estimators': 200, 
    'min_child_weight': 1, 
    'max_depth': 8, 
    'learning_rate': 0.1, 
    'gamma': 0, 
    'colsample_bytree': 0.8,
    'objective': 'reg:squarederror', 
    'random_state': 42
}

optimized_xgb = xgb.XGBRegressor(**best_params)

# Entrenar el modelo con el conjunto de entrenamiento
optimized_xgb.fit(X_train_std, y_train)

# Realizar predicciones en el conjunto de entrenamiento
train_predictions = optimized_xgb.predict(X_train_std)

# Calcular el MAE en el conjunto de entrenamiento
mae = mean_absolute_error(y_train, train_predictions)
print(f"Mean Absolute Error (MAE) con hiperparámetros optimizados: {mae:.4f}")


  if is_sparse(data):


Mean Absolute Error (MAE) con hiperparámetros optimizados: 0.0467
