In [1]:
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
import xgboost as xgb

In [2]:
df= pd.read_excel("../data/raw/entrenamiento.xlsx")
df= df.rename(columns={"C7.1": "C8", "des": "QUALITY"})


In [3]:
high_values_train = df["C8"] > 900
df.loc[high_values_train, "C8"] = df.loc[high_values_train, "C8"] / 1000

In [4]:
def impute_outliers_iqr(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Identify outliers
    outliers = (data[column] < lower_bound) | (data[column] > upper_bound)

    # Impute outliers with the median of the column
    data.loc[outliers, column] = data[column].median()
    return data


# Impute outliers in the dataset using IQR method
df_imputed_outliers = df.copy()
for column in df.columns:
    if df[column].dtype != 'object':  # Apply only for numerical columns
        df_imputed_outliers = impute_outliers_iqr(df_imputed_outliers, column)

In [5]:
# Impute missing values using median for predictor columns (excluding 'des' or 'QUALITY')
for column in df_imputed_outliers.columns:
    if column != "des" and df_imputed_outliers[
        column].dtype != 'object':  # Exclude target column and non-numerical columns
        median_value = df_imputed_outliers[column].median()
        df_imputed_outliers[column].fillna(median_value, inplace=True)

# Check if there are any missing values left
remaining_missing_values = df_imputed_outliers.isnull().sum()
remaining_missing_values[remaining_missing_values > 0]

Series([], dtype: int64)

In [6]:
df_completed = df_imputed_outliers.copy()
# Separar las características y la variable objetivo
X = df_completed.drop("QUALITY", axis=1)
y = df_completed["QUALITY"]

# Estandarizar las características
scaler = StandardScaler()
X_standardized = scaler.fit_transform(X)
## Splitting the DataFrame
# Dividir los datos en conjuntos de entrenamiento y prueba
X_train_std, X_test_std, y_train, y_test = train_test_split(X_standardized, y, test_size=0.3, random_state=42)

In [9]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

# Definir el rango de hiperparámetros
param_dist = {
    'n_estimators': np.arange(50, 501, 50),
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': np.arange(3, 21),
    'min_samples_split': np.arange(2, 21),
    'min_samples_leaf': np.arange(1, 21),
    'bootstrap': [True, False]
}

# Iniciar RandomizedSearchCV
rf = RandomForestRegressor(random_state=42)
rf_search = RandomizedSearchCV(
    rf, param_distributions=param_dist, n_iter=100, cv=5, verbose=2, random_state=42, n_jobs=-1, scoring='neg_mean_absolute_error'
)

# Ajustar RandomizedSearchCV al conjunto de entrenamiento
rf_search.fit(X_train_std, y_train)

# Mostrar los mejores hiperparámetros encontrados
print("Mejores hiperparámetros:", rf_search.best_params_)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV] END bootstrap=True, max_depth=14, max_features=sqrt, min_samples_leaf=15, min_samples_split=9, n_estimators=150; total time=   0.4s
[CV] END bootstrap=False, max_depth=7, max_features=log2, min_samples_leaf=5, min_samples_split=3, n_estimators=450; total time=   1.3s
[CV] END bootstrap=True, max_depth=15, max_features=auto, min_samples_leaf=3, min_samples_split=17, n_estimators=400; total time=   0.0s
[CV] END bootstrap=True, max_depth=15, max_features=auto, min_samples_leaf=3, min_samples_split=17, n_estimators=400; total time=   0.0s
[CV] END bootstrap=True, max_depth=15, max_features=auto, min_samples_leaf=3, min_samples_split=17, n_estimators=400; total time=   0.0s
[CV] END bootstrap=True, max_depth=15, max_features=auto, min_samples_leaf=3, min_samples_split=17, n_estimators=400; total time=   0.0s
[CV] END bootstrap=True, max_depth=15, max_features=auto, min_samples_leaf=3, min_samples_split=17, n_estimators=400

170 fits failed out of a total of 500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
66 fits failed with the following error:
Traceback (most recent call last):
  File "/home/alberte/Desktop/Fruit/.venv/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/alberte/Desktop/Fruit/.venv/lib/python3.10/site-packages/sklearn/base.py", line 1144, in wrapper
    estimator._validate_params()
  File "/home/alberte/Desktop/Fruit/.venv/lib/python3.10/site-packages/sklearn/base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "/home/alberte/Desktop/Fruit/.venv/lib/python3.10/site-packages/sklearn/utils/_param_val

Mejores hiperparámetros: {'n_estimators': 250, 'min_samples_split': 8, 'min_samples_leaf': 2, 'max_features': 'log2', 'max_depth': 17, 'bootstrap': False}


In [10]:
# Evaluar el modelo con los hiperparámetros óptimos en el conjunto de entrenamiento
best_rf = rf_search.best_estimator_
train_predictions = best_rf.predict(X_train_std)
mae = mean_absolute_error(y_train, train_predictions)
print(f"MAE con hiperparámetros optimizados: {mae:.4f}")


MAE con hiperparámetros optimizados: 0.1842


In [11]:
from sklearn.model_selection import cross_val_score

# Definir el modelo Random Forest con hiperparámetros optimizados
optimized_rf = RandomForestRegressor(
    n_estimators=100,
    max_depth=9,
    min_samples_split=4,
    min_samples_leaf=2,
    random_state=42
)

# Realizar validación cruzada con 5 particiones
mae_scores = -cross_val_score(optimized_rf, X_train_std, y_train, cv=5, scoring='neg_mean_absolute_error')

# Mostrar los resultados de cada iteración
print(f"MAE scores from cross-validation: {mae_scores}")

# Calcular y mostrar el MAE medio y la desviación estándar de los MAE
mean_mae = mae_scores.mean()
std_mae = mae_scores.std()
print(f"Mean MAE score: {mean_mae}")
print(f"Standard Deviation of MAE scores: {std_mae}")


MAE scores from cross-validation: [0.50999777 0.51509413 0.49633356 0.49567327 0.52110788]
Mean MAE score: 0.5076413226835612
Standard Deviation of MAE scores: 0.010134546807913956
