In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import xgboost as xgb

In [4]:
df_modelo5 = pd.read_csv('df_eda_limpio.csv')

In [5]:
# Paso 2: Definimos características más importantes
features_numeric = ['year', 'kms', 'power', 'vehicle_age']
features_categorical = ['fuel', 'shift', 'make']
target = 'price'

# Paso 3: Preparamos los datos para RandomForest
X_rf = df_modelo5[features_numeric + features_categorical]
y_rf = np.log1p(df_modelo5[target])

In [6]:
# Paso 4: División de datos
X_train_rf, X_test_rf, y_train_rf, y_test_rf = train_test_split(
    X_rf, y_rf, test_size=0.2, random_state=42
)

In [7]:
# Paso 5: Preprocesamiento para RandomForest
preprocessor_rf = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), features_numeric),
        ('cat', OneHotEncoder(handle_unknown='ignore'), features_categorical)
    ])

In [8]:
# Paso 6: Modelo RandomForest optimizado
rf_model = RandomForestRegressor(
    n_estimators=100,
    max_depth=15,
    min_samples_leaf=4,
    n_jobs=-1,
    random_state=42
)


In [9]:
# Paso 7: Pipeline RandomForest
pipeline_rf = Pipeline([
    ('preprocessor', preprocessor_rf),
    ('regressor', rf_model)
])

In [10]:
# Paso 8: Entrenamiento RandomForest
print("Entrenando RandomForest...")
pipeline_rf.fit(X_train_rf, y_train_rf)


Entrenando RandomForest...


In [11]:
# Preparación de datos para XGBoost
print("\nPreparando datos para XGBoost...")
X_xgb = df_modelo5[features_numeric].copy()


Preparando datos para XGBoost...


In [12]:
# Codificación de variables categóricas
encoders = {}
for cat_col in features_categorical:
    le = LabelEncoder()
    X_xgb[cat_col] = le.fit_transform(df_modelo5[cat_col])
    encoders[cat_col] = le

# Transformación logarítmica de la variable objetivo
y_xgb = np.log1p(df_modelo5[target])

In [13]:
# División de datos
X_train_xgb, X_test_xgb, y_train_xgb, y_test_xgb = train_test_split(
    X_xgb, y_xgb, test_size=0.2, random_state=42
)

In [None]:
# Configuración e inicialización del modelo XGBoost
xgb_model = xgb.XGBRegressor(
    max_depth=6,
    learning_rate=0.1,
    n_estimators=100,
    subsample=0.8,
    colsample_bytree=0.8,
    tree_method='hist',
    n_jobs=-1,
    random_state=42,
    enable_categorical=True,  # Para manejar variables categóricas
    early_stopping_rounds=10  # Movemos early_stopping_rounds aquí
)

In [15]:
# Entrenamiento del modelo
print("Entrenando XGBoost...")
xgb_model.fit(
    X_train_xgb, 
    y_train_xgb,
    eval_set=[(X_test_xgb, y_test_xgb)],  # Conjunto de validación
    verbose=True  # Mostrar progreso durante el entrenamiento
)

Entrenando XGBoost...
[0]	validation_0-rmse:0.78587
[1]	validation_0-rmse:0.75819
[2]	validation_0-rmse:0.73771
[3]	validation_0-rmse:0.71545
[4]	validation_0-rmse:0.69754
[5]	validation_0-rmse:0.68250
[6]	validation_0-rmse:0.67034
[7]	validation_0-rmse:0.66170
[8]	validation_0-rmse:0.65244
[9]	validation_0-rmse:0.64441
[10]	validation_0-rmse:0.63781
[11]	validation_0-rmse:0.62915
[12]	validation_0-rmse:0.62009
[13]	validation_0-rmse:0.61268
[14]	validation_0-rmse:0.60727
[15]	validation_0-rmse:0.60192
[16]	validation_0-rmse:0.59814
[17]	validation_0-rmse:0.59544
[18]	validation_0-rmse:0.59183
[19]	validation_0-rmse:0.58370
[20]	validation_0-rmse:0.57737
[21]	validation_0-rmse:0.57103
[22]	validation_0-rmse:0.56883
[23]	validation_0-rmse:0.56744
[24]	validation_0-rmse:0.56352
[25]	validation_0-rmse:0.56126
[26]	validation_0-rmse:0.55622
[27]	validation_0-rmse:0.55291
[28]	validation_0-rmse:0.55167
[29]	validation_0-rmse:0.54964
[30]	validation_0-rmse:0.54797
[31]	validation_0-rmse:0.54

In [16]:
# Realizar predicciones
y_pred_xgb = xgb_model.predict(X_test_xgb)

# Calcular métricas (recordando que estamos trabajando con valores logarítmicos)
from sklearn.metrics import mean_squared_error, r2_score
rmse = np.sqrt(mean_squared_error(y_test_xgb, y_pred_xgb))
r2 = r2_score(y_test_xgb, y_pred_xgb)

print(f"\nMétricas de evaluación:")
print(f"RMSE (en escala logarítmica): {rmse:.4f}")
print(f"R²: {r2:.4f}")

# Si quieres ver las métricas en la escala original
y_pred_original = np.expm1(y_pred_xgb)
y_test_original = np.expm1(y_test_xgb)
rmse_original = np.sqrt(mean_squared_error(y_test_original, y_pred_original))
print(f"RMSE (en escala original): {rmse_original:.4f}")


Métricas de evaluación:
RMSE (en escala logarítmica): 0.4830
R²: 0.6520
RMSE (en escala original): 3882.8670


In [17]:
# Calculando métricas adicionales para mejor interpretación
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error

# Métricas en escala logarítmica
mae_log = mean_absolute_error(y_test_xgb, y_pred_xgb)

# Métricas en escala original
mae_original = mean_absolute_error(y_test_original, y_pred_original)
mape = mean_absolute_percentage_error(y_test_original, y_pred_original) * 100

print("\nMétricas detalladas de evaluación:")
print(f"En escala logarítmica:")
print(f"- RMSE: {rmse:.4f}")
print(f"- MAE: {mae_log:.4f}")
print(f"\nEn escala original:")
print(f"- RMSE: {rmse_original:.2f}")
print(f"- MAE: {mae_original:.2f}")
print(f"- MAPE: {mape:.2f}%")
print(f"\nBondad de ajuste:")
print(f"- R²: {r2:.4f} (Explica el {r2*100:.2f}% de la varianza)")


Métricas detalladas de evaluación:
En escala logarítmica:
- RMSE: 0.4830
- MAE: 0.2070

En escala original:
- RMSE: 3882.87
- MAE: 2536.74
- MAPE: 78.32%

Bondad de ajuste:
- R²: 0.6520 (Explica el 65.20% de la varianza)


In [18]:
# Paso 13: Función de evaluación
def evaluate_model(y_true, y_pred, model_name):
    # Convertimos a escala original
    y_true_original = np.expm1(y_true)
    y_pred_original = np.expm1(y_pred)
    
    # Calculamos métricas
    rmse = np.sqrt(mean_squared_error(y_true_original, y_pred_original))
    mae = mean_absolute_error(y_true_original, y_pred_original)
    mape = np.mean(np.abs((y_true_original - y_pred_original) / y_true_original)) * 100
    
    print(f"\nResultados del modelo {model_name}:")
    print(f"RMSE: {rmse:,.2f} €")
    print(f"MAE: {mae:,.2f} €")
    print(f"MAPE: {mape:.2f}%")

# Paso 14: Evaluación de ambos modelos
print("\nEvaluando modelos...")
# Evaluación RandomForest
y_pred_rf = pipeline_rf.predict(X_test_rf)
evaluate_model(y_test_rf, y_pred_rf, "RandomForest")

# Evaluación XGBoost
y_pred_xgb = xgb_model.predict(X_test_xgb)
evaluate_model(y_test_xgb, y_pred_xgb, "XGBoost")


Evaluando modelos...

Resultados del modelo RandomForest:
RMSE: 3,022.84 €
MAE: 1,587.30 €
MAPE: 53.26%

Resultados del modelo XGBoost:
RMSE: 3,882.87 €
MAE: 2,536.74 €
MAPE: 78.32%


In [None]:
# Paso 15: Guardar los modelos
#import joblib
#print("\nGuardando modelos...")
#joblib.dump(pipeline_rf, 'random_forest_model.joblib')
#joblib.dump(xgb_model, 'xgboost_model.joblib')
#joblib.dump(encoders, 'label_encoders.joblib')