In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PowerTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel

In [2]:
df_modelo4 = pd.read_csv('df_eda_limpio.csv')
df_modelo4.head()

Unnamed: 0,make,model,version,fuel,year,kms,power,shift,price,dealer_name,...,province,vehicle_age,price_per_power,price_per_year,dealer_info,power_per_kms,make_popularity,model_popularity,big_city_dealer,price_range
0,Opel,Crossland,1.2 GAS 110 GS Line 5p S/S,Gasolina,2022,5.0,110.0,manual,22900,Sergio Y.,...,Barcelona,3,208.19,7633.34,"Sergio Y.\r\n2, Carrer de Jacint Benavente, Po...",22.0,4591,158,0,20-30k
1,Opel,Crossland,1.2 81kW (110CV) GS Line,Gasolina,2022,24847.0,110.0,manual,19990,Peugeot Alcala 534,...,Madrid,3,181.73,6663.34,"Peugeot Alcala 534\r\nAvenida de José Gárate, ...",0.004427,4591,158,0,10-20k
2,Opel,Crossland,1.5D 88kW (120CV) Business Elegance Auto,Diésel,2021,41356.0,120.0,automatic,18590,Clicars S.,...,Madrid,4,154.92,4647.5,"Clicars S.\r\nSan Cristóbal, Avenida de Andalu...",0.002902,4591,158,1,10-20k
3,Opel,Crossland,GS-Line 1.2 GAS MT6 S/S 110cv,Gasolina,2022,11.0,110.0,manual,22700,Vallescar S.,...,Barcelona,3,206.37,7566.67,"Vallescar S.\r\nParc de Bombers de Sabadell, 5...",10.0,4591,158,0,20-30k
4,Opel,Crossland,GS-Line 1.2 GAS MT6 S/S 110cv,Gasolina,2022,11.0,110.0,manual,22700,Vallescar Ocasion M.,...,Barcelona,3,206.37,7566.67,"Vallescar Ocasion M.\r\n27, Carrer de Fèlix Fe...",10.0,4591,158,0,20-30k


In [3]:
# Agregamos más características derivadas que podrían ser útiles
df_modelo4['power_year_ratio'] = df_modelo4['power'] / df_modelo4['year']
df_modelo4['kms_year_ratio'] = df_modelo4['kms'] / df_modelo4['vehicle_age']
df_modelo4['kms_power_ratio'] = df_modelo4['kms'] / df_modelo4['power']

In [None]:
# Selección de características incluyendo las nuevas
features = [
    'year', 'kms', 'power', 'vehicle_age', 'fuel', 'shift', 'make', 'model',
    'power_year_ratio', 'kms_year_ratio', 'kms_power_ratio'
]
target = 'price'

X = df_modelo4[features]
y = df_modelo4[target]


In [5]:
# Separamos características numéricas y categóricas
numeric_features = ['year', 'kms', 'power', 'vehicle_age', 
                   'power_year_ratio', 'kms_year_ratio', 'kms_power_ratio']
categorical_features = ['fuel', 'shift', 'make', 'model']

# Preprocesamiento con transformación logarítmica para la variable objetivo
y = np.log1p(y)  # Transformamos los precios a escala logarítmica

# Creamos el preprocesador
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore', max_categories=20)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [6]:
# División del dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# Creamos el pipeline con selección de características
rf_selector = RandomForestRegressor(n_estimators=100, random_state=42)
feature_selector = SelectFromModel(rf_selector, prefit=False)

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('feature_selection', feature_selector),
    ('model', RandomForestRegressor(random_state=42))
])

# Grid de hiperparámetros más exhaustivo
param_grid = {
    'model__n_estimators': [100, 200, 300],
    'model__max_depth': [10, 20, 30, None],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 4],
    'feature_selection__threshold': ['mean', '0.5*mean', '1.5*mean']
}

# Búsqueda de hiperparámetros con validación cruzada
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,
    scoring='neg_mean_squared_error',
    n_jobs=-1,  # Usa todos los cores disponibles
    verbose=1
)

In [8]:
# Entrenamos el modelo
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 324 candidates, totalling 1620 fits


KeyboardInterrupt: 

In [None]:
# Obtenemos las predicciones
y_pred = grid_search.predict(X_test)

# Convertimos las predicciones de vuelta a la escala original
y_test_original = np.expm1(y_test)
y_pred_original = np.expm1(y_pred)

In [None]:
# Calculamos las métricas
rmse = np.sqrt(mean_squared_error(y_test_original, y_pred_original))
mae = mean_absolute_error(y_test_original, y_pred_original)

# Mostramos los resultados
print("Mejores parámetros encontrados:")
print(grid_search.best_params_)
print(f'\nRMSE: {rmse}')
print(f'MAE: {mae}')

# Importancia de características
best_model = grid_search.best_estimator_
feature_importance = pd.DataFrame({
    'feature': features,
    'importance': best_model.named_steps['model'].feature_importances_
})
print("\nImportancia de características:")
print(feature_importance.sort_values('importance', ascending=False))