In [27]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PowerTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel

In [11]:
df_modelo = pd.read_csv('df_eda_limpio.csv')
df_modelo.head()

Unnamed: 0,make,model,version,fuel,year,kms,power,shift,price,dealer_name,...,province,vehicle_age,price_per_power,price_per_year,dealer_info,power_per_kms,make_popularity,model_popularity,big_city_dealer,price_range
0,Opel,Crossland,1.2 GAS 110 GS Line 5p S/S,Gasolina,2022,5.0,110.0,manual,22900,Sergio Y.,...,Barcelona,3,208.19,7633.34,"Sergio Y.\n2, Carrer de Jacint Benavente, Pobl...",22.0,4591,158,0,20-30k
1,Opel,Crossland,1.2 81kW (110CV) GS Line,Gasolina,2022,24847.0,110.0,manual,19990,Peugeot Alcala 534,...,Madrid,3,181.73,6663.34,"Peugeot Alcala 534\nAvenida de José Gárate, Co...",0.004427,4591,158,0,10-20k
2,Opel,Crossland,1.5D 88kW (120CV) Business Elegance Auto,Diésel,2021,41356.0,120.0,automatic,18590,Clicars S.,...,Madrid,4,154.92,4647.5,"Clicars S.\nSan Cristóbal, Avenida de Andalucí...",0.002902,4591,158,1,10-20k
3,Opel,Crossland,GS-Line 1.2 GAS MT6 S/S 110cv,Gasolina,2022,11.0,110.0,manual,22700,Vallescar S.,...,Barcelona,3,206.37,7566.67,"Vallescar S.\nParc de Bombers de Sabadell, 52,...",10.0,4591,158,0,20-30k
4,Opel,Crossland,GS-Line 1.2 GAS MT6 S/S 110cv,Gasolina,2022,11.0,110.0,manual,22700,Vallescar Ocasion M.,...,Barcelona,3,206.37,7566.67,"Vallescar Ocasion M.\n27, Carrer de Fèlix Ferr...",10.0,4591,158,0,20-30k


In [12]:
df_modelo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91040 entries, 0 to 91039
Data columns (total 23 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   make              91040 non-null  object 
 1   model             91040 non-null  object 
 2   version           91040 non-null  object 
 3   fuel              91040 non-null  object 
 4   year              91040 non-null  int64  
 5   kms               91040 non-null  float64
 6   power             91040 non-null  float64
 7   shift             91040 non-null  object 
 8   price             91040 non-null  int64  
 9   dealer_name       91040 non-null  object 
 10  dealer_address    91040 non-null  object 
 11  dealer_city       91040 non-null  object 
 12  dealer_zip_code   91040 non-null  int64  
 13  province          91040 non-null  object 
 14  vehicle_age       91040 non-null  int64  
 15  price_per_power   91040 non-null  float64
 16  price_per_year    91040 non-null  float6

In [15]:
# Selección de características relevantes
features = ['year', 'kms', 'power', 'vehicle_age', 'fuel', 'shift', 'make', 'model']
target = 'price'

X = df_modelo[features]
y = df_modelo[target]

In [None]:
# Preprocesamiento de los datos
numeric_features = ['year', 'kms', 'power', 'vehicle_age']
categorical_features = ['fuel', 'shift', 'make', 'model']

numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [17]:
# Se divide el dataset en conjunto de entrenamiento y prueba

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:
# Entrenamiento del modelo KNN
knn = KNeighborsRegressor()

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', knn)
])

# Búsqueda de hiperparámetros (ajuste del número de vecinos)
param_grid = {'model__n_neighbors': [3, 5, 7, 9, 11]}
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# Mejor modelo encontrado
best_model = grid_search.best_estimator_

In [None]:
y_pred = best_model.predict(X_test)

# Calcular RMSE y MAE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)

# Mostrar los resultados
print(f'Best K: {grid_search.best_params_}')
print(f'RMSE: {rmse}')
print(f'MAE: {mae}')# Se evalúa el modelo 


Best K: {'model__n_neighbors': 3}
RMSE: 1819.8399254003727
MAE: 444.6970928529584


In [20]:
# Ajustar más hiperparámetros en el modelo KNN
param_grid = {
    'model__n_neighbors': [3, 5, 7, 9, 11, 15],
    'model__p': [1, 2]  # p=1 (Manhattan) o p=2 (Euclidiana)
}
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_

In [22]:
# Evaluar el nuevo modelo
y_pred = best_model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)

print(f'Best K: {grid_search.best_params_}')
print(f'RMSE: {rmse}')
print(f'MAE: {mae}')

Best K: {'model__n_neighbors': 3, 'model__p': 1}
RMSE: 1800.1565970347915
MAE: 440.3260837727006


In [24]:
# Modelo de RandomForest
rf = RandomForestRegressor(n_estimators=100, random_state=42)

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', rf)
])

pipeline.fit(X_train, y_train)

# Evaluar el modelo
y_pred = pipeline.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)

print(f'RMSE: {rmse}')
print(f'MAE: {mae}')

RMSE: 1363.1901891069524
MAE: 314.07350616751205


In [25]:
# Modelo de GradientBoosting
gb = GradientBoostingRegressor(n_estimators=100, random_state=42)

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', gb)
])

pipeline.fit(X_train, y_train)

# Evaluar el modelo
y_pred = pipeline.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)

print(f'RMSE: {rmse}')
print(f'MAE: {mae}')

RMSE: 4259.814616663449
MAE: 2819.4263899456146


In [None]:
# Agregamos más características derivadas que podrían ser útiles
df_modelo['power_year_ratio'] = df_modelo['power'] / df_modelo['year']
df_modelo['kms_year_ratio'] = df_modelo['kms'] / df_modelo['vehicle_age']
df_modelo['kms_power_ratio'] = df_modelo['kms'] / df_modelo['power']

# Selección de características incluyendo las nuevas
features = [
    'year', 'kms', 'power', 'vehicle_age', 'fuel', 'shift', 'make', 'model',
    'power_year_ratio', 'kms_year_ratio', 'kms_power_ratio'
]
target = 'price'

X = df_modelo[features]
y = df_modelo[target]

# Separamos características numéricas y categóricas
numeric_features = ['year', 'kms', 'power', 'vehicle_age', 
                   'power_year_ratio', 'kms_year_ratio', 'kms_power_ratio']
categorical_features = ['fuel', 'shift', 'make', 'model']

# Preprocesamiento con transformación logarítmica para la variable objetivo
y = np.log1p(y)  # Transformamos los precios a escala logarítmica

# Creamos el preprocesador
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore', max_categories=20)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# División del dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Creamos el pipeline con selección de características
rf_selector = RandomForestRegressor(n_estimators=100, random_state=42)
feature_selector = SelectFromModel(rf_selector, prefit=False)

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('feature_selection', feature_selector),
    ('model', RandomForestRegressor(random_state=42))
])

# Grid de hiperparámetros más exhaustivo
param_grid = {
    'model__n_estimators': [100, 200, 300],
    'model__max_depth': [10, 20, 30, None],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 4],
    'feature_selection__threshold': ['mean', '0.5*mean', '1.5*mean']
}

# Búsqueda de hiperparámetros con validación cruzada
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,
    scoring='neg_mean_squared_error',
    n_jobs=-1,  # Usa todos los cores disponibles
    verbose=1
)

# Entrenamos el modelo
grid_search.fit(X_train, y_train)

# Obtenemos las predicciones
y_pred = grid_search.predict(X_test)

# Convertimos las predicciones de vuelta a la escala original
y_test_original = np.expm1(y_test)
y_pred_original = np.expm1(y_pred)

# Calculamos las métricas
rmse = np.sqrt(mean_squared_error(y_test_original, y_pred_original))
mae = mean_absolute_error(y_test_original, y_pred_original)

# Mostramos los resultados
print("Mejores parámetros encontrados:")
print(grid_search.best_params_)
print(f'\nRMSE: {rmse}')
print(f'MAE: {mae}')

# Importancia de características
best_model = grid_search.best_estimator_
feature_importance = pd.DataFrame({
    'feature': features,
    'importance': best_model.named_steps['model'].feature_importances_
})
print("\nImportancia de características:")
print(feature_importance.sort_values('importance', ascending=False))

Fitting 5 folds for each of 324 candidates, totalling 1620 fits
