In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PowerTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, r2_score, median_absolute_error
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import median_absolute_error

In [2]:
df_modelo = pd.read_csv('df_eda_limpio.csv')
df_modelo.head()

Unnamed: 0,make,model,version,fuel,year,kms,power,shift,price,dealer_name,...,province,vehicle_age,price_per_power,price_per_year,dealer_info,power_per_kms,make_popularity,model_popularity,big_city_dealer,price_range
0,Opel,Crossland,1.2 GAS 110 GS Line 5p S/S,Gasolina,2022,5.0,110.0,manual,22900,Sergio Y.,...,Barcelona,3,208.19,7633.34,"Sergio Y.\r\n2, Carrer de Jacint Benavente, Po...",22.0,4591,158,0,20-30k
1,Opel,Crossland,1.2 81kW (110CV) GS Line,Gasolina,2022,24847.0,110.0,manual,19990,Peugeot Alcala 534,...,Madrid,3,181.73,6663.34,"Peugeot Alcala 534\r\nAvenida de José Gárate, ...",0.004427,4591,158,0,10-20k
2,Opel,Crossland,1.5D 88kW (120CV) Business Elegance Auto,Diésel,2021,41356.0,120.0,automatic,18590,Clicars S.,...,Madrid,4,154.92,4647.5,"Clicars S.\r\nSan Cristóbal, Avenida de Andalu...",0.002902,4591,158,1,10-20k
3,Opel,Crossland,GS-Line 1.2 GAS MT6 S/S 110cv,Gasolina,2022,11.0,110.0,manual,22700,Vallescar S.,...,Barcelona,3,206.37,7566.67,"Vallescar S.\r\nParc de Bombers de Sabadell, 5...",10.0,4591,158,0,20-30k
4,Opel,Crossland,GS-Line 1.2 GAS MT6 S/S 110cv,Gasolina,2022,11.0,110.0,manual,22700,Vallescar Ocasion M.,...,Barcelona,3,206.37,7566.67,"Vallescar Ocasion M.\r\n27, Carrer de Fèlix Fe...",10.0,4591,158,0,20-30k


In [3]:
df_modelo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91040 entries, 0 to 91039
Data columns (total 23 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   make              91040 non-null  object 
 1   model             91040 non-null  object 
 2   version           91040 non-null  object 
 3   fuel              91040 non-null  object 
 4   year              91040 non-null  int64  
 5   kms               91040 non-null  float64
 6   power             91040 non-null  float64
 7   shift             91040 non-null  object 
 8   price             91040 non-null  int64  
 9   dealer_name       91040 non-null  object 
 10  dealer_address    91040 non-null  object 
 11  dealer_city       91040 non-null  object 
 12  dealer_zip_code   91040 non-null  int64  
 13  province          91040 non-null  object 
 14  vehicle_age       91040 non-null  int64  
 15  price_per_power   91040 non-null  float64
 16  price_per_year    91040 non-null  float6

In [4]:
# Selección de características relevantes
features = ['year', 'kms', 'power', 'vehicle_age', 'fuel', 'shift', 'make', 'model']
target = 'price'

X = df_modelo[features]
y = df_modelo[target]

In [5]:
# Preprocesamiento de los datos
numeric_features = ['year', 'kms', 'power', 'vehicle_age']
categorical_features = ['fuel', 'shift', 'make', 'model']

numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [6]:
# Se divide el dataset en conjunto de entrenamiento y prueba

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# Entrenamiento del modelo KNN
knn = KNeighborsRegressor()

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', knn)
])

# Búsqueda de hiperparámetros (ajuste del número de vecinos)
param_grid = {'model__n_neighbors': [3, 5, 7, 9, 11]}
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# Mejor modelo encontrado
best_model = grid_search.best_estimator_

In [None]:
y_pred = best_model.predict(X_test)

# Calcular RMSE y MAE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)

# Calcular MAPE
mape = mean_absolute_percentage_error(y_test, y_pred)

# Calcular R²
r2 = r2_score(y_test, y_pred)

# Calcular métricas adicionales
medae = median_absolute_error(y_test, y_pred)
r2_adj = 1 - (1 - r2_score(y_test, y_pred)) * (len(y_test) - 1) / (len(y_test) - X_test.shape[1] - 1)


# Mostrar los resultados
print(f'Best K: {grid_search.best_params_}')
print(f'RMSE: {rmse} €')
print(f'MAE: {mae} €')
print(f'MAPE: {mape} %')
print(f'R²: {r2} %')
print(f'R² Ajustado: {r2_adj:.4f} %')

Best K: {'model__n_neighbors': 3, 'model__p': 1}
RMSE: 1800.1565970347915 €
MAE: 440.3260837727006 €
MAPE: 0.48772600040724456%
R²: 0.9624163928256781 %
R² Ajustado: 0.9624 %


In [11]:
# Ajustar más hiperparámetros en el modelo KNN
param_grid = {
    'model__n_neighbors': [3, 5, 7, 9, 11, 15],
    'model__p': [1, 2]  # p=1 (Manhattan) o p=2 (Euclidiana)
}
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_

In [15]:
# Evaluar el nuevo modelo
y_pred = best_model.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)

# Calcular MAPE
mape = mean_absolute_percentage_error(y_test, y_pred)

# Calcular R²
r2 = r2_score(y_test, y_pred)

# Mostrar los resultados
print(f'Best K: {grid_search.best_params_}')
print(f'RMSE: {rmse} €')
print(f'MAE: {mae} €')
print(f'MAPE: {mape}%')
print(f'R²: {r2}')

Best K: {'model__n_neighbors': 3, 'model__p': 1}
RMSE: 1800.1565970347915 €
MAE: 440.3260837727006 €
MAPE: 0.48772600040724456%
R²: 0.9624163928256781


In [16]:
# Modelo de RandomForest
rf = RandomForestRegressor(n_estimators=100, random_state=42)

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', rf)
])

pipeline.fit(X_train, y_train)

# Evaluar el modelo
y_pred = pipeline.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)

# Calcular MAPE
mape = mean_absolute_percentage_error(y_test, y_pred)

# Calcular R²
r2 = r2_score(y_test, y_pred)

# Mostrar los resultados
print(f'Best K: {grid_search.best_params_}')
print(f'RMSE: {rmse} €')
print(f'MAE: {mae} €')
print(f'MAPE: {mape} %')
print(f'R²: {r2}')

Best K: {'model__n_neighbors': 3, 'model__p': 1}
RMSE: 1363.1901891069524
MAE: 314.07350616751205
MAPE: 0.405827554770408%
R²: 0.9784478405678062


In [17]:
# Modelo de GradientBoosting
gb = GradientBoostingRegressor(n_estimators=100, random_state=42)

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', gb)
])

pipeline.fit(X_train, y_train)

# Evaluar el modelo
y_pred = pipeline.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)

# Calcular MAPE
mape = mean_absolute_percentage_error(y_test, y_pred)

# Calcular R²
r2 = r2_score(y_test, y_pred)

# Mostrar los resultados
print(f'Best K: {grid_search.best_params_}')
print(f'RMSE: {rmse} €')
print(f'MAE: {mae} €')
print(f'MAPE: {mape}%')
print(f'R²: {r2}')

Best K: {'model__n_neighbors': 3, 'model__p': 1}
RMSE: 4259.814616663449 €
MAE: 2819.4263899456146 €
MAPE: 2.349951388403365%
R²: 0.7895449815487403
