In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [5]:
df_modelo_2 = pd.read_csv('df_eda_limpio.csv')
df_modelo_2.head()

Unnamed: 0,make,model,version,fuel,year,kms,power,shift,price,dealer_name,...,province,vehicle_age,price_per_power,price_per_year,dealer_info,power_per_kms,make_popularity,model_popularity,big_city_dealer,price_range
0,Opel,Crossland,1.2 GAS 110 GS Line 5p S/S,Gasolina,2022,5.0,110.0,manual,22900,Sergio Y.,...,Barcelona,3,208.19,7633.34,"Sergio Y.\n2, Carrer de Jacint Benavente, Pobl...",22.0,4591,158,0,20-30k
1,Opel,Crossland,1.2 81kW (110CV) GS Line,Gasolina,2022,24847.0,110.0,manual,19990,Peugeot Alcala 534,...,Madrid,3,181.73,6663.34,"Peugeot Alcala 534\nAvenida de José Gárate, Co...",0.004427,4591,158,0,10-20k
2,Opel,Crossland,1.5D 88kW (120CV) Business Elegance Auto,Diésel,2021,41356.0,120.0,automatic,18590,Clicars S.,...,Madrid,4,154.92,4647.5,"Clicars S.\nSan Cristóbal, Avenida de Andalucí...",0.002902,4591,158,1,10-20k
3,Opel,Crossland,GS-Line 1.2 GAS MT6 S/S 110cv,Gasolina,2022,11.0,110.0,manual,22700,Vallescar S.,...,Barcelona,3,206.37,7566.67,"Vallescar S.\nParc de Bombers de Sabadell, 52,...",10.0,4591,158,0,20-30k
4,Opel,Crossland,GS-Line 1.2 GAS MT6 S/S 110cv,Gasolina,2022,11.0,110.0,manual,22700,Vallescar Ocasion M.,...,Barcelona,3,206.37,7566.67,"Vallescar Ocasion M.\n27, Carrer de Fèlix Ferr...",10.0,4591,158,0,20-30k


In [6]:
# Seleccionamos características más específicas y limitadas
features = ['year', 'kms', 'power', 'vehicle_age', 'fuel', 'shift', 'make']  # Reducimos features
target = 'price'

X = df_modelo_2[features]
y = df_modelo_2[target]

# Separamos características
numeric_features = ['year', 'kms', 'power', 'vehicle_age']
categorical_features = ['fuel', 'shift', 'make']  # Reducimos categorical features

# Preprocesamiento más simple
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# División del dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Modelo RandomForest con parámetros más específicos
rf = RandomForestRegressor(
    n_estimators=100,  # Reducimos número de árboles
    max_depth=15,      # Limitamos profundidad
    min_samples_leaf=4,
    n_jobs=-1,         # Paralelizamos el entrenamiento
    random_state=42
)

# Pipeline simplificado
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', rf)
])

# Entrenamiento
pipeline.fit(X_train, y_train)

# Guardamos el modelo entrenado para Streamlit
import joblib
joblib.dump(pipeline, 'car_prediction_model.joblib')

['car_prediction_model.joblib']

In [7]:
# Evaluación del modelo
y_pred = pipeline.predict(X_test)

# Calculamos las métricas principales
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)

# Calculamos el R² (coeficiente de determinación)
r2 = pipeline.score(X_test, y_test)

# Calculamos el error porcentual medio absoluto (MAPE)
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100

print("Resultados de la evaluación del modelo:")
print(f"RMSE: {rmse:,.2f} €")
print(f"MAE: {mae:,.2f} €")
print(f"R²: {r2:.4f}")
print(f"MAPE: {mape:.2f}%")

# Análisis de errores por rango de precios
df_evaluacion = pd.DataFrame({
    'Precio_Real': y_test,
    'Precio_Predicho': y_pred,
    'Error_Absoluto': np.abs(y_test - y_pred)
})

# Creamos rangos de precios para analizar el rendimiento
df_evaluacion['Rango_Precio'] = pd.qcut(df_evaluacion['Precio_Real'], q=5)
analisis_por_rango = df_evaluacion.groupby('Rango_Precio').agg({
    'Error_Absoluto': ['mean', 'std'],
    'Precio_Real': 'count'
}).round(2)

print("\nAnálisis de errores por rango de precios:")
print(analisis_por_rango)


Resultados de la evaluación del modelo:
RMSE: 2,437.40 €
MAE: 1,197.65 €
R²: 0.9311
MAPE: 130.14%

Análisis de errores por rango de precios:
                   Error_Absoluto          Precio_Real
                             mean      std       count
Rango_Precio                                          
(0.999, 10990.0]          1708.35  3267.97        3651
(10990.0, 14500.0]         757.44  1084.61        3666
(14500.0, 18500.0]        1014.45  1337.76        3632
(18500.0, 24990.0]        1206.28  1630.33        3624
(24990.0, 45950.0]        1303.13  2393.00        3635


  analisis_por_rango = df_evaluacion.groupby('Rango_Precio').agg({


In [None]:
# Función para recomendar coches similares
def recommend_cars(input_features, df_modelo, pipeline, n_recommendations=5):
    """
    Recomienda coches similares basados en las características de entrada
    y devuelve también la información del vendedor.
    """
    # Predecimos el precio para las características de entrada
    predicted_price = pipeline.predict(pd.DataFrame([input_features]))[0]
    
    # Creamos una máscara para filtrar coches similares
    price_range = 0.2  # 20% de diferencia en precio
    min_price = predicted_price * (1 - price_range)
    max_price = predicted_price * (1 + price_range)
    
    # Filtramos coches similares
    mask = (
        (df_modelo['price'] >= min_price) &
        (df_modelo['price'] <= max_price) &
        (df_modelo['fuel'] == input_features['fuel']) &
        (df_modelo['shift'] == input_features['shift'])
    )
    
    similar_cars = df_modelo[mask].copy()
    
    # Calculamos una puntuación de similitud simple
    similar_cars['similarity_score'] = (
        (1 - abs(similar_cars['year'] - input_features['year']) / 10) +
        (1 - abs(similar_cars['kms'] - input_features['kms']) / similar_cars['kms'].max()) +
        (1 - abs(similar_cars['power'] - input_features['power']) / similar_cars['power'].max())
    )
    
    # Ordenamos por puntuación de similitud y seleccionamos los top N
    recommendations = similar_cars.nlargest(n_recommendations, 'similarity_score')
    
    # Seleccionamos las columnas relevantes para la recomendación
    return recommendations[['make', 'model', 'year', 'kms', 'power', 'price', 
                          'dealer_info', 'similarity_score']]

# Ejemplo de uso para Streamlit
example_features = {
    'year': 2018,
    'kms': 50000,
    'power': 150,
    'vehicle_age': 5,
    'fuel': 'Diesel',
    'shift': 'Manual',
    'make': 'BMW'
}

recommendations = recommend_cars(example_features, df_modelo_2, pipeline)
print("Recomendaciones de coches similares:")
print(recommendations)

In [9]:
# Creamos características derivadas que pueden ayudar con la predicción
df_modelo_2['age_power_ratio'] = df_modelo_2['vehicle_age'] / df_modelo_2['power']
df_modelo_2['kms_age_power'] = df_modelo_2['kms'] * df_modelo_2['age_power_ratio']

# Seleccionamos las características que vamos a usar
features = ['year', 'kms', 'power', 'vehicle_age', 'fuel', 'shift', 'make',
           'age_power_ratio', 'kms_age_power']
target = 'price'

# Preparamos X e y, aplicando transformación logarítmica al precio
X = df_modelo_2[features]
y = np.log1p(df_modelo_2[target])  # Transformación logarítmica para manejar mejor los precios

# Separamos características numéricas y categóricas
numeric_features = ['year', 'kms', 'power', 'vehicle_age', 'age_power_ratio', 'kms_age_power']
categorical_features = ['fuel', 'shift', 'make']

In [10]:
# Creamos los transformadores para las características
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Creamos el preprocesador
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# División del dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Creamos el modelo RandomForest con parámetros optimizados
rf = RandomForestRegressor(
    n_estimators=100,
    max_depth=15,
    min_samples_leaf=4,
    n_jobs=-1,
    random_state=42
)

# Creamos y entrenamos el pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', rf)
])

# Entrenamos el modelo
pipeline.fit(X_train, y_train)

In [13]:
# Seleccionar solo las columnas numéricas
numeric_X_test = X_test.select_dtypes(include=[np.number])

# Comprobar valores infinitos y grandes
print("¿Hay valores infinitos en las columnas numéricas?")
print(np.isinf(numeric_X_test.values).any())

print("\nValores máximos por columna:")
print(numeric_X_test.max())

print("\nValores mínimos por columna:")
print(numeric_X_test.min())

¿Hay valores infinitos en las columnas numéricas?
True

Valores máximos por columna:
year                 2023.0
kms                970000.0
power              360571.0
vehicle_age            49.0
age_power_ratio         inf
kms_age_power           inf
dtype: float64

Valores mínimos por columna:
year               1976.000000
kms                   0.000000
power                 0.000000
vehicle_age           2.000000
age_power_ratio       0.000069
kms_age_power         0.000000
dtype: float64


In [14]:
X_test['age_power_ratio'].replace(np.inf, X_test['age_power_ratio'][X_test['age_power_ratio'] != np.inf].max(), inplace=True)
X_test['kms_age_power'].replace(np.inf, X_test['kms_age_power'][X_test['kms_age_power'] != np.inf].max(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_test['age_power_ratio'].replace(np.inf, X_test['age_power_ratio'][X_test['age_power_ratio'] != np.inf].max(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_test['kms_age_power'].replace(np.inf, X_test['kms_age_power'][X_test['kms_age_power'] != np.inf].max(), 

In [15]:
# Hacemos predicciones
y_pred_log = pipeline.predict(X_test)

# Convertimos las predicciones de vuelta a la escala original
y_pred = np.expm1(y_pred_log)
y_test_original = np.expm1(y_test)

# Calculamos las métricas
rmse = np.sqrt(mean_squared_error(y_test_original, y_pred))
mae = mean_absolute_error(y_test_original, y_pred)
r2 = pipeline.score(X_test, y_test)
mape = np.mean(np.abs((y_test_original - y_pred) / y_test_original)) * 100

print("Resultados de la evaluación del modelo:")
print(f"RMSE: {rmse:,.2f} €")
print(f"MAE: {mae:,.2f} €")
print(f"R²: {r2:.4f}")
print(f"MAPE: {mape:.2f}%")

# Análisis por rangos de precios
df_evaluacion = pd.DataFrame({
    'Precio_Real': y_test_original,
    'Precio_Predicho': y_pred,
    'Error_Absoluto': np.abs(y_test_original - y_pred)
})

# Análisis por rangos de precios
df_evaluacion['Rango_Precio'] = pd.qcut(df_evaluacion['Precio_Real'], q=5)
analisis_por_rango = df_evaluacion.groupby('Rango_Precio').agg({
    'Error_Absoluto': ['mean', 'std'],
    'Precio_Real': 'count'
}).round(2)

print("\nAnálisis de errores por rango de precios:")
print(analisis_por_rango)

Resultados de la evaluación del modelo:
RMSE: 3,071.91 €
MAE: 1,726.16 €
R²: 0.7699
MAPE: 49.20%

Análisis de errores por rango de precios:
                   Error_Absoluto          Precio_Real
                             mean      std       count
Rango_Precio                                          
(0.999, 10990.0]          1254.59  2109.66        3651
(10990.0, 14500.0]        1012.66  1268.67        3666
(14500.0, 18500.0]        1538.22  1926.15        3632
(18500.0, 24990.0]        2000.79  2385.24        3624
(24990.0, 45950.0]        2833.36  3847.90        3635


  analisis_por_rango = df_evaluacion.groupby('Rango_Precio').agg({


In [16]:
# Guardamos el modelo para uso futuro
import joblib
joblib.dump(pipeline, 'car_prediction_modelo.joblib')

['car_prediction_modelo.joblib']

In [17]:
# Función de recomendación
def recommend_cars(input_features, df_modelo, pipeline, n_recommendations=5):
    """
    Recomienda coches similares basados en las características de entrada.
    
    Parámetros:
    input_features: dict con las características del coche deseado
    df_modelo: DataFrame con todos los coches
    pipeline: modelo entrenado
    n_recommendations: número de recomendaciones a devolver
    """
    # Calculamos las características derivadas
    input_features['age_power_ratio'] = input_features['vehicle_age'] / input_features['power']
    input_features['kms_age_power'] = input_features['kms'] * input_features['age_power_ratio']
    
    # Predecimos el precio (en escala logarítmica)
    predicted_price_log = pipeline.predict(pd.DataFrame([input_features]))[0]
    predicted_price = np.expm1(predicted_price_log)
    
    # Definimos el rango de precios similar (±20%)
    price_range = 0.2
    min_price = predicted_price * (1 - price_range)
    max_price = predicted_price * (1 + price_range)
    
    # Filtramos coches similares
    mask = (
        (df_modelo['price'] >= min_price) &
        (df_modelo['price'] <= max_price) &
        (df_modelo['fuel'] == input_features['fuel']) &
        (df_modelo['shift'] == input_features['shift'])
    )
    
    similar_cars = df_modelo[mask].copy()
    
    # Calculamos puntuación de similitud
    similar_cars['similarity_score'] = (
        (1 - abs(similar_cars['year'] - input_features['year']) / 10) +
        (1 - abs(similar_cars['kms'] - input_features['kms']) / similar_cars['kms'].max()) +
        (1 - abs(similar_cars['power'] - input_features['power']) / similar_cars['power'].max())
    )
    
    # Seleccionamos las mejores recomendaciones
    recommendations = similar_cars.nlargest(n_recommendations, 'similarity_score')
    
    return recommendations[['make', 'model', 'year', 'kms', 'power', 'price', 
                          'dealer_info', 'similarity_score']]