In [1]:
# üì¶ Importamos las librer√≠as
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
# üìÇ Cargamos los datos
df_modelo_3 = pd.read_csv('df_eda_limpio.csv')  

In [5]:
# ‚ú® Creamos caracter√≠sticas derivadas
df_modelo_3['power_year_ratio'] = df_modelo_3['power'] / df_modelo_3['year']
df_modelo_3['kms_year_ratio'] = df_modelo_3['kms'] / df_modelo_3['vehicle_age']
df_modelo_3['kms_power_ratio'] = df_modelo_3['kms'] / df_modelo_3['power']

# üìã Selecci√≥n de caracter√≠sticas
features = [
    'year', 'kms', 'power', 'vehicle_age', 'fuel', 'shift', 'make', 'model',
    'power_year_ratio', 'kms_year_ratio', 'kms_power_ratio'
]
target = 'price'

X = df_modelo_3[features]
y = df_modelo_3[target]

In [6]:
X.head(), y.head()

(   year      kms  power  vehicle_age      fuel      shift  make      model  \
 0  2022      5.0  110.0            3  Gasolina     manual  Opel  Crossland   
 1  2022  24847.0  110.0            3  Gasolina     manual  Opel  Crossland   
 2  2021  41356.0  120.0            4    Di√©sel  automatic  Opel  Crossland   
 3  2022     11.0  110.0            3  Gasolina     manual  Opel  Crossland   
 4  2022     11.0  110.0            3  Gasolina     manual  Opel  Crossland   
 
    power_year_ratio  kms_year_ratio  kms_power_ratio  
 0          0.054402        1.666667         0.045455  
 1          0.054402     8282.333333       225.881818  
 2          0.059377    10339.000000       344.633333  
 3          0.054402        3.666667         0.100000  
 4          0.054402        3.666667         0.100000  ,
 0    22900
 1    19990
 2    18590
 3    22700
 4    22700
 Name: price, dtype: int64)

In [7]:
# ‚úÇÔ∏è Divisi√≥n en train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# üî• Transformamos la variable objetivo a logar√≠tmica
y_train = np.log1p(y_train)
y_test = np.log1p(y_test)

In [8]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [9]:
# üî¢ Variables num√©ricas y categ√≥ricas
numeric_features = ['year', 'kms', 'power', 'vehicle_age', 
                    'power_year_ratio', 'kms_year_ratio', 'kms_power_ratio']
categorical_features = ['fuel', 'shift', 'make', 'model']

# ‚öôÔ∏è Transformadores
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore', max_categories=20)

# üèóÔ∏è Preprocesador
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='drop'
)


In [11]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline

In [14]:
# üå≥ Random Forest
rf_selector = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
feature_selector = SelectFromModel(rf_selector, prefit=False)

# üèóÔ∏è Pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('feature_selection', feature_selector),
    ('model', RandomForestRegressor(random_state=42, n_jobs=-1))
])

In [15]:
from sklearn.model_selection import RandomizedSearchCV


In [16]:
# üß© Grid de par√°metros (m√°s ligero para pruebas r√°pidas)
param_grid = {
    'model__n_estimators': [100, 200],
    'model__max_depth': [10, 20],
    'model__min_samples_split': [2, 5],
    'model__min_samples_leaf': [1, 2],
    'feature_selection__threshold': ['mean']
}

# üîç Randomized Search
random_search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_grid,
    n_iter=30,      # üî• N√∫mero de combinaciones a probar
    cv=3,           # ‚ö° Menos pliegues para m√°s velocidad
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    verbose=1,
    random_state=42
)

In [17]:
# üöÄ Entrenamos el modelo
random_search.fit(X_train, y_train)



Fitting 3 folds for each of 16 candidates, totalling 48 fits


In [18]:
# üíæ Guardamos el modelo
import joblib
joblib.dump(random_search.best_estimator_, 'best_model.pkl')

['best_model.pkl']

In [19]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [24]:
print(X_test.isnull().sum())
numeric_X_test = X_test.select_dtypes(include=[np.number])
print(np.isinf(numeric_X_test).sum())

year                0
kms                 0
power               0
vehicle_age         0
fuel                0
shift               0
make                0
model               0
power_year_ratio    0
kms_year_ratio      0
kms_power_ratio     0
dtype: int64
year                0
kms                 0
power               0
vehicle_age         0
power_year_ratio    0
kms_year_ratio      0
kms_power_ratio     1
dtype: int64


In [25]:
X_test['kms_power_ratio'] = np.where(np.isinf(X_test['kms_power_ratio']), 0, X_test['kms_power_ratio'])

In [26]:
# üî• Predicciones
y_pred = random_search.predict(X_test)

# üí∏ Volvemos a la escala original
y_test_original = np.expm1(y_test)
y_pred_original = np.expm1(y_pred)

# üìè M√©tricas
rmse = np.sqrt(mean_squared_error(y_test_original, y_pred_original))
mae = mean_absolute_error(y_test_original, y_pred_original)
r2 = r2_score(y_test_original, y_pred_original)

print("\nResultados de la evaluaci√≥n del modelo:")
print(f"RMSE: {rmse:,.2f} ‚Ç¨")
print(f"MAE: {mae:,.2f} ‚Ç¨")
print(f"R¬≤: {r2:.4f}")

# üèÖ Mejores hiperpar√°metros
print("\nMejores par√°metros encontrados:")
print(random_search.best_params_)


Resultados de la evaluaci√≥n del modelo:
RMSE: 2,087.72 ‚Ç¨
MAE: 706.28 ‚Ç¨
R¬≤: 0.9494

Mejores par√°metros encontrados:
{'model__n_estimators': 200, 'model__min_samples_split': 2, 'model__min_samples_leaf': 1, 'model__max_depth': 20, 'feature_selection__threshold': 'mean'}
