In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [3]:
df_modelo = pd.read_csv('df_EDA_predicprecio.csv')
df_modelo.head()

Unnamed: 0,make,model,version,fuel,year,kms,power,shift,price
0,Opel,Crossland,1.2 GAS 110 GS Line 5p S/S,Gasolina,2022,5.0,110.0,manual,22900
1,Opel,Crossland,1.2 81kW (110CV) GS Line,Gasolina,2022,24847.0,110.0,manual,19990
2,Opel,Crossland,1.5D 88kW (120CV) Business Elegance Auto,Diésel,2021,41356.0,120.0,automatic,18590
3,Opel,Crossland,GS-Line 1.2 GAS MT6 S/S 110cv,Gasolina,2022,11.0,110.0,manual,22700
4,Opel,Crossland,1.2 GS LINE 110 CV 5P,Gasolina,2021,51390.0,110.0,manual,18200


In [4]:
df_modelo.shape

(12453, 9)

In [7]:
df_modelo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12453 entries, 0 to 12452
Data columns (total 9 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   make     12453 non-null  object 
 1   model    12453 non-null  object 
 2   version  12453 non-null  object 
 3   fuel     12453 non-null  object 
 4   year     12453 non-null  int64  
 5   kms      12453 non-null  float64
 6   power    12453 non-null  float64
 7   shift    12453 non-null  object 
 8   price    12453 non-null  int64  
dtypes: float64(2), int64(2), object(5)
memory usage: 875.7+ KB


In [10]:
def preprocesar_datos(df):
    # Separar características y variable objetivo
    X = df.drop('price', axis=1)
    y = df['price']
    
    # Identificar columnas categóricas y numéricas
    columnas_categoricas = ['make', 'model', 'version', 'fuel', 'shift']
    columnas_numericas = ['year', 'kms', 'power']
    
    # Crear transformador de columnas
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), columnas_numericas),
            ('cat', OneHotEncoder(handle_unknown='ignore'), columnas_categoricas)
        ])
    
    return X, y, preprocessor

In [11]:
# Paso 3: Dividir los datos en entrenamiento y prueba
def dividir_datos(X, y):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    return X_train, X_test, y_train, y_test


In [12]:
# Paso 4: Crear y entrenar el modelo
def crear_modelo(preprocessor):
    modelo = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
    ])
    return modelo

In [13]:
# Paso 5: Evaluar el modelo
def evaluar_modelo(modelo, X_test, y_test):
    predicciones = modelo.predict(X_test)
    
    mse = mean_squared_error(y_test, predicciones)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, predicciones)
    r2 = r2_score(y_test, predicciones)
    
    print("Métricas de evaluación:")
    print(f"Error Cuadrático Medio (MSE): {mse:.2f}")
    print(f"Raíz del Error Cuadrático Medio (RMSE): {rmse:.2f}")
    print(f"Error Absoluto Medio (MAE): {mae:.2f}")
    print(f"Coeficiente de Determinación (R²): {r2:.2f}")
    
    return predicciones

In [None]:
# Función principal para ejecutar todo el proceso
def main(df_limpio):
    # Paso 2: Preprocesar datos
    X, y, preprocessor = preprocesar_datos(df_limpio)
    
    # Paso 3: Dividir datos
    X_train, X_test, y_train, y_test = dividir_datos(X, y)
    
    # Paso 4: Crear modelo
    modelo = crear_modelo(preprocessor)
    modelo.fit(X_train, y_train)
    
    # Paso 5: Evaluar modelo
    predicciones = evaluar_modelo(modelo, X_test, y_test)
    
    return modelo, predicciones