# Modelos

In [1]:
# Utilice este espacio para escribir el código.
import pandas as pd

# Cargar el archivo CSV 
data = pd.read_csv("datalimpio_modelo_final.csv")

# Ver las primeras filas
print(data.head())

   start_date  anio  mes  mes_cont zona  rooms  bedrooms  bathrooms  \
0  2020-12-08  2020   12        12  1_1    3.0       2.0        2.0   
1  2020-12-08  2020   12        12  1_0    2.0       1.0        1.0   
2  2020-12-08  2020   12        12  1_0    2.0       1.0        1.0   
3  2020-12-08  2020   12        12  2_1    3.0       2.0        1.0   
4  2020-12-08  2020   12        12  1_0    4.0       2.0        2.0   

   surface_total     price  price_per_m2  
0           65.0  160000.0   2461.538462  
1           68.0  130000.0   1911.764706  
2           33.0  130000.0   3939.393939  
3           50.0   99000.0   1980.000000  
4           99.0  220000.0   2222.222222  


In [2]:
data.head()

Unnamed: 0,start_date,anio,mes,mes_cont,zona,rooms,bedrooms,bathrooms,surface_total,price,price_per_m2
0,2020-12-08,2020,12,12,1_1,3.0,2.0,2.0,65.0,160000.0,2461.538462
1,2020-12-08,2020,12,12,1_0,2.0,1.0,1.0,68.0,130000.0,1911.764706
2,2020-12-08,2020,12,12,1_0,2.0,1.0,1.0,33.0,130000.0,3939.393939
3,2020-12-08,2020,12,12,2_1,3.0,2.0,1.0,50.0,99000.0,1980.0
4,2020-12-08,2020,12,12,1_0,4.0,2.0,2.0,99.0,220000.0,2222.222222


Random Forest

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Verificar si hay valores nulos
print(data.isnull().sum())

# Codificar zona como variables dummy
data_model = pd.get_dummies(data, columns=['zona'], drop_first=True)

# Definir variables predictoras y objetivo
features_base = ['rooms', 'bedrooms', 'bathrooms', 'surface_total', 'mes_cont']
features_zonas = [c for c in data_model.columns if c.startswith('zona_')]
X = data_model[features_base + features_zonas]
y = data_model['price_per_m2']

# Dividir en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Entrenar modelo Random Forest
modelo = RandomForestRegressor(
    n_estimators=500,
    max_depth=15,
    min_samples_leaf=5,
    random_state=42
)
modelo.fit(X_train, y_train)

# Predicciones
y_pred = modelo.predict(X_test)

# Evaluación
mae = mean_absolute_error(y_test, y_pred)
print(f"Error absoluto medio: {mae:.2f}")


start_date       0
anio             0
mes              0
mes_cont         0
zona             0
rooms            0
bedrooms         0
bathrooms        0
surface_total    0
price            0
price_per_m2     0
dtype: int64
Error absoluto medio: 728.57


In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Cargar CSV
data = pd.read_csv("datalimpio_modelo_final.csv")

# Filtrar outliers (1% inferior y superior)
q_low = data['price_per_m2'].quantile(0.01)
q_high = data['price_per_m2'].quantile(0.99)
data_filtered = data[(data['price_per_m2'] >= q_low) & (data['price_per_m2'] <= q_high)].copy()

# Log-transform de la variable objetivo
data_filtered['price_per_m2_log'] = np.log1p(data_filtered['price_per_m2'])

# Codificar zona como dummies
data_model = pd.get_dummies(data_filtered, columns=['zona'], drop_first=True)

# Variables predictoras
features_base = ['rooms', 'bedrooms', 'bathrooms', 'surface_total', 'mes_cont']
features_zonas = [c for c in data_model.columns if c.startswith('zona_')]
X = data_model[features_base + features_zonas]
y = data_model['price_per_m2_log']

# Dividir entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Definir grid de hiperparámetros
param_grid = {
    'n_estimators': [200, 500],
    'max_depth': [10, 15, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 3]
}

# Crear Random Forest y GridSearchCV
rf = RandomForestRegressor(random_state=42)
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Mejor modelo
best_model = grid_search.best_estimator_
print("Mejores parámetros:", grid_search.best_params_)

# Predicciones (log-transform)
y_pred_log = best_model.predict(X_test)

# Transformar de vuelta
y_pred = np.expm1(y_pred_log)
y_test_real = np.expm1(y_test)

# Evaluación
mae = mean_absolute_error(y_test_real, y_pred)
print(f"Error absoluto medio calibrado: {mae:.2f} USD/m²")


Mejores parámetros: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 500}
Error absoluto medio calibrado: 484.71 USD/m²
