## Functions and libraries

In [65]:
from fastai.tabular.all import *
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import (
    AdaBoostRegressor, GradientBoostingRegressor,
    BaggingRegressor, RandomForestRegressor
)
from sklearn.tree import DecisionTreeRegressor
from lightgbm import LGBMRegressor
import xgboost as xgb
from catboost import CatBoostRegressor
import re

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
pd.set_option('display.expand_frame_repr', False)

In [66]:
# Función para limpiar nombres de columnas
def clean_column_names(df):
    df = df.copy()
    df.columns = [
        re.sub(r'[^A-Za-z0-9_]+', '_', col)  # deja solo letras, números y "_"
        for col in df.columns
    ]
    return df


In [67]:
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_log_error, mean_squared_error
import numpy as np

# Funciones auxiliares
def mape_percent(y_true, y_pred):
    # MAPE en porcentaje
    return 100 * mean_absolute_percentage_error(y_true, y_pred)

def rmsle(y_true, y_pred):
    # MSLE requiere no-negativos
    y_true_safe = np.maximum(y_true, 0)
    y_pred_safe = np.maximum(y_pred, 0)
    return np.sqrt(mean_squared_log_error(y_true_safe, y_pred_safe))

def root_mean_squared_error(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# Evaluación de modelos
def fit_transform_model(X, y):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    models = [
        LinearRegression(n_jobs=-1),
        KNeighborsRegressor(n_neighbors=5, n_jobs=-1),
        AdaBoostRegressor(n_estimators=100, learning_rate=0.1, random_state=42),
        DecisionTreeRegressor(max_depth=10, random_state=42),
        GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42),
        BaggingRegressor(n_estimators=50, n_jobs=-1, random_state=42),
        RandomForestRegressor(n_estimators=100, max_depth=10, n_jobs=-1, random_state=42),
        LGBMRegressor(n_estimators=200, learning_rate=0.1, max_depth=-1, n_jobs=-1, random_state=42, verbose=-1),
        xgb.XGBRegressor(n_estimators=200, learning_rate=0.1, max_depth=6,
                         subsample=0.8, colsample_bytree=0.8, n_jobs=-1,
                         tree_method="hist", random_state=42, verbosity=0),
        CatBoostRegressor(iterations=200, depth=6, learning_rate=0.1, verbose=False, random_state=42)
    ]

    for model in models:
        model_name = model.__class__.__name__
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)

        mape_val  = mape_percent(y_test, predictions)
        rmsle_val = rmsle(y_test, predictions)
        rmse_val  = root_mean_squared_error(y_test, predictions)

        print(f"{model_name} MAPE: {mape_val:.2f}% | RMSLE: {rmsle_val:.4f} | RMSE: {rmse_val:.4f}")

## Model test with application_train.csv

### Import dataset

In [68]:
train_df = pd.read_parquet("train_2_Prework.parquet", engine='fastparquet')

### Quick prework with fastai

In [69]:
# La librería fastai tiene varias funcionalidad para preprocesar datasets. 
# En particular para poder usar un Random Forests de ScikitLearn necesitamos que todas las variables sean numéricas y no tener valores faltantas.
# En particular vamos a usar dos preprocesamientos: Categorify y FillMissing. Por ahora solo definimos los preprocesamientos y luego se aplican.
procs = [Categorify, FillMissing]

In [70]:
# Además del preprocesamiento, vamos a dividir nuestro dataset en train y validation. 
# En este caso no sería una buena elección hacer una partición aleatoria porque estamos trabajando con datos temporales. 
# Vamos a poner una fecha de corte para dividir el dataset.
# - dep_var='TARGET': excluye esta columna del análisis porque es la variable que se quiere predecir.
# - max_card=10: si una columna tiene menos de 10 valores únicos, se considera categórica.
# - preproc_names=procs: aplica transformaciones como Categorify, FillMissing, que están definidas en procs

cont,cat = cont_cat_split(train_df, 1, dep_var='TARGET')

In [71]:
# Índices del DataFrame completo
idxs = list(range(len(train_df)))

# Partición aleatoria 80/20
train_idx, valid_idx = train_test_split(idxs, test_size=0.3, random_state=42)

# Adaptación para TabularPandas
splits = (list(train_idx), list(valid_idx))


In [72]:
from fastai.tabular.all import *
import numpy as np
import pandas as pd


# ===== 1) Preparar df y target =====
df = train_df.copy()

# target precio/m2 con guardas
df["price_m2"] = df["price"] / df["surface_total"].replace(0, np.nan)
df["lat"] = pd.to_numeric(df["lat"], errors="coerce")
df["lon"] = pd.to_numeric(df["lon"], errors="coerce")
df = df.dropna(subset=["price_m2"]).copy()   # sin target NaN

# ===== 2) Definir columnas =====
cat = df.select_dtypes(include=["object","category"]).columns.tolist()
cont = [c for c in df.select_dtypes(include=["number"]).columns if c != "price_m2"]

from fastai.tabular.all import *
import numpy as np
import pandas as pd


df = df.dropna(subset=["price_m2"]).copy()
all_nan_cols   = df.columns[df.isna().all()].tolist()                         # 100% NaN (ej. l6)
constant_cols  = [c for c in df.columns if df[c].nunique(dropna=True) <= 1]   # sin variabilidad
hi_null_cols   = df.columns[df.isna().mean() > 0.95].tolist()                 # >95% NaN
to_drop = sorted(set(all_nan_cols) | set(constant_cols) | set(hi_null_cols))

print("Dropping columns:", to_drop)  # debería incluir 'l6'
df = df.drop(columns=to_drop)

num_cols = df.select_dtypes(include=["number"]).columns.tolist()
cont = [c for c in num_cols if c != "price_m2"]
cat  = [c for c in df.columns if c not in cont + ["price_m2"]]

procs = [Categorify, FillMissing, Normalize]
splits = RandomSplitter(valid_pct=0.2, seed=42)(range_of(df))

Dropping columns: ['ad_type', 'currency', 'l5', 'l6', 'operation_type', 'price_period']


In [73]:
to = TabularPandas(df, procs, cat, cont, y_names='price', splits=splits)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  to[n].fillna(self.na_dict[n], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  to[n].fillna(self.na_dict[n], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values alway

In [74]:
# === BLOQUE SEGURO: usar salidas de TabularPandas para evitar NaN ===
# Requiere que exista el objeto `to = TabularPandas(...)` con procs [Categorify, FillMissing, Normalize]
try:
    X_train, y_train = to.train.xs.copy(), to.train.y.values.ravel()
    X_valid, y_valid = to.valid.xs.copy(), to.valid.y.values.ravel()
except NameError as e:
    raise RuntimeError("No existe el objeto 'to'. Asegúrate de haber creado TabularPandas antes.") from e

all_nan_cols   = df.columns[df.isna().all()].tolist()                         # 100% NaN (ej. l6)
constant_cols  = [c for c in df.columns if df[c].nunique(dropna=True) <= 1]   # sin variabilidad
hi_null_cols   = df.columns[df.isna().mean() > 0.95].tolist()                 # >95% NaN
to_drop = sorted(set(all_nan_cols) | set(constant_cols) | set(hi_null_cols))

print("Dropping columns:", to_drop)  # debería incluir 'l6'

# Chequeos duros de NaN (deben dar 0)
print("NaN en X_train:", int(X_train.isna().sum().sum()))
print("NaN en X_valid:", int(X_valid.isna().sum().sum()))


Dropping columns: []
NaN en X_train: 0
NaN en X_valid: 0


In [75]:
len(to.train),len(to.valid)

(224100, 56025)

In [76]:
# Una vez hecho el preprocesamiento, se puede ver que los valores del dataframe son todos numéricos.
to.items.head(3)

Unnamed: 0,id,start_date,end_date,created_on,lat,lon,l1,l2,l3,l4,rooms,bedrooms,bathrooms,surface_total,surface_covered,title,description,property_type,price,covered_ratio,year_created,month_created,age_of_ad,price_m2,lat_na,lon_na,rooms_na,bedrooms_na,bathrooms_na,surface_covered_na,covered_ratio_na
976371,1.944583,27,41,27,0.308681,-0.063553,1,7,660,0,-0.027854,-0.159933,-0.58907,-0.108352,-0.063522,77799,207108,8,-0.299272,0.002644,-0.90205,0.326434,1.269102,2000.0,1,1,2,2,1,1,1
143386,-1.036305,56,72,56,0.664449,-1.197887,1,4,390,0,-0.027854,-0.159933,0.367002,-0.095983,-0.030901,96032,156078,8,-0.299272,0.002644,-0.90205,0.637303,1.000483,1072.727295,1,1,2,2,1,1,1
415316,-0.060889,138,203,138,0.29327,-0.054636,1,7,443,0,-0.027854,-1.160798,-0.58907,-0.105927,-0.060324,85026,23423,4,-0.127918,-0.004603,-0.90205,1.259042,0.237951,2971.014404,1,1,2,1,1,1,1


In [77]:
to.items.describe(include='all')  # Para ver estadísticas generales

Unnamed: 0,id,start_date,end_date,created_on,lat,lon,l1,l2,l3,l4,rooms,bedrooms,bathrooms,surface_total,surface_covered,title,description,property_type,price,covered_ratio,year_created,month_created,age_of_ad,price_m2,lat_na,lon_na,rooms_na,bedrooms_na,bathrooms_na,surface_covered_na,covered_ratio_na
count,280125.0,280125.0,280125.0,280125.0,280125.0,280125.0,280125.0,280125.0,280125.0,280125.0,280125.0,280125.0,280125.0,280125.0,280125.0,280125.0,280125.0,280125.0,280125.0,280125.0,280125.0,280125.0,280125.0,280125.0,280125.0,280125.0,280125.0,280125.0,280125.0,280125.0,280125.0
mean,-3.1e-05,158.530181,272.967104,158.530181,-0.000655,-0.000195,1.069476,9.555084,397.602399,140.257721,-0.001546,0.002985,-0.000884,0.000821,0.001345,82714.240935,109406.606968,4.199404,-6.8e-05,-0.000439,0.000294,0.000679,-0.00091,1906.913,1.12724,1.12724,1.308237,1.45575,1.185296,1.161499,1.161499
std,1.000305,101.263015,145.345692,101.263015,0.999965,0.995251,0.447577,11.394756,219.302012,238.187339,1.000338,1.97953,0.999948,1.01213,1.107909,47253.169024,61769.154997,2.47268,0.998055,0.941927,1.000032,0.999563,1.000511,4851.974,0.333242,0.333242,0.461766,0.498039,0.388538,0.367992,0.367992
min,-1.530717,1.0,0.0,1.0,-22.314139,-6.851133,1.0,1.0,0.0,0.0,-1.43881,-4.163394,-0.58907,-0.17117,-0.209997,1.0,0.0,1.0,-0.521836,-0.097369,-0.90205,-1.849652,-1.85901,-57500.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,-0.893183,68.0,151.0,68.0,0.190198,-0.06961,1.0,2.0,214.0,0.0,-0.733332,-0.159933,-0.58907,-0.109565,-0.069278,47818.0,56414.0,3.0,-0.35836,-0.014804,-0.90205,-0.606175,-0.853855,875.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
50%,-0.050208,145.0,287.0,145.0,0.273895,-0.059901,1.0,7.0,443.0,0.0,-0.027854,-0.159933,-0.58907,-0.097195,-0.052648,74391.0,114187.0,4.0,-0.230927,-0.005929,-0.90205,0.015564,0.159965,1739.475,1.0,1.0,1.0,1.0,1.0,1.0,1.0
75%,0.747618,241.0,424.0,241.0,0.295299,-0.02434,1.0,7.0,572.0,233.0,-0.027854,-0.159933,0.367002,-0.052083,-0.018108,123086.0,161551.0,5.0,0.029649,0.002644,1.108586,0.948173,0.896501,2549.02,1.0,1.0,2.0,2.0,1.0,1.0,1.0
max,2.002324,346.0,448.0,346.0,10.231139,27.502083,4.0,42.0,752.0,784.0,26.074825,898.616975,17.576286,48.385573,247.435092,172602.0,213956.0,10.0,90.069385,339.949033,1.108586,1.569911,1.511726,1817667.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0


In [78]:
to.train.xs.info()

<class 'pandas.core.frame.DataFrame'>
Index: 224100 entries, 976371 to 450282
Data columns (total 30 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   start_date          224100 non-null  int16  
 1   end_date            224100 non-null  int16  
 2   created_on          224100 non-null  int16  
 3   l1                  224100 non-null  int8   
 4   l2                  224100 non-null  int8   
 5   l3                  224100 non-null  int16  
 6   l4                  224100 non-null  int16  
 7   title               224100 non-null  int32  
 8   description         224100 non-null  int32  
 9   property_type       224100 non-null  int8   
 10  lat_na              224100 non-null  int8   
 11  lon_na              224100 non-null  int8   
 12  rooms_na            224100 non-null  int8   
 13  bedrooms_na         224100 non-null  int8   
 14  bathrooms_na        224100 non-null  int8   
 15  surface_covered_na  224100 non-nul

In [79]:
# Dado que el preprocesamiento puede tardar, guardamos el objeto en memoria para futuro uso
save_pickle('./df_train-tabular-object.pkl',to)

### Fit

In [80]:
# Cargar el objeto TabularPandas desde el pickle
to = load_pickle('./df_train-tabular-object.pkl')

# Extraer features y target
xs, y = to.train.xs, to.train.y
valid_xs, valid_y = to.valid.xs, to.valid.y

# Muestreo
sample_frac = 0.20   # 100% de las filas
sampled_xs = xs.sample(frac=sample_frac, random_state=42)
sampled_y = y.loc[sampled_xs.index] 

# Copias para trabajar con scikit-learn
X = sampled_xs.copy()
target = sampled_y.copy()

# Ejecutar evaluación
fit_transform_model(X, target)

LinearRegression MAPE: 0.00% | RMSLE: 0.0000 | RMSE: 0.0000
KNeighborsRegressor MAPE: 2222.29% | RMSLE: 0.3113 | RMSE: 0.8557
AdaBoostRegressor MAPE: 867.22% | RMSLE: 0.1041 | RMSE: 0.2350
DecisionTreeRegressor MAPE: 1.27% | RMSLE: 0.0017 | RMSE: 0.0426
GradientBoostingRegressor MAPE: 109.16% | RMSLE: 0.0039 | RMSE: 0.0289
BaggingRegressor MAPE: 0.19% | RMSLE: 0.0017 | RMSE: 0.0452
RandomForestRegressor MAPE: 0.66% | RMSLE: 0.0018 | RMSE: 0.0468
LGBMRegressor MAPE: 4.19% | RMSLE: 0.0217 | RMSE: 0.1637
XGBRegressor MAPE: 127.80% | RMSLE: 0.0370 | RMSE: 0.2641
CatBoostRegressor MAPE: 310.35% | RMSLE: 0.0333 | RMSE: 0.4654
