## Functions and libraries

In [1]:
from fastai.tabular.all import *
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import (
    AdaBoostRegressor, GradientBoostingRegressor,
    BaggingRegressor, RandomForestRegressor
)
from sklearn.tree import DecisionTreeRegressor
from lightgbm import LGBMRegressor
import xgboost as xgb
from catboost import CatBoostRegressor
import re

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
pd.set_option('display.expand_frame_repr', False)

In [2]:
# Función para limpiar nombres de columnas
def clean_column_names(df):
    df = df.copy()
    df.columns = [
        re.sub(r'[^A-Za-z0-9_]+', '_', col)  # deja solo letras, números y "_"
        for col in df.columns
    ]
    return df


In [3]:
# Función auxiliar
def root_mean_squared_error(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# Evaluación de modelos
def fit_transform_model(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    models = [
        LinearRegression(n_jobs=-1),
        KNeighborsRegressor(n_neighbors=5, n_jobs=-1),
        AdaBoostRegressor(n_estimators=100, learning_rate=0.1, random_state=42),
        DecisionTreeRegressor(max_depth=10, random_state=42),
        GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42),
        BaggingRegressor(n_estimators=50, n_jobs=-1, random_state=42),
        RandomForestRegressor(n_estimators=100, max_depth=10, n_jobs=-1, random_state=42),
        LGBMRegressor(n_estimators=200,learning_rate=0.1,max_depth=-1,n_jobs=-1,random_state=42,verbose=-1),
        xgb.XGBRegressor(n_estimators=200, learning_rate=0.1, max_depth=6,subsample=0.8, colsample_bytree=0.8, n_jobs=-1, tree_method="hist", random_state=42, verbosity=0),
        CatBoostRegressor(iterations=200, depth=6, learning_rate=0.1, verbose=False, random_state=42)
    ]   

    for model in models:
        model_name = model.__class__.__name__
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        rmse = root_mean_squared_error(y_test, predictions)
        print(f"{model_name} RMSE: {rmse:.4f}")

## Model test with application_train.csv

### Import dataset

In [4]:
train_df = pd.read_parquet("train_2_Prework.parquet", engine='fastparquet')

### Quick prework with fastai

In [5]:
# La librería fastai tiene varias funcionalidad para preprocesar datasets. 
# En particular para poder usar un Random Forests de ScikitLearn necesitamos que todas las variables sean numéricas y no tener valores faltantas.
# En particular vamos a usar dos preprocesamientos: Categorify y FillMissing. Por ahora solo definimos los preprocesamientos y luego se aplican.
procs = [Categorify, FillMissing]

In [6]:
# Además del preprocesamiento, vamos a dividir nuestro dataset en train y validation. 
# En este caso no sería una buena elección hacer una partición aleatoria porque estamos trabajando con datos temporales. 
# Vamos a poner una fecha de corte para dividir el dataset.
# - dep_var='TARGET': excluye esta columna del análisis porque es la variable que se quiere predecir.
# - max_card=10: si una columna tiene menos de 10 valores únicos, se considera categórica.
# - preproc_names=procs: aplica transformaciones como Categorify, FillMissing, que están definidas en procs

cont,cat = cont_cat_split(train_df, 1, dep_var='TARGET')

In [7]:
# Índices del DataFrame completo
idxs = list(range(len(train_df)))

# Partición aleatoria 80/20
train_idx, valid_idx = train_test_split(idxs, test_size=0.2, random_state=42)

# Adaptación para TabularPandas
splits = (list(train_idx), list(valid_idx))


In [19]:
from fastai.tabular.all import *
import numpy as np
import pandas as pd


# ===== 1) Preparar df y target =====
df = train_df.copy()

# target precio/m2 con guardas
df["price_m2"] = df["price"] / df["surface_total"].replace(0, np.nan)
df["lat"] = pd.to_numeric(df["lat"], errors="coerce")
df["lon"] = pd.to_numeric(df["lon"], errors="coerce")
df = df.dropna(subset=["price_m2"]).copy()   # sin target NaN

# ===== 2) Definir columnas =====
cat = df.select_dtypes(include=["object","category"]).columns.tolist()
cont = [c for c in df.select_dtypes(include=["number"]).columns if c != "price_m2"]

from fastai.tabular.all import *
import numpy as np
import pandas as pd


df = df.dropna(subset=["price_m2"]).copy()
all_nan_cols   = df.columns[df.isna().all()].tolist()                         # 100% NaN (ej. l6)
constant_cols  = [c for c in df.columns if df[c].nunique(dropna=True) <= 1]   # sin variabilidad
hi_null_cols   = df.columns[df.isna().mean() > 0.95].tolist()                 # >95% NaN
to_drop = sorted(set(all_nan_cols) | set(constant_cols) | set(hi_null_cols))

print("Dropping columns:", to_drop)  # debería incluir 'l6'
df = df.drop(columns=to_drop)

num_cols = df.select_dtypes(include=["number"]).columns.tolist()
cont = [c for c in num_cols if c != "price_m2"]
cat  = [c for c in df.columns if c not in cont + ["price_m2"]]

procs = [Categorify, FillMissing, Normalize]
splits = RandomSplitter(valid_pct=0.2, seed=42)(range_of(df))

Dropping columns: ['ad_type', 'l5', 'l6']


In [20]:
to = TabularPandas(df, procs, cat, cont, y_names='price_m2', splits=splits)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  to[n].fillna(self.na_dict[n], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  to[n].fillna(self.na_dict[n], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values alway

In [21]:
# === BLOQUE SEGURO: usar salidas de TabularPandas para evitar NaN ===
# Requiere que exista el objeto `to = TabularPandas(...)` con procs [Categorify, FillMissing, Normalize]
try:
    X_train, y_train = to.train.xs.copy(), to.train.y.values.ravel()
    X_valid, y_valid = to.valid.xs.copy(), to.valid.y.values.ravel()
except NameError as e:
    raise RuntimeError("No existe el objeto 'to'. Asegúrate de haber creado TabularPandas antes.") from e

all_nan_cols   = df.columns[df.isna().all()].tolist()                         # 100% NaN (ej. l6)
constant_cols  = [c for c in df.columns if df[c].nunique(dropna=True) <= 1]   # sin variabilidad
hi_null_cols   = df.columns[df.isna().mean() > 0.95].tolist()                 # >95% NaN
to_drop = sorted(set(all_nan_cols) | set(constant_cols) | set(hi_null_cols))

print("Dropping columns:", to_drop)  # debería incluir 'l6'

# Chequeos duros de NaN (deben dar 0)
print("NaN en X_train:", int(X_train.isna().sum().sum()))
print("NaN en X_valid:", int(X_valid.isna().sum().sum()))

# Parche opcional en caso de que alguna columna ajena se haya filtrado
if X_train.isna().any().any() or X_valid.isna().any().any():
    cols_train = X_train.columns[X_train.isna().any()].tolist()
    cols_valid = X_valid.columns[X_valid.isna().any()].tolist()
    print("Advertencia: se detectaron NaN en columnas:", set(cols_train) | set(cols_valid))
    print("Se rellenan con 0 como parche de emergencia. Revisa el pipeline de procs/columnas.")
    X_train = X_train.fillna(0)
    X_valid = X_valid.fillna(0)

Dropping columns: []
NaN en X_train: 0
NaN en X_valid: 0


In [22]:
len(to.train),len(to.valid)

(338209, 84552)

In [23]:
# Una vez hecho el preprocesamiento, se puede ver que los valores del dataframe son todos numéricos.
to.items.head(3)

Unnamed: 0,id,start_date,end_date,created_on,lat,lon,l1,l2,l3,l4,rooms,bedrooms,bathrooms,surface_total,surface_covered,currency,price_period,title,description,property_type,operation_type,price,price_m2,covered_ratio,year_created,month_created,age_of_ad,lat_na,lon_na,rooms_na,bedrooms_na,bathrooms_na,surface_covered_na,covered_ratio_na
623843,0.710348,204,450,204,0.297067,-0.071017,1,7,159,0,0.038124,-0.116073,6.090944,0.090864,0.347234,1,2,29991,171594,8,1,0.016949,487.5,0.004588,1.139006,-1.563577,-0.460101,1,1,2,2,1,1,1
89329,-1.213215,91,176,91,-0.627484,0.540297,1,38,609,0,0.757931,-1.14103,-0.571296,-0.101214,-0.058587,1,0,82620,162594,4,1,-0.051608,226.027405,-0.005949,-0.877958,0.920759,0.671152,1,1,1,1,1,1,1
519438,0.334937,31,99,31,0.290864,-0.067454,1,7,65,0,0.038124,-0.116073,-0.571296,-0.103592,-0.062463,3,0,226193,79231,4,3,-0.025451,2484.375,-0.003424,-0.877958,0.299675,1.219375,1,1,1,1,1,1,1


In [24]:
to.items.describe(include='all')  # Para ver estadísticas generales

Unnamed: 0,id,start_date,end_date,created_on,lat,lon,l1,l2,l3,l4,rooms,bedrooms,bathrooms,surface_total,surface_covered,currency,price_period,title,description,property_type,operation_type,price,price_m2,covered_ratio,year_created,month_created,age_of_ad,lat_na,lon_na,rooms_na,bedrooms_na,bathrooms_na,surface_covered_na,covered_ratio_na
count,422761.0,422761.0,422761.0,422761.0,422761.0,422761.0,422761.0,422761.0,422761.0,422761.0,422761.0,422761.0,422761.0,422761.0,422761.0,422761.0,422761.0,422761.0,422761.0,422761.0,422761.0,422761.0,422761.0,422761.0,422761.0,422761.0,422761.0,422761.0,422761.0,422761.0,422761.0,422761.0,422761.0,422761.0
mean,0.000381,156.394987,258.514045,156.394987,0.000174,-0.000487,1.080417,9.816549,457.029296,148.473809,-0.000209,0.001925,0.000817,-0.001057,-0.000854,2.431826,0.827276,124995.584264,165069.491613,4.250406,2.443695,-0.000183,2771.934,0.000757,-0.001004,0.000972,0.000768,1.125603,1.125603,1.318014,1.487737,1.168296,1.144909,1.144909
std,0.999985,100.82726,143.325026,100.82726,0.998536,0.998099,0.482112,11.475457,243.680766,251.117639,0.997949,1.731583,1.001513,0.996864,0.980706,0.918393,0.985214,69155.528626,93916.267075,2.384041,0.868725,0.900081,81882.19,1.035509,0.999869,0.999571,1.000039,0.331402,0.331402,0.465706,0.49985,0.374129,0.35201,0.35201
min,-1.514851,1.0,1.0,1.0,-21.739646,-7.592503,1.0,1.0,0.0,0.0,-1.40149,-4.215901,-0.571296,-0.173343,-0.189801,0.0,0.0,1.0,0.0,1.0,1.0,-0.054636,-57500.0,-0.599773,-0.877958,-1.874119,-1.887221,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,-0.89103,67.0,132.0,67.0,0.203323,-0.089804,1.0,2.0,256.0,0.0,-0.681683,-0.116073,-0.571296,-0.107291,-0.069107,1.0,0.0,72485.0,82875.0,4.0,1.0,-0.04924,291.6667,-0.015135,-0.877958,-0.631951,-0.825583,1.0,1.0,1.0,1.0,1.0,1.0,1.0
50%,-0.048625,141.0,253.0,141.0,0.286414,-0.079316,1.0,7.0,508.0,0.0,0.038124,-0.116073,-0.571296,-0.096195,-0.055819,3.0,0.0,112304.0,171740.0,4.0,3.0,-0.035363,1065.574,-0.004356,-0.877958,-0.010867,0.192545,1.0,1.0,1.0,1.0,1.0,1.0,1.0
75%,0.740259,235.0,392.0,235.0,0.307918,-0.037889,1.0,7.0,649.0,252.0,0.038124,-0.116073,0.380453,-0.051808,-0.023708,3.0,2.0,182142.0,244658.0,5.0,3.0,-0.011501,2223.214,0.004588,1.139006,0.920759,0.888701,1.0,1.0,2.0,2.0,1.0,1.0,1.0
max,2.030273,346.0,450.0,346.0,9.994368,30.406407,4.0,42.0,841.0,813.0,26.670991,920.295292,17.511925,52.72085,214.164921,4.0,3.0,255902.0,324162.0,10.0,3.0,568.959375,51666670.0,435.749295,1.139006,1.541843,1.497837,2.0,2.0,2.0,2.0,2.0,2.0,2.0


In [25]:
to.train.xs.info()

<class 'pandas.core.frame.DataFrame'>
Index: 338209 entries, 623843 to 51772
Data columns (total 33 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   start_date          338209 non-null  int16  
 1   end_date            338209 non-null  int16  
 2   created_on          338209 non-null  int16  
 3   l1                  338209 non-null  int8   
 4   l2                  338209 non-null  int8   
 5   l3                  338209 non-null  int16  
 6   l4                  338209 non-null  int16  
 7   currency            338209 non-null  int8   
 8   price_period        338209 non-null  int8   
 9   title               338209 non-null  int32  
 10  description         338209 non-null  int32  
 11  property_type       338209 non-null  int8   
 12  operation_type      338209 non-null  int8   
 13  lat_na              338209 non-null  int8   
 14  lon_na              338209 non-null  int8   
 15  rooms_na            338209 non-null

In [27]:
# Dado que el preprocesamiento puede tardar, guardamos el objeto en memoria para futuro uso
save_pickle('./df_train-tabular-object.pkl',to)

### Fit

In [29]:
# Cargar el objeto TabularPandas desde el pickle
to = load_pickle('./df_train-tabular-object.pkl')

# Extraer features y target
xs, y = to.train.xs, to.train.y
valid_xs, valid_y = to.valid.xs, to.valid.y

# Muestreo
sample_frac = 0.20   # 5% de las filas
sampled_xs = xs.sample(frac=sample_frac, random_state=42)
sampled_y = y.loc[sampled_xs.index] 

# Copias para trabajar con scikit-learn
X = sampled_xs.copy()
target = sampled_y.copy()

# Ejecutar evaluación
fit_transform_model(X, target)

LinearRegression RMSE: 71930.6485
KNeighborsRegressor RMSE: 82178.2656
AdaBoostRegressor RMSE: 79640.6540
DecisionTreeRegressor RMSE: 79106.2603
GradientBoostingRegressor RMSE: 77329.9166
BaggingRegressor RMSE: 79051.3787
RandomForestRegressor RMSE: 79118.4216
LGBMRegressor RMSE: 79692.0336
XGBRegressor RMSE: 80422.4844
CatBoostRegressor RMSE: 80183.1868
