## Functions and libraries

In [1]:
from fastai.tabular.all import *
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import (
    AdaBoostRegressor, GradientBoostingRegressor,
    BaggingRegressor, RandomForestRegressor
)
from sklearn.tree import DecisionTreeRegressor
from lightgbm import LGBMRegressor
import xgboost as xgb
from catboost import CatBoostRegressor
import re

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
pd.set_option('display.expand_frame_repr', False)

In [2]:
# Función para limpiar nombres de columnas
def clean_column_names(df):
    df = df.copy()
    df.columns = [
        re.sub(r'[^A-Za-z0-9_]+', '_', col)  # deja solo letras, números y "_"
        for col in df.columns
    ]
    return df


In [3]:
# Función auxiliar
def root_mean_squared_error(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# Evaluación de modelos
def fit_transform_model(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    models = [
        LinearRegression(n_jobs=-1),
        KNeighborsRegressor(n_neighbors=5, n_jobs=-1),
        AdaBoostRegressor(n_estimators=100, learning_rate=0.1, random_state=42),
        DecisionTreeRegressor(max_depth=10, random_state=42),
        GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42),
        BaggingRegressor(n_estimators=50, n_jobs=-1, random_state=42),
        RandomForestRegressor(n_estimators=100, max_depth=10, n_jobs=-1, random_state=42),
        LGBMRegressor(n_estimators=200,learning_rate=0.1,max_depth=-1,n_jobs=-1,random_state=42,verbose=-1),
        xgb.XGBRegressor(n_estimators=200, learning_rate=0.1, max_depth=6,subsample=0.8, colsample_bytree=0.8, n_jobs=-1, tree_method="hist", random_state=42, verbosity=0),
        CatBoostRegressor(iterations=200, depth=6, learning_rate=0.1, verbose=False, random_state=42)
    ]   

    for model in models:
        model_name = model.__class__.__name__
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        rmse = root_mean_squared_error(y_test, predictions)
        print(f"{model_name} RMSE: {rmse:.4f}")

## Model test with application_train.csv

### Import dataset

In [4]:
train_df = pd.read_csv("./home-credit-default-risk/application_train.csv")

### Quick prework with fastai

In [5]:
# La librería fastai tiene varias funcionalidad para preprocesar datasets. 
# En particular para poder usar un Random Forests de ScikitLearn necesitamos que todas las variables sean numéricas y no tener valores faltantas.
# En particular vamos a usar dos preprocesamientos: Categorify y FillMissing. Por ahora solo definimos los preprocesamientos y luego se aplican.
procs = [Categorify, FillMissing]

In [6]:
# Además del preprocesamiento, vamos a dividir nuestro dataset en train y validation. 
# En este caso no sería una buena elección hacer una partición aleatoria porque estamos trabajando con datos temporales. 
# Vamos a poner una fecha de corte para dividir el dataset.
# - dep_var='TARGET': excluye esta columna del análisis porque es la variable que se quiere predecir.
# - max_card=10: si una columna tiene menos de 10 valores únicos, se considera categórica.
# - preproc_names=procs: aplica transformaciones como Categorify, FillMissing, que están definidas en procs

cont,cat = cont_cat_split(train_df, 1, dep_var='TARGET')

In [7]:
# Índices del DataFrame completo
idxs = list(range(len(train_df)))

# Partición aleatoria 80/20
train_idx, valid_idx = train_test_split(idxs, test_size=0.2, random_state=42)

# Adaptación para TabularPandas
splits = (list(train_idx), list(valid_idx))


In [8]:
to = TabularPandas(train_df, procs, cat, cont, y_names='TARGET', splits=splits)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  to[n].fillna(self.na_dict[n], inplace=True)
  to.loc[:,n+'_na'] = missing[n]
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  to[n].fillna(self.na_dict[n], inplace=True)
  to.loc[:,n+'_na'] = missing[n]
The behavior will change in pandas 3.0. This inplace method will never work be

In [9]:
len(to.train),len(to.valid)

(246008, 61503)

In [10]:
# Una vez hecho el preprocesamiento, se puede ver que los valores del dataframe son todos numéricos.
to.items.head(3)

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_TYPE_SUITE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,ORGANIZATION_TYPE,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,APARTMENTS_AVG,BASEMENTAREA_AVG,YEARS_BEGINEXPLUATATION_AVG,YEARS_BUILD_AVG,COMMONAREA_AVG,ELEVATORS_AVG,ENTRANCES_AVG,FLOORSMAX_AVG,FLOORSMIN_AVG,LANDAREA_AVG,LIVINGAPARTMENTS_AVG,LIVINGAREA_AVG,NONLIVINGAPARTMENTS_AVG,NONLIVINGAREA_AVG,APARTMENTS_MODE,BASEMENTAREA_MODE,YEARS_BEGINEXPLUATATION_MODE,YEARS_BUILD_MODE,COMMONAREA_MODE,ELEVATORS_MODE,ENTRANCES_MODE,FLOORSMAX_MODE,FLOORSMIN_MODE,LANDAREA_MODE,LIVINGAPARTMENTS_MODE,LIVINGAREA_MODE,NONLIVINGAPARTMENTS_MODE,NONLIVINGAREA_MODE,APARTMENTS_MEDI,BASEMENTAREA_MEDI,YEARS_BEGINEXPLUATATION_MEDI,YEARS_BUILD_MEDI,COMMONAREA_MEDI,ELEVATORS_MEDI,ENTRANCES_MEDI,FLOORSMAX_MEDI,FLOORSMIN_MEDI,LANDAREA_MEDI,LIVINGAPARTMENTS_MEDI,LIVINGAREA_MEDI,NONLIVINGAPARTMENTS_MEDI,NONLIVINGAREA_MEDI,FONDKAPREMONT_MODE,HOUSETYPE_MODE,TOTALAREA_MODE,WALLSMATERIAL_MODE,EMERGENCYSTATE_MODE,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_2,FLAG_DOCUMENT_3,FLAG_DOCUMENT_4,FLAG_DOCUMENT_5,FLAG_DOCUMENT_6,FLAG_DOCUMENT_7,FLAG_DOCUMENT_8,FLAG_DOCUMENT_9,FLAG_DOCUMENT_10,FLAG_DOCUMENT_11,FLAG_DOCUMENT_12,FLAG_DOCUMENT_13,FLAG_DOCUMENT_14,FLAG_DOCUMENT_15,FLAG_DOCUMENT_16,FLAG_DOCUMENT_17,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,AMT_ANNUITY_na,AMT_GOODS_PRICE_na,OWN_CAR_AGE_na,CNT_FAM_MEMBERS_na,EXT_SOURCE_1_na,EXT_SOURCE_2_na,EXT_SOURCE_3_na,APARTMENTS_AVG_na,BASEMENTAREA_AVG_na,YEARS_BEGINEXPLUATATION_AVG_na,YEARS_BUILD_AVG_na,COMMONAREA_AVG_na,ELEVATORS_AVG_na,ENTRANCES_AVG_na,FLOORSMAX_AVG_na,FLOORSMIN_AVG_na,LANDAREA_AVG_na,LIVINGAPARTMENTS_AVG_na,LIVINGAREA_AVG_na,NONLIVINGAPARTMENTS_AVG_na,NONLIVINGAREA_AVG_na,APARTMENTS_MODE_na,BASEMENTAREA_MODE_na,YEARS_BEGINEXPLUATATION_MODE_na,YEARS_BUILD_MODE_na,COMMONAREA_MODE_na,ELEVATORS_MODE_na,ENTRANCES_MODE_na,FLOORSMAX_MODE_na,FLOORSMIN_MODE_na,LANDAREA_MODE_na,LIVINGAPARTMENTS_MODE_na,LIVINGAREA_MODE_na,NONLIVINGAPARTMENTS_MODE_na,NONLIVINGAREA_MODE_na,APARTMENTS_MEDI_na,BASEMENTAREA_MEDI_na,YEARS_BEGINEXPLUATATION_MEDI_na,YEARS_BUILD_MEDI_na,COMMONAREA_MEDI_na,ELEVATORS_MEDI_na,ENTRANCES_MEDI_na,FLOORSMAX_MEDI_na,FLOORSMIN_MEDI_na,LANDAREA_MEDI_na,LIVINGAPARTMENTS_MEDI_na,LIVINGAREA_MEDI_na,NONLIVINGAPARTMENTS_MEDI_na,NONLIVINGAREA_MEDI_na,TOTALAREA_MODE_na,OBS_30_CNT_SOCIAL_CIRCLE_na,DEF_30_CNT_SOCIAL_CIRCLE_na,OBS_60_CNT_SOCIAL_CIRCLE_na,DEF_60_CNT_SOCIAL_CIRCLE_na,DAYS_LAST_PHONE_CHANGE_na,AMT_REQ_CREDIT_BUREAU_HOUR_na,AMT_REQ_CREDIT_BUREAU_DAY_na,AMT_REQ_CREDIT_BUREAU_WEEK_na,AMT_REQ_CREDIT_BUREAU_MON_na,AMT_REQ_CREDIT_BUREAU_QRT_na,AMT_REQ_CREDIT_BUREAU_YEAR_na
123473,243191,0,1,1,2,1,0,171000.0,555273.0,16366.5,463500.0,7,4,5,6,2,0.035792,-23349,365243,-3595.0,-4408,31.0,1,0,0,1,0,0,0,1.0,2,2,6,9,0,0,0,0,0,0,58,0.524685,0.358568,0.563835,0.0876,0.0763,0.9821,0.7552,0.0211,0.0,0.1379,0.1667,0.2083,0.0483,0.0756,0.0745,0.0,0.0035,0.084,0.0746,0.9816,0.7648,0.019,0.0,0.1379,0.1667,0.2083,0.046,0.0771,0.0731,0.0,0.0011,0.0864,0.0758,0.9816,0.7585,0.0208,0.0,0.1379,0.1667,0.2083,0.0488,0.0761,0.0749,0.0,0.003,0,0,0.0687,0,0,0.0,0.0,0.0,0.0,-2058.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,1.0,1.0,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1
10118,111778,0,1,2,1,2,1,157500.0,198085.5,23638.5,171000.0,7,8,5,2,2,0.010032,-10921,-117,-4281.0,-3399,9.0,1,1,1,1,1,0,9,3.0,2,2,3,7,0,0,0,0,0,0,5,0.244926,0.490305,0.595456,0.0784,0.0633,0.9742,0.6464,0.0266,0.0,0.1379,0.1667,0.2083,0.0409,0.063,0.0594,0.0039,0.0149,0.0798,0.0657,0.9742,0.6602,0.0269,0.0,0.1379,0.1667,0.2083,0.0418,0.0689,0.0619,0.0039,0.0158,0.0791,0.0633,0.9742,0.6511,0.0268,0.0,0.1379,0.1667,0.2083,0.0416,0.0641,0.0605,0.0039,0.0153,3,1,0.0645,6,1,1.0,0.0,1.0,0.0,-73.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
64716,175057,1,1,2,2,2,0,135000.0,776304.0,25173.0,648000.0,7,8,4,1,2,0.035792,-23213,-2157,-5680.0,-5009,8.0,1,1,0,1,0,0,5,2.0,2,2,1,13,0,0,0,0,0,0,43,0.506427,0.643404,0.706205,0.0876,0.0763,0.9821,0.7552,0.0211,0.0,0.1379,0.1667,0.2083,0.0483,0.0756,0.0745,0.0,0.0035,0.084,0.0746,0.9816,0.7648,0.019,0.0,0.1379,0.1667,0.2083,0.046,0.0771,0.0731,0.0,0.0011,0.0864,0.0758,0.9816,0.7585,0.0208,0.0,0.1379,0.1667,0.2083,0.0488,0.0761,0.0749,0.0,0.003,0,0,0.0687,0,0,2.0,0.0,2.0,0.0,-1959.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,5.0,1,1,1,1,2,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1


In [11]:
to.items.describe(include='all')  # Para ver estadísticas generales

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_TYPE_SUITE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,ORGANIZATION_TYPE,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,APARTMENTS_AVG,BASEMENTAREA_AVG,YEARS_BEGINEXPLUATATION_AVG,YEARS_BUILD_AVG,COMMONAREA_AVG,ELEVATORS_AVG,ENTRANCES_AVG,FLOORSMAX_AVG,FLOORSMIN_AVG,LANDAREA_AVG,LIVINGAPARTMENTS_AVG,LIVINGAREA_AVG,NONLIVINGAPARTMENTS_AVG,NONLIVINGAREA_AVG,APARTMENTS_MODE,BASEMENTAREA_MODE,YEARS_BEGINEXPLUATATION_MODE,YEARS_BUILD_MODE,COMMONAREA_MODE,ELEVATORS_MODE,ENTRANCES_MODE,FLOORSMAX_MODE,FLOORSMIN_MODE,LANDAREA_MODE,LIVINGAPARTMENTS_MODE,LIVINGAREA_MODE,NONLIVINGAPARTMENTS_MODE,NONLIVINGAREA_MODE,APARTMENTS_MEDI,BASEMENTAREA_MEDI,YEARS_BEGINEXPLUATATION_MEDI,YEARS_BUILD_MEDI,COMMONAREA_MEDI,ELEVATORS_MEDI,ENTRANCES_MEDI,FLOORSMAX_MEDI,FLOORSMIN_MEDI,LANDAREA_MEDI,LIVINGAPARTMENTS_MEDI,LIVINGAREA_MEDI,NONLIVINGAPARTMENTS_MEDI,NONLIVINGAREA_MEDI,FONDKAPREMONT_MODE,HOUSETYPE_MODE,TOTALAREA_MODE,WALLSMATERIAL_MODE,EMERGENCYSTATE_MODE,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_2,FLAG_DOCUMENT_3,FLAG_DOCUMENT_4,FLAG_DOCUMENT_5,FLAG_DOCUMENT_6,FLAG_DOCUMENT_7,FLAG_DOCUMENT_8,FLAG_DOCUMENT_9,FLAG_DOCUMENT_10,FLAG_DOCUMENT_11,FLAG_DOCUMENT_12,FLAG_DOCUMENT_13,FLAG_DOCUMENT_14,FLAG_DOCUMENT_15,FLAG_DOCUMENT_16,FLAG_DOCUMENT_17,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,AMT_ANNUITY_na,AMT_GOODS_PRICE_na,OWN_CAR_AGE_na,CNT_FAM_MEMBERS_na,EXT_SOURCE_1_na,EXT_SOURCE_2_na,EXT_SOURCE_3_na,APARTMENTS_AVG_na,BASEMENTAREA_AVG_na,YEARS_BEGINEXPLUATATION_AVG_na,YEARS_BUILD_AVG_na,COMMONAREA_AVG_na,ELEVATORS_AVG_na,ENTRANCES_AVG_na,FLOORSMAX_AVG_na,FLOORSMIN_AVG_na,LANDAREA_AVG_na,LIVINGAPARTMENTS_AVG_na,LIVINGAREA_AVG_na,NONLIVINGAPARTMENTS_AVG_na,NONLIVINGAREA_AVG_na,APARTMENTS_MODE_na,BASEMENTAREA_MODE_na,YEARS_BEGINEXPLUATATION_MODE_na,YEARS_BUILD_MODE_na,COMMONAREA_MODE_na,ELEVATORS_MODE_na,ENTRANCES_MODE_na,FLOORSMAX_MODE_na,FLOORSMIN_MODE_na,LANDAREA_MODE_na,LIVINGAPARTMENTS_MODE_na,LIVINGAREA_MODE_na,NONLIVINGAPARTMENTS_MODE_na,NONLIVINGAREA_MODE_na,APARTMENTS_MEDI_na,BASEMENTAREA_MEDI_na,YEARS_BEGINEXPLUATATION_MEDI_na,YEARS_BUILD_MEDI_na,COMMONAREA_MEDI_na,ELEVATORS_MEDI_na,ENTRANCES_MEDI_na,FLOORSMAX_MEDI_na,FLOORSMIN_MEDI_na,LANDAREA_MEDI_na,LIVINGAPARTMENTS_MEDI_na,LIVINGAREA_MEDI_na,NONLIVINGAPARTMENTS_MEDI_na,NONLIVINGAREA_MEDI_na,TOTALAREA_MODE_na,OBS_30_CNT_SOCIAL_CIRCLE_na,DEF_30_CNT_SOCIAL_CIRCLE_na,OBS_60_CNT_SOCIAL_CIRCLE_na,DEF_60_CNT_SOCIAL_CIRCLE_na,DAYS_LAST_PHONE_CHANGE_na,AMT_REQ_CREDIT_BUREAU_HOUR_na,AMT_REQ_CREDIT_BUREAU_DAY_na,AMT_REQ_CREDIT_BUREAU_WEEK_na,AMT_REQ_CREDIT_BUREAU_MON_na,AMT_REQ_CREDIT_BUREAU_QRT_na,AMT_REQ_CREDIT_BUREAU_YEAR_na
count,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0
mean,278180.518577,0.080729,1.095213,1.341669,1.340108,1.693673,0.417052,168797.9,599026.0,27108.490234,538316.2,6.19358,5.670288,4.188273,2.472312,2.29039,0.020868,-16036.995067,63815.045904,-4986.120605,-2994.202373,10.041052,0.999997,0.819889,0.199368,0.998133,0.281066,0.05672,6.076361,2.152664,2.052463,2.031521,4.090732,12.063419,0.015144,0.050769,0.040659,0.078173,0.230454,0.179555,30.450429,0.504552,0.5145032,0.516051,0.102297,0.081337,0.979864,0.754286,0.028186,0.036869,0.143771,0.196633,0.215886,0.055626,0.083567,0.090886,0.002693,0.014641,0.098889,0.079969,0.979277,0.76307,0.026096,0.03479,0.141521,0.194641,0.214653,0.053701,0.086133,0.089474,0.002469,0.012719,0.101889,0.080842,0.979629,0.757577,0.027969,0.036465,0.143517,0.19644,0.215799,0.056262,0.084282,0.091688,0.002644,0.014311,0.932438,0.510996,0.086209,2.543994,0.533587,1.417523,0.142944,1.400626,0.099717,-962.858154,4.2e-05,0.710023,8.1e-05,0.015115,0.088055,0.000192,0.081376,0.003896,2.3e-05,0.003912,7e-06,0.003525,0.002936,0.00121,0.009928,0.000267,0.00813,0.000595,0.000507,0.000335,0.005538,0.006055,0.029723,0.231293,0.229631,1.778463,1.000039,1.000904,1.659908,1.000007,1.563811,1.002146,1.198253,1.507497,1.58516,1.48781,1.664978,1.698723,1.53296,1.503488,1.497608,1.678486,1.593767,1.68355,1.501933,1.69433,1.551792,1.507497,1.58516,1.48781,1.664978,1.698723,1.53296,1.503488,1.497608,1.678486,1.593767,1.68355,1.501933,1.69433,1.551792,1.507497,1.58516,1.48781,1.664978,1.698723,1.53296,1.503488,1.497608,1.678486,1.593767,1.68355,1.501933,1.69433,1.551792,1.482685,1.00332,1.00332,1.00332,1.00332,1.000003,1.135016,1.135016,1.135016,1.135016,1.135016,1.135016
std,102790.175348,0.272419,0.293509,0.474297,0.473746,0.460968,0.722121,237123.1,402490.8,14493.460938,369289.0,1.817005,2.544525,1.298753,1.168884,0.951168,0.013831,4363.988632,141275.766519,3522.88623,1509.450419,7.115229,0.001803,0.38428,0.399526,0.043164,0.449521,0.231307,5.491022,0.910679,0.509034,0.502737,2.149512,3.265832,0.122126,0.219526,0.197499,0.268444,0.421124,0.383817,20.463558,0.139411,0.1908699,0.174777,0.077412,0.053433,0.042441,0.06558,0.043108,0.100048,0.070746,0.106761,0.092167,0.052496,0.053378,0.079745,0.0267,0.048158,0.077241,0.054674,0.04627,0.06378,0.042266,0.097726,0.071245,0.105588,0.091846,0.05293,0.056639,0.080627,0.025854,0.048768,0.078146,0.053267,0.04291,0.064878,0.043197,0.099811,0.070949,0.106998,0.092464,0.053142,0.054032,0.080999,0.026516,0.048623,1.418267,0.532238,0.079121,2.745148,0.513823,2.398396,0.446033,2.377224,0.361735,826.807251,0.006502,0.453752,0.009016,0.12201,0.283376,0.01385,0.273412,0.062295,0.004771,0.062424,0.00255,0.059268,0.05411,0.03476,0.099144,0.016327,0.089798,0.024387,0.022518,0.018299,0.078014,0.103037,0.190728,0.85681,0.744059,1.765523,0.006247,0.030054,0.473741,0.00255,0.495912,0.046278,0.398684,0.499945,0.492695,0.499852,0.471999,0.458814,0.498913,0.499989,0.499995,0.467058,0.49113,0.465092,0.499997,0.460692,0.497311,0.499945,0.492695,0.499852,0.471999,0.458814,0.498913,0.499989,0.499995,0.467058,0.49113,0.465092,0.499997,0.460692,0.497311,0.499945,0.492695,0.499852,0.471999,0.458814,0.498913,0.499989,0.499995,0.467058,0.49113,0.465092,0.499997,0.460692,0.497311,0.499701,0.057526,0.057526,0.057526,0.057526,0.001803,0.341742,0.341742,0.341742,0.341742,0.341742,0.341742
min,100002.0,0.0,1.0,1.0,1.0,1.0,0.0,25650.0,45000.0,1615.5,40500.0,0.0,1.0,1.0,1.0,1.0,0.00029,-25229.0,-17912.0,-24672.0,-7197.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.014568,8.173617e-08,0.000527,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-4292.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,189145.5,0.0,1.0,1.0,1.0,1.0,0.0,112500.0,270000.0,16524.0,238500.0,7.0,4.0,3.0,2.0,2.0,0.010006,-19682.0,-2760.0,-7479.5,-4299.0,9.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,2.0,2.0,2.0,2.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.506427,0.3929737,0.4171,0.0876,0.0763,0.9816,0.7552,0.0211,0.0,0.1379,0.1667,0.2083,0.0483,0.0756,0.0745,0.0,0.0035,0.084,0.0746,0.9811,0.7648,0.019,0.0,0.1379,0.1667,0.2083,0.046,0.0771,0.0731,0.0,0.0011,0.0864,0.0758,0.9816,0.7585,0.0208,0.0,0.1379,0.1667,0.2083,0.0488,0.0761,0.0749,0.0,0.003,0.0,0.0,0.067,0.0,0.0,0.0,0.0,0.0,0.0,-1570.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
50%,278202.0,0.0,1.0,1.0,1.0,2.0,0.0,147150.0,513531.0,24903.0,450000.0,7.0,8.0,5.0,2.0,2.0,0.01885,-15750.0,-1213.0,-4504.0,-3254.0,9.0,1.0,1.0,0.0,1.0,0.0,0.0,5.0,2.0,2.0,2.0,5.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,34.0,0.506427,0.5658916,0.53707,0.0876,0.0763,0.9821,0.7552,0.0211,0.0,0.1379,0.1667,0.2083,0.0483,0.0756,0.0745,0.0,0.0035,0.084,0.0746,0.9816,0.7648,0.019,0.0,0.1379,0.1667,0.2083,0.046,0.0771,0.0731,0.0,0.0011,0.0864,0.0758,0.9816,0.7585,0.0208,0.0,0.1379,0.1667,0.2083,0.0488,0.0761,0.0749,0.0,0.003,0.0,0.0,0.0687,0.0,1.0,0.0,0.0,0.0,0.0,-757.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,2.0,1.0,2.0,1.0,1.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
75%,367142.5,0.0,1.0,2.0,2.0,2.0,1.0,202500.0,808650.0,34596.0,679500.0,7.0,8.0,5.0,3.0,2.0,0.028663,-12413.0,-289.0,-2010.0,-1720.0,9.0,1.0,1.0,0.0,1.0,1.0,0.0,10.0,3.0,2.0,2.0,6.0,14.0,0.0,0.0,0.0,0.0,0.0,0.0,48.0,0.506427,0.6634218,0.636376,0.0876,0.0763,0.9821,0.7552,0.0211,0.0,0.1379,0.1667,0.2083,0.0483,0.0756,0.0745,0.0,0.0035,0.084,0.0746,0.9816,0.7648,0.019,0.0,0.1379,0.1667,0.2083,0.046,0.0771,0.0731,0.0,0.0011,0.0864,0.0758,0.9821,0.7585,0.0208,0.0,0.1379,0.1667,0.2083,0.0488,0.0761,0.0749,0.0,0.003,3.0,1.0,0.0703,5.0,1.0,2.0,0.0,2.0,0.0,-274.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,1.0,1.0,2.0,1.0,2.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
max,456255.0,1.0,2.0,3.0,2.0,2.0,19.0,117000000.0,4050000.0,258025.5,4050000.0,7.0,8.0,5.0,6.0,6.0,0.072508,-7489.0,365243.0,0.0,0.0,91.0,1.0,1.0,1.0,1.0,1.0,1.0,18.0,20.0,3.0,3.0,7.0,23.0,1.0,1.0,1.0,1.0,1.0,1.0,58.0,0.962693,0.8549997,0.89601,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,4.0,3.0,1.0,7.0,2.0,348.0,34.0,344.0,24.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,4.0,9.0,8.0,27.0,261.0,25.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0


In [12]:
to.train.xs.info()

<class 'pandas.core.frame.DataFrame'>
Index: 246008 entries, 123473 to 121958
Columns: 182 entries, NAME_CONTRACT_TYPE to AMT_REQ_CREDIT_BUREAU_YEAR
dtypes: float32(65), int16(2), int32(2), int8(113)
memory usage: 92.2 MB


In [13]:
# Dado que el preprocesamiento puede tardar, guardamos el objeto en memoria para futuro uso
save_pickle('./application_train-tabular-object.pkl',to)

### Fit

In [14]:
# Cargar el objeto TabularPandas desde el pickle
to = load_pickle('./application_train-tabular-object.pkl')

# Extraer features y target
xs, y = to.train.xs, to.train.y
valid_xs, valid_y = to.valid.xs, to.valid.y

# Muestreo
sample_frac = 0.05   # 5% de las filas
sampled_xs = xs.sample(frac=sample_frac, random_state=42)
sampled_y = y.loc[sampled_xs.index] 

# Copias para trabajar con scikit-learn
X = sampled_xs.copy()
target = sampled_y.copy()

# Ejecutar evaluación
fit_transform_model(X, target)

LinearRegression RMSE: 0.2748
KNeighborsRegressor RMSE: 0.3018
AdaBoostRegressor RMSE: 0.2905
DecisionTreeRegressor RMSE: 0.3221
GradientBoostingRegressor RMSE: 0.2741
BaggingRegressor RMSE: 0.2837
RandomForestRegressor RMSE: 0.2751
LGBMRegressor RMSE: 0.2828
XGBRegressor RMSE: 0.2826
CatBoostRegressor RMSE: 0.2744


## Model test with all data joined

In [15]:
# Cargar el objeto TabularPandas desde el pickle
to = load_pickle('./prework-tabular-object.pkl')

# Extraer features y target
xs, y = to.train.xs, to.train.y
valid_xs, valid_y = to.valid.xs, to.valid.y

# Muestreo
sample_frac = 0.05   # 5% de las filas
sampled_xs = xs.sample(frac=sample_frac, random_state=42)
sampled_y = y.loc[sampled_xs.index] 

# Copias para trabajar con scikit-learn
# X = sampled_xs.copy()
X = clean_column_names(sampled_xs)
target = sampled_y.copy()

# Ejecutar evaluación
fit_transform_model(X, target)

LinearRegression RMSE: 0.2751
KNeighborsRegressor RMSE: 0.3032
AdaBoostRegressor RMSE: 0.2850
DecisionTreeRegressor RMSE: 0.3045
GradientBoostingRegressor RMSE: 0.2722
BaggingRegressor RMSE: 0.2773
RandomForestRegressor RMSE: 0.2715
LGBMRegressor RMSE: 0.2763
XGBRegressor RMSE: 0.2791
CatBoostRegressor RMSE: 0.2726
