# Feature Selection & Scaling

@roman

11 Mar, 2024

In [1]:
import os
import pandas as pd
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf

from sklearn.preprocessing import StandardScaler, PowerTransformer, OneHotEncoder
from sklearn.linear_model import LassoCV, lasso_path
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor, Pool
import shap



In [2]:
# Settings
# show 100 columns in pandas
pd.set_option('display.max_columns', 500)
TODAY = pd.to_datetime('today')

---
# Lasso

## Read

In [3]:
# important columns
cols_to_stay = [
    'observation_id',
    'price',
    'half_bathrooms',
    'full_bathrooms',
    'age_in_months',
    'parking_lots',
    'valuation_date',
    'sidewalk_id',
    'conservation_status_id',
    'elevator_service_id',
    'road_materials_id',
    'property_type_id',
    'level',
    'total_levels',
    'bedrooms',
    'built_area',
    'saleable_area',
    'land_area',
    'remaining_useful_life',
    # 'ppsm_comps',
    # 'ppsm_lower_comps',
    # 'ppsm_upper_comps',
    'count_supermarkets_at_1km',
    'count_hospitals_at_5km',
    'count_metro_at_1km',
    'count_schools_at_1km',
    'count_restaurants_at_1km',
    'tam_loc',
    'est_socio',
    'clase_hog',
    'sexo_jefe',
    'edad_jefe',
    'educa_jefe',
    'tot_integ',
    'mayores',
    'menores',
    'p12_64',
    'p65mas',
    'percep_ing',
    'ing_cor',
    'estim_alqu',
    'otros_ing',
    'gasto_mon',
    'vivienda',
    'alquiler',
    'pred_cons',
    'cuidados',
    'tipo_viv',
    'antiguedad',
    'cuart_dorm',
    'tenencia',
    'renta',
    'estim_pago',
    'pago_viv',
    'pago_mesp',
    'tipo_adqui',
    'viv_usada',
    'tipo_finan',
    'escrituras',
    'tot_resid',
    'tot_hom',
    'tot_muj',
    'tot_hog',
    'total_viviendas',
    'ppsm_terrain',
    # 'ppsm_terrain_lower',
    # 'ppsm_terrain_upper'
]

# read database
df_properties = pd.read_parquet("../../data/clean/properties_shif.parquet").loc[:, cols_to_stay]

# set observation_id as index
df_properties = df_properties.set_index('observation_id')

# see
print(df_properties.shape)
df_properties.head(2)

(1908483, 61)


Unnamed: 0_level_0,price,half_bathrooms,full_bathrooms,age_in_months,parking_lots,valuation_date,sidewalk_id,conservation_status_id,elevator_service_id,road_materials_id,property_type_id,level,total_levels,bedrooms,built_area,saleable_area,land_area,remaining_useful_life,count_supermarkets_at_1km,count_hospitals_at_5km,count_metro_at_1km,count_schools_at_1km,count_restaurants_at_1km,tam_loc,est_socio,clase_hog,sexo_jefe,edad_jefe,educa_jefe,tot_integ,mayores,menores,p12_64,p65mas,percep_ing,ing_cor,estim_alqu,otros_ing,gasto_mon,vivienda,alquiler,pred_cons,cuidados,tipo_viv,antiguedad,cuart_dorm,tenencia,renta,estim_pago,pago_viv,pago_mesp,tipo_adqui,viv_usada,tipo_finan,escrituras,tot_resid,tot_hom,tot_muj,tot_hog,total_viviendas,ppsm_terrain
observation_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1
43c375b2-75fd-4fcc-b311-55b396e7cbbf,2349000,1,2,204,1,2022-11-15,0,4,0,2,2,3,2,3,141,154,198,53,2,3,0,0,19,1.210938,2.724609,2,1,52.34375,7.0625,3.197266,2.740234,0.456055,2.367188,0.373291,2.210938,81674.453125,10253.206055,31.565142,47140.269531,3390.347168,1129.408691,295.455017,2439.210693,1.0,24.3125,2.015625,4,389.65509,3491.472412,512.813477,0.279052,1.0,0.174438,1,1,3.255859,1.561523,1.694336,1.019531,294870,3740
f95c14fe-9acc-4b5c-872b-bd15d412a65d,424000,0,1,192,1,2022-11-01,0,4,2,3,2,3,1,2,52,54,133,44,0,0,0,0,0,2.117188,1.929688,2,1,46.3125,5.65625,4.042969,3.228516,0.81543,2.964844,0.262451,2.564453,54447.476562,4833.395996,3.735561,37908.296875,1927.791382,391.177155,157.040634,1460.74646,1.0,17.65625,1.93457,4,133.639404,1646.661743,567.759888,0.311107,1.0,0.101379,1,1,4.058594,2.070312,1.990234,1.00293,34959,886


## Wrangling

### NaNs

In [4]:
# nan values
df_properties.isna().mean()[df_properties.isna().mean().gt(0)]

Series([], dtype: float64)

### Casting

In [5]:
# cast to string
cols_categorical = list(set(
    df_properties.select_dtypes(include='string').columns.tolist()
    + df_properties.select_dtypes(include='category').columns.tolist() 
    + df_properties.select_dtypes(include='object').columns.tolist()
    + df_properties.filter(like='_id').columns.tolist()
    ))

# to categorical
df_properties[cols_categorical] = df_properties[cols_categorical].astype('category')

## Feature Engineering

In [6]:
# params
first_date_obs = df_properties['valuation_date'].min()
last_date_obs = df_properties['valuation_date'].max()

# create columns
df_properties = (
    df_properties
    .assign(
        # objective variable
        ppsm=lambda x: x['price'] / x['saleable_area'],
        # quarters since appraisal
        quarters_since_first_appraisal=lambda x: (x['valuation_date'] - first_date_obs).dt.days / (30.4 * 3),
        # recategorizations
        has_elevator=lambda x: np.where(x['elevator_service_id'].eq(1), 1, 0),
        is_new=lambda x: x['age_in_months'].le(1).astype(int),
        conservacion_recat=lambda x: x['conservation_status_id'].replace({7: 3.5}),
    )
)

  conservacion_recat=lambda x: x['conservation_status_id'].replace({7: 3.5}),


In [7]:
# count of unique values 
def count_unique_values(df, col, max_values=10):
    """
    Count the number of unique values in a column and return the result as a DataFrame.

    Parameters:
        df (pd.DataFrame): The input DataFrame.
        col (str): The column to analyze.
        max_values (int): The maximum number of unique values to display. If there are more unique values, they will be grouped in an 'Others' category.
    
    Returns:
        pd.DataFrame: A DataFrame with the count of unique values.
    """
    # counts
    table_counts = (
        df[col]
        .value_counts(dropna=False)
        .reset_index()
        .assign(
            perc=lambda x: x['count'] / x['count'].sum()
        )
        .sort_values(by='count', ascending=False)
        .head(max_values)
    )
    
    num_others = df.shape[0] - table_counts['count'].sum()

    # append others if necessary
    if num_others > 0:
        table_counts = pd.concat([
            table_counts,
            pd.DataFrame({
                col: 'others',
                'count': [num_others],
                'perc': [num_others / df.shape[0]]
            })
        ],
        axis=0,
        ignore_index=True
    )

    return table_counts

# counts of categorical columns
cols_to_count = df_properties.select_dtypes(include='category').columns.tolist()
for col in cols_to_count:
    print(f"\n{col} {'=' * 50}")
    print(count_unique_values(df_properties, col, 10))


  sidewalk_id    count      perc
0           1  1228728  0.643824
1           0   634363  0.332391
2           5    24912  0.013053
3           4    11234  0.005886
4           3     8081  0.004234
5           2     1165  0.000610

  conservation_status_id   count      perc
0                      6  910747  0.477210
1                      4  870514  0.456129
2                      5   87763  0.045986
3                      7   35335  0.018515
4                      3    4124  0.002161

  elevator_service_id    count      perc
0                   2  1407355  0.737421
1                   0   460853  0.241476
2                   1    40275  0.021103

  road_materials_id    count      perc
0                 2  1092972  0.572692
1                 3   518715  0.271794
2                 6   187461  0.098225
3                 5    40394  0.021166
4                 1    35888  0.018804
5                 4    30425  0.015942
6                 8     1801  0.000944
7                 7      788  0

In [8]:
# see new var
df_properties.describe()

  return dtype.type(n)
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)
  the_mean = the_sum / count if count > 0 else np.nan
  return dtype.type(n)
  return dtype.type(n)
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)
  the_mean = the_sum / count if count > 0 else np.nan
  return dtype.type(n)
  return dtype.type(n)
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)
  the_mean = the_sum / count if count > 0 else np.nan
  return dtype.type(n)
  return dtype.type(n)
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)
  the_mean = the_sum / count if count > 0 else np.nan
  return dtype.type(n)
  return dtype.type(n)
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)
  the_mean = the_sum / count if count > 0 else np.nan
  return dtype.type(n)
  return dtype.type(n)
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)
  the_mean = the_sum / count if count > 0 else np.nan
  return dtype.type(n)
  return dtype.t

Unnamed: 0,price,half_bathrooms,full_bathrooms,age_in_months,parking_lots,valuation_date,level,total_levels,bedrooms,built_area,saleable_area,land_area,remaining_useful_life,count_supermarkets_at_1km,count_hospitals_at_5km,count_metro_at_1km,count_schools_at_1km,count_restaurants_at_1km,tam_loc,est_socio,clase_hog,sexo_jefe,edad_jefe,educa_jefe,tot_integ,mayores,menores,p12_64,p65mas,percep_ing,ing_cor,estim_alqu,otros_ing,gasto_mon,vivienda,alquiler,pred_cons,cuidados,tipo_viv,antiguedad,cuart_dorm,tenencia,renta,estim_pago,pago_viv,pago_mesp,tipo_adqui,viv_usada,tipo_finan,escrituras,tot_resid,tot_hom,tot_muj,tot_hog,total_viviendas,ppsm_terrain,ppsm,quarters_since_first_appraisal,has_elevator,is_new
count,1908483.0,1908483.0,1908483.0,1908483.0,1908483.0,1908483,1908483.0,1908483.0,1908483.0,1908483.0,1908483.0,1908483.0,1908483.0,1908483.0,1908483.0,1908483.0,1908483.0,1908483.0,1908483.0,1908483.0,1908483.0,1908483.0,1908483.0,1908483.0,1908483.0,1908483.0,1908483.0,1908483.0,1908483.0,1908483.0,1908483.0,1908483.0,1908483.0,1908483.0,1908483.0,1908483.0,1908483.0,1908483.0,1908483.0,1908483.0,1908483.0,1908483.0,1908483.0,1908483.0,1908483.0,1908483.0,1908483.0,1908483.0,1908483.0,1908483.0,1908483.0,1908483.0,1908483.0,1908483.0,1908483.0,1908483.0,1908483.0,1908483.0,1908483.0,1908483.0
mean,1483767.0,0.34917,1.488231,52.91538,1.135525,2021-06-26 07:39:58.221204736,3.443452,1.59271,2.356046,85.59633,91.01616,111.6373,58.70005,0.7675478,2.31332,0.0360501,0.06813841,1.583942,,,2.001783,1.007037,,,,,,,,,63970.08,7525.788,55.01279,38621.98,4140.751,1615.227,282.0286,1754.71,,,,3.855138,553.1334,2563.89,504.2402,0.3304807,1.311779,,3.43983,1.060667,,,,,174550.7,4893.699,14790.2,9.948678,0.02110315,0.4691349
min,200000.0,0.0,1.0,0.0,0.0,2019-01-01 00:00:00,0.0,0.0,1.0,20.0,30.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,37.34375,2.949219,2.0,1.666992,0.0,1.333008,0.0,1.307617,18464.88,2322.577,0.0,12989.09,541.0,0.0,0.0,369.6533,1.0,3.800781,1.200195,1.0,0.0,800.0,0.0,0.0,1.0,0.0,0.0,1.0,2.0,0.8334961,0.5,1.0,880.0,159.0,4000.0,0.0,0.0,0.0
25%,545000.0,0.0,1.0,0.0,1.0,2020-03-17 00:00:00,3.0,1.0,2.0,49.0,50.0,64.0,52.0,0.0,0.0,0.0,0.0,0.0,1.042969,2.318359,2.0,1.0,47.875,6.066406,3.257812,2.695312,0.5200195,2.398438,0.208252,2.101562,51906.88,5304.737,4.028208,31705.67,2947.57,751.6017,118.4212,1403.711,1.0,15.54688,1.970703,4.0,252.2204,1809.421,255.1812,0.2445493,1.0,0.08123779,1.0,1.0,3.289062,1.574219,1.68457,1.001953,62468.0,1973.0,9883.721,4.835526,0.0,0.0
50%,825000.0,0.0,1.0,8.0,1.0,2021-05-27 00:00:00,3.0,1.0,2.0,65.0,66.0,94.0,60.0,0.0,1.0,0.0,0.0,0.0,1.325195,2.595703,2.0,1.0,50.09375,6.480469,3.429688,2.794922,0.6113281,2.542969,0.2849121,2.226562,60912.5,6798.937,33.89166,36953.86,3741.147,1204.49,207.8829,1633.519,1.0,18.59375,2.099609,4.0,408.4537,2315.394,477.4912,0.3241348,1.0,0.125,5.0,1.0,3.466797,1.682617,1.785156,1.010742,155533.0,2708.0,12142.86,9.616228,0.0,0.0
75%,1626000.0,1.0,2.0,96.0,1.0,2022-10-24 00:00:00,3.0,2.0,3.0,101.0,106.0,124.0,70.0,1.0,2.0,0.0,0.0,0.0,2.0,2.832031,2.0,1.0,51.6875,6.835938,3.601562,2.919922,0.7182617,2.681641,0.342041,2.322266,72428.6,8810.545,69.20169,43984.74,4658.048,1766.107,344.709,1957.372,1.0,20.5,2.228516,4.0,605.4603,3000.441,700.8428,0.40397,1.0,0.1647949,5.0,1.0,3.671875,1.796875,1.882812,1.020508,260991.0,4222.0,15925.93,15.26316,0.0,1.0
max,89589000.0,6.0,8.0,255.0,8.0,2023-12-31 00:00:00,33.0,57.0,6.0,1556.0,1973.0,9944.0,80.0,10.0,41.0,10.0,13.0,230.0,4.0,3.6875,3.0,2.0,65.0,9.34375,5.914062,4.671875,2.5,4.445312,1.0,3.552734,572613.8,40926.94,2211.719,198564.8,31567.73,15105.46,11225.1,11005.48,2.0,47.90625,3.111328,5.0,5180.798,13853.75,2982.022,1.0,3.0,0.380127,5.0,3.0,6.0,3.115234,3.5,1.5,586990.0,153674.0,270597.0,20.01096,1.0,1.0
std,2201403.0,0.5124819,0.8050008,73.46578,0.6593062,,1.77386,1.09129,0.6847813,61.15447,70.37096,107.1589,11.30119,1.151416,4.991394,0.2879814,0.3513367,8.04908,0.0,0.0,0.04913171,0.08359119,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21030.03,3470.179,86.56723,11538.73,2218.931,1759.485,343.1685,709.8364,0.0,0.0,0.0,0.5378373,605.8383,1179.893,338.809,0.139096,0.6001011,0.0,1.947954,0.3406261,0.0,0.0,0.0,0.0,130373.9,8222.196,9322.498,5.884181,0.1437283,0.4990466


## Scaling

In [24]:
# scaling
df_properties = (
    df_properties
    .assign(
        # surfaces
        saleable_area=lambda x: np.log(x['saleable_area']),
        land_area=lambda x: np.log1p(x['land_area']),
        # counts
        count_supermarkets_at_1km=lambda x: np.sqrt(x['count_supermarkets_at_1km']),
        count_hospitals_at_5km=lambda x: np.sqrt(x['count_hospitals_at_5km']),
        count_metro_at_1km=lambda x: np.sqrt(x['count_metro_at_1km']),
        count_schools_at_1km=lambda x: np.sqrt(x['count_schools_at_1km']),
        count_restaurants_at_1km=lambda x: np.sqrt(x['count_restaurants_at_1km']),
        # ing
        ing_cor=lambda x: np.log(x['ing_cor']),
        # terrain
        ppsm_terrain=lambda x: np.log(x['ppsm_terrain']),
        # objective variable
        log_price_per_sqm=lambda x: np.log(x['ppsm']),
    )
)

  return dtype.type(n)
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)
  the_mean = the_sum / count if count > 0 else np.nan
  return dtype.type(n)
  return dtype.type(n)
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)
  the_mean = the_sum / count if count > 0 else np.nan
  return dtype.type(n)
  return dtype.type(n)
  return dtype.type(n)
  return dtype.type(n)
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)
  the_mean = the_sum / count if count > 0 else np.nan
  return dtype.type(n)
  return dtype.type(n)
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)
  the_mean = the_sum / count if count > 0 else np.nan
  return dtype.type(n)
  return dtype.type(n)
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)
  the_mean = the_sum / count if count > 0 else np.nan
  return dtype.type(n)
  return dtype.type(n)
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)
  the_mean = the_sum / count if count > 0 else 

Unnamed: 0,mean,min,max
price,1483767.13341,200000.0,89589000.0
half_bathrooms,0.34917,0.0,6.0
full_bathrooms,1.488231,1.0,8.0
age_in_months,52.915376,0.0,255.0
parking_lots,1.135525,0.0,8.0
...,...,...,...
ppsm,14790.199894,4000.0,270597.014925
quarters_since_first_appraisal,9.948678,0.0,20.010965
has_elevator,0.021103,0.0,1.0
is_new,0.469135,0.0,1.0


## One hot encoding (stood here)

todo: subset only the most important features and elimnate the rest (redundant for the recats)

In [None]:
# categories
columns_categorical = list(set([
    'city_cluster',
    'id_entidad_f',
    'id_tipo_inmueble',
    'ocean_label',
    'tipo_vialidad'
    ]))

# convert to category
df_properties[columns_categorical] = df_properties[columns_categorical].astype('category')


In [None]:
# one hot encoding
columns_to_one_hot = [
    'id_entidad_f',
    'id_tipo_inmueble'
]

# one hot encoding
df_properties = pd.get_dummies(df_properties, columns=columns_to_one_hot, drop_first=True, dtype='int')

# see columns
df_properties.columns

## Split Data

In [None]:
# get important columns
cols_for_model = [
    'id_clase_inmueble',
    'edad_anios',
    'cve_vigilancia', 
    'superficie_accesoria',
    'saleable_area',
    'count_supermarkets_at_1km',
    'count_hospitals_at_5km',
    'count_metro_at_1km',
    'count_schools_at_1km',
    'count_restaurants_at_1km',
    'ing_cor',
    'competitors_weighted_mean_log_price_per_sqm',
    'competitors_weighted_mean_log_price_per_sqm_lower',
    'competitors_weighted_mean_log_price_per_sqm_upper',
    'mean_log_valor_fisico_terreno_m2',
    'mean_log_valor_fisico_terreno_m2_lower',
    'mean_log_valor_fisico_terreno_m2_upper', 
    'quarters_since_first_appraisal',
    'has_elevator',
    'is_new',
    'recamaras_recat',
    'banos_recat',
    'medio_banos_recat',
    'estacionamiento_recat',
    'conservacion_recat',
    'cve_vigilancia_recat',
    'regimen_propiedad_colectiva',
    'distance_to_ocean_recat',
    'superficie_terreno_usable',
    'id_entidad_f_02',
    'id_entidad_f_03',
    'id_entidad_f_04',
    'id_entidad_f_05',
    'id_entidad_f_06',
    'id_entidad_f_07',
    'id_entidad_f_08',
    'id_entidad_f_09',
    'id_entidad_f_10',
    'id_entidad_f_11',
    'id_entidad_f_12',
    'id_entidad_f_13',
    'id_entidad_f_14',
    'id_entidad_f_15',
    'id_entidad_f_16',
    'id_entidad_f_17',
    'id_entidad_f_18',
    'id_entidad_f_19',
    'id_entidad_f_20',
    'id_entidad_f_21',
    'id_entidad_f_22',
    'id_entidad_f_23',
    'id_entidad_f_24',
    'id_entidad_f_25',
    'id_entidad_f_26',
    'id_entidad_f_27',
    'id_entidad_f_28',
    'id_entidad_f_29',
    'id_entidad_f_30',
    'id_entidad_f_31',
    'id_entidad_f_32',
    'id_tipo_inmueble_3',
    'id_tipo_inmueble_4'
]

X = df_properties[cols_for_model].copy()
y = df_properties['log_price_per_sqm'].copy()

In [None]:
# split
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.1,
    random_state=42,
    stratify=df_properties['id_tipo_inmueble_4']  # same proportion of apartments
    )

In [None]:
# see
X_train

## Fit Lasso

In [None]:
# # train lasso
# lm_lasso = LassoCV(
#     cv=5,
#     random_state=42,
#     n_jobs=-1,
#     verbose=True,
#     alphas=np.logspace(-6, 2, 100)
# )

# # fit
# lm_lasso.fit(X_train, y_train)

# # see best alpha
# idx_min_mse = np.argmin(lm_lasso.mse_path_.mean(axis=1))
# min_mse = lm_lasso.mse_path_.mean(axis=1)[idx_min_mse]
# min_mse

In [None]:
def visualize_beta_decay(X, y, n_alphas=100, n_highlight=5, scale=True):
    """
    Visualize how betas decay as alpha increases using lasso_path.
    Highlights top performing features by their index.
    Uses gray for non-highlighted features and adds a black line at y=0.
    """ 
    if scale:
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)
        alphas, coefs, _ = lasso_path(X_scaled, y, n_alphas=n_alphas)
    else:
        alphas, coefs, _ = lasso_path(X, y, n_alphas=n_alphas)
    
    # Convert coefficients to a DataFrame for easier manipulation
    coef_df = pd.DataFrame(coefs.T, columns=[f"Feature {i}" for i in range(coefs.shape[0])])
    coef_df.index = alphas

    # Plot
    _, ax = plt.subplots(figsize=(8, 6))
    
    alpha_min = coef_df.index.min()
    top_coefs = coef_df.loc[alpha_min].map(abs).sort_values().tail(n_highlight)
    
    # Plot non-highlighted features in gray
    for feature in coef_df.columns:
        if feature not in top_coefs.index:
            ax.semilogx(coef_df.index, coef_df[feature], "-", color='gray', alpha=0.5)
    
    # Plot highlighted features in color
    for feature in top_coefs.index:
        ax.semilogx(coef_df.index, coef_df[feature], "-", label=feature)
        feature_index = int(feature.split()[-1])  # Extract feature index
        plt.text(
            alpha_min, 
            coef_df.loc[alpha_min, feature],
            f"f.{feature_index}", 
            horizontalalignment="right", 
            verticalalignment="center"
        )

    # Add black horizontal line at y=0
    ax.axhline(y=0, color='black', linestyle='-', linewidth=0.5)

    # Format plot
    ax.yaxis.set_label_position("right")
    ax.yaxis.tick_right()
    ax.grid(True)
    ax.set_xlabel("Alpha")
    ax.set_ylabel("Coefficient")
    ax.set_title("Beta Decay with Increasing Alpha (using lasso_path)")
    plt.tight_layout()
    plt.show()

    # print the name of the features (number, name)
    for feature in top_coefs.index:
        feature_index = int(feature.split()[-1])  # Extract feature index
        print(f"f.{feature_index}: {X.columns[feature_index]}")


In [None]:
# visualize scaled
visualize_beta_decay(X_train, y_train, n_highlight=10)

In [None]:
# visualize not scaled
visualize_beta_decay(
    X_train,
    y_train,
    n_highlight=10,
    scale=False
    )

In [None]:
X_train['ing_cor'].describe()

---
# Catboost

## Read

In [None]:
# important columns
cols_to_stay = [
    'observation_id', 'valuation_date', 
    'id_clase_inmueble', 'id_tipo_inmueble', 'conservacion', 'elevador',
    'cve_ref_proximidad_urbana', 'price', 'edad_anios',
    'cve_vigilancia', 'regimen_propiedad', 'tipo_vialidad', 'id_entidad_f',
    'recamaras', 'banos', 'medio_banos',
    'estacionamiento', 'superficie_terreno', 'superficie_construida',
    'superficie_accesoria', 'saleable_area',
    'city_cluster', 'distance_to_ocean', 'ocean_label', 'longitud', 'latitud',
    'count_supermarkets_at_1km', 'count_hospitals_at_5km',
    'count_metro_at_1km', 'count_schools_at_1km', 'count_restaurants_at_1km',
    'ing_cor', 'competitors_weighted_mean_log_price_per_sqm',
    'competitors_weighted_mean_log_price_per_sqm_lower',
    'competitors_weighted_mean_log_price_per_sqm_upper',
    'mean_log_valor_fisico_terreno_m2',
    'mean_log_valor_fisico_terreno_m2_lower',
    'mean_log_valor_fisico_terreno_m2_upper'
]

# read database
df_properties = pd.read_parquet("../../data/clean/properties_shif.parquet").loc[:, cols_to_stay]

# set observation_id as index
df_properties = df_properties.set_index('observation_id')

# see
print(df_properties.shape)
df_properties.head(2)

## Wrangling

### NaNs

In [None]:
# nan values
df_properties.isna().mean()[df_properties.isna().mean().gt(0)]

In [None]:
# fill with 0's
df_properties['elevador'] = df_properties['elevador'].fillna(0)
df_properties['cve_vigilancia'] = df_properties['cve_vigilancia'].fillna(0)
df_properties['tipo_vialidad'] = df_properties['tipo_vialidad'].fillna(0)

In [None]:
# competitors with terrain value
df_properties['competitors_weighted_mean_log_price_per_sqm'] = df_properties['competitors_weighted_mean_log_price_per_sqm'].combine_first(df_properties['mean_log_valor_fisico_terreno_m2'])
df_properties['competitors_weighted_mean_log_price_per_sqm_lower'] = df_properties['competitors_weighted_mean_log_price_per_sqm_lower'].combine_first(df_properties['mean_log_valor_fisico_terreno_m2_lower'])
df_properties['competitors_weighted_mean_log_price_per_sqm_upper'] = df_properties['competitors_weighted_mean_log_price_per_sqm_upper'].combine_first(df_properties['mean_log_valor_fisico_terreno_m2_upper'])

In [None]:
# nan values
df_properties.isna().mean()[df_properties.isna().mean().gt(0)]

### Casting

In [None]:
# integer columns
columns_to_integer = [
    'cve_vigilancia', 'tipo_vialidad', 
]

# to int (downcast)
df_properties[columns_to_integer] = df_properties[columns_to_integer].astype('float').round().astype('Int64')

## Feature Engineering

In [None]:
# params
first_date_obs = df_properties['valuation_date'].min()
last_date_obs = df_properties['valuation_date'].max()

# create columns
df_properties = (
    df_properties
    .assign(
        # objective variable
        ppsm=lambda x: x['price'] / x['saleable_area'],
        log_price_per_sqm=lambda x: np.log(x['ppsm']),
        # quarters since appraisal
        quarters_since_first_appraisal=lambda x: (x['valuation_date'] - first_date_obs).dt.days / (30.4 * 3),
        # recategorizations
        conservacion_recat=lambda x: x['conservacion'].replace({7: 3.5}) - x['conservacion'].min(),
        cve_vigilancia_recat=lambda x: np.where(x['cve_vigilancia'].eq(2), 1, 0),
        superficie_terreno_usable=lambda x: np.where(
            x['id_tipo_inmueble'].eq(4),
            x['superficie_accesoria'],
            x['superficie_terreno'] + x['superficie_accesoria']
        ),
    )
)

In [None]:
# see new var
df_properties['quarters_since_first_appraisal'].describe()

## Categorical Variables

In [None]:
# get categorical columns
categorical_columns = [
    'id_tipo_inmueble', 'cve_vigilancia_recat', 'regimen_propiedad', 'id_entidad_f', 
    'city_cluster', 'ocean_label', 'tipo_vialidad'
]

# to category
df_properties[categorical_columns] = df_properties[categorical_columns].astype('category')

## Split Data

In [None]:
# get important columns
cols_for_model = list(set([
    'id_clase_inmueble',
    'id_tipo_inmueble',
    'elevador',
    'cve_ref_proximidad_urbana',
    'edad_anios',
    'regimen_propiedad',
    'tipo_vialidad',
    'id_entidad_f',
    'recamaras',
    'banos',
    'medio_banos',
    'estacionamiento',
    'superficie_accesoria',
    'saleable_area',
    'city_cluster',
    'distance_to_ocean',
    'ocean_label',
    'longitud',
    'latitud',
    'count_supermarkets_at_1km',
    'count_hospitals_at_5km',
    'count_metro_at_1km',
    'count_schools_at_1km',
    'count_restaurants_at_1km',
    'ing_cor',
    'competitors_weighted_mean_log_price_per_sqm',
    'competitors_weighted_mean_log_price_per_sqm_lower',
    'competitors_weighted_mean_log_price_per_sqm_upper',
    'mean_log_valor_fisico_terreno_m2',
    'mean_log_valor_fisico_terreno_m2_lower',
    'mean_log_valor_fisico_terreno_m2_upper',
    'quarters_since_first_appraisal',
    'conservacion_recat',
    'cve_vigilancia_recat',
    'superficie_terreno_usable'
]))

X = df_properties[cols_for_model].copy()
y = df_properties['ppsm'].copy()

In [None]:
# split
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.1,
    random_state=42,
    stratify=df_properties['id_tipo_inmueble'].eq(4)  # same proportion of apartments
    )

# pool
train_pool = Pool(X_train, y_train, cat_features=categorical_columns)
test_pool = Pool(X_test, y_test, cat_features=categorical_columns)

## Fit Catboost

In [None]:
# fit catboost
cb_model = CatBoostRegressor(
    loss_function='MAE',
    eval_metric='MAPE',
    verbose=100,
    random_seed=42
)

# fit
cb_model.fit(train_pool)

In [None]:
# explainer
explainer = shap.TreeExplainer(cb_model)

# shap values
shap_values = explainer(X_train)

In [None]:
# plot feature importance
shap.plots.bar(shap_values, max_display=20)

In [None]:
# Plot a beeswarm plot of SHAP values
shap.plots.beeswarm(shap_values, max_display=20)

---
# Sandbox

In [20]:
df_properties.query("property_type_id.eq(4) & land_area.le(10)").describe()

  return umr_sum(a, axis, dtype, out, keepdims, initial, where)
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)


Unnamed: 0,price,half_bathrooms,full_bathrooms,age_in_months,parking_lots,valuation_date,level,total_levels,bedrooms,built_area,saleable_area,land_area,remaining_useful_life,count_supermarkets_at_1km,count_hospitals_at_5km,count_metro_at_1km,count_schools_at_1km,count_restaurants_at_1km,tam_loc,est_socio,clase_hog,sexo_jefe,edad_jefe,educa_jefe,tot_integ,mayores,menores,p12_64,p65mas,percep_ing,ing_cor,estim_alqu,otros_ing,gasto_mon,vivienda,alquiler,pred_cons,cuidados,tipo_viv,antiguedad,cuart_dorm,tenencia,renta,estim_pago,pago_viv,pago_mesp,tipo_adqui,viv_usada,tipo_finan,escrituras,tot_resid,tot_hom,tot_muj,tot_hog,total_viviendas,ppsm_terrain,ppsm,quarters_since_first_appraisal,has_elevator,is_new
count,11382.0,11382.0,11382.0,11382.0,11382.0,11382,11382.0,11382.0,11382.0,11382.0,11382.0,11382.0,11382.0,11382.0,11382.0,11382.0,11382.0,11382.0,11382.0,11382.0,11382.0,11382.0,11382.0,11382.0,11382.0,11382.0,11382.0,11382.0,11382.0,11382.0,11382.0,11382.0,11382.0,11382.0,11382.0,11382.0,11382.0,11382.0,11382.0,11382.0,11382.0,11382.0,11382.0,11382.0,11382.0,11382.0,11382.0,11382.0,11382.0,11382.0,11382.0,11382.0,11382.0,11382.0,11382.0,11382.0,11382.0,11382.0,11382.0,11382.0
mean,2562701.0,0.040503,1.407046,19.292216,0.897645,2021-06-26 22:28:16.573537024,9.268406,3.022052,1.838166,59.635829,60.552803,7.612546,68.464769,2.21411,13.062643,0.507556,0.314971,14.928659,1.166992,2.75,2.000351,1.073625,inf,inf,3.154297,2.712891,0.442627,2.361328,0.350586,2.230469,82955.1875,12726.172852,73.065292,49957.480469,7575.182617,4586.655273,713.917664,2425.216064,1.280273,inf,2.072266,3.369179,1580.262573,4338.057617,385.64444,0.364417,1.156475,0.13501,4.164997,1.105605,3.205078,1.513672,1.69043,1.015625,177659.553593,27424.179845,39426.959856,9.955442,0.255315,0.697417
min,270000.0,0.0,1.0,0.0,0.0,2019-01-02 00:00:00,0.0,0.0,1.0,26.0,30.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,2.0,1.0,38.125,2.949219,2.441406,1.858398,0.180054,1.358398,0.0,1.307617,22246.158203,2876.16333,0.0,15795.15332,1295.720581,0.0,0.0,606.029846,1.0,6.351562,1.428711,1.0,0.0,978.804138,0.0,0.0,1.0,0.0,1.0,1.0,2.464844,1.057617,1.057617,1.0,1404.0,1343.0,4912.5,0.010965,0.0,0.0
25%,674000.0,0.0,1.0,0.0,0.0,2020-02-06 00:00:00,3.0,1.0,2.0,44.0,46.0,6.0,60.0,0.0,1.0,0.0,0.0,0.0,1.0,2.482422,2.0,1.0,49.15625,6.605469,3.017578,2.554688,0.312744,2.207031,0.232666,2.132812,59297.6875,7233.132324,4.605845,38126.960938,4776.178223,2082.84375,212.812424,1530.793091,1.0,14.92188,1.885986,4.0,767.280029,2457.317139,171.13942,0.237992,1.0,0.088074,5.0,1.0,3.025391,1.363281,1.549805,1.0,96192.0,4323.0,14270.833333,4.39693,0.0,0.0
50%,2271500.0,0.0,1.0,0.0,1.0,2021-06-24 00:00:00,7.0,1.0,2.0,56.0,56.0,8.0,70.0,2.0,14.0,0.0,0.0,4.0,1.0,2.845703,2.0,1.0,53.75,6.917969,3.214844,2.705078,0.423584,2.373047,0.363037,2.244141,76742.710938,11910.004883,42.652428,47140.269531,6524.25293,3037.980225,521.791138,1975.968384,1.0,24.92188,2.169922,4.0,1100.134521,4065.352539,303.669769,0.336579,1.0,0.121277,5.0,1.0,3.255859,1.548828,1.749023,1.008789,178716.0,20369.0,38745.12987,9.923246,0.0,1.0
75%,3659800.0,0.0,2.0,12.0,1.0,2022-10-19 00:00:00,13.0,2.0,2.0,69.0,70.0,10.0,80.0,4.0,20.0,0.0,0.0,24.0,1.03125,3.019531,2.0,1.0,55.75,7.601562,3.316406,2.882812,0.570312,2.498047,0.470215,2.316406,100765.898438,17885.675781,65.036064,53865.664062,8983.433594,5783.566406,1055.456177,3120.334473,2.0,28.53125,2.261719,4.0,2002.350586,6088.377441,590.260742,0.462553,1.0,0.171143,5.0,1.0,3.416016,1.682617,1.800781,1.021484,255594.0,37644.0,55714.738176,15.208333,1.0,1.0
max,33642000.0,1.0,5.0,252.0,4.0,2023-12-29 00:00:00,33.0,57.0,4.0,256.0,256.0,10.0,80.0,8.0,39.0,8.0,7.0,220.0,4.0,3.630859,3.0,2.0,60.65625,9.34375,5.0,3.662109,1.549805,3.554688,0.743164,3.150391,225755.0625,40926.941406,1255.467163,143908.5,20024.144531,15105.459961,3306.426514,10586.366211,2.0,40.34375,3.044922,4.0,5180.79834,13853.746094,2982.021973,1.0,3.0,0.380127,5.0,3.0,5.0,2.597656,2.599609,1.166992,586990.0,132314.0,270597.014925,19.989035,1.0,1.0
std,1993185.0,0.197143,0.517267,43.450724,0.745767,,7.070776,6.032885,0.475006,19.473596,19.749819,2.548076,10.397965,2.016652,11.574343,1.068962,0.736286,24.29746,0.494141,0.433594,0.018744,0.261171,5.457031,0.8579102,0.314209,0.225586,0.164429,0.230347,0.151123,0.203003,34033.074219,5784.541992,133.4711,19899.521484,4168.709473,3557.193115,655.299377,1350.50415,0.449707,8.984375,0.251709,1.179695,1223.562866,1972.116577,319.335632,0.160548,0.395536,0.077454,1.604518,0.446512,0.330322,0.220337,0.167969,0.017639,98031.59882,27256.58351,24362.7358,6.083903,0.436058,0.459396
