# Algorithm Selection

@roman

21 July, 2024

In [None]:
import os
import pandas as pd
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf
import shap
import h3

from sklearn.preprocessing import StandardScaler, PowerTransformer, OneHotEncoder, LabelEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor, Pool
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.base import BaseEstimator, RegressorMixin
from scipy.stats import hmean
from INEGIpy import MarcoGeoestadistico

In [None]:
# Settings
# show 100 columns in pandas
pd.set_option('display.max_columns', 500)
TODAY = pd.to_datetime('today')
# inegi class
inegi_api = MarcoGeoestadistico()

---
# Data

## Mexico Shape

In [None]:
# read country shape
gdf_mexico = inegi_api.Entidades()

# change crs to 6372
gdf_mexico = gdf_mexico.to_crs(epsg=6372)
gdf_mexico.plot()

## Read

In [None]:
def get_properties_data(file_path, cols_to_stay, cols_as_categories):
    # Read database
    df = pd.read_parquet(file_path)

    # Set property_id as index
    df = df.set_index('property_id')

    # Handling NaNs
    df['elevador'] = df['elevador'].fillna(0)
    df['cve_vigilancia'] = df['cve_vigilancia'].fillna(0)
    df['tipo_vialidad'] = df['tipo_vialidad'].fillna(0)

    # Fill missing competitors values with terrain values
    df['competitors_weighted_mean_log_price_per_sqm'] = df['competitors_weighted_mean_log_price_per_sqm'].combine_first(df['mean_log_valor_fisico_terreno_m2'])
    df['competitors_weighted_mean_log_price_per_sqm_lower'] = df['competitors_weighted_mean_log_price_per_sqm_lower'].combine_first(df['mean_log_valor_fisico_terreno_m2_lower'])
    df['competitors_weighted_mean_log_price_per_sqm_upper'] = df['competitors_weighted_mean_log_price_per_sqm_upper'].combine_first(df['mean_log_valor_fisico_terreno_m2_upper'])

    # Casting integer columns
    columns_to_integer = ['cve_vigilancia', 'tipo_vialidad']
    df[columns_to_integer] = df[columns_to_integer].astype('float').round().astype('Int64')

    # Feature Engineering
    first_date_obs = df['valuation_date'].min()
    last_date_obs = df['valuation_date'].max()

    df = (
        df
        .assign(
            year_appraised=lambda x: x['valuation_date'].dt.year,
            price_per_sqm=lambda x: x['valor_mercado'] / x['saleable_area'],
            quarters_since_first_appraisal=lambda x: (x['valuation_date'] - first_date_obs).dt.days / (30.4 * 3),
            conservacion_recat=lambda x: x['conservacion'].replace({7: 3.5}) - x['conservacion'].min(),
            cve_vigilancia_recat=lambda x: np.where(x['cve_vigilancia'].eq(2), 1, 0),
            superficie_terreno_usable=lambda x: np.where(
                x['id_tipo_inmueble'].eq(4),
                x['superficie_accesoria'],
                x['superficie_terreno'] + x['superficie_accesoria']
            ),
            elevador=lambda x: x['elevador'].eq(1).astype('int'),
            log_superficie_vendible=lambda x: np.log(x['saleable_area']),
            log_superficie_terreno=lambda x: np.log(x['superficie_terreno']),
            log_superficie_construida=lambda x: np.log(x['superficie_construida']),
            log_ing_cor=lambda x: np.log(x['ing_cor']),
            recamaras_cat=lambda x: x['recamaras'].clip(0, 5),
            banos_cat=lambda x: x['banos'].clip(0, 5),
            medios_banos_cat=lambda x: x['medio_banos'].clip(0, 5),
            pisos_cat=lambda x: x['niveles'].clip(1, 7),
            estacionamiento_cat=lambda x: x['estacionamiento'].clip(0, 1).astype('category'),
        )
    )

    # Cast columns as categories
    df[cols_as_categories] = df[cols_as_categories].astype('category')

    # return df.loc[:, cols_to_stay]  not necessary
    return df

cols_to_categories = [
    'property_type', 'cve_vigilancia_recat', 'regimen_propiedad', 'state_id',
    'id_tipo_inmueble'
]

df_properties = get_properties_data(
    "../../data/clean/properties_shif.parquet", cols_to_stay_with, cols_to_categories
    )

# see
print(df_properties.shape)
df_properties.head()

In [None]:
# count dtypes
df_properties.dtypes.value_counts()

In [None]:
# see nans
df_properties.isna().sum()[df_properties.isna().sum() > 0]

## Split

In [None]:
# split data (index)
index_train, index_test = train_test_split(
    df_properties.index, test_size=0.1, random_state=42, stratify=df_properties['property_type']
    )

# sizes
print(f"Train size: {len(index_train)}")
print(f"Test size: {len(index_test)}")

In [None]:
# count of property types
df_properties.loc[index_train, 'property_type'].value_counts(normalize=True)

In [None]:
# count of property types
df_properties.loc[index_test, 'property_type'].value_counts(normalize=True)

---
# Models

In [None]:
def calculate_metrics(y, y_pred, best_percent=1.0):
    # Create a DataFrame to hold y, y_pred, and MAPE
    df = pd.DataFrame({
        'y': y,
        'y_pred': y_pred
    })
    
    # Calculate MAPE
    df['mape'] = np.abs((df['y'] - df['y_pred']) / df['y'])
    
    # Determine the threshold MAPE to filter the best_percent data
    threshold_mape = df['mape'].quantile(best_percent)
    
    # Filter the best_percent of the data
    df_best = df[df['mape'] <= threshold_mape]
    
    # Calculate metrics
    rmse = np.sqrt(mean_squared_error(df_best['y'], df_best['y_pred']))
    mae = mean_absolute_error(df_best['y'], df_best['y_pred'])
    mape_best = df_best['mape'].mean()
    r2 = r2_score(df_best['y'], df_best['y_pred'])
    
    return pd.Series({
        "rmse": rmse,
        "mape": mape_best,
        "mae": mae,
        "r2": r2
    })

## XGBoost (got stucked in the time consuming of dmatrx)

### Fit

In [None]:
# # cols to use
# cols_x = [
#     'id_clase_inmueble', 
#     # 'property_type',
#     'elevador', 'edad_anios',
#     # 'regimen_propiedad', 'state_id', 'banos',
#     # 'medio_banos', 'estacionamiento', 'saleable_area',
#     # 'superficie_terreno_usable', 'distance_to_ocean', 'longitude', 'latitude',
#     # 'count_supermarkets_at_1km', 'count_hospitals_at_5km',
#     # 'count_metro_at_1km', 'count_schools_at_1km',
#     # 'count_restaurants_at_1km',
#     # 'competitors_weighted_mean_log_price_per_sqm',
#     # 'mean_log_valor_fisico_terreno_m2',
#     # 'mean_log_valor_fisico_terreno_m2_lower',
#     # 'mean_log_valor_fisico_terreno_m2_upper',
#     # 'quarters_since_first_appraisal', 'conservacion_recat',
#     # 'cve_vigilancia_recat'
# ]

# # categorical_cols = [
# #     'property_type', 'cve_vigilancia_recat', 'regimen_propiedad', 'state_id'
# # ]

# # x_train, y_train
# X_train = df_properties.loc[index_train, cols_x].copy()
# y_train = df_properties['price_per_sqm'].loc[index_train].copy()

# # x_test, y_test
# X_test = df_properties.loc[index_test, cols_x].copy()
# y_test = df_properties['price_per_sqm'].loc[index_test].copy()

# # one hot categorical cols
# # X_train = pd.get_dummies(X_train, columns=categorical_cols, drop_first=True, dtype='int')
# # X_test = pd.get_dummies(X_test, columns=categorical_cols, drop_first=True, dtype='int')

# # # distance to ocean: from inf to 100_000
# # X_train['distance_to_ocean'] = X_train['distance_to_ocean'].replace(np.inf, 100_000)
# # X_test['distance_to_ocean'] = X_test['distance_to_ocean'].replace(np.inf, 100_000)

# # set all columns as float
# X_train = X_train.astype('float')
# X_test = X_test.astype('float')

# # see num cols
# print(X_train.shape)
# print(X_test.shape)

In [None]:
# # generate dmatrix
# # dtrain = xgb.DMatrix(X_train, label=y_train)
# # dtest = xgb.DMatrix(X_test, label=y_test)

In [None]:
# params = {
#     'objective': 'reg:squarederror',
#     'eval_metric': 'rmse',
#     'max_depth': 6,
#     'eta': 0.01,
#     'subsample': 0.8,
#     'colsample_bytree': 0.8,
#     'enable_categorical': True,
#     'seed': 42,
#     'nthread': 4,
#     'early_stopping_rounds': 50,
#     'n_estimators': 1000,
#     'verbose': 1
# }


# # Train the model
# xgb_model = xgb.XGBRegressor(**params)
# xgb_model.fit(
#     X_train, y_train,
#     eval_set=[(X_train, y_train), (X_test, y_test)],
# )

## Baseline Model

### Fit

In [None]:
class BaselineModel(BaseEstimator, RegressorMixin):
    def __init__(self):
        self.harmonic_means_ = {}
        self.default_mean_ = None
        self.first_category_ = None

    def fit(self, X, y):
        # Ensure X is a DataFrame and has exactly 2 columns
        if not isinstance(X, pd.DataFrame) or X.shape[1] != 2:
            raise ValueError("X must be a DataFrame with exactly 2 columns.")
        
        # Ensure y is a Series or ndarray with the same length as X
        if len(y) != len(X):
            raise ValueError("Length of y must be equal to the number of rows in X.")
        
        # Create a DataFrame with y included
        df = X.copy()
        df['y'] = y
        
        # Calculate harmonic means for each combination of categories
        grouped = df.groupby(list(X.columns))['y']
        self.harmonic_means_ = grouped.apply(lambda grp: hmean(grp)).to_dict()
        
        # Calculate the default mean as the harmonic mean of the first category
        self.first_category_ = X.columns[0]
        first_category = df.iloc[0][self.first_category_]
        
        if first_category:
            first_category_df = df[df[self.first_category_] == first_category]
            if not first_category_df.empty:
                # Calculate harmonic mean for each level of the second category
                category_means = first_category_df.groupby(X.columns[1])['y'].apply(lambda grp: hmean(grp))
                # Calculate the overall harmonic mean of these category means
                if not category_means.empty:
                    self.default_mean_ = hmean(category_means.values)
                else:
                    self.default_mean_ = np.nan
            else:
                self.default_mean_ = np.nan
        else:
            self.default_mean_ = np.nan
        
        return self

    def predict(self, X):
        # Ensure X is a DataFrame and has exactly 2 columns
        if not isinstance(X, pd.DataFrame) or X.shape[1] != 2:
            raise ValueError("X must be a DataFrame with exactly 2 columns.")
        
        # Create a DataFrame for predictions
        X_copy = X.copy()
        X_copy['prediction'] = X_copy.apply(lambda row: self.harmonic_means_.get(tuple(row), self.default_mean_), axis=1)
        return X_copy['prediction'].values


In [None]:
# cols to use
cols_x = [
    'state_id', 'year_appraised'
]

# x_train, y_train
X_train = df_properties.drop(columns=['price_per_sqm']).loc[index_train, cols_x].copy()
y_train = df_properties['price_per_sqm'].loc[index_train].copy()

# x_test, y_test
X_test = df_properties.drop(columns=['price_per_sqm']).loc[index_test, cols_x].copy()
y_test = df_properties['price_per_sqm'].loc[index_test].copy()

In [None]:
# fit baseline model
baseline_model = BaselineModel()
baseline_model.fit(X_train, y_train)

### Metrics

In [None]:
# train 
y_train_pred = baseline_model.predict(X_train)
calculate_metrics(y_train, y_train_pred, best_percent=0.9)

In [None]:
# test
y_test_pred = baseline_model.predict(X_test)
calculate_metrics(y_test, y_test_pred, best_percent=0.9)

## SHF Linear Model

### Fit

In [None]:
df_properties.filter(like='super')

In [None]:
# cols to use
cols_x = [
    'id_tipo_inmueble',
    'log_superficie_vendible',
    'log_superficie_construida',
    'log_ing_cor',
    'banos_cat',
    'medios_banos_cat',
    'pisos_cat',
    'recamaras_cat',
    'estacionamiento_cat'
]

# x_train, y_train
X_train = df_properties.loc[index_train, cols_x].copy()
y_train = np.log(df_properties['price_per_sqm'].loc[index_train]).copy()

# x_test, y_test
X_test = df_properties.loc[index_test, cols_x].copy()
y_test = np.log(df_properties['price_per_sqm'].loc[index_test]).copy()

# one hot encode id_tipo_inmueble
X_train = pd.get_dummies(X_train, columns=['id_tipo_inmueble'], drop_first=True, dtype='int')
X_test = pd.get_dummies(X_test, columns=['id_tipo_inmueble'], drop_first=True, dtype='int')

# set all columns as float
X_train = X_train.astype('float')
X_test = X_test.astype('float')

# see num cols
print(X_train.shape)
print(X_test.shape)

In [None]:
# see cols
X_train.columns

In [None]:
# fit linear regression
shf_linear_model = LinearRegression()
shf_linear_model.fit(X_train, y_train)

In [None]:
# add X_train the price_per_sqm col
X_train_bis = X_train.copy()
X_train_bis['price_per_sqm'] = np.exp(y_train)

cols_to_use = X_train_bis.columns.tolist()
cols_to_use.remove('price_per_sqm')

# fit linear regression using smf
shf_linear_model_smf = smf.ols(
    formula='np.log(price_per_sqm) ~ ' + ' + '.join(cols_to_use),
    data=X_train_bis
    )
shf_linear_model_smf = shf_linear_model_smf.fit()

# summary
shf_linear_model_smf.summary()

In [None]:
df_properties['id_tipo_inmueble'].value_counts()

### Metrics

In [None]:
# train
y_train_pred = shf_linear_model.predict(X_train)
calculate_metrics(np.exp(y_train), np.exp(y_train_pred), best_percent=0.9)


In [None]:
# test
y_test_pred = shf_linear_model.predict(X_test)
calculate_metrics(np.exp(y_test), np.exp(y_test_pred), best_percent=0.9)

## Linear Regression

### Fit

In [None]:
# cols to use
cols_x = [
    'mean_log_valor_fisico_terreno_m2', 'quarters_since_first_appraisal', 'conservacion_recat', 'id_clase_inmueble',
    'competitors_weighted_mean_log_price_per_sqm', 'saleable_area', 'elevador', 'banos',
    'medio_banos', 'estacionamiento', 'superficie_terreno_usable', 'property_type', 'state_id'
]

# x_train, y_train
X_train = df_properties.loc[index_train, cols_x].copy()
y_train = np.log(df_properties['price_per_sqm'].loc[index_train].copy())

# x_test, y_test
X_test = df_properties.loc[index_test, cols_x].copy()
y_test = np.log(df_properties['price_per_sqm'].loc[index_test].copy())

# one hot encode property_type, state_id
X_train = pd.get_dummies(X_train, columns=['property_type', 'state_id'], drop_first=True, dtype='int')
X_test = pd.get_dummies(X_test, columns=['property_type', 'state_id'], drop_first=True, dtype='int')

# see num cols
print(X_train.shape)
print(X_test.shape)

In [None]:
# see cols
X_train.columns

In [None]:
# fit linear regression
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

### Metrics

In [None]:
# train
y_train_pred = linear_model.predict(X_train)
calculate_metrics(np.exp(y_train), np.exp(y_train_pred), best_percent=0.9)


In [None]:
# test
y_test_pred = linear_model.predict(X_test)
calculate_metrics(np.exp(y_test), np.exp(y_test_pred), best_percent=0.9)

## Decision Tree

### Fit

In [None]:
# cols to use
cols_x = [
    'mean_log_valor_fisico_terreno_m2', 'quarters_since_first_appraisal', 'conservacion_recat', 'id_clase_inmueble',
    'competitors_weighted_mean_log_price_per_sqm', 'saleable_area', 'elevador', 'banos',
    'medio_banos', 'estacionamiento', 'superficie_terreno_usable', 'property_type', 'state_id'
]

# x_train, y_train
X_train = df_properties.loc[index_train, cols_x].copy()
y_train = df_properties['price_per_sqm'].loc[index_train].copy()

# x_test, y_test
X_test = df_properties.loc[index_test, cols_x].copy()
y_test = df_properties['price_per_sqm'].loc[index_test].copy()

# one hot encode property_type, state_id
X_train = pd.get_dummies(X_train, columns=['property_type', 'state_id'], drop_first=True, dtype='int')
X_test = pd.get_dummies(X_test, columns=['property_type', 'state_id'], drop_first=True, dtype='int')

# see num cols
print(X_train.shape)
print(X_test.shape)

In [None]:
# fit decision tree
tree_model = DecisionTreeRegressor(
    random_state=42,
    max_depth=15,
    min_samples_split=10,
    min_samples_leaf=5
    )
tree_model.fit(X_train, y_train)


### Metrics

In [None]:
# train
y_train_pred = tree_model.predict(X_train)
calculate_metrics(y_train, y_train_pred, best_percent=0.9)

In [None]:
# test
y_test_pred = tree_model.predict(X_test)
calculate_metrics(y_test, y_test_pred, best_percent=0.9)

## Catboost

### Fit

In [None]:
# cols to use
cols_x = [
    'id_clase_inmueble', 
    'property_type',
    'elevador', 'edad_anios',
    'regimen_propiedad', 'state_id', 'banos',
    'medio_banos', 'estacionamiento', 'saleable_area',
    'superficie_terreno_usable', 'distance_to_ocean', 'longitude', 'latitude',
    'count_supermarkets_at_1km', 'count_hospitals_at_5km',
    'count_metro_at_1km', 'count_schools_at_1km',
    'count_restaurants_at_1km',
    'competitors_weighted_mean_log_price_per_sqm',
    'mean_log_valor_fisico_terreno_m2',
    'mean_log_valor_fisico_terreno_m2_lower',
    'mean_log_valor_fisico_terreno_m2_upper',
    'quarters_since_first_appraisal', 'conservacion_recat',
    'cve_vigilancia_recat'
]

categorical_cols = [
    'property_type', 'cve_vigilancia_recat', 'regimen_propiedad', 'state_id'
]

# x_train, y_train
X_train = df_properties.loc[index_train, cols_x].copy()
y_train = df_properties['price_per_sqm'].loc[index_train].copy()

# x_test, y_test
X_test = df_properties.loc[index_test, cols_x].copy()
y_test = df_properties['price_per_sqm'].loc[index_test].copy()

# # distance to ocean: from inf to 100_000
X_train['distance_to_ocean'] = X_train['distance_to_ocean'].replace(np.inf, 100_000)
X_test['distance_to_ocean'] = X_test['distance_to_ocean'].replace(np.inf, 100_000)


# see num cols
print(X_train.shape)
print(X_test.shape)

In [None]:
# see distance to ocean categories
X_train['distance_to_ocean'].value_counts()

In [None]:
# subset train data to get validation data
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.1, random_state=42
)

In [None]:
# fit catboost
params = {
    'max_depth': 10,
    'learning_rate': 0.05,
    'l2_leaf_reg': 3,
    'loss_function': 'MAE',
    'eval_metric': 'MAPE',
    'iterations': 1000,
    'early_stopping_rounds': 50,
    'random_seed': 42,
    'verbose': 100,
    'use_best_model': True
}

# create pool
pool_train = Pool(X_train, y_train, cat_features=categorical_cols)
pool_val = Pool(X_val, y_val, cat_features=categorical_cols)
pool_test = Pool(X_test, y_test, cat_features=categorical_cols)

# train
catboost_model = CatBoostRegressor(**params)
catboost_model.fit(
    pool_train,
    eval_set=pool_val,
    plot=True
)

### Metrics

In [None]:
# train
y_train_pred = catboost_model.predict(pool_train)
calculate_metrics(y_train, y_train_pred, best_percent=0.9)

In [None]:
# test
y_test_pred = catboost_model.predict(pool_test)
calculate_metrics(y_test, y_test_pred, best_percent=0.9)

---
# Regression Analysis

## General Analysis

### Univariate

In [None]:
# cols to use
cols_x = [
    'id_clase_inmueble', 
    'property_type',
    'elevador', 'edad_anios',
    'regimen_propiedad', 'state_id', 'banos',
    'medio_banos', 'estacionamiento', 'saleable_area',
    'superficie_terreno_usable', 'distance_to_ocean', 'longitude', 'latitude',
    'count_supermarkets_at_1km', 'count_hospitals_at_5km',
    'count_metro_at_1km', 'count_schools_at_1km',
    'count_restaurants_at_1km',
    'competitors_weighted_mean_log_price_per_sqm',
    'mean_log_valor_fisico_terreno_m2',
    'mean_log_valor_fisico_terreno_m2_lower',
    'mean_log_valor_fisico_terreno_m2_upper',
    'quarters_since_first_appraisal', 'conservacion_recat',
    'cve_vigilancia_recat'
]

categorical_cols = [
    'property_type', 'cve_vigilancia_recat', 'regimen_propiedad', 'state_id'
]

# x_train, y_train
X_train = df_properties.loc[index_train, cols_x].copy()
y_train = df_properties['price_per_sqm'].loc[index_train].copy()

# x_test, y_test
X_test = df_properties.loc[index_test, cols_x].copy()
y_test = df_properties['price_per_sqm'].loc[index_test].copy()

# # distance to ocean: from inf to 100_000
X_train['distance_to_ocean'] = X_train['distance_to_ocean'].replace(np.inf, 100_000)
X_test['distance_to_ocean'] = X_test['distance_to_ocean'].replace(np.inf, 100_000)


# see num cols
print(X_train.shape)
print(X_test.shape)

In [None]:
# set best model
best_model = catboost_model

In [None]:
# predict all data in  test
y_pred = best_model.predict(pool_test)

# save in X_test
X_test['y_pred'] = y_pred
X_test['y_true'] = y_test


In [None]:
# errors
X_test['error'] = X_test['y_true'] - X_test['y_pred']
X_test['error_perc'] = X_test['error'] / X_test['y_true']
X_test['error_perc_abs'] = X_test['error_perc'].abs()


In [None]:
# calculate metrics
metrics_all = calculate_metrics(y_test, y_pred, best_percent=1)
metrics_best_90 = calculate_metrics(y_test, y_pred, best_percent=0.9) 

In [None]:
# print metrics
pd.DataFrame([metrics_all, metrics_best_90], index=['All', 'Best 90%'])

In [None]:
# histogram of errors
sns.histplot(
    X_test,
    x='error_perc',
    bins=100,
    kde=True
)
# addorn
plt.title("Error Percentage Histogram")
plt.xlabel("Error Percentage")
plt.ylabel("Count")
plt.show()

In [None]:
# best 90% histogram
sns.histplot(
    X_test[X_test['error_perc_abs'] <= X_test['error_perc_abs'].quantile(0.9)],
    x='error_perc',
    bins=100,
    kde=True
)
# addorn
plt.title("Error Percentage Histogram (Best 90%)")
plt.xlabel("Error Percentage")
plt.ylabel("Count")
plt.show()

### Categories

In [None]:
# errors for propertytype
(
    X_test
    .groupby('property_type', observed=True)
    .apply(lambda x: calculate_metrics(x['y_true'], x['y_pred'], best_percent=0.9), include_groups=False)
)

In [None]:
# errors for propertytype
(
    X_test
    .groupby('cve_vigilancia_recat', observed=True)
    .apply(lambda x: calculate_metrics(x['y_true'], x['y_pred'], best_percent=0.9), include_groups=False)
)

In [None]:
# errors for propertytype
(
    X_test
    .groupby('regimen_propiedad', observed=True)
    .apply(lambda x: calculate_metrics(x['y_true'], x['y_pred'], best_percent=0.9), include_groups=False)
)

In [None]:
# errors for state_id (only get mape)
table_errors_state = (
    X_test
    .groupby('state_id', observed=True)
    .apply(lambda x: calculate_metrics(x['y_true'], x['y_pred'], best_percent=0.9), include_groups=False)
)
table_errors_state

In [None]:
# see error upfront_beach of beach vs not infront of beach
(
    X_test
    .groupby('distance_to_ocean', observed=True)
    .apply(lambda x: calculate_metrics(x['y_true'], x['y_pred'], best_percent=0.9), include_groups=False)
)

## Temporal Error

In [None]:
# visualize errors over time
fig, ax = plt.subplots(figsize=(12, 6))

sns.regplot(
    x='quarters_since_first_appraisal',
    y='error_perc',
    data=X_test,
    scatter_kws={'alpha': 0.1},
    line_kws={'color': 'red'},
    ax=ax,
    lowess=True
)

# addorn
plt.title("Error Percentage Over Time")
plt.xlabel("Quarters Since First Appraisal")
plt.ylabel("Error Percentage")

# add legend
plt.legend(['Data', 'Lowess'])

plt.show()

In [None]:
# adjust linear regression to error_perc
lm_time_error = smf.ols('error_perc ~ quarters_since_first_appraisal', data=X_test).fit()

# see summary
lm_time_error.summary()

## Geospatial Error

In [None]:
# plot MAPE by state
fig, ax = plt.subplots(1, 1, figsize=(12, 8))

(
    gdf_mexico
    .merge(
        table_errors_state.reset_index(),
        left_on='cvegeo',
        right_on='state_id',
        how='left'
    )
    .plot('mape', legend=True, ax=ax, cmap='copper', edgecolor='gray')
)

# dont show axis
plt.axis('off')

# title
plt.title('MAPE by state')

## Covariates


In [None]:
def visualize_errors(df, col):
    # visualize errors over time
    fig, ax = plt.subplots(figsize=(12, 6))

    sns.regplot(
        x=col,
        y='error_perc',
        data=df,
        scatter_kws={'alpha': 0.1},
        line_kws={'color': 'red'},
        ax=ax,
        lowess=True
    )

    # addorn
    plt.title(f"Error Percentage by {col}")
    plt.xlabel(col)
    plt.ylabel("Error Percentage")

    # add legend
    plt.legend(['Data', 'Lowess'])

    plt.show()
    return

In [None]:
# visualize errors by categories
cols_x = [
    'elevador',
    'edad_anios',
    'banos',
    'medio_banos',
    'estacionamiento',
    'saleable_area',
    'superficie_terreno_usable',
    'distance_to_ocean',
    'count_supermarkets_at_1km',
    'count_hospitals_at_5km',
    'count_metro_at_1km',
    'count_schools_at_1km',
    'count_restaurants_at_1km',
    # 'competitors_weighted_mean_log_price_per_sqm',
    # 'mean_log_valor_fisico_terreno_m2',
    # 'mean_log_valor_fisico_terreno_m2_lower',
    # 'mean_log_valor_fisico_terreno_m2_upper',
    # 'quarters_since_first_appraisal', 
    # 'cve_vigilancia_recat'
]

for col in cols_x:
    visualize_errors(X_test, col)

## Time Series in special zones

Zones of interest:
- Andares, Guadalajara 8849ab4b45fffff
- Ruben Darío, Mexico City 884995bae9fffff
- Roma Norte, Mexico City 884995ba3dfffff
- Centrito Valle, Monterrey 8848a20667fffff
- Puerto Cancún, Cancún 884519b491fffff

The counterfactual will be the same property in different zones during the same period.


The property will be:
- id_clase_inmueble: 4 (media residencial)
- property_type: apartment
- elevador: 1
- edad_anios: 0 (new)
- regimen_propiedad: PRIVADA COLECTIVA
- state_id: (depends on the hex)
- banos: 2
- medio_banos: 1
- estacionamiento: 2
- saleable_area: 100
- superficie_terreno_usable: 100
- distance_to_ocean: 100000
- longitude: (depends on the hex)
- latitude: (depends on the hex)
- count_supermarkets_at_1km: 1
- count_hospitals_at_5km: 1
- count_metro_at_1km: 1
- count_schools_at_1km: 1
- count_restaurants_at_1km: 1
- competitors_weighted_mean_log_price_per_sqm: (depends on the hex)
- mean_log_valor_fisico_terreno_m2: (depends on the hex)
- mean_log_valor_fisico_terreno_m2_lower: (depends on the hex)
- mean_log_valor_fisico_terreno_m2_upper: (depends on the hex)
- quarters_since_first_appraisal: (time series)
- conservacion_recat: 3
- cve_vigilancia_recat: 1

In [None]:
def generate_counterfactual_property(h_id, df):
    """
    Generate a counterfactual property based on a given hex id.
    The property should have:

    - id_clase_inmueble: 4
    - property_type: 'apartment'
    - elevador: 1
    - edad_anios: 0
    - regimen_propiedad: 'PRIVADA COLECTIVA'
    - state_id: based on the hex_id
    - banos: 2
    - medio_banos: 1
    - estacionamiento: 1
    - saleable_area: 100
    - superficie_terreno_usable: 100
    - distance_to_ocean: 100_000
    - longitude: based on the hex_id
    - latitude: based on the hex_id
    - count_supermarkets_at_1km: 1
    - count_hospitals_at_5km: 1
    - count_metro_at_1km: 0
    - count_schools_at_1km: 1
    - count_restaurants_at_1km: 1
    - competitors_weighted_mean_log_price_per_sqm: based on the hex_id and quarter (doing a regression)
    - mean_log_valor_fisico_terreno_m2: based on the hex_id
    - mean_log_valor_fisico_terreno_m2_lower: based on the hex_id
    - mean_log_valor_fisico_terreno_m2_upper: based on the hex_id
    - quarters_since_first_appraisal: (time range from 0 to 20)
    - conservacion_recat: 3
    - cve_vigilancia_recat: 1
    """
    # Step 0: generate Series with fixed values
    s = pd.Series({
        'id_clase_inmueble': 6,
        'property_type': 'apartment',
        'elevador': 1,
        'edad_anios': 0,
        'regimen_propiedad': 'PRIVADA COLECTIVA',
        'banos': 1,
        'medio_banos': 1,
        'estacionamiento': 1,
        'saleable_area': 60,
        'superficie_terreno_usable': 60,
        'distance_to_ocean': 100_000,
        'count_supermarkets_at_1km': 1,
        'count_hospitals_at_5km': 1,
        'count_metro_at_1km': 0,
        'count_schools_at_1km': 1,
        'count_restaurants_at_1km': 1,
        'conservacion_recat': 3,
        'cve_vigilancia_recat': 1
    })

    # Step 1: get hex_id info
    # get hex_id info
    df = (
        df.copy()
        .query('hex_id == @h_id')
    )
    # get lat and long using h3_to_geo
    s['longitude'], s['latitude'] = h3.h3_to_geo(h_id)
    # get state_id
    s['state_id'] = df['state_id'].values[0]
    # get mean_log_valor_fisico_terreno_m2
    s['mean_log_valor_fisico_terreno_m2'] = df['mean_log_valor_fisico_terreno_m2'].apply(hmean).values[0]
    s['mean_log_valor_fisico_terreno_m2_lower'] = df['mean_log_valor_fisico_terreno_m2_lower'].apply(hmean).values[0]
    s['mean_log_valor_fisico_terreno_m2_upper'] = df['mean_log_valor_fisico_terreno_m2_upper'].apply(hmean).values[0]

    # Step 2: generate 20 counterfactuals but changing the quarters_since_first_appraisal from 0 to 20
    counterfactuals = []
    for i in range(21):
        s['quarters_since_first_appraisal'] = i
        counterfactuals.append(s.copy())
    
    # generate df
    df_counterfactuals = pd.DataFrame(counterfactuals)

    # Step 3: get competitors_weighted_mean_log_price_per_sqm
    # adjust a linear regression model
    x_variables = [
        # 'banos',
        # 'estacionamiento',
        'saleable_area',
        'quarters_since_first_appraisal'
    ]
    # x & y
    X = df.copy()[x_variables]
    y = np.log(df['price_per_sqm'])

    # fit
    lm = LinearRegression()
    lm.fit(X, y)
    # predict in counterfactuals
    x_counterfactuals = df_counterfactuals[x_variables].copy()
    # look beta of quarters_since_first_appraisal, if it is negative, then only predict for the quarter 10
    if lm.coef_[-1] < 0:
        x_counterfactuals['quarters_since_first_appraisal'] = 10
        df_counterfactuals['competitors_weighted_mean_log_price_per_sqm'] = lm.predict(x_counterfactuals[X.columns])
    else:
        df_counterfactuals['competitors_weighted_mean_log_price_per_sqm'] = lm.predict(x_counterfactuals)
    
    # see betas
    print(dict(zip(x_variables, lm.coef_)))

    return df_counterfactuals

# generate counterfactuals for 5 hex_ids
hex_ids_counterfactuals = [
    '8849ab4b45fffff',
    # '884995bae9fffff',
    '884995ba27fffff',
    '884995ba3dfffff',
    '8848a20667fffff',
    '884519b491fffff'
]

# counterfactuals, append the hex_id as index
X_counterfactuals = pd.concat(
    [generate_counterfactual_property(h_id, df_properties) for h_id in hex_ids_counterfactuals],
    keys=hex_ids_counterfactuals
)

# predict price per sqm
X_counterfactuals['price_per_sqm_pred'] = best_model.predict(Pool(X_counterfactuals, cat_features=categorical_cols))

# plot price per sqm pred vs quarters_since_first_appraisal for each hex_id
X_counterfactuals = X_counterfactuals.reset_index().rename(columns={'level_0': 'hex_id'})

# map hex_id to name
dict_hex_id_to_name = {
    '8849ab4b45fffff': 'andares-gdl',
    # '884995bae9fffff': 'ruben-dario-cdmx',
    '884995ba3dfffff': 'roma-norte-cdmx',
    '884995ba27fffff': 'doctores-cdmx',
    '8848a20667fffff': 'centrito-valle-mty',
    '884519b491fffff': 'pto-cancun-cancun'
}
X_counterfactuals['zone'] = X_counterfactuals['hex_id'].map(dict_hex_id_to_name)


In [None]:
from pandas.tseries.offsets import QuarterEnd

# get first day of the minimum date
first_date_obs = df_properties['valuation_date'].min().to_period('Q').to_timestamp()

# create a column of the date of the appraisal using first_date_obs + quarters_since_first_appraisal
X_counterfactuals['date'] = X_counterfactuals['quarters_since_first_appraisal'].apply(
    lambda x: first_date_obs + QuarterEnd(x)
)

# see
X_counterfactuals.head()

In [None]:
# plot
fig, ax = plt.subplots(figsize=(12, 6))
sns.lineplot(
    x='date',
    y='price_per_sqm_pred',
    hue='zone',
    data=X_counterfactuals,
    ax=ax
)

# y ticks as money
from matplotlib.ticker import FuncFormatter
def money_fmt(x, pos):
    return f"${x:,.0f}"

formatter = FuncFormatter(money_fmt)
ax.yaxis.set_major_formatter(formatter)

# every quarter add a tick
# import matplotlib.dates as mdates
# ax.xaxis.set_major_locator(mdates.MonthLocator(interval=12))
# ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))



# addorn
plt.title("Price Per sqm Prediction Over Time")
plt.xlabel("Quarters Since First Appraisal")
plt.ylabel("Price per sqm")
plt.legend(title='Zone')

plt.show()



In [None]:
# fit a cuadratic model to the data
lm_appraisal_time = smf.ols(
    formula="price_per_sqm_pred ~ quarters_since_first_appraisal*zone",
    data=X_counterfactuals
).fit()

# see summary
lm_appraisal_time.summary()

In [None]:
# forest plot for the slopes
df_slopes = pd.DataFrame({
    'zone': lm_appraisal_time.params.index[lm_appraisal_time.params.index.str.contains('quarters_since_first_appraisal')],
    'slope': lm_appraisal_time.params[
        lm_appraisal_time.params.index.str.contains('quarters_since_first_appraisal')
    ].values,
    'ci_lower': lm_appraisal_time.conf_int().loc[
        lm_appraisal_time.params.index.str.contains('quarters_since_first_appraisal'), 0
    ].values,
    'ci_upper': lm_appraisal_time.conf_int().loc[
        lm_appraisal_time.params.index.str.contains('quarters_since_first_appraisal'), 1
    ].values
})

# clean zone string
df_slopes['zone'] = df_slopes['zone'].str.replace('quarters_since_first_appraisal', '')
df_slopes['zone'] = df_slopes['zone'].str.replace('zone[T.', '')
df_slopes['zone'] = df_slopes['zone'].str.replace(']', '')
df_slopes['zone'] = df_slopes['zone'].str.replace(':', '')

# if empty then andares-gdl
df_slopes['zone'] = df_slopes['zone'].replace('', 'andares-gdl')

# set as index
df_slopes = df_slopes.set_index('zone')

# for all zones except andares-gdl add andares-gdl value
df_slopes['slope'] = np.where(
    df_slopes.index == 'andares-gdl',
    df_slopes['slope'],
    df_slopes.loc['andares-gdl', 'slope'] + df_slopes['slope']
)
df_slopes['ci_lower'] = np.where(
    df_slopes.index == 'andares-gdl',
    df_slopes['ci_lower'],
    df_slopes.loc['andares-gdl', 'ci_lower'] + df_slopes['ci_lower']
)
df_slopes['ci_upper'] = np.where(
    df_slopes.index == 'andares-gdl',
    df_slopes['ci_upper'],
    df_slopes.loc['andares-gdl', 'ci_upper'] + df_slopes['ci_upper']
)

# sort by slope
df_slopes = df_slopes.sort_values('slope')

# see
df_slopes


In [None]:
# forest plot
fig, ax = plt.subplots(figsize=(12, 6))

# plot
sns.barplot(
    x='slope',
    y='zone',
    data=df_slopes,
    ax=ax,
    orient='h'
)

# add confidence intervals
for i, row in df_slopes.iterrows():
    ax.plot(
        [row['ci_lower'], row['ci_upper']],
        [i, i],
        color='black'
    )

# addorn
plt.title("Velocity of Price Increase Over Time")
plt.xlabel("Price per sqm")
plt.ylabel("Zone")

# x ticks as money
from matplotlib.ticker import FuncFormatter
def money_fmt(x, pos):
    return f"${x:,.0f}"

formatter = FuncFormatter(money_fmt)
ax.xaxis.set_major_formatter(formatter)

# show
plt.show()


---
# Feature Importance

## General Importance

In [None]:
## Feature Importance
explainer = shap.TreeExplainer(best_model)

# calculate shap values
cols_x = [
    'id_clase_inmueble', 
    'property_type',
    'elevador', 'edad_anios',
    'regimen_propiedad', 'state_id', 'banos',
    'medio_banos', 'estacionamiento', 'saleable_area',
    'superficie_terreno_usable', 'distance_to_ocean', 'longitude', 'latitude',
    'count_supermarkets_at_1km', 'count_hospitals_at_5km',
    'count_metro_at_1km', 'count_schools_at_1km',
    'count_restaurants_at_1km',
    'competitors_weighted_mean_log_price_per_sqm',
    'mean_log_valor_fisico_terreno_m2',
    'mean_log_valor_fisico_terreno_m2_lower',
    'mean_log_valor_fisico_terreno_m2_upper',
    'quarters_since_first_appraisal', 'conservacion_recat',
    'cve_vigilancia_recat'
]
shap_values = explainer(X_test.loc[:, cols_x])

In [None]:
# plot feature importance
shap.summary_plot(shap_values, X_test.loc[:, cols_x])

In [None]:
# plot feature importance
shap.plots.bar(shap_values, max_display=10)

In [None]:
# Plot a beeswarm plot of SHAP values
shap.plots.beeswarm(shap_values, max_display=10)

## Interactions

In [None]:
# partal dependence plot
shap.dependence_plot('mean_log_valor_fisico_terreno_m2', shap_values.values, X_test.loc[:, cols_x], cmap=plt.get_cmap("winter"))

In [None]:
# partal dependence plot
shap.dependence_plot('quarters_since_first_appraisal', shap_values.values, X_test.loc[:, cols_x], interaction_index='id_clase_inmueble', cmap=plt.get_cmap("viridis"))

In [None]:
# partal dependence plot
shap.dependence_plot('quarters_since_first_appraisal', shap_values.values, X_test.loc[:, cols_x])

In [None]:
# partal dependence plot
shap.dependence_plot('id_clase_inmueble', shap_values.values, X_test.loc[:, cols_x])

In [None]:
# partal dependence plot
shap.dependence_plot('saleable_area', shap_values.values, X_test.loc[:, cols_x], interaction_index='property_type', cmap=plt.get_cmap("winter"))

In [None]:
# partal dependence plot
shap.dependence_plot('conservacion_recat', shap_values.values, X_test.loc[:, cols_x])

^^^maybe my categorization of remodeled is not good (move 1 position more)

In [None]:
# partal dependence plot
shap.dependence_plot('edad_anios', shap_values.values, X_test.loc[:, cols_x])

---
# Sandbox

In [None]:
df_properties['property_type'].value_counts()

In [None]:
df_properties['cve_vigilancia_recat'].value_counts()

In [None]:
# calculate harmonic mean
(
    df_properties
    .groupby('cve_vigilancia_recat')
    .agg({'price_per_sqm': hmean})
    .sort_values('cve_vigilancia_recat', ascending=False)
)

In [None]:
df_properties['quarters_since_first_appraisal'].describe()

In [None]:
df_properties.filter(like='fecha').describe()

In [None]:
X_test.columns

In [None]:
df_properties.columns