In [1]:
import os
import sys
import pandas as pd
import numpy as np

import pickle

sys.path.insert(1, '..')
import src.constants as cst
import src.pipeline as pipe
import src.evaluation as eval

from sklearn.model_selection import RandomizedSearchCV

import plotly.express as px

# Load data

In [2]:
train = pd.read_csv(cst.PREPROCESSED_NB_TRAIN_PATH, index_col=0)
test = pd.read_csv(cst.PREPROCESSED_NB_TEST_PATH, index_col=0)

# Target engineering

Housing price variations are usually driven by the price per m2. We will use `valeur_m2` as the new target for our models, computed by dividing `valeur` by `surface_reelle_bati`.

In [3]:
train[cst.target_col] = train[cst.raw_target_col]/train['surface_reelle_bati']
assert np.all(train[cst.target_col].notnull() & train[cst.target_col]!=np.inf)

In [4]:
fig = px.histogram(train[cst.raw_target_col], title='Distribution des valeurs foncières')
fig = fig.update_layout(xaxis_title='Valeur foncière', showlegend=False)
fig.show()

fig = px.histogram(train[cst.target_col], title='Distribution des valeurs foncières au m2')
fig = fig.update_layout(xaxis_title='Valeur foncière m2', showlegend=False)
fig.show()

# Outlier detection

No obvious outliers.

In [5]:
fig = px.box(train[cst.target_col])
fig.show()

# Split train test

In [6]:
X_train, X_test, y_train, y_test, y_train_valeur, y_test_valeur = eval.split_train_test(train)

# Test models

In [7]:
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
#from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

### Ensemble methods

In [8]:
# Untuned Random Forest Regressor
features_dict = {
    'min_max_scaled': ['lat', 'lon', 'n_metros_within_0.5km', 'n_trains_within_0.5km'], 
    'standard_scaled': ['surface_carrez_1er_lot', 'surface_carrez_2e_lot', 'surface_reelle_bati'], 
    'one_hot_encoded': ['commune'], 
    'target_encoded': [], 
    'count_freq_encoded': [],
    'unprocessed': ['nb_lots', 'nb_pieces', 'date_mutation', 'med_revenue_iris_2018'],
}

estimator = RandomForestRegressor(n_estimators=300, random_state=cst.random_seed)
eval.fit_and_evaluate_pipeline(estimator, features_dict, X_train, y_train, X_test, y_test)

{'mape': 0.11301462442874802, 'rmse': 1246.8272594827506}

In [9]:
features_dict = {
    'min_max_scaled': ['lat', 'lon', 'n_metros_within_0.5km', 'n_trains_within_0.5km'], 
    'standard_scaled': ['surface_carrez_1er_lot', 'surface_carrez_2e_lot', 'surface_reelle_bati'], 
    'one_hot_encoded': ['commune'], 
    'target_encoded': [], 
    'count_freq_encoded': ['district'],
    'unprocessed': ['nb_lots', 'nb_pieces', 'date_mutation'],
}

estimator = RandomForestRegressor(n_estimators=300, random_state=cst.random_seed)
eval.fit_and_evaluate_pipeline(estimator, features_dict, X_train, y_train, X_test, y_test)

{'mape': 0.1130967781578814, 'rmse': 1242.9783776413167}

### Tree-based methods (XGBoost)

In [10]:
features_dict = {
    'min_max_scaled': ['date_mutation', 'lat', 'lon', 'n_metros_within_0.5km', 'n_trains_within_0.5km'], 
    'standard_scaled': ['surface_carrez_1er_lot', 'surface_carrez_2e_lot', 'surface_reelle_bati'], 
    'one_hot_encoded': ['commune'], 
    'target_encoded': [], 
    'count_freq_encoded': ['district'],
    'unprocessed': ['nb_lots', 'nb_pieces']
}

estimator = XGBRegressor(random_state=cst.random_seed)
eval.fit_and_evaluate_pipeline(estimator, features_dict, X_train, y_train, X_test, y_test)

{'mape': 0.1121488796304243, 'rmse': 1235.981609508465}

#### Fine-tuned XGBRegressor

In [11]:
features_dict = {
    'min_max_scaled': ['date_mutation', 'lat', 'lon', 'n_metros_within_0.5km', 'n_trains_within_0.5km'], 
    'standard_scaled': ['surface_carrez_1er_lot', 'surface_carrez_2e_lot', 'surface_reelle_bati'], 
    'one_hot_encoded': ['commune'], 
    'target_encoded': [], 
    'count_freq_encoded': ['district'],
    'unprocessed': ['nb_lots', 'nb_pieces'] #, 'med_revenue_iris_2018'],
}

estimator = XGBRegressor(
    colsample_bytree=0.9, 
    learning_rate=0.1, 
    max_depth=5, 
    min_child_weight=0.2, 
    n_estimators=800, 
    reg_alpha=20, 
    reg_lambda=60
)
eval.fit_and_evaluate_pipeline(estimator, features_dict, X_train, y_train, X_test, y_test)

{'mape': 0.1103010424393738, 'rmse': 1215.369106461245}

In [12]:
features = pipe.get_features_from_dict(features_dict)
pipeline = pipe.build_pipeline(estimator, features_dict)
pipeline.fit(X_train[features], y_train)

filename = 'tuned_xgbr2.sav'
pickle.dump(pipeline, open(os.path.join('..', 'models', filename), 'wb'))

#### Hyperparameter tuning

In [13]:
features_dict = {
    'min_max_scaled': ['date_mutation', 'lat', 'lon', 'n_metros_within_0.5km', 'n_trains_within_0.5km'], 
    'standard_scaled': ['surface_carrez_1er_lot', 'surface_carrez_2e_lot', 'surface_reelle_bati'], 
    'one_hot_encoded': ['commune'], 
    'target_encoded': [], 
    'count_freq_encoded': ['district'],
    'unprocessed': ['nb_lots', 'nb_pieces'],
}

features = pipe.get_features_from_dict(features_dict)

xgb_param_grid = {
        'estimator__max_depth': [3, 6, 10, 15, 20],
        'estimator__learning_rate': [0.001, 0.01, 0.05, 0.1],
        'estimator__subsample': np.arange(0.7, 1, 0.05),
        'estimator__colsample_bytree': np.arange(0.6, 1, 0.1),
        'estimator__min_child_weight': [0.1, 0.2, 0.5, 1.0, 3.0],
        'estimator__gamma': [0, 0.25, 0.5, 1.0],
        'estimator__reg_alpha': [10, 20, 30],
        'estimator__reg_lambda': [10.0, 50.0, 70.0], 
        'estimator__n_estimators': np.arange(200, 800, 100)
        }

estimator = XGBRegressor(random_state=cst.random_seed) 
pipeline = pipe.build_pipeline(estimator, features_dict)

rsearch_xgbr = RandomizedSearchCV(
        pipeline, xgb_param_grid, 
        cv=3,
        n_iter=50, scoring='neg_mean_squared_error', 
        random_state=cst.random_seed, 
        verbose=1
)

In [14]:
# rsearch_xgbr.fit(X_train[features], y_train)

In [15]:
# best_xgbr_model = rsearch_xgbr.best_estimator_
# print('Best RMSE score:', np.sqrt(-rsearch_xgbr.best_score_))
# print('Best params:', rsearch_xgbr.best_params_)

### Tree-based methods (LightGBM)

In [16]:
# Random Forest Regressor
model_name = 'lgb_untuned'

features_dict = {
    'min_max_scaled': ['date_mutation', 'lat', 'lon', 'n_metros_within_0.5km', 'n_trains_within_0.5km'], 
    'standard_scaled': ['surface_carrez_1er_lot', 'surface_carrez_2e_lot', 'surface_reelle_bati'], 
    'one_hot_encoded': ['commune'], 
    'target_encoded': [], 
    'count_freq_encoded': ['district'],
    'unprocessed': ['nb_lots', 'nb_pieces'],
}

estimator = LGBMRegressor(random_state=cst.random_seed)
eval.fit_and_evaluate_pipeline(estimator, features_dict, X_train, y_train, X_test, y_test)

{'mape': 0.11215248469453538, 'rmse': 1222.372030570458}

In [17]:
model_name = 'lgb_tuned'

features_dict = {
    'min_max_scaled': ['date_mutation', 'lat', 'lon', 'n_metros_within_0.5km', 'n_trains_within_0.5km'], 
    'standard_scaled': ['surface_carrez_1er_lot', 'surface_carrez_2e_lot', 'surface_reelle_bati'], 
    'one_hot_encoded': ['commune'], 
    'target_encoded': [], 
    'count_freq_encoded': ['district'],
    'unprocessed': ['nb_lots', 'nb_pieces']#, 'med_revenue_iris_2018'],
}

estimator = LGBMRegressor(
    learning_rate=0.1,
    n_estimators=300,
    colsample_bytree=0.9,
    min_child_samples=120, 
    reg_alpha=1, 
    random_state=cst.random_seed, 
    subsample=0.9
)

eval.fit_and_evaluate_pipeline(estimator, features_dict, X_train, y_train, X_test, y_test)

{'mape': 0.11050492407858624, 'rmse': 1216.5006940235605}

In [18]:
features = pipe.get_features_from_dict(features_dict)

pipeline = pipe.build_pipeline(estimator, features_dict)
pipeline.fit(X_train[features], y_train)

filename = 'tuned_lgbr.sav'
pickle.dump(pipeline, open(os.path.join('..', 'models', filename), 'wb'))

### Tree-based models (CatBoost)

In [19]:
features_dict = {
    'min_max_scaled': [], 
    'standard_scaled': ['surface_carrez_1er_lot', 'surface_carrez_2e_lot', 'surface_reelle_bati'], 
    'one_hot_encoded': [], 
    'target_encoded': [], 
    'count_freq_encoded': ['district'],
    'unprocessed': ['lat', 'lon', 'date_mutation', 'n_metros_within_0.5km', 'n_trains_within_0.5km', 'nb_lots', 'nb_pieces', 'commune'],
}

In [20]:
estimator = CatBoostRegressor(random_state=cst.random_seed, cat_features=['commune'], verbose=False)
pipeline = pipe.build_pipeline(estimator, features_dict)

In [21]:
encoder = pipeline[:2]

features = pipe.get_features_from_dict(features_dict)

X_train_encoded = encoder.fit_transform(X_train[features])
X_train_encoded = pd.DataFrame(columns=X_train[features].columns, data=X_train_encoded)
X_test_encoded = encoder.transform(X_test[features])
X_test_encoded = pd.DataFrame(columns=X_train[features].columns, data=X_test_encoded)

estimator.fit(X_train_encoded, y_train)
metrics = eval.evaluate_model(estimator, X_test_encoded, y_test)
metrics

{'mape': 0.11016871263307161, 'rmse': 1210.418036538429}

In [22]:
filename = 'untuned_catboost.sav'
pickle.dump(estimator, open(os.path.join('..', 'models', filename), 'wb'))