In [1]:
import os
import sys
import pandas as pd
import numpy as np

import pickle

sys.path.insert(1, '..')
import src.constants as cst
import src.pipeline as pipe

from sklearn.base import clone
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error

import plotly.express as px

# Load data

In [2]:
train = pd.read_csv(cst.PREPROCESSED_NB_TRAIN_PATH, index_col=0)
test = pd.read_csv(cst.PREPROCESSED_NB_TEST_PATH, index_col=0)

In [3]:
# Recreate the target column
train[cst.target_col] = train[cst.raw_target_col]/train['surface_reelle_bati']

# Load models

In [4]:
lgbr_model = pickle.load(open('../models/tuned_lgbr.sav', 'rb'))
xgb_model = pickle.load(open('../models/tuned_xgbr2.sav', 'rb'))
catboost_model = pickle.load(open('../models/untuned_catboost.sav', 'rb'))

# Cross-evaluate models

In [5]:
features_dict_xgb = {
    'min_max_scaled': ['date_mutation', 'lat', 'lon', 'n_metros_within_0.5km', 'n_trains_within_0.5km'], 
    'standard_scaled': ['surface_carrez_1er_lot', 'surface_carrez_2e_lot', 'surface_reelle_bati'], 
    'one_hot_encoded': ['commune'], 
    'target_encoded': [], 
    'count_freq_encoded': ['district'],
    'unprocessed': ['nb_lots', 'nb_pieces'],
}

features_xgb = pipe.get_features_from_dict(features_dict_xgb)

# We ignore the RareLabelEncoder warnings because adding this step in the preprocessing affects the score 
# Plus the issue doesn't arise when using the full training set
scores = cross_val_score(xgb_model, train[features_xgb], train[cst.target_col], cv=3, scoring='neg_mean_squared_error')



In [6]:
print('Cross-val RMSE for the XBGRegressor model:', np.sqrt(-scores))
print('Average RMSE for the XBGRegressor model:', np.mean(np.sqrt(-scores)))

Cross-val RMSE for the XBGRegressor model: [1391.69458912 1308.06238861 1289.88916125]
Average RMSE for the XBGRegressor model: 1329.8820463270154


In [7]:
features_dict_lgb = {
    'min_max_scaled': ['date_mutation', 'lat', 'lon', 'n_metros_within_0.5km', 'n_trains_within_0.5km'], 
    'standard_scaled': ['surface_carrez_1er_lot', 'surface_carrez_2e_lot', 'surface_reelle_bati'], 
    'one_hot_encoded': ['commune'], 
    'target_encoded': [], 
    'count_freq_encoded': ['district'],
    'unprocessed': ['nb_lots', 'nb_pieces'],
}

features_lgb = pipe.get_features_from_dict(features_dict_lgb)

#scores = cross_val_score(lgbr_model, train[features_lgb], train[cst.target_col], cv=3, scoring='neg_mean_squared_error')

In [8]:
print('Cross-val RMSE for the LGBMRegressor model:', np.sqrt(-scores))
print('Average RMSE for the LGBMRegressor model:', np.mean(np.sqrt(-scores)))

Cross-val RMSE for the LGBMRegressor model: [1391.69458912 1308.06238861 1289.88916125]
Average RMSE for the LGBMRegressor model: 1329.8820463270154


In [11]:
# I build a custom KFold generator to perform cross-validation with the Catboost model
# Since we need X_train and X_test as datasets with column names to encode 'commune' using Catboost categorical feature handling method

features_dict_cb = {
    'min_max_scaled': [], 
    'standard_scaled': ['surface_carrez_1er_lot', 'surface_carrez_2e_lot', 'surface_reelle_bati'], 
    'one_hot_encoded': [], 
    'target_encoded': [], 
    'count_freq_encoded': ['district'],
    'unprocessed': ['lat', 'lon', 'date_mutation', 'n_metros_within_0.5km', 'n_trains_within_0.5km', 'nb_lots', 'nb_pieces', 'commune'],
}

features_cb = pipe.get_features_from_dict(features_dict_cb)
encoder_cb = pipe.build_pipeline(catboost_model, features_dict_cb)[:2]

In [12]:
X = train.reset_index(drop=True)[features_cb]
y = train.reset_index(drop=True)[cst.target_col]
scores = []

kf = KFold(n_splits=3)
kf.get_n_splits(X)

for train_index, test_index in kf.split(X):
    X_train, X_test = X.loc[train_index, :], X.loc[test_index, :]
    y_train, y_test = y[train_index], y[test_index]

    X_train_encoded = encoder_cb.fit_transform(X_train, y_train)
    X_train_encoded = pd.DataFrame(columns=features_cb, data=X_train_encoded)
    X_test_encoded = encoder_cb.transform(X_test)
    X_test_encoded = pd.DataFrame(columns=features_cb, data=X_test_encoded)

    catboost_model.fit(X_train_encoded, y_train)
    y_pred = catboost_model.predict(X_test_encoded)
    scores.append(mean_squared_error(y_test, y_pred))



In [13]:
print('Cross-val RMSE for the Catboost regressor model:', np.sqrt(scores))
print('Average RMSE for the Catboost regressor:', np.mean(np.sqrt(scores)))

Cross-val RMSE for the Catboost regressor model: [1672.17723367 1497.17838261 1309.755746  ]
Average RMSE for the Catboost regressor: 1493.037120759305


# Train the selected model on the full dataset

In [14]:
selected_model = clone(lgbr_model)

In [15]:
X, y = train[features_lgb], train[cst.target_col]
selected_model.fit(X, y)

Pipeline(steps=[('feature_engine',
                 CountFrequencyEncoder(encoding_method='frequency',
                                       variables=['district'])),
                ('encoder',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('minmaxscaler',
                                                  MinMaxScaler(),
                                                  ['date_mutation', 'lat',
                                                   'lon',
                                                   'n_metros_within_0.5km',
                                                   'n_trains_within_0.5km']),
                                                 ('standardscaler',
                                                  StandardScaler(),
                                                  ['surface_carrez_1er_lot',
                                                   'surface_carrez_2e_lot',
                                    

In [16]:
train_predictions_m2 = selected_model.predict(X)
train_predictions_full_price = train_predictions_m2 * train['surface_reelle_bati']
train_mape_score = mean_absolute_percentage_error(train_predictions_m2, y)
print(train_mape_score)

0.08859827311484908


In [17]:
fig = px.histogram(train_predictions_m2, title='Distribution of train set predictions (m2)')
fig.update_layout(xaxis_title='Predicted price per m2', yaxis_title='Count', showlegend=False)
fig.show()

fig = px.histogram(train_predictions_full_price, title='Distribution of train set predictions (full price)')
fig.update_layout(xaxis_title='Predicted price', yaxis_title='Count', showlegend=False)
fig.show()

# Predict on the test set

In [18]:
assert test[features_lgb].isnull().sum().sum()==0

In [19]:
test_predictions_m2 = selected_model.predict(test[features_lgb])
test_predictions_full_price = test_predictions_m2 * test['surface_reelle_bati']

In [20]:
fig = px.histogram(test_predictions_m2, title='Distribution of test set predictions (m2)')
fig.update_layout(xaxis_title='Predicted price per m2', yaxis_title='Count', showlegend=False)
fig.show()

fig = px.histogram(test_predictions_full_price, title='Distribution of test set predictions (full price)')
fig.update_layout(xaxis_title='Predicted price', yaxis_title='Count', showlegend=False)
fig.show()

In [21]:
test['Valeur fonciere'] = test_predictions_full_price
test.head()

Unnamed: 0_level_0,date_mutation,nature_mutation,type_voie,commune,surface_carrez_1er_lot,surface_carrez_2e_lot,nb_lots,type_local,surface_reelle_bati,nb_pieces,lon,lat,district,coordinates,n_metros_within_0.5km,n_trains_within_0.5km,med_revenue_iris_2018,Valeur fonciere
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
249349,1530,Vente,AV,BOULOGNE-BILLANCOURT,79.4,0.0,1,Appartement,78.0,4.0,2.252551,48.837822,Auteuil Sud,"(48.837822, 2.252551)",1,0.0,21300.0,708077.647602
249400,1532,Vente,RUE,ISSY-LES-MOULINEAUX,32.88,0.0,1,Appartement,34.0,1.0,2.282171,48.831289,Vaugirard-Parc des Expositions,"(48.831289, 2.282171)",1,0.0,29960.0,278153.042451
249800,1528,Vente,RUE,PARIS 08,33.66,0.0,1,Appartement,34.0,2.0,2.301586,48.873205,Hoche-Friedland,"(48.873205, 2.301586)",4,1.0,30380.0,357779.484702
249943,1528,Vente,RUE,PARIS 03,50.67,0.0,1,Appartement,59.0,2.0,2.364598,48.857983,Archives,"(48.857983, 2.364598)",4,0.0,40470.0,568243.719151
250150,1529,Vente,RUE,PARIS 03,19.7,0.0,1,Appartement,20.0,1.0,2.365183,48.861299,Enfants Rouges,"(48.861299, 2.365183)",5,0.0,36220.0,175263.629417


In [22]:
test_results = test.reset_index()[['index', 'Valeur fonciere']]
test_results.head()

Unnamed: 0,index,Valeur fonciere
0,249349,708077.647602
1,249400,278153.042451
2,249800,357779.484702
3,249943,568243.719151
4,250150,175263.629417


In [23]:
test_results.to_csv(cst.PREDICTIONS_PATH)