# Model trained for each city

This notebook demonstrates the process of building and evaluating regression models for predicting the `realsum_cut` variable for different cities (Barcelona, Paris, Rome).

### Loading Dependencies
Importing necessary libraries for data manipulation, feature selection, model building, and evaluation. We will also import the custom functions created for this project.

In [169]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import RFECV
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

from sklearn.tree import DecisionTreeRegressor
from tqdm import tqdm
from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_error,
    median_absolute_error,
    mean_absolute_percentage_error,
    r2_score,
    make_scorer
)
from sklearn.model_selection import (
    KFold,
    ShuffleSplit,
    cross_val_score,
    cross_validate,
    RandomizedSearchCV,
    GridSearchCV
)
from sklearn.linear_model import (
    LinearRegression,
    Lasso,
    Ridge,
    ElasticNet
)
from scipy.stats import uniform
from sklearn.exceptions import ConvergenceWarning
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from additional_functions import (
    CV_wrapper, CV_rmse_wrapper, shuffle_CV, shuffle_CV_rmse, ols_model_wrapper, scoring_wrapper, lasso_model_wrapper, ridge_model_wrapper, EN_model_wrapper, cv_proc
)
import warnings
import pickle

warnings.simplefilter("ignore", category=ConvergenceWarning)

pd.set_option("display.max_columns", 500)
pd.set_option("display.max_rows", 150)

### Loading Data
Loading training datasets for Barcelona, Paris, and Rome, as well as precomputed general rankings of features for each city.

In [7]:
df_barcelona = pd.read_csv("data/output/data_barcelona_train.csv")
df_paris = pd.read_csv("data/output/data_paris_train.csv")
df_rome = pd.read_csv("data/output/data_rome_train.csv")

fr_barcelona = pd.read_csv("data/output/general_ranking_barcelona.csv", index_col=0)
fr_paris = pd.read_csv("data/output/general_ranking_paris.csv", index_col=0)
fr_rome = pd.read_csv("data/output/general_ranking_rome.csv", index_col=0)

### Feature Selection
We will select a few features dataset, based on different metrics, i.e. correlation, mutual information, and F-score. We will also select features based on the results of the Elastic Net model, Recursive Feature Elimination, Sequential Feature Selector, and Boruta.

1. Correlation

In [8]:
corr_features_bar = fr_barcelona['corr'].sort_values(ascending=False).index[0:5]
corr_features_par = fr_paris['corr'].sort_values(ascending=False).index[0:5]
corr_features_rom = fr_rome['corr'].sort_values(ascending=False).index[0:5]

2. Elastic Net

In [9]:
features_selection_en_bar = fr_barcelona["EN_coef"].dropna().index.tolist()
features_selection_en_par = fr_paris["EN_coef"].dropna().index.tolist()
features_selection_en_rom = fr_rome["EN_coef"].dropna().index.tolist()

In [10]:
features_selection_en_bar.remove("guest_satisfaction_overall")
features_selection_en_bar.remove("room_private")
features_selection_en_bar.remove("multi")

features_selection_en_par.remove("guest_satisfaction_overall")
features_selection_en_par.remove("room_private")
features_selection_en_par.remove("multi")

features_selection_en_rom.remove("guest_satisfaction_overall")
features_selection_en_rom.remove("room_private")
features_selection_en_rom.remove("multi")

3. Recursive Feature Elimination

In [11]:
features_candidates_rec_bar = df_barcelona.columns.tolist()
features_candidates_rec_bar.remove("realsum_cut")

features_candidates_rec_par = df_paris.columns.tolist()
features_candidates_rec_par.remove("realsum_cut")

features_candidates_rec_rom = df_rome.columns.tolist()
features_candidates_rec_rom.remove("realsum_cut")

In [12]:
estimator = LinearRegression()
selector = RFECV(estimator, step=1, cv=5, min_features_to_select=10)

selector = selector.fit(
    df_barcelona.loc[:, features_candidates_rec_bar].values, df_barcelona.loc[:, "realsum_cut"].values.ravel()
)
features_candidates_rec_bar = df_barcelona.loc[:, features_candidates_rec_bar].iloc[:, selector.support_].columns.tolist()

selector = selector.fit(
    df_paris.loc[:, features_candidates_rec_par].values, df_paris.loc[:, "realsum_cut"].values.ravel()
)
features_candidates_rec_par = df_paris.loc[:, features_candidates_rec_par].iloc[:, selector.support_].columns.tolist()

selector = selector.fit(
    df_rome.loc[:, features_candidates_rec_rom].values, df_rome.loc[:, "realsum_cut"].values.ravel()
)
features_candidates_rec_rom = df_rome.loc[:, features_candidates_rec_rom].iloc[:, selector.support_].columns.tolist()

4. Sequential Feature Selector

In [13]:
features_selection_back_bar = df_barcelona.columns.tolist()
features_selection_back_bar.remove("realsum_cut")

features_selection_back_par = df_paris.columns.tolist()
features_selection_back_par.remove("realsum_cut")

features_selection_back_rom = df_rome.columns.tolist()
features_selection_back_rom.remove("realsum_cut")

In [14]:
model= LinearRegression()

sf = SFS(
    model,
    k_features=(1, 10),
    forward=True,
    floating=False,
    verbose=0,
    scoring="neg_mean_squared_error",
    cv=5,
)

sffit = sf.fit(df_barcelona.loc[:, features_selection_back_bar].values, df_barcelona.loc[:, "realsum_cut"].values.ravel())
features_selection_back_bar = df_barcelona.loc[:, features_selection_back_bar].columns[list(sffit.k_feature_idx_)]

sffit = sf.fit(df_paris.loc[:, features_selection_back_par].values, df_paris.loc[:, "realsum_cut"].values.ravel())
features_selection_back_par = df_paris.loc[:, features_selection_back_par].columns[list(sffit.k_feature_idx_)]

sffit = sf.fit(df_rome.loc[:, features_selection_back_rom].values, df_rome.loc[:, "realsum_cut"].values.ravel())
features_selection_back_rom = df_rome.loc[:, features_selection_back_rom].columns[list(sffit.k_feature_idx_)]

#### Model evaluation with cross-validation approach
We will evaluate the performance of the models using the cross-validation approach. We will use the following metrics: mean absolute percentage error (MAPE), mean absolute error (MAE), and root mean squared error (RMSE). We will also shuffle the data and evaluate the models using the same metrics. We will be using custom functions to evaluate the models.

In [15]:
model_ols = LinearRegression()
(CV_wrapper(df_barcelona, corr_features_bar, model_ols, 10, display_res=True).mean(),
 CV_rmse_wrapper(df_barcelona, corr_features_bar, model_ols, 10, display_res=True).mean(),
 CV_wrapper(df_barcelona, corr_features_bar, model_ols, 10, 123, shuff=True, display_res=True).mean(),
 CV_rmse_wrapper(df_barcelona, corr_features_bar, model_ols, 10, 123, shuff=True, display_res=True).mean(),
 shuffle_CV(df_barcelona, corr_features_bar, model_ols, 10, 123, display_res=True).mean(),
 shuffle_CV_rmse(df_barcelona, corr_features_bar, model_ols, 10, 123, display_res=True).mean())

(cv_train    0.324073
 cv_val      0.325246
 dtype: float64,
 cv_train    121.250344
 cv_val      121.311725
 dtype: float64,
 cv_train    0.324080
 cv_val      0.325145
 dtype: float64,
 cv_train    121.252342
 cv_val      121.693285
 dtype: float64,
 cv_train    0.322960
 cv_val      0.330138
 dtype: float64,
 cv_train    120.685517
 cv_val      123.062210
 dtype: float64)

In [16]:
(CV_wrapper(df_barcelona, features_selection_en_bar, model_ols, 10, display_res=True).mean(),
 CV_rmse_wrapper(df_barcelona, features_selection_en_bar, model_ols, 10, display_res=True).mean(),
 CV_wrapper(df_barcelona, features_selection_en_bar, model_ols, 10, 123, shuff=True, display_res=True).mean(),
 CV_rmse_wrapper(df_barcelona, features_selection_en_bar, model_ols, 10, 123, shuff=True, display_res=True).mean(),
 shuffle_CV(df_barcelona, features_selection_en_bar, model_ols, 10, 123, display_res=True).mean(),
 shuffle_CV_rmse(df_barcelona, features_selection_en_bar, model_ols, 10, 123, display_res=True).mean())

(cv_train    0.298691
 cv_val      0.300762
 dtype: float64,
 cv_train    115.721985
 cv_val      116.151767
 dtype: float64,
 cv_train    0.298721
 cv_val      0.301339
 dtype: float64,
 cv_train    115.720214
 cv_val      116.709301
 dtype: float64,
 cv_train    0.298645
 cv_val      0.305653
 dtype: float64,
 cv_train    115.071281
 cv_val      118.120806
 dtype: float64)

In [17]:
(CV_wrapper(df_barcelona, features_selection_back_bar, model_ols, 10, display_res=True).mean(),
 CV_rmse_wrapper(df_barcelona, features_selection_back_bar, model_ols, 10, display_res=True).mean(),
 CV_wrapper(df_barcelona, features_selection_back_bar, model_ols, 10, 123, shuff=True, display_res=True).mean(),
 CV_rmse_wrapper(df_barcelona, features_selection_back_bar, model_ols, 10, 123, shuff=True, display_res=True).mean(),
 shuffle_CV(df_barcelona, features_selection_back_bar, model_ols, 10, 123, display_res=True).mean(),
 shuffle_CV_rmse(df_barcelona, features_selection_back_bar, model_ols, 10, 123, display_res=True).mean())

(cv_train    0.298506
 cv_val      0.300055
 dtype: float64,
 cv_train    115.758574
 cv_val      116.060216
 dtype: float64,
 cv_train    0.298531
 cv_val      0.300305
 dtype: float64,
 cv_train    115.761178
 cv_val      116.526678
 dtype: float64,
 cv_train    0.298405
 cv_val      0.304339
 dtype: float64,
 cv_train    115.135030
 cv_val      117.949626
 dtype: float64)

In [18]:
(CV_wrapper(df_barcelona, features_candidates_rec_bar, model_ols, 10, display_res=True).mean(),
 CV_rmse_wrapper(df_barcelona, features_candidates_rec_bar, model_ols, 10, display_res=True).mean(),
 CV_wrapper(df_barcelona, features_candidates_rec_bar, model_ols, 10, 123, shuff=True, display_res=True).mean(),
 CV_rmse_wrapper(df_barcelona, features_candidates_rec_bar, model_ols, 10, 123, shuff=True, display_res=True).mean(),
 shuffle_CV(df_barcelona, features_candidates_rec_bar, model_ols, 10, 123, display_res=True).mean(),
 shuffle_CV_rmse(df_barcelona, features_candidates_rec_bar, model_ols, 10, 123, display_res=True).mean())

(cv_train    0.298456
 cv_val      0.300369
 dtype: float64,
 cv_train    115.727480
 cv_val      116.109915
 dtype: float64,
 cv_train    0.298488
 cv_val      0.300739
 dtype: float64,
 cv_train    115.728158
 cv_val      116.619966
 dtype: float64,
 cv_train    0.298369
 cv_val      0.305064
 dtype: float64,
 cv_train    115.083457
 cv_val      118.066039
 dtype: float64)

In [19]:
(CV_wrapper(df_paris, corr_features_par, model_ols, 10, display_res=True).mean(),
 CV_rmse_wrapper(df_paris, corr_features_par, model_ols, 10, display_res=True).mean(),
 CV_wrapper(df_paris, corr_features_par, model_ols, 10, 123, shuff=True, display_res=True).mean(),
 CV_rmse_wrapper(df_paris, corr_features_par, model_ols, 10, 123, shuff=True, display_res=True).mean(),
 shuffle_CV(df_paris, corr_features_par, model_ols, 10, 123, display_res=True).mean(),
 shuffle_CV_rmse(df_paris, corr_features_par, model_ols, 10, 123, display_res=True).mean())

(cv_train    0.368739
 cv_val      0.369420
 dtype: float64,
 cv_train    175.994451
 cv_val      176.096363
 dtype: float64,
 cv_train    0.368761
 cv_val      0.369101
 dtype: float64,
 cv_train    175.996297
 cv_val      175.982847
 dtype: float64,
 cv_train    0.369285
 cv_val      0.369439
 dtype: float64,
 cv_train    176.376613
 cv_val      174.956156
 dtype: float64)

In [20]:
(CV_wrapper(df_paris, features_selection_en_par, model_ols, 10, display_res=True).mean(),
 CV_rmse_wrapper(df_paris, features_selection_en_par, model_ols, 10, display_res=True).mean(),
 CV_wrapper(df_paris, features_selection_en_par, model_ols, 10, 123, shuff=True, display_res=True).mean(),
 CV_rmse_wrapper(df_paris, features_selection_en_par, model_ols, 10, 123, shuff=True, display_res=True).mean(),
 shuffle_CV(df_paris, features_selection_en_par, model_ols, 10, 123, display_res=True).mean(),
 shuffle_CV_rmse(df_paris, features_selection_en_par, model_ols, 10, 123, display_res=True).mean())

(cv_train    0.32878
 cv_val      0.33002
 dtype: float64,
 cv_train    159.857185
 cv_val      160.317761
 dtype: float64,
 cv_train    0.328760
 cv_val      0.329646
 dtype: float64,
 cv_train    159.861901
 cv_val      160.069747
 dtype: float64,
 cv_train    0.329664
 cv_val      0.329176
 dtype: float64,
 cv_train    160.244623
 cv_val      159.052378
 dtype: float64)

In [21]:
(CV_wrapper(df_paris, features_selection_back_par, model_ols, 10, display_res=True).mean(),
 CV_rmse_wrapper(df_paris, features_selection_back_par, model_ols, 10, display_res=True).mean(),
 CV_wrapper(df_paris, features_selection_back_par, model_ols, 10, 123, shuff=True, display_res=True).mean(),
 CV_rmse_wrapper(df_paris, features_selection_back_par, model_ols, 10, 123, shuff=True, display_res=True).mean(),
 shuffle_CV(df_paris, features_selection_back_par, model_ols, 10, 123, display_res=True).mean(),
 shuffle_CV_rmse(df_paris, features_selection_back_par, model_ols, 10, 123, display_res=True).mean())

(cv_train    0.327131
 cv_val      0.327992
 dtype: float64,
 cv_train    159.227814
 cv_val      159.497709
 dtype: float64,
 cv_train    0.327117
 cv_val      0.327861
 dtype: float64,
 cv_train    159.230815
 cv_val      159.331195
 dtype: float64,
 cv_train    0.327737
 cv_val      0.328030
 dtype: float64,
 cv_train    159.536121
 cv_val      158.554046
 dtype: float64)

In [22]:
(CV_wrapper(df_paris, features_candidates_rec_par, model_ols, 10, display_res=True).mean(),
 CV_rmse_wrapper(df_paris, features_candidates_rec_par, model_ols, 10, display_res=True).mean(),
 CV_wrapper(df_paris, features_candidates_rec_par, model_ols, 10, 123, shuff=True, display_res=True).mean(),
 CV_rmse_wrapper(df_paris, features_candidates_rec_par, model_ols, 10, 123, shuff=True, display_res=True).mean(),
 shuffle_CV(df_paris, features_candidates_rec_par, model_ols, 10, 123, display_res=True).mean(),
 shuffle_CV_rmse(df_paris, features_candidates_rec_par, model_ols, 10, 123, display_res=True).mean())

(cv_train    0.326919
 cv_val      0.328376
 dtype: float64,
 cv_train    158.956937
 cv_val      159.462674
 dtype: float64,
 cv_train    0.326924
 cv_val      0.327799
 dtype: float64,
 cv_train    158.962325
 cv_val      159.199517
 dtype: float64,
 cv_train    0.327597
 cv_val      0.328042
 dtype: float64,
 cv_train    159.286094
 cv_val      158.364783
 dtype: float64)

In [23]:
(CV_wrapper(df_rome, corr_features_rom, model_ols, 10, display_res=True).mean(),
 CV_rmse_wrapper(df_rome, corr_features_rom, model_ols, 10, display_res=True).mean(),
 CV_wrapper(df_rome, corr_features_rom, model_ols, 10, 123, shuff=True, display_res=True).mean(),
 CV_rmse_wrapper(df_rome, corr_features_rom, model_ols, 10, 123, shuff=True, display_res=True).mean(),
 shuffle_CV(df_rome, corr_features_rom, model_ols, 10, 123, display_res=True).mean(),
 shuffle_CV_rmse(df_rome, corr_features_rom, model_ols, 10, 123, display_res=True).mean())

(cv_train    0.262915
 cv_val      0.263078
 dtype: float64,
 cv_train    70.766413
 cv_val      70.744913
 dtype: float64,
 cv_train    0.26291
 cv_val      0.26316
 dtype: float64,
 cv_train    70.764806
 cv_val      70.730000
 dtype: float64,
 cv_train    0.262993
 cv_val      0.262828
 dtype: float64,
 cv_train    70.850707
 cv_val      70.530659
 dtype: float64)

In [24]:
(CV_wrapper(df_rome, features_selection_en_rom, model_ols, 10, display_res=True).mean(),
 CV_rmse_wrapper(df_rome, features_selection_en_rom, model_ols, 10, display_res=True).mean(),
 CV_wrapper(df_rome, features_selection_en_rom, model_ols, 10, 123, shuff=True, display_res=True).mean(),
 CV_rmse_wrapper(df_rome, features_selection_en_rom, model_ols, 10, 123, shuff=True, display_res=True).mean(),
 shuffle_CV(df_rome, features_selection_en_rom, model_ols, 10, 123, display_res=True).mean(),
 shuffle_CV_rmse(df_rome, features_selection_en_rom, model_ols, 10, 123, display_res=True).mean())

(cv_train    0.242658
 cv_val      0.243200
 dtype: float64,
 cv_train    66.273342
 cv_val      66.349094
 dtype: float64,
 cv_train    0.242656
 cv_val      0.243230
 dtype: float64,
 cv_train    66.271127
 cv_val      66.293823
 dtype: float64,
 cv_train    0.242747
 cv_val      0.244003
 dtype: float64,
 cv_train    66.297068
 cv_val      66.274527
 dtype: float64)

In [25]:
(CV_wrapper(df_rome, features_selection_back_rom, model_ols, 10, display_res=True).mean(),
 CV_rmse_wrapper(df_rome, features_selection_back_rom, model_ols, 10, display_res=True).mean(),
 CV_wrapper(df_rome, features_selection_back_rom, model_ols, 10, 123, shuff=True, display_res=True).mean(),
 CV_rmse_wrapper(df_rome, features_selection_back_rom, model_ols, 10, 123, shuff=True, display_res=True).mean(),
 shuffle_CV(df_rome, features_selection_back_rom, model_ols, 10, 123, display_res=True).mean(),
 shuffle_CV_rmse(df_rome, features_selection_back_rom, model_ols, 10, 123, display_res=True).mean())

(cv_train    0.243348
 cv_val      0.243684
 dtype: float64,
 cv_train    66.316611
 cv_val      66.356529
 dtype: float64,
 cv_train    0.24334
 cv_val      0.24382
 dtype: float64,
 cv_train    66.313880
 cv_val      66.308558
 dtype: float64,
 cv_train    0.243319
 cv_val      0.244818
 dtype: float64,
 cv_train    66.318376
 cv_val      66.362151
 dtype: float64)

In [26]:
(CV_wrapper(df_rome, features_candidates_rec_rom, model_ols, 10, display_res=True).mean(),
 CV_rmse_wrapper(df_rome, features_candidates_rec_rom, model_ols, 10, display_res=True).mean(),
 CV_wrapper(df_rome, features_candidates_rec_rom, model_ols, 10, 123, shuff=True, display_res=True).mean(),
 CV_rmse_wrapper(df_rome, features_candidates_rec_rom, model_ols, 10, 123, shuff=True, display_res=True).mean(),
 shuffle_CV(df_rome, features_candidates_rec_rom, model_ols, 10, 123, display_res=True).mean(),
 shuffle_CV_rmse(df_rome, features_candidates_rec_rom, model_ols, 10, 123, display_res=True).mean())

(cv_train    0.241865
 cv_val      0.242456
 dtype: float64,
 cv_train    66.116590
 cv_val      66.197232
 dtype: float64,
 cv_train    0.241864
 cv_val      0.242495
 dtype: float64,
 cv_train    66.114313
 cv_val      66.147092
 dtype: float64,
 cv_train    0.241976
 cv_val      0.243325
 dtype: float64,
 cv_train    66.120678
 cv_val      66.186610
 dtype: float64)

We have that the best OLS model performance is achieved with the features obtained with Forward Sequential Feature Selector method for Barcelona and Paris, and Recursive Feature Elimination method for Rome. Let's save the models.

In [170]:
model_bar = LinearRegression()
model_bar.fit(df_barcelona.loc[:, features_selection_back_bar].values, df_barcelona.loc[:, "realsum_cut"].values.ravel())
pickle.dump(model_bar, open("models/model_barcelona.sav", "wb"))

model_par = LinearRegression()
model_par.fit(df_paris.loc[:, features_selection_back_par].values, df_paris.loc[:, "realsum_cut"].values.ravel())
pickle.dump(model_par, open("models/model_paris.sav", "wb"))

model_rom = LinearRegression()
model_rom.fit(df_rome.loc[:, features_candidates_rec_rom].values, df_rome.loc[:, "realsum_cut"].values.ravel())
pickle.dump(model_rom, open("models/model_rome.sav", "wb"))

#### Regularization


1. Lasso

In [27]:
(lasso_model_wrapper(df_barcelona, corr_features_bar, 10, 123, display_res=True).mean(),
    lasso_model_wrapper(df_barcelona, features_selection_en_bar, 10, 123, display_res=True).mean(),
    lasso_model_wrapper(df_barcelona, features_selection_back_bar, 10, 123, display_res=True).mean(),
    lasso_model_wrapper(df_barcelona, features_candidates_rec_bar, 10, 123, display_res=True).mean())

(cv_train    0.325544
 cv_val      0.326529
 dtype: float64,
 cv_train    0.298697
 cv_val      0.300256
 dtype: float64,
 cv_train    0.298696
 cv_val      0.300247
 dtype: float64,
 cv_train    0.298696
 cv_val      0.300247
 dtype: float64)

In [28]:
(lasso_model_wrapper(df_paris, corr_features_par, 10, 123, display_res=True).mean(),
    lasso_model_wrapper(df_paris, features_selection_en_par, 10, 123, display_res=True).mean(),
    lasso_model_wrapper(df_paris, features_selection_back_par, 10, 123, display_res=True).mean(),
    lasso_model_wrapper(df_paris, features_candidates_rec_par, 10, 123, display_res=True).mean())

(cv_train    0.369263
 cv_val      0.369600
 dtype: float64,
 cv_train    0.330401
 cv_val      0.331052
 dtype: float64,
 cv_train    0.335160
 cv_val      0.335809
 dtype: float64,
 cv_train    0.327687
 cv_val      0.328406
 dtype: float64)

In [29]:
(lasso_model_wrapper(df_rome, corr_features_rom, 10, 123, display_res=True).mean(),
    lasso_model_wrapper(df_rome, features_selection_en_rom, 10, 123, display_res=True).mean(),
    lasso_model_wrapper(df_rome, features_selection_back_rom, 10, 123, display_res=True).mean(),
    lasso_model_wrapper(df_rome, features_candidates_rec_rom, 10, 123, display_res=True).mean())

(cv_train    0.264011
 cv_val      0.264274
 dtype: float64,
 cv_train    0.245345
 cv_val      0.245875
 dtype: float64,
 cv_train    0.245779
 cv_val      0.246180
 dtype: float64,
 cv_train    0.245342
 cv_val      0.245873
 dtype: float64)

Lasso did not improved any of the models. Let's try Ridge.

2. Ridge

In [30]:
(ridge_model_wrapper(df_barcelona, corr_features_bar, 10, 123, display_res=True).mean(),
    ridge_model_wrapper(df_barcelona, features_selection_en_bar, 10, 123, display_res=True).mean(),
    ridge_model_wrapper(df_barcelona, features_selection_back_bar, 10, 123, display_res=True).mean(),
    ridge_model_wrapper(df_barcelona, features_candidates_rec_bar, 10, 123, display_res=True).mean())

(cv_train    0.324206
 cv_val      0.325268
 dtype: float64,
 cv_train    0.298773
 cv_val      0.300819
 dtype: float64,
 cv_train    0.298716
 cv_val      0.300376
 dtype: float64,
 cv_train    0.298661
 cv_val      0.300482
 dtype: float64)

In [31]:
(ridge_model_wrapper(df_paris, corr_features_par, 10, 123, display_res=True).mean(),
    ridge_model_wrapper(df_paris, features_selection_en_par, 10, 123, display_res=True).mean(),
    ridge_model_wrapper(df_paris, features_selection_back_par, 10, 123, display_res=True).mean(),
    ridge_model_wrapper(df_paris, features_candidates_rec_par, 10, 123, display_res=True).mean())

(cv_train    0.368757
 cv_val      0.369097
 dtype: float64,
 cv_train    0.326628
 cv_val      0.327403
 dtype: float64,
 cv_train    0.324629
 cv_val      0.325325
 dtype: float64,
 cv_train    0.324968
 cv_val      0.325809
 dtype: float64)

In [32]:
(ridge_model_wrapper(df_rome, corr_features_rom, 10, 123, display_res=True).mean(),
    ridge_model_wrapper(df_rome, features_selection_en_rom, 10, 123, display_res=True).mean(),
    ridge_model_wrapper(df_rome, features_selection_back_rom, 10, 123, display_res=True).mean(),
    ridge_model_wrapper(df_rome, features_candidates_rec_rom, 10, 123, display_res=True).mean())

(cv_train    0.262916
 cv_val      0.263166
 dtype: float64,
 cv_train    0.242499
 cv_val      0.243061
 dtype: float64,
 cv_train    0.242948
 cv_val      0.243433
 dtype: float64,
 cv_train    0.241733
 cv_val      0.242351
 dtype: float64)

Same with Ridge. Finally, let's try Elastic Net.

3. Elastic Net

In [33]:
(EN_model_wrapper(df_barcelona, corr_features_bar, 10, 123, display_res=True).mean(),
    EN_model_wrapper(df_barcelona, features_selection_en_bar, 10, 123, display_res=True).mean(),
    EN_model_wrapper(df_barcelona, features_selection_back_bar, 10, 123, display_res=True).mean(),
    EN_model_wrapper(df_barcelona, features_candidates_rec_bar, 10, 123, display_res=True).mean())

(cv_train    0.324745
 cv_val      0.325785
 dtype: float64,
 cv_train    0.299149
 cv_val      0.301084
 dtype: float64,
 cv_train    0.299100
 cv_val      0.300728
 dtype: float64,
 cv_train    0.298826
 cv_val      0.300541
 dtype: float64)

In [34]:
(EN_model_wrapper(df_paris, corr_features_par, 10, 123, display_res=True).mean(),
    EN_model_wrapper(df_paris, features_selection_en_par, 10, 123, display_res=True).mean(),
    EN_model_wrapper(df_paris, features_selection_back_par, 10, 123, display_res=True).mean(),
    EN_model_wrapper(df_paris, features_candidates_rec_par, 10, 123, display_res=True).mean())

(cv_train    0.368735
 cv_val      0.369078
 dtype: float64,
 cv_train    0.330134
 cv_val      0.330901
 dtype: float64,
 cv_train    0.332274
 cv_val      0.332932
 dtype: float64,
 cv_train    0.327884
 cv_val      0.328619
 dtype: float64)

In [35]:
(EN_model_wrapper(df_rome, corr_features_rom, 10, 123, display_res=True).mean(),
    EN_model_wrapper(df_rome, features_selection_en_rom, 10, 123, display_res=True).mean(),
    EN_model_wrapper(df_rome, features_selection_back_rom, 10, 123, display_res=True).mean(),
    EN_model_wrapper(df_rome, features_candidates_rec_rom, 10, 123, display_res=True).mean())

(cv_train    0.263028
 cv_val      0.263280
 dtype: float64,
 cv_train    0.243673
 cv_val      0.244186
 dtype: float64,
 cv_train    0.243985
 cv_val      0.244458
 dtype: float64,
 cv_train    0.242910
 cv_val      0.243469
 dtype: float64)

Still no significant improvements. Let tune hyperparemeters for the Lasso model.

In [36]:
estimator = Lasso()

parameter_space = dict(
   alpha=np.logspace(-3, 3, 20),
    #l1_ratio=uniform(0,1),
   fit_intercept=[True, False],
)

cross_validation_schema = KFold(n_splits=10, shuffle=True)
score_function = {'mape': 'neg_mean_absolute_percentage_error',
           'mae': make_scorer(mean_absolute_error, greater_is_better=False)}

n_iterations = 100

In [37]:
random_search_cv = RandomizedSearchCV(
    estimator=estimator,
    param_distributions=parameter_space,
    n_iter=n_iterations,
    n_jobs=-1,
    cv=cross_validation_schema,
    scoring=score_function,
    refit="mape",
    random_state=123,
    return_train_score=False,
    verbose=3 #Controls the verbosity: the higher, the more messages.

)
search_bar = random_search_cv.fit(df_barcelona[features_selection_back_bar], df_barcelona["realsum_cut"])

Fitting 10 folds for each of 40 candidates, totalling 400 fits


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

In [38]:
search_bar.best_score_

np.float64(-0.2998620755504134)

In [39]:
search_par = random_search_cv.fit(df_paris[features_selection_back_par], df_paris["realsum_cut"])

Fitting 10 folds for each of 40 candidates, totalling 400 fits


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

In [40]:
search_par.best_score_

np.float64(-0.3250357895548496)

In [41]:
search_rom = random_search_cv.fit(df_rome[features_selection_back_rom], df_rome["realsum_cut"])

search_rom.best_score_

Fitting 10 folds for each of 40 candidates, totalling 400 fits


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

[CV 7/10] END alpha=0.001, fit_intercept=True; mae: (test=-75.777) mape: (test=-0.279) total time=   0.0s
[CV 5/10] END alpha=0.00206913808111479, fit_intercept=False; mae: (test=-73.798) mape: (test=-0.295) total time=   0.1s
[CV 6/10] END alpha=0.004281332398719396, fit_intercept=True; mae: (test=-69.607) mape: (test=-0.282) total time=   0.0s
[CV 6/10] END alpha=0.004281332398719396, fit_intercept=False; mae: (test=-69.711) mape: (test=-0.282) total time=   0.1s
[CV 3/10] END alpha=0.008858667904100823, fit_intercept=True; mae: (test=-74.214) mape: (test=-0.280) total time=   0.0s
[CV 4/10] END alpha=0.008858667904100823, fit_intercept=True; mae: (test=-81.835) mape: (test=-0.333) total time=   0.0s
[CV 1/10] END alpha=0.018329807108324356, fit_intercept=True; mae: (test=-76.356) mape: (test=-0.309) total time=   0.0s
[CV 2/10] END alpha=0.018329807108324356, fit_intercept=True; mae: (test=-81.242) mape: (test=-0.304) total time=   0.0s
[CV 3/10] END alpha=0.018329807108324356, fit_

np.float64(-0.24326220513060334)

Still no significant improvements. Let's stay with the already saved OLS models.

### KNN Regression
Now, we will build KNN regression models for each city. We will use the same feature selection methods as before.
#### Feature Selection

1. Mutual Information

In [42]:
fr_barcelona.sort_values("mi_score", ascending=False, inplace=True)
fr_paris.sort_values("mi_score", ascending=False, inplace=True)
fr_rome.sort_values("mi_score", ascending=False, inplace=True)

mi_features_bar = fr_barcelona.iloc[0:8].index.tolist()
mi_features_10_bar = fr_barcelona.iloc[0:10].index.tolist()
mi_features_14_bar = fr_barcelona.iloc[0:14].index.tolist()
mi_features_17_bar = fr_barcelona.iloc[0:17].index.tolist()
mi_features_20_bar = fr_barcelona.iloc[0:20].index.tolist()

mi_features_par = fr_paris.iloc[0:8].index.tolist()
mi_features_10_par = fr_paris.iloc[0:10].index.tolist()
mi_features_14_par = fr_paris.iloc[0:14].index.tolist()
mi_features_17_par = fr_paris.iloc[0:17].index.tolist()
mi_features_20_par = fr_paris.iloc[0:20].index.tolist()

mi_features_rom = fr_rome.iloc[0:8].index.tolist()
mi_features_10_rom = fr_rome.iloc[0:10].index.tolist()
mi_features_14_rom = fr_rome.iloc[0:14].index.tolist()
mi_features_17_rom = fr_rome.iloc[0:17].index.tolist()
mi_features_20_rom = fr_rome.iloc[0:20].index.tolist()

2. Boruta

In [171]:
br_features_bar = fr_barcelona[fr_barcelona.boruta_rank.isin([1,2,3])].index.tolist()
br_features_par = fr_paris[fr_paris.boruta_rank.isin([1,2,3])].index.tolist()
br_features_rom = fr_rome[fr_rome.boruta_rank.isin([1,2,3])].index.tolist()

3. Correlation

In [172]:
fr_barcelona["corr_abs"] = np.abs(fr_barcelona["corr"])
fr_barcelona.sort_values("corr_abs", ascending=False, inplace=True)
corr_features_bar = fr_barcelona.iloc[0:12].index.tolist()

fr_paris["corr_abs"] = np.abs(fr_paris["corr"])
fr_paris.sort_values("corr_abs", ascending=False, inplace=True)
corr_features_par = fr_paris.iloc[0:12].index.tolist()

fr_rome["corr_abs"] = np.abs(fr_rome["corr"])
fr_rome.sort_values("corr_abs", ascending=False, inplace=True)
corr_features_rom = fr_rome.iloc[0:12].index.tolist()

4. Sequential Feature Selector

In [43]:
param_knn = {
    "n_neighbors": [3, 5, 7, 10, 12, 15, 20, 25, 40, 50, 100],
    "weights": ["uniform", "distance"],
    "metric": ["chebyshev", "manhattan", "minkowski"],
    "p": [1, 2],
}

In [44]:
mse = make_scorer(mean_squared_error, greater_is_better=False)
model = KNeighborsRegressor()

grid_CV_bar = GridSearchCV(
    model, param_knn, cv=5, scoring=mse, return_train_score=True, n_jobs=-1
)
grid_CV_bar.fit(df_barcelona.loc[:, mi_features_bar].values, df_barcelona.loc[:, "realsum_cut"].values.ravel())


In [45]:
grid_CV_bar.best_score_

np.float64(-9970.148741217514)

In [46]:
grid_CV_par = GridSearchCV(
    model, param_knn, cv=5, scoring=mse, return_train_score=True, n_jobs=-1
)
grid_CV_par.fit(df_paris.loc[:, mi_features_par].values, df_paris.loc[:, "realsum_cut"].values.ravel())


In [47]:
grid_CV_par.best_score_

np.float64(-15590.778137057107)

In [48]:
grid_CV_rom = GridSearchCV(
    model, param_knn, cv=5, scoring=mse, return_train_score=True, n_jobs=-1
)
grid_CV_rom.fit(df_rome.loc[:, mi_features_rom].values, df_rome.loc[:, "realsum_cut"].values.ravel())

In [51]:
sf_features_bar = df_barcelona.columns.tolist()
sf_features_bar.remove("realsum_cut")

sf_features_par = df_paris.columns.tolist()
sf_features_par.remove("realsum_cut")

sf_features_rom = df_rome.columns.tolist()
sf_features_rom.remove("realsum_cut")

In [52]:
model_bar = KNeighborsRegressor(**grid_CV_bar.best_params_)

sf = SFS(
    model_bar,
    k_features=(5, 15),
    forward=True,
    floating=False,
    verbose=0,
    scoring=mse,
    cv=5,
    n_jobs=-1,
)

sffit = sf.fit(
    df_barcelona.loc[:, sf_features_bar].values, df_barcelona.loc[:, "realsum_cut"].values.ravel()
)
sf_features_bar = df_barcelona.loc[:, sf_features_bar].columns[list(sffit.k_feature_idx_)]


In [53]:
model_par = KNeighborsRegressor(**grid_CV_par.best_params_)

sf = SFS(
    model_par,
    k_features=(5, 15),
    forward=True,
    floating=False,
    verbose=0,
    scoring=mse,
    cv=5,
    n_jobs=-1,
)

sffit = sf.fit(
    df_paris.loc[:, sf_features_par].values, df_paris.loc[:, "realsum_cut"].values.ravel()
)
sf_features_par = df_paris.loc[:, sf_features_par].columns[list(sffit.k_feature_idx_)]

In [54]:
model_rom = KNeighborsRegressor(**grid_CV_rom.best_params_)

sf = SFS(
    model_rom,
    k_features=(5, 15),
    forward=True,
    floating=False,
    verbose=0,
    scoring=mse,
    cv=5,
    n_jobs=-1,
)

sffit = sf.fit(
    df_rome.loc[:, sf_features_rom].values, df_rome.loc[:, "realsum_cut"].values.ravel()
)
sf_features_rom = df_rome.loc[:, sf_features_rom].columns[list(sffit.k_feature_idx_)]

In [55]:
mse = make_scorer(mean_squared_error, greater_is_better=False)

#### Hyperparameters Tuning

In [57]:
(cv_proc(df_barcelona, br_features_bar, model_bar, param_knn, mse),
cv_proc(df_barcelona, corr_features_bar, model_bar, param_knn, mse),
cv_proc(df_barcelona, mi_features_bar, model_bar, param_knn, mse),
cv_proc(df_barcelona, mi_features_10_bar, model_bar, param_knn, mse),
cv_proc(df_barcelona, mi_features_14_bar, model_bar, param_knn, mse),
cv_proc(df_barcelona, mi_features_17_bar, model_bar, param_knn, mse),
cv_proc(df_barcelona, mi_features_20_bar, model_bar, param_knn, mse),
cv_proc(df_barcelona, sf_features_bar, model_bar, param_knn, mse))


{'metric': 'manhattan', 'n_neighbors': 10, 'p': 1, 'weights': 'distance'}
-13311.948959618865
{'metric': 'manhattan', 'n_neighbors': 50, 'p': 1, 'weights': 'distance'}
-9483.626804105468
{'metric': 'manhattan', 'n_neighbors': 50, 'p': 1, 'weights': 'distance'}
-9970.148741217514
{'metric': 'manhattan', 'n_neighbors': 40, 'p': 1, 'weights': 'distance'}
-9963.975823032093
{'metric': 'manhattan', 'n_neighbors': 25, 'p': 1, 'weights': 'distance'}
-9879.514533754633
{'metric': 'manhattan', 'n_neighbors': 25, 'p': 1, 'weights': 'distance'}
-10034.761614962485
{'metric': 'manhattan', 'n_neighbors': 7, 'p': 1, 'weights': 'distance'}
-12495.07663447646
{'metric': 'manhattan', 'n_neighbors': 50, 'p': 1, 'weights': 'distance'}
-8652.146073511345


(None, None, None, None, None, None, None, None)

In [174]:
hp_barcelona = [
    {'metric': 'manhattan', 'n_neighbors': 10, 'p': 1, 'weights': 'distance'},
    {'metric': 'manhattan', 'n_neighbors': 50, 'p': 1, 'weights': 'distance'},
    {'metric': 'manhattan', 'n_neighbors': 50, 'p': 1, 'weights': 'distance'},
    {'metric': 'manhattan', 'n_neighbors': 40, 'p': 1, 'weights': 'distance'},
    {'metric': 'manhattan', 'n_neighbors': 25, 'p': 1, 'weights': 'distance'},
    {'metric': 'manhattan', 'n_neighbors': 25, 'p': 1, 'weights': 'distance'},
    {'metric': 'manhattan', 'n_neighbors': 7, 'p': 1, 'weights': 'distance'},
    {'metric': 'manhattan', 'n_neighbors': 50, 'p': 1, 'weights': 'distance'}
]

In [59]:
(cv_proc(df_paris, br_features_par, model_par, param_knn, mse),
cv_proc(df_paris, corr_features_par, model_par, param_knn, mse),
cv_proc(df_paris, mi_features_par, model_par, param_knn, mse),
cv_proc(df_paris, mi_features_10_par, model_par, param_knn, mse),
cv_proc(df_paris, mi_features_14_par, model_par, param_knn, mse),
cv_proc(df_paris, mi_features_17_par, model_par, param_knn, mse),
cv_proc(df_paris, mi_features_20_par, model_par, param_knn, mse),
cv_proc(df_paris, sf_features_par, model_par, param_knn, mse))

{'metric': 'manhattan', 'n_neighbors': 50, 'p': 1, 'weights': 'distance'}
-15536.569954497065
{'metric': 'manhattan', 'n_neighbors': 100, 'p': 1, 'weights': 'distance'}
-13521.225199450466
{'metric': 'manhattan', 'n_neighbors': 40, 'p': 1, 'weights': 'distance'}
-15590.778137057107
{'metric': 'manhattan', 'n_neighbors': 40, 'p': 1, 'weights': 'distance'}
-14720.821974412243
{'metric': 'manhattan', 'n_neighbors': 25, 'p': 1, 'weights': 'distance'}
-14815.859479748324
{'metric': 'manhattan', 'n_neighbors': 25, 'p': 1, 'weights': 'distance'}
-14690.381934334853
{'metric': 'manhattan', 'n_neighbors': 5, 'p': 1, 'weights': 'distance'}
-22238.72174837941
{'metric': 'manhattan', 'n_neighbors': 50, 'p': 1, 'weights': 'distance'}
-12969.203484618562


(None, None, None, None, None, None, None, None)

In [175]:
hp_paris = [
    {'metric': 'manhattan', 'n_neighbors': 50, 'p': 1, 'weights': 'distance'},
{'metric': 'manhattan', 'n_neighbors': 100, 'p': 1, 'weights': 'distance'},
{'metric': 'manhattan', 'n_neighbors': 40, 'p': 1, 'weights': 'distance'},
{'metric': 'manhattan', 'n_neighbors': 40, 'p': 1, 'weights': 'distance'},
{'metric': 'manhattan', 'n_neighbors': 25, 'p': 1, 'weights': 'distance'},
{'metric': 'manhattan', 'n_neighbors': 25, 'p': 1, 'weights': 'distance'},
{'metric': 'manhattan', 'n_neighbors': 5, 'p': 1, 'weights': 'distance'},
{'metric': 'manhattan', 'n_neighbors': 50, 'p': 1, 'weights': 'distance'}
]

In [61]:
(cv_proc(df_rome, br_features_rom, model_rom, param_knn, mse),
cv_proc(df_rome, corr_features_rom, model_rom, param_knn, mse),
cv_proc(df_rome, mi_features_rom, model_rom, param_knn, mse),
cv_proc(df_rome, mi_features_10_rom, model_rom, param_knn, mse),
cv_proc(df_rome, mi_features_14_rom, model_rom, param_knn, mse),
cv_proc(df_rome, mi_features_17_rom, model_rom, param_knn, mse),
cv_proc(df_rome, mi_features_20_rom, model_rom, param_knn, mse),
cv_proc(df_rome, sf_features_rom, model_rom, param_knn, mse))

{'metric': 'manhattan', 'n_neighbors': 25, 'p': 1, 'weights': 'distance'}
-2307.554576451804
{'metric': 'manhattan', 'n_neighbors': 50, 'p': 1, 'weights': 'distance'}
-2276.210396681318
{'metric': 'manhattan', 'n_neighbors': 40, 'p': 1, 'weights': 'distance'}
-4491.922318406288
{'metric': 'manhattan', 'n_neighbors': 25, 'p': 1, 'weights': 'distance'}
-3989.024486028245
{'metric': 'manhattan', 'n_neighbors': 12, 'p': 1, 'weights': 'distance'}
-3658.97590815891
{'metric': 'manhattan', 'n_neighbors': 10, 'p': 1, 'weights': 'distance'}
-3471.803967153086
{'metric': 'manhattan', 'n_neighbors': 7, 'p': 1, 'weights': 'distance'}
-3405.6393308226848
{'metric': 'manhattan', 'n_neighbors': 50, 'p': 1, 'weights': 'distance'}
-2040.5564569982143


(None, None, None, None, None, None, None, None)

In [176]:
hp_rome = [
    {'metric': 'manhattan', 'n_neighbors': 25, 'p': 1, 'weights': 'distance'},
    {'metric': 'manhattan', 'n_neighbors': 50, 'p': 1, 'weights': 'distance'},
    {'metric': 'manhattan', 'n_neighbors': 40, 'p': 1, 'weights': 'distance'},
    {'metric': 'manhattan', 'n_neighbors': 25, 'p': 1, 'weights': 'distance'},
    {'metric': 'manhattan', 'n_neighbors': 12, 'p': 1, 'weights': 'distance'},
    {'metric': 'manhattan', 'n_neighbors': 10, 'p': 1, 'weights': 'distance'},
    {'metric': 'manhattan', 'n_neighbors': 7, 'p': 1, 'weights': 'distance'},
    {'metric': 'manhattan', 'n_neighbors': 50, 'p': 1, 'weights': 'distance'}
]

#### Model Evaluation

In [63]:
model = KNeighborsRegressor(**hp_barcelona[0])
cv_output0_rmse = CV_rmse_wrapper(df_barcelona, br_features_bar, model,10, 123, shuff=True, display_res=True)
cv_output0_mape = CV_wrapper(df_barcelona, br_features_bar, model, 10, 123, shuff=True,  display_res=True)

model = KNeighborsRegressor(**hp_barcelona[1])
cv_output1_rmse = CV_rmse_wrapper(df_barcelona, corr_features_bar, model,10, 123, shuff=True, display_res=True)
cv_output1_mape = CV_wrapper(df_barcelona, corr_features_bar, model, 10, 123, shuff=True,  display_res=True)

model = KNeighborsRegressor(**hp_barcelona[2])
cv_output2_rmse = CV_rmse_wrapper(df_barcelona, mi_features_bar, model,10, 123, shuff=True, display_res=True)
cv_output2_mape = CV_wrapper(df_barcelona, mi_features_bar, model, 10, 123, shuff=True,  display_res=True)

model = KNeighborsRegressor(**hp_barcelona[3])
cv_output3_rmse = CV_rmse_wrapper(df_barcelona, mi_features_10_bar, model,10, 123, shuff=True, display_res=True)
cv_output3_mape = CV_wrapper(df_barcelona, mi_features_10_bar, model, 10, 123, shuff=True,  display_res=True)

model = KNeighborsRegressor(**hp_barcelona[4])
cv_output4_rmse = CV_rmse_wrapper(df_barcelona, mi_features_14_bar, model,10, 123, shuff=True, display_res=True)
cv_output4_mape = CV_wrapper(df_barcelona, mi_features_14_bar, model, 10, 123, shuff=True,  display_res=True)

model = KNeighborsRegressor(**hp_barcelona[5])
cv_output5_rmse = CV_rmse_wrapper(df_barcelona, mi_features_17_bar, model,10, 123, shuff=True, display_res=True)
cv_output5_mape = CV_wrapper(df_barcelona, mi_features_17_bar, model, 10, 123, shuff=True,  display_res=True)

model = KNeighborsRegressor(**hp_barcelona[6])
cv_output6_rmse = CV_rmse_wrapper(df_barcelona, mi_features_20_bar, model,10, 123, shuff=True, display_res=True)
cv_output6_mape = CV_wrapper(df_barcelona, mi_features_20_bar, model, 10, 123, shuff=True,  display_res=True)

model = KNeighborsRegressor(**hp_barcelona[7])
cv_output7_rmse = CV_rmse_wrapper(df_barcelona, sf_features_bar, model,10, 123, shuff=True, display_res=True)
cv_output7_mape = CV_wrapper(df_barcelona, sf_features_bar, model, 10, 123, shuff=True,  display_res=True)

In [64]:
(cv_output0_mape.mean(), cv_output0_rmse.mean())

(cv_train    0.000000
 cv_val      0.275371
 dtype: float64,
 cv_train      0.000000
 cv_val      113.228865
 dtype: float64)

In [65]:
(cv_output1_mape.mean(), cv_output1_rmse.mean())

(cv_train    0.00000
 cv_val      0.18537
 dtype: float64,
 cv_train     0.000000
 cv_val      91.042965
 dtype: float64)

In [66]:
(cv_output2_mape.mean(), cv_output2_rmse.mean())

(cv_train    0.000000
 cv_val      0.194837
 dtype: float64,
 cv_train     0.000000
 cv_val      94.760397
 dtype: float64)

In [67]:
(cv_output3_mape.mean(), cv_output3_rmse.mean())

(cv_train    0.000000
 cv_val      0.194931
 dtype: float64,
 cv_train     0.000000
 cv_val      95.445891
 dtype: float64)

In [68]:
(cv_output4_mape.mean(), cv_output4_rmse.mean())

(cv_train    0.000000
 cv_val      0.196512
 dtype: float64,
 cv_train     0.000000
 cv_val      95.221672
 dtype: float64)

In [69]:
(cv_output5_mape.mean(), cv_output5_rmse.mean())

(cv_train    0.000000
 cv_val      0.193848
 dtype: float64,
 cv_train     0.000000
 cv_val      94.809398
 dtype: float64)

In [70]:
(cv_output6_mape.mean(), cv_output6_rmse.mean())

(cv_train    0.000000
 cv_val      0.261025
 dtype: float64,
 cv_train      0.000000
 cv_val      109.745001
 dtype: float64)

In [71]:
(cv_output7_mape.mean(), cv_output7_rmse.mean())

(cv_train    0.000000
 cv_val      0.179339
 dtype: float64,
 cv_train     0.000000
 cv_val      87.642538
 dtype: float64)

In [72]:
model = KNeighborsRegressor(**hp_paris[0])
cv_output0_rmse = CV_rmse_wrapper(df_paris, br_features_par, model,10, 123, shuff=True, display_res=True)
cv_output0_mape = CV_wrapper(df_paris, br_features_par, model, 10, 123, shuff=True,  display_res=True)

model = KNeighborsRegressor(**hp_paris[1])
cv_output1_rmse = CV_rmse_wrapper(df_paris, corr_features_par, model,10, 123, shuff=True, display_res=True)
cv_output1_mape = CV_wrapper(df_paris, corr_features_par, model, 10, 123, shuff=True,  display_res=True)

model = KNeighborsRegressor(**hp_paris[2])
cv_output2_rmse = CV_rmse_wrapper(df_paris, mi_features_par, model,10, 123, shuff=True, display_res=True)
cv_output2_mape = CV_wrapper(df_paris, mi_features_par, model, 10, 123, shuff=True,  display_res=True)

model = KNeighborsRegressor(**hp_paris[3])
cv_output3_rmse = CV_rmse_wrapper(df_paris, mi_features_10_par, model,10, 123, shuff=True, display_res=True)
cv_output3_mape = CV_wrapper(df_paris, mi_features_10_par, model, 10, 123, shuff=True,  display_res=True)

model = KNeighborsRegressor(**hp_paris[4])
cv_output4_rmse = CV_rmse_wrapper(df_paris, mi_features_14_par, model,10, 123, shuff=True, display_res=True)
cv_output4_mape = CV_wrapper(df_paris, mi_features_14_par, model, 10, 123, shuff=True,  display_res=True)

model = KNeighborsRegressor(**hp_paris[5])
cv_output5_rmse = CV_rmse_wrapper(df_paris, mi_features_17_par, model,10, 123, shuff=True, display_res=True)
cv_output5_mape = CV_wrapper(df_paris, mi_features_17_par, model, 10, 123, shuff=True,  display_res=True)

model = KNeighborsRegressor(**hp_paris[6])
cv_output6_rmse = CV_rmse_wrapper(df_paris, mi_features_20_par, model,10, 123, shuff=True, display_res=True)
cv_output6_mape = CV_wrapper(df_paris, mi_features_20_par, model, 10, 123, shuff=True,  display_res=True)

model = KNeighborsRegressor(**hp_paris[7])
cv_output7_rmse = CV_rmse_wrapper(df_paris, sf_features_par, model,10, 123, shuff=True, display_res=True)
cv_output7_mape = CV_wrapper(df_paris, sf_features_par, model, 10, 123, shuff=True,  display_res=True)

In [73]:
(cv_output0_mape.mean(), cv_output0_rmse.mean())

(cv_train    0.000000
 cv_val      0.160924
 dtype: float64,
 cv_train      0.00000
 cv_val      114.41824
 dtype: float64)

In [74]:
(cv_output1_mape.mean(), cv_output1_rmse.mean())

(cv_train    0.000193
 cv_val      0.147539
 dtype: float64,
 cv_train      1.357209
 cv_val      108.370198
 dtype: float64)

In [75]:
(cv_output2_mape.mean(), cv_output2_rmse.mean())

(cv_train    0.000000
 cv_val      0.162459
 dtype: float64,
 cv_train      0.000000
 cv_val      114.562302
 dtype: float64)

In [76]:
(cv_output3_mape.mean(), cv_output3_rmse.mean())

(cv_train    0.000000
 cv_val      0.156307
 dtype: float64,
 cv_train      0.000000
 cv_val      110.806605
 dtype: float64)

In [77]:
(cv_output4_mape.mean(), cv_output4_rmse.mean())

(cv_train    0.000000
 cv_val      0.155265
 dtype: float64,
 cv_train      0.000000
 cv_val      111.009581
 dtype: float64)

In [78]:
(cv_output5_mape.mean(), cv_output5_rmse.mean())

(cv_train    0.000000
 cv_val      0.152518
 dtype: float64,
 cv_train      0.000000
 cv_val      110.535613
 dtype: float64)

In [79]:
(cv_output6_mape.mean(), cv_output6_rmse.mean())

(cv_train    0.000000
 cv_val      0.265132
 dtype: float64,
 cv_train      0.000000
 cv_val      144.761273
 dtype: float64)

In [80]:
(cv_output7_mape.mean(), cv_output7_rmse.mean())

(cv_train    0.000000
 cv_val      0.148419
 dtype: float64,
 cv_train      0.00000
 cv_val      104.88746
 dtype: float64)

In [81]:
model = KNeighborsRegressor(**hp_rome[0])
cv_output0_rmse = CV_rmse_wrapper(df_rome, br_features_rom, model,10, 123, shuff=True, display_res=True)
cv_output0_mape = CV_wrapper(df_rome, br_features_rom, model, 10, 123, shuff=True,  display_res=True)

model = KNeighborsRegressor(**hp_rome[1])
cv_output1_rmse = CV_rmse_wrapper(df_rome, corr_features_rom, model,10, 123, shuff=True, display_res=True)
cv_output1_mape = CV_wrapper(df_rome, corr_features_rom, model, 10, 123, shuff=True,  display_res=True)

model = KNeighborsRegressor(**hp_rome[2])
cv_output2_rmse = CV_rmse_wrapper(df_rome, mi_features_rom, model,10, 123, shuff=True, display_res=True)
cv_output2_mape = CV_wrapper(df_rome, mi_features_rom, model, 10, 123, shuff=True,  display_res=True)

model = KNeighborsRegressor(**hp_rome[3])
cv_output3_rmse = CV_rmse_wrapper(df_rome, mi_features_10_rom, model,10, 123, shuff=True, display_res=True)
cv_output3_mape = CV_wrapper(df_rome, mi_features_10_rom, model, 10, 123, shuff=True,  display_res=True)

model = KNeighborsRegressor(**hp_rome[4])
cv_output4_rmse = CV_rmse_wrapper(df_rome, mi_features_14_rom, model,10, 123, shuff=True, display_res=True)
cv_output4_mape = CV_wrapper(df_rome, mi_features_14_rom, model, 10, 123, shuff=True,  display_res=True)

model = KNeighborsRegressor(**hp_rome[5])
cv_output5_rmse = CV_rmse_wrapper(df_rome, mi_features_17_rom, model,10, 123, shuff=True, display_res=True)
cv_output5_mape = CV_wrapper(df_rome, mi_features_17_rom, model, 10, 123, shuff=True,  display_res=True)

model = KNeighborsRegressor(**hp_rome[6])
cv_output6_rmse = CV_rmse_wrapper(df_rome, mi_features_20_rom, model,10, 123, shuff=True, display_res=True)
cv_output6_mape = CV_wrapper(df_rome, mi_features_20_rom, model, 10, 123, shuff=True,  display_res=True)

model = KNeighborsRegressor(**hp_rome[7])
cv_output7_rmse = CV_rmse_wrapper(df_rome, sf_features_rom, model,10, 123, shuff=True, display_res=True)
cv_output7_mape = CV_wrapper(df_rome, sf_features_rom, model, 10, 123, shuff=True,  display_res=True)

In [82]:
(cv_output0_mape.mean(), cv_output0_rmse.mean())

(cv_train    0.000000
 cv_val      0.132346
 dtype: float64,
 cv_train     0.000000
 cv_val      46.416077
 dtype: float64)

In [83]:
(cv_output1_mape.mean(), cv_output1_rmse.mean())

(cv_train    0.000057
 cv_val      0.129900
 dtype: float64,
 cv_train     0.271964
 cv_val      45.665003
 dtype: float64)

In [84]:
(cv_output2_mape.mean(), cv_output2_rmse.mean())

(cv_train    0.000000
 cv_val      0.239307
 dtype: float64,
 cv_train     0.000000
 cv_val      66.274585
 dtype: float64)

In [85]:
(cv_output3_mape.mean(), cv_output3_rmse.mean())

(cv_train    0.000000
 cv_val      0.227599
 dtype: float64,
 cv_train     0.000000
 cv_val      62.905584
 dtype: float64)

In [86]:
(cv_output4_mape.mean(), cv_output4_rmse.mean())

(cv_train    0.000000
 cv_val      0.213985
 dtype: float64,
 cv_train     0.00000
 cv_val      60.28393
 dtype: float64)

In [87]:
(cv_output5_mape.mean(), cv_output5_rmse.mean())

(cv_train    0.000000
 cv_val      0.207379
 dtype: float64,
 cv_train     0.000000
 cv_val      58.442811
 dtype: float64)

In [88]:
(cv_output6_mape.mean(), cv_output6_rmse.mean())

(cv_train    0.000000
 cv_val      0.204743
 dtype: float64,
 cv_train     0.000000
 cv_val      57.835544
 dtype: float64)

In [89]:
(cv_output7_mape.mean(), cv_output7_rmse.mean())

(cv_train    0.000000
 cv_val      0.122388
 dtype: float64,
 cv_train     0.000000
 cv_val      43.704434
 dtype: float64)

The best KNN model performance is achieved with the features obtained with Forward Sequential Feature Elimination method for Barcelona and Rome, and top 10 features based on the correlation with the target variable for Rome. Let's save the models.

In [177]:
model_bar = KNeighborsRegressor(**hp_barcelona[7])
model_bar.fit(df_barcelona.loc[:, sf_features_bar].values, df_barcelona.loc[:, "realsum_cut"].values.ravel())
pickle.dump(model_bar, open("models/model_barcelona_knn.sav", "wb"))

model_par = KNeighborsRegressor(**hp_paris[1])
model_par.fit(df_paris.loc[:, sf_features_par].values, df_paris.loc[:, "realsum_cut"].values.ravel())
pickle.dump(model_par, open("models/model_paris_knn.sav", "wb"))

model_rom = KNeighborsRegressor(**hp_rome[7])
model_rom.fit(df_rome.loc[:, br_features_rom].values, df_rome.loc[:, "realsum_cut"].values.ravel())
pickle.dump(model_rom, open("models/model_rome_knn.sav", "wb"))

### Decision Tree Regression

#### Feature Selection

1. Sequential Feature Selector

In [90]:
param_dt = {
    "max_depth": [3, 5, 7, 10],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
}

In [91]:
model_dt = DecisionTreeRegressor()
grid_CV_bar = GridSearchCV(
    model_dt, param_dt, cv=5, scoring=mse, return_train_score=True, n_jobs=-1
)
grid_CV_bar.fit(df_barcelona.loc[:, mi_features_10_bar].values, df_barcelona.loc[:, "realsum_cut"].values.ravel())

grid_CV_par = GridSearchCV(
    model_dt, param_dt, cv=5, scoring=mse, return_train_score=True, n_jobs=-1
)
grid_CV_par.fit(df_paris.loc[:, mi_features_10_par].values, df_paris.loc[:, "realsum_cut"].values.ravel())

grid_CV_rom = GridSearchCV(
    model_dt, param_dt, cv=5, scoring=mse, return_train_score=True, n_jobs=-1
)
grid_CV_rom.fit(df_rome.loc[:, mi_features_10_rom].values, df_rome.loc[:, "realsum_cut"].values.ravel())

In [92]:
dt_features_bar = df_barcelona.columns.tolist()
dt_features_bar.remove("realsum_cut")

dt_features_par = df_paris.columns.tolist()
dt_features_par.remove("realsum_cut")

dt_features_rom = df_rome.columns.tolist()
dt_features_rom.remove("realsum_cut")

In [93]:
model_bar = DecisionTreeRegressor(**grid_CV_bar.best_params_)

sf = SFS(
    model_bar,
    k_features=(5, 15),
    forward=True,
    floating=False,
    verbose=0,
    scoring=mse,
    cv=5,
    n_jobs=-1,
)

sffit = sf.fit(
    df_barcelona.loc[:, dt_features_bar].values, df_barcelona.loc[:, "realsum_cut"].values.ravel()
)
dt_features_bar = df_barcelona.loc[:, dt_features_bar].columns[list(sffit.k_feature_idx_)]

In [94]:
model_par = DecisionTreeRegressor(**grid_CV_par.best_params_)

sf = SFS(
    model_par,
    k_features=(5, 15),
    forward=True,
    floating=False,
    verbose=0,
    scoring=mse,
    cv=5,
    n_jobs=-1,
)

sffit = sf.fit(
    df_paris.loc[:, dt_features_par].values, df_paris.loc[:, "realsum_cut"].values.ravel()
)
dt_features_par = df_paris.loc[:, dt_features_par].columns[list(sffit.k_feature_idx_)]

In [95]:
model_rom = DecisionTreeRegressor(**grid_CV_rom.best_params_)
sf = SFS(
    model_rom,
    k_features=(5, 15),
    forward=True,
    floating=False,
    verbose=0,
    scoring=mse,
    cv=5,
    n_jobs=-1,
)

sffit = sf.fit(
    df_rome.loc[:, dt_features_rom].values, df_rome.loc[:, "realsum_cut"].values.ravel()
)
dt_features_rom = df_rome.loc[:, dt_features_rom].columns[list(sffit.k_feature_idx_)]

#### Hyperparameters Tuning

In [96]:
(cv_proc(df_barcelona, br_features_bar, model_bar, param_dt, mse),
cv_proc(df_barcelona, corr_features_bar, model_bar, param_dt, mse),
cv_proc(df_barcelona, mi_features_bar, model_bar, param_dt, mse),
cv_proc(df_barcelona, mi_features_10_bar, model_bar, param_dt, mse),
cv_proc(df_barcelona, mi_features_14_bar, model_bar, param_dt, mse),
cv_proc(df_barcelona, mi_features_17_bar, model_bar, param_dt, mse),
cv_proc(df_barcelona, mi_features_20_bar, model_bar, param_dt, mse),
cv_proc(df_barcelona, dt_features_bar, model_bar, param_dt, mse))

{'max_depth': 7, 'min_samples_leaf': 1, 'min_samples_split': 2}
-13896.398215113903
{'max_depth': 7, 'min_samples_leaf': 4, 'min_samples_split': 2}
-14687.026842172176
{'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 10}
-14558.215337137104
{'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 5}
-13760.641214845324
{'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 5}
-13807.092044853413
{'max_depth': 7, 'min_samples_leaf': 1, 'min_samples_split': 2}
-13563.117003340067
{'max_depth': 7, 'min_samples_leaf': 2, 'min_samples_split': 2}
-14096.066494111565
{'max_depth': 7, 'min_samples_leaf': 1, 'min_samples_split': 2}
-13171.733885821197


(None, None, None, None, None, None, None, None)

In [178]:
hp_barcelona = [
    {'max_depth': 7, 'min_samples_leaf': 4, 'min_samples_split': 5},
{'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10},
{'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2},
{'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 5},
{'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 5},
{'max_depth': 7, 'min_samples_leaf': 2, 'min_samples_split': 2},
{'max_depth': 7, 'min_samples_leaf': 4, 'min_samples_split': 2},
{'max_depth': 7, 'min_samples_leaf': 1, 'min_samples_split': 2}
]

In [98]:
(cv_proc(df_paris, br_features_par, model_par, param_dt, mse),
cv_proc(df_paris, corr_features_par, model_par, param_dt, mse),
cv_proc(df_paris, mi_features_par, model_par, param_dt, mse),
cv_proc(df_paris, mi_features_10_par, model_par, param_dt, mse),
cv_proc(df_paris, mi_features_14_par, model_par, param_dt, mse),
cv_proc(df_paris, mi_features_17_par, model_par, param_dt, mse),
cv_proc(df_paris, mi_features_20_par, model_par, param_dt, mse),
cv_proc(df_paris, dt_features_par, model_par, param_dt, mse))

{'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 2}
-24477.87899567706
{'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 5}
-23457.740400532202
{'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 5}
-23427.456078451833
{'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 2}
-23271.848531347496
{'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 2}
-23972.85009478048
{'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 2}
-23819.656355935756
{'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 5}
-24367.558280602632
{'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 5}
-21939.622529027005


(None, None, None, None, None, None, None, None)

In [179]:
hp_paris = [
    {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 5},
{'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 5},
{'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 2},
{'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 2},
{'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 2},
{'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 2},
{'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 5},
{'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 2}
]

In [100]:
(cv_proc(df_rome, br_features_rom, model_rom, param_dt, mse),
cv_proc(df_rome, corr_features_rom, model_rom, param_dt, mse),
cv_proc(df_rome, mi_features_rom, model_rom, param_dt, mse),
cv_proc(df_rome, mi_features_10_rom, model_rom, param_dt, mse),
cv_proc(df_rome, mi_features_14_rom, model_rom, param_dt, mse),
cv_proc(df_rome, mi_features_17_rom, model_rom, param_dt, mse),
cv_proc(df_rome, mi_features_20_rom, model_rom, param_dt, mse),
cv_proc(df_rome, dt_features_rom, model_rom, param_dt, mse))

{'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2}
-3370.3841982702857
{'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 10}
-3354.5707700038815
{'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 10}
-3684.4464054686273
{'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 2}
-3611.4309297823793
{'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 10}
-3414.903972528556
{'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 10}
-3457.281045049565
{'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 10}
-3449.864529602254
{'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 10}
-3312.7034399243967


(None, None, None, None, None, None, None, None)

In [180]:
hp_rome = [
    {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2},
{'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 10},
{'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 10},
{'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 5},
{'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 10},
{'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 10},
{'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 10},
{'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2}
]

#### Model Evaluation

In [102]:
model = DecisionTreeRegressor(**hp_barcelona[0])
cv_output0_rmse = CV_rmse_wrapper(df_barcelona, br_features_bar, model,10, 123, shuff=True, display_res=True)
cv_output0_mape = CV_wrapper(df_barcelona, br_features_bar, model, 10, 123, shuff=True,  display_res=True)

model = DecisionTreeRegressor(**hp_barcelona[1])
cv_output1_rmse = CV_rmse_wrapper(df_barcelona, corr_features_bar, model,10, 123, shuff=True, display_res=True)
cv_output1_mape = CV_wrapper(df_barcelona, corr_features_bar, model, 10, 123, shuff=True,  display_res=True)

model = DecisionTreeRegressor(**hp_barcelona[2])
cv_output2_rmse = CV_rmse_wrapper(df_barcelona, mi_features_bar, model,10, 123, shuff=True, display_res=True)
cv_output2_mape = CV_wrapper(df_barcelona, mi_features_bar, model, 10, 123, shuff=True,  display_res=True)

model = DecisionTreeRegressor(**hp_barcelona[3])
cv_output3_rmse = CV_rmse_wrapper(df_barcelona, mi_features_10_bar, model,10, 123, shuff=True, display_res=True)
cv_output3_mape = CV_wrapper(df_barcelona, mi_features_10_bar, model, 10, 123, shuff=True,  display_res=True)

model = DecisionTreeRegressor(**hp_barcelona[4])
cv_output4_rmse = CV_rmse_wrapper(df_barcelona, mi_features_14_bar, model,10, 123, shuff=True, display_res=True)
cv_output4_mape = CV_wrapper(df_barcelona, mi_features_14_bar, model, 10, 123, shuff=True,  display_res=True)

model = DecisionTreeRegressor(**hp_barcelona[5])
cv_output5_rmse = CV_rmse_wrapper(df_barcelona, mi_features_17_bar, model,10, 123, shuff=True, display_res=True)
cv_output5_mape = CV_wrapper(df_barcelona, mi_features_17_bar, model, 10, 123, shuff=True,  display_res=True)

model = DecisionTreeRegressor(**hp_barcelona[6])
cv_output6_rmse = CV_rmse_wrapper(df_barcelona, mi_features_20_bar, model,10, 123, shuff=True, display_res=True)
cv_output6_mape = CV_wrapper(df_barcelona, mi_features_20_bar, model, 10, 123, shuff=True,  display_res=True)

model = DecisionTreeRegressor(**hp_barcelona[7])
cv_output7_rmse = CV_rmse_wrapper(df_barcelona, dt_features_bar, model,10, 123, shuff=True, display_res=True)
cv_output7_mape = CV_wrapper(df_barcelona, dt_features_bar, model, 10, 123, shuff=True,  display_res=True)

In [103]:
(cv_output0_mape.mean(), cv_output0_rmse.mean())

(cv_train    0.233935
 cv_val      0.288560
 dtype: float64,
 cv_train     90.416896
 cv_val      118.489039
 dtype: float64)

In [104]:
(cv_output1_mape.mean(), cv_output1_rmse.mean())

(cv_train    0.217454
 cv_val      0.284355
 dtype: float64,
 cv_train     87.962438
 cv_val      117.885337
 dtype: float64)

In [105]:
(cv_output2_mape.mean(), cv_output2_rmse.mean())

(cv_train    0.157216
 cv_val      0.253679
 dtype: float64,
 cv_train     60.213514
 cv_val      120.524624
 dtype: float64)

In [106]:
(cv_output3_mape.mean(), cv_output3_rmse.mean())

(cv_train    0.186727
 cv_val      0.277487
 dtype: float64,
 cv_train     76.215502
 cv_val      120.222100
 dtype: float64)

In [107]:
(cv_output4_mape.mean(), cv_output4_rmse.mean())

(cv_train    0.186858
 cv_val      0.279166
 dtype: float64,
 cv_train     75.907371
 cv_val      120.764245
 dtype: float64)

In [108]:
(cv_output5_mape.mean(), cv_output5_rmse.mean())

(cv_train    0.229581
 cv_val      0.282516
 dtype: float64,
 cv_train     87.704476
 cv_val      119.147878
 dtype: float64)

In [109]:
(cv_output6_mape.mean(), cv_output6_rmse.mean())

(cv_train    0.234408
 cv_val      0.286735
 dtype: float64,
 cv_train     90.372936
 cv_val      118.186288
 dtype: float64)

In [110]:
(cv_output7_mape.mean(), cv_output7_rmse.mean())

(cv_train    0.247278
 cv_val      0.281582
 dtype: float64,
 cv_train     95.207198
 cv_val      115.719865
 dtype: float64)

In [111]:
model = DecisionTreeRegressor(**hp_paris[0])
cv_output0_rmse = CV_rmse_wrapper(df_paris, br_features_par, model,10, 123, shuff=True, display_res=True)
cv_output0_mape = CV_wrapper(df_paris, br_features_par, model, 10, 123, shuff=True,  display_res=True)

model = DecisionTreeRegressor(**hp_paris[1])
cv_output1_rmse = CV_rmse_wrapper(df_paris, corr_features_par, model,10, 123, shuff=True, display_res=True)
cv_output1_mape = CV_wrapper(df_paris, corr_features_par, model, 10, 123, shuff=True,  display_res=True)

model = DecisionTreeRegressor(**hp_paris[2])
cv_output2_rmse = CV_rmse_wrapper(df_paris, mi_features_par, model,10, 123, shuff=True, display_res=True)
cv_output2_mape = CV_wrapper(df_paris, mi_features_par, model, 10, 123, shuff=True,  display_res=True)

model = DecisionTreeRegressor(**hp_paris[3])
cv_output3_rmse = CV_rmse_wrapper(df_paris, mi_features_10_par, model,10, 123, shuff=True, display_res=True)
cv_output3_mape = CV_wrapper(df_paris, mi_features_10_par, model, 10, 123, shuff=True,  display_res=True)

model = DecisionTreeRegressor(**hp_paris[4])
cv_output4_rmse = CV_rmse_wrapper(df_paris, mi_features_14_par, model,10, 123, shuff=True, display_res=True)
cv_output4_mape = CV_wrapper(df_paris, mi_features_14_par, model, 10, 123, shuff=True,  display_res=True)

model = DecisionTreeRegressor(**hp_paris[5])
cv_output5_rmse = CV_rmse_wrapper(df_paris, mi_features_17_par, model,10, 123, shuff=True, display_res=True)
cv_output5_mape = CV_wrapper(df_paris, mi_features_17_par, model, 10, 123, shuff=True,  display_res=True)

model = DecisionTreeRegressor(**hp_paris[6])
cv_output6_rmse = CV_rmse_wrapper(df_paris, mi_features_20_par, model,10, 123, shuff=True, display_res=True)
cv_output6_mape = CV_wrapper(df_paris, mi_features_20_par, model, 10, 123, shuff=True,  display_res=True)

model = DecisionTreeRegressor(**hp_paris[7])
cv_output7_rmse = CV_rmse_wrapper(df_paris, dt_features_par, model,10, 123, shuff=True, display_res=True)
cv_output7_mape = CV_wrapper(df_paris, dt_features_par, model, 10, 123, shuff=True,  display_res=True)

In [112]:
(cv_output0_mape.mean(), cv_output0_rmse.mean())

(cv_train    0.207581
 cv_val      0.268171
 dtype: float64,
 cv_train    112.984609
 cv_val      147.673027
 dtype: float64)

In [113]:
(cv_output1_mape.mean(), cv_output1_rmse.mean())

(cv_train    0.200928
 cv_val      0.261639
 dtype: float64,
 cv_train    108.395615
 cv_val      147.452949
 dtype: float64)

In [114]:
(cv_output2_mape.mean(), cv_output2_rmse.mean())

(cv_train    0.209516
 cv_val      0.271194
 dtype: float64,
 cv_train    114.644996
 cv_val      148.548929
 dtype: float64)

In [115]:
(cv_output3_mape.mean(), cv_output3_rmse.mean())

(cv_train    0.205765
 cv_val      0.265548
 dtype: float64,
 cv_train    114.029539
 cv_val      147.628932
 dtype: float64)

In [116]:
(cv_output4_mape.mean(), cv_output4_rmse.mean())

(cv_train    0.203707
 cv_val      0.263440
 dtype: float64,
 cv_train    112.487426
 cv_val      147.406876
 dtype: float64)

In [117]:
(cv_output5_mape.mean(), cv_output5_rmse.mean())

(cv_train    0.189653
 cv_val      0.258515
 dtype: float64,
 cv_train    105.423331
 cv_val      146.888904
 dtype: float64)

In [118]:
(cv_output6_mape.mean(), cv_output6_rmse.mean())

(cv_train    0.202331
 cv_val      0.263091
 dtype: float64,
 cv_train    111.405264
 cv_val      147.375078
 dtype: float64)

In [119]:
(cv_output7_mape.mean(), cv_output7_rmse.mean())

(cv_train    0.207788
 cv_val      0.265608
 dtype: float64,
 cv_train    116.581702
 cv_val      146.390569
 dtype: float64)

In [120]:
model = DecisionTreeRegressor(**hp_rome[0])
cv_output0_rmse = CV_rmse_wrapper(df_rome, br_features_rom, model,10, 123, shuff=True, display_res=True)
cv_output0_mape = CV_wrapper(df_rome, br_features_rom, model, 10, 123, shuff=True,  display_res=True)

model = DecisionTreeRegressor(**hp_rome[1])
cv_output1_rmse = CV_rmse_wrapper(df_rome, corr_features_rom, model,10, 123, shuff=True, display_res=True)
cv_output1_mape = CV_wrapper(df_rome, corr_features_rom, model, 10, 123, shuff=True,  display_res=True)

model = DecisionTreeRegressor(**hp_rome[2])
cv_output2_rmse = CV_rmse_wrapper(df_rome, mi_features_rom, model,10, 123, shuff=True, display_res=True)
cv_output2_mape = CV_wrapper(df_rome, mi_features_rom, model, 10, 123, shuff=True,  display_res=True)

model = DecisionTreeRegressor(**hp_rome[3])
cv_output3_rmse = CV_rmse_wrapper(df_rome, mi_features_10_rom, model,10, 123, shuff=True, display_res=True)
cv_output3_mape = CV_wrapper(df_rome, mi_features_10_rom, model, 10, 123, shuff=True,  display_res=True)

model = DecisionTreeRegressor(**hp_rome[4])
cv_output4_rmse = CV_rmse_wrapper(df_rome, mi_features_14_rom, model,10, 123, shuff=True, display_res=True)
cv_output4_mape = CV_wrapper(df_rome, mi_features_14_rom, model, 10, 123, shuff=True,  display_res=True)

model = DecisionTreeRegressor(**hp_rome[5])
cv_output5_rmse = CV_rmse_wrapper(df_rome, mi_features_17_rom, model,10, 123, shuff=True, display_res=True)
cv_output5_mape = CV_wrapper(df_rome, mi_features_17_rom, model, 10, 123, shuff=True,  display_res=True)

model = DecisionTreeRegressor(**hp_rome[6])
cv_output6_rmse = CV_rmse_wrapper(df_rome, mi_features_20_rom, model,10, 123, shuff=True, display_res=True)
cv_output6_mape = CV_wrapper(df_rome, mi_features_20_rom, model, 10, 123, shuff=True,  display_res=True)

model = DecisionTreeRegressor(**hp_rome[7])
cv_output7_rmse = CV_rmse_wrapper(df_rome, dt_features_rom, model,10, 123, shuff=True, display_res=True)
cv_output7_mape = CV_wrapper(df_rome, dt_features_rom, model, 10, 123, shuff=True,  display_res=True)

In [121]:
(cv_output0_mape.mean(), cv_output0_rmse.mean())

(cv_train    0.145430
 cv_val      0.192615
 dtype: float64,
 cv_train    41.654885
 cv_val      58.121207
 dtype: float64)

In [122]:
(cv_output1_mape.mean(), cv_output1_rmse.mean())

(cv_train    0.156965
 cv_val      0.198944
 dtype: float64,
 cv_train    44.365603
 cv_val      57.872158
 dtype: float64)

In [123]:
(cv_output2_mape.mean(), cv_output2_rmse.mean())

(cv_train    0.165086
 cv_val      0.205358
 dtype: float64,
 cv_train    47.437986
 cv_val      59.450308
 dtype: float64)

In [124]:
(cv_output3_mape.mean(), cv_output3_rmse.mean())

(cv_train    0.154635
 cv_val      0.196901
 dtype: float64,
 cv_train    44.905058
 cv_val      58.738532
 dtype: float64)

In [125]:
(cv_output4_mape.mean(), cv_output4_rmse.mean())

(cv_train    0.156601
 cv_val      0.198718
 dtype: float64,
 cv_train    44.101283
 cv_val      57.993151
 dtype: float64)

In [126]:
(cv_output5_mape.mean(), cv_output5_rmse.mean())

(cv_train    0.155635
 cv_val      0.199877
 dtype: float64,
 cv_train    43.967856
 cv_val      58.277635
 dtype: float64)

In [127]:
(cv_output6_mape.mean(), cv_output6_rmse.mean())

(cv_train    0.155286
 cv_val      0.200249
 dtype: float64,
 cv_train    43.879334
 cv_val      58.551955
 dtype: float64)

In [128]:
(cv_output7_mape.mean(), cv_output7_rmse.mean())

(cv_train    0.144496
 cv_val      0.193005
 dtype: float64,
 cv_train    41.584281
 cv_val      57.160435
 dtype: float64)

The best Decision Tree model performance is achieved with the features obtained with Forward Sequential Feature Elimination method for Rome, top 8 features based on the Mutual Information Score for Barcelona, and top 17 features based on the MI score. Let's save the models.

In [182]:
model_bar = DecisionTreeRegressor(**hp_barcelona[2])
model_bar.fit(df_barcelona.loc[:, dt_features_bar].values, df_barcelona.loc[:, "realsum_cut"].values.ravel())
pickle.dump(model_bar, open("models/model_barcelona_dt.sav", "wb"))

model_par = DecisionTreeRegressor(**hp_paris[5])
model_par.fit(df_paris.loc[:, dt_features_par].values, df_paris.loc[:, "realsum_cut"].values.ravel())
pickle.dump(model_par, open("models/model_paris_dt.sav", "wb"))

model_rom = DecisionTreeRegressor(**hp_rome[7])
model_rom.fit(df_rome.loc[:, br_features_rom].values, df_rome.loc[:, "realsum_cut"].values.ravel())
pickle.dump(model_rom, open("models/model_rome_dt.sav", "wb"))

### Random Forest Regression

#### Feature Selection

1. Sequential Feature Selector

In [129]:
param_rf = {
    "n_estimators": [50, 100, 200],
    "max_depth": [3, 5, 7, 10],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
}

In [130]:
model_rf = RandomForestRegressor()
grid_CV_bar = GridSearchCV(
    model_rf, param_rf, cv=5, scoring=mse, return_train_score=True, n_jobs=-1
)
grid_CV_bar.fit(df_barcelona.loc[:, mi_features_10_bar].values, df_barcelona.loc[:, "realsum_cut"].values.ravel())

grid_CV_par = GridSearchCV(
    model_rf, param_rf, cv=5, scoring=mse, return_train_score=True, n_jobs=-1
)
grid_CV_par.fit(df_paris.loc[:, mi_features_10_par].values, df_paris.loc[:, "realsum_cut"].values.ravel())

grid_CV_rom = GridSearchCV(
    model_rf, param_rf, cv=5, scoring=mse, return_train_score=True, n_jobs=-1
)
grid_CV_rom.fit(df_rome.loc[:, mi_features_10_rom].values, df_rome.loc[:, "realsum_cut"].values.ravel())

In [131]:
rf_features_bar = df_barcelona.columns.tolist()
rf_features_bar.remove("realsum_cut")

rf_features_par = df_paris.columns.tolist()
rf_features_par.remove("realsum_cut")

rf_features_rom = df_rome.columns.tolist()
rf_features_rom.remove("realsum_cut")

In [132]:
model_bar = RandomForestRegressor(**grid_CV_bar.best_params_)
sf = SFS(
    model_bar,
    k_features=(5, 15),
    forward=True,
    floating=False,
    verbose=0,
    scoring=mse,
    cv=5,
    n_jobs=-1,
)

sffit = sf.fit(
    df_barcelona.loc[:, rf_features_bar].values, df_barcelona.loc[:, "realsum_cut"].values.ravel()
)
rf_features_bar = df_barcelona.loc[:, rf_features_bar].columns[list(sffit.k_feature_idx_)]

In [133]:
model_par = RandomForestRegressor(**grid_CV_par.best_params_)
sf = SFS(
    model_par,
    k_features=(5, 15),
    forward=True,
    floating=False,
    verbose=0,
    scoring=mse,
    cv=5,
    n_jobs=-1,
)

sffit = sf.fit(
    df_paris.loc[:, rf_features_par].values, df_paris.loc[:, "realsum_cut"].values.ravel()
)
rf_features_par = df_paris.loc[:, rf_features_par].columns[list(sffit.k_feature_idx_)]

In [134]:
model_rom = RandomForestRegressor(**grid_CV_rom.best_params_)
sf = SFS(
    model_rom,
    k_features=(5, 15),
    forward=True,
    floating=False,
    verbose=0,
    scoring=mse,
    cv=5,
    n_jobs=-1,
)

sffit = sf.fit(
    df_rome.loc[:, rf_features_rom].values, df_rome.loc[:, "realsum_cut"].values.ravel()
)
rf_features_rom = df_rome.loc[:, rf_features_rom].columns[list(sffit.k_feature_idx_)]

#### Hyperparameters Tuning

In [135]:
(cv_proc(df_barcelona, br_features_bar, model_bar, param_rf, mse),
cv_proc(df_barcelona, corr_features_bar, model_bar, param_rf, mse),
cv_proc(df_barcelona, mi_features_bar, model_bar, param_rf, mse),
cv_proc(df_barcelona, mi_features_10_bar, model_bar, param_rf, mse),
cv_proc(df_barcelona, mi_features_14_bar, model_bar, param_rf, mse),
cv_proc(df_barcelona, mi_features_17_bar, model_bar, param_rf, mse),
cv_proc(df_barcelona, mi_features_20_bar, model_bar, param_rf, mse),
cv_proc(df_barcelona, rf_features_bar, model_bar, param_rf, mse))

{'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
-9545.426797022377
{'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
-11430.558507089418
{'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
-10089.24265175917
{'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
-9798.496132181548
{'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
-9759.905086401224
{'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
-9769.96371527288
{'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
-9425.255877038358
{'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
-9406.79271806711


(None, None, None, None, None, None, None, None)

In [183]:
hp_barcelona = [
    {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200},
{'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200},
{'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50},
{'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50},
{'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200},
{'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50},
{'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200},
{'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
]

In [138]:
(cv_proc(df_paris, br_features_par, model_par, param_rf, mse),
cv_proc(df_paris, corr_features_par, model_par, param_rf, mse),
cv_proc(df_paris, mi_features_par, model_par, param_rf, mse),
cv_proc(df_paris, mi_features_10_par, model_par, param_rf, mse),
cv_proc(df_paris, mi_features_14_par, model_par, param_rf, mse),
cv_proc(df_paris, mi_features_17_par, model_par, param_rf, mse),
cv_proc(df_paris, mi_features_20_par, model_par, param_rf, mse),
cv_proc(df_paris, rf_features_par, model_par, param_rf, mse))

{'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
-16973.225384753307
{'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
-18135.86703445085
{'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
-17214.983531826736
{'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
-16967.377807969035
{'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
-16938.771989296565


KeyboardInterrupt: 

In [184]:
hp_paris = [
    {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators':100},
{'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100},
{'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200},
{'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200},
{'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100},
{'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100},
{'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200},
{'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
]

In [156]:
(cv_proc(df_rome, br_features_rom, model_rom, param_rf, mse),
cv_proc(df_rome, corr_features_rom, model_rom, param_rf, mse),
cv_proc(df_rome, mi_features_rom, model_rom, param_rf, mse),
cv_proc(df_rome, mi_features_10_rom, model_rom, param_rf, mse),
cv_proc(df_rome, mi_features_14_rom, model_rom, param_rf, mse),
cv_proc(df_rome, mi_features_17_rom, model_rom, param_rf, mse),
cv_proc(df_rome, mi_features_20_rom, model_rom, param_rf, mse),
cv_proc(df_rome, rf_features_rom, model_rom, param_rf, mse))

{'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
-2536.2917976454
{'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
-2568.0301706979835
{'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
-2941.932825245658
{'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
-2751.053382162012
{'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
-2586.6875963193897
{'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
-2552.3854851164456
{'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
-2552.5564326013528
{'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
-2502.1863581786442


(None, None, None, None, None, None, None, None)

In [185]:
hp_rome = [
    {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200},
{'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200},
{'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200},
{'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100},
{'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200},
{'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100},
{'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100},
{'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
]

#### Model Evaluation

In [141]:
model = RandomForestRegressor(**hp_barcelona[0])
cv_output0_rmse = CV_rmse_wrapper(df_barcelona, br_features_bar, model,10, 123, shuff=True, display_res=True)
cv_output0_mape = CV_wrapper(df_barcelona, br_features_bar, model, 10, 123, shuff=True,  display_res=True)

model = RandomForestRegressor(**hp_barcelona[1])
cv_output1_rmse = CV_rmse_wrapper(df_barcelona, corr_features_bar, model,10, 123, shuff=True, display_res=True)
cv_output1_mape = CV_wrapper(df_barcelona, corr_features_bar, model, 10, 123, shuff=True,  display_res=True)

model = RandomForestRegressor(**hp_barcelona[2])
cv_output2_rmse = CV_rmse_wrapper(df_barcelona, mi_features_bar, model,10, 123, shuff=True, display_res=True)
cv_output2_mape = CV_wrapper(df_barcelona, mi_features_bar, model, 10, 123, shuff=True,  display_res=True)

model = RandomForestRegressor(**hp_barcelona[3])
cv_output3_rmse = CV_rmse_wrapper(df_barcelona, mi_features_10_bar, model,10, 123, shuff=True, display_res=True)
cv_output3_mape = CV_wrapper(df_barcelona, mi_features_10_bar, model, 10, 123, shuff=True,  display_res=True)

model = RandomForestRegressor(**hp_barcelona[4])
cv_output4_rmse = CV_rmse_wrapper(df_barcelona, mi_features_14_bar, model,10, 123, shuff=True, display_res=True)
cv_output4_mape = CV_wrapper(df_barcelona, mi_features_14_bar, model, 10, 123, shuff=True,  display_res=True)

model = RandomForestRegressor(**hp_barcelona[5])
cv_output5_rmse = CV_rmse_wrapper(df_barcelona, mi_features_17_bar, model,10, 123, shuff=True, display_res=True)
cv_output5_mape = CV_wrapper(df_barcelona, mi_features_17_bar, model, 10, 123, shuff=True,  display_res=True)

model = RandomForestRegressor(**hp_barcelona[6])
cv_output6_rmse = CV_rmse_wrapper(df_barcelona, mi_features_20_bar, model,10, 123, shuff=True, display_res=True)
cv_output6_mape = CV_wrapper(df_barcelona, mi_features_20_bar, model, 10, 123, shuff=True,  display_res=True)

model = RandomForestRegressor(**hp_barcelona[7])
cv_output7_rmse = CV_rmse_wrapper(df_barcelona, dt_features_bar, model,10, 123, shuff=True, display_res=True)
cv_output7_mape = CV_wrapper(df_barcelona, dt_features_bar, model, 10, 123, shuff=True,  display_res=True)

  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **

In [142]:
(cv_output0_mape.mean(), cv_output0_rmse.mean())

(cv_train    0.162420
 cv_val      0.242524
 dtype: float64,
 cv_train    54.344919
 cv_val      95.772812
 dtype: float64)

In [143]:
(cv_output1_mape.mean(), cv_output1_rmse.mean())

(cv_train    0.181933
 cv_val      0.254275
 dtype: float64,
 cv_train     63.992637
 cv_val      102.064363
 dtype: float64)

In [144]:
(cv_output2_mape.mean(), cv_output2_rmse.mean())

(cv_train    0.167461
 cv_val      0.243294
 dtype: float64,
 cv_train    58.074425
 cv_val      97.985964
 dtype: float64)

In [145]:
(cv_output3_mape.mean(), cv_output3_rmse.mean())

(cv_train    0.164685
 cv_val      0.241679
 dtype: float64,
 cv_train    56.524287
 cv_val      97.216905
 dtype: float64)

In [146]:
(cv_output4_mape.mean(), cv_output4_rmse.mean())

(cv_train    0.163606
 cv_val      0.240320
 dtype: float64,
 cv_train    55.446257
 cv_val      96.233629
 dtype: float64)

In [147]:
(cv_output5_mape.mean(), cv_output5_rmse.mean())

(cv_train    0.163212
 cv_val      0.240433
 dtype: float64,
 cv_train    55.888755
 cv_val      95.689547
 dtype: float64)

In [148]:
(cv_output6_mape.mean(), cv_output6_rmse.mean())

(cv_train    0.161061
 cv_val      0.241441
 dtype: float64,
 cv_train    53.820987
 cv_val      95.475428
 dtype: float64)

In [149]:
(cv_output7_mape.mean(), cv_output7_rmse.mean())

(cv_train    0.191214
 cv_val      0.258295
 dtype: float64,
 cv_train     67.602743
 cv_val      103.685523
 dtype: float64)

In [150]:
model = RandomForestRegressor(**hp_paris[0])
cv_output0_rmse = CV_rmse_wrapper(df_paris, br_features_par, model,10, 123, shuff=True, display_res=True)
cv_output0_mape = CV_wrapper(df_paris, br_features_par, model, 10, 123, shuff=True,  display_res=True)

model = RandomForestRegressor(**hp_paris[1])
cv_output1_rmse = CV_rmse_wrapper(df_paris, corr_features_par, model,10, 123, shuff=True, display_res=True)
cv_output1_mape = CV_wrapper(df_paris, corr_features_par, model, 10, 123, shuff=True,  display_res=True)

model = RandomForestRegressor(**hp_paris[2])
cv_output2_rmse = CV_rmse_wrapper(df_paris, mi_features_par, model,10, 123, shuff=True, display_res=True)
cv_output2_mape = CV_wrapper(df_paris, mi_features_par, model, 10, 123, shuff=True,  display_res=True)

model = RandomForestRegressor(**hp_paris[3])
cv_output3_rmse = CV_rmse_wrapper(df_paris, mi_features_10_par, model,10, 123, shuff=True, display_res=True)
cv_output3_mape = CV_wrapper(df_paris, mi_features_10_par, model, 10, 123, shuff=True,  display_res=True)

model = RandomForestRegressor(**hp_paris[4])
cv_output4_rmse = CV_rmse_wrapper(df_paris, mi_features_14_par, model,10, 123, shuff=True, display_res=True)
cv_output4_mape = CV_wrapper(df_paris, mi_features_14_par, model, 10, 123, shuff=True,  display_res=True)

model = RandomForestRegressor(**hp_paris[5])
cv_output5_rmse = CV_rmse_wrapper(df_paris, mi_features_17_par, model,10, 123, shuff=True, display_res=True)
cv_output5_mape = CV_wrapper(df_paris, mi_features_17_par, model, 10, 123, shuff=True,  display_res=True)

model = RandomForestRegressor(**hp_paris[6])
cv_output6_rmse = CV_rmse_wrapper(df_paris, mi_features_20_par, model,10, 123, shuff=True, display_res=True)
cv_output6_mape = CV_wrapper(df_paris, mi_features_20_par, model, 10, 123, shuff=True,  display_res=True)

model = RandomForestRegressor(**hp_paris[7])
cv_output7_rmse = CV_rmse_wrapper(df_paris, dt_features_par, model,10, 123, shuff=True, display_res=True)
cv_output7_mape = CV_wrapper(df_paris, dt_features_par, model, 10, 123, shuff=True,  display_res=True)

  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **

In [151]:
(cv_output0_mape.mean(), cv_output0_rmse.mean())

(cv_train    0.181987
 cv_val      0.240702
 dtype: float64,
 cv_train     89.854647
 cv_val      126.587121
 dtype: float64)

In [152]:
(cv_output1_mape.mean(), cv_output1_rmse.mean())

(cv_train    0.192022
 cv_val      0.242874
 dtype: float64,
 cv_train     95.445270
 cv_val      130.527673
 dtype: float64)

In [153]:
(cv_output2_mape.mean(), cv_output2_rmse.mean())

(cv_train    0.182835
 cv_val      0.242138
 dtype: float64,
 cv_train     90.441791
 cv_val      126.914221
 dtype: float64)

In [154]:
(cv_output3_mape.mean(), cv_output3_rmse.mean())

(cv_train    0.179097
 cv_val      0.238058
 dtype: float64,
 cv_train     90.085298
 cv_val      126.549390
 dtype: float64)

In [155]:
(cv_output4_mape.mean(), cv_output4_rmse.mean())

(cv_train    0.178597
 cv_val      0.237927
 dtype: float64,
 cv_train     89.236873
 cv_val      125.817038
 dtype: float64)

In [156]:
(cv_output5_mape.mean(), cv_output5_rmse.mean())

(cv_train    0.178508
 cv_val      0.237216
 dtype: float64,
 cv_train     89.465909
 cv_val      126.092780
 dtype: float64)

In [157]:
(cv_output6_mape.mean(), cv_output6_rmse.mean())

(cv_train    0.176671
 cv_val      0.236315
 dtype: float64,
 cv_train     87.715333
 cv_val      125.371818
 dtype: float64)

In [158]:
(cv_output7_mape.mean(), cv_output7_rmse.mean())

(cv_train    0.184878
 cv_val      0.237533
 dtype: float64,
 cv_train     97.348960
 cv_val      127.027526
 dtype: float64)

In [159]:
model = RandomForestRegressor(**hp_rome[0])
cv_output0_rmse = CV_rmse_wrapper(df_rome, br_features_rom, model,10, 123, shuff=True, display_res=True)
cv_output0_mape = CV_wrapper(df_rome, br_features_rom, model, 10, 123, shuff=True,  display_res=True)

model = RandomForestRegressor(**hp_rome[1])
cv_output1_rmse = CV_rmse_wrapper(df_rome, corr_features_rom, model,10, 123, shuff=True, display_res=True)
cv_output1_mape = CV_wrapper(df_rome, corr_features_rom, model, 10, 123, shuff=True,  display_res=True)

model = RandomForestRegressor(**hp_rome[2])
cv_output2_rmse = CV_rmse_wrapper(df_rome, mi_features_rom, model,10, 123, shuff=True, display_res=True)
cv_output2_mape = CV_wrapper(df_rome, mi_features_rom, model, 10, 123, shuff=True,  display_res=True)

model = RandomForestRegressor(**hp_rome[3])
cv_output3_rmse = CV_rmse_wrapper(df_rome, mi_features_10_rom, model,10, 123, shuff=True, display_res=True)
cv_output3_mape = CV_wrapper(df_rome, mi_features_10_rom, model, 10, 123, shuff=True,  display_res=True)

model = RandomForestRegressor(**hp_rome[4])
cv_output4_rmse = CV_rmse_wrapper(df_rome, mi_features_14_rom, model,10, 123, shuff=True, display_res=True)
cv_output4_mape = CV_wrapper(df_rome, mi_features_14_rom, model, 10, 123, shuff=True,  display_res=True)

model = RandomForestRegressor(**hp_rome[5])
cv_output5_rmse = CV_rmse_wrapper(df_rome, mi_features_17_rom, model,10, 123, shuff=True, display_res=True)
cv_output5_mape = CV_wrapper(df_rome, mi_features_17_rom, model, 10, 123, shuff=True,  display_res=True)

model = RandomForestRegressor(**hp_rome[6])
cv_output6_rmse = CV_rmse_wrapper(df_rome, mi_features_20_rom, model,10, 123, shuff=True, display_res=True)
cv_output6_mape = CV_wrapper(df_rome, mi_features_20_rom, model, 10, 123, shuff=True,  display_res=True)

model = RandomForestRegressor(**hp_rome[7])
cv_output7_rmse = CV_rmse_wrapper(df_rome, dt_features_rom, model,10, 123, shuff=True, display_res=True)
cv_output7_mape = CV_wrapper(df_rome, dt_features_rom, model, 10, 123, shuff=True,  display_res=True)

  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **

In [160]:
(cv_output0_mape.mean(), cv_output0_rmse.mean())

(cv_train    0.139520
 cv_val      0.177695
 dtype: float64,
 cv_train    37.116976
 cv_val      49.599009
 dtype: float64)

In [161]:
(cv_output1_mape.mean(), cv_output1_rmse.mean())

(cv_train    0.140160
 cv_val      0.178029
 dtype: float64,
 cv_train    37.715853
 cv_val      49.990092
 dtype: float64)

In [162]:
(cv_output2_mape.mean(), cv_output2_rmse.mean())

(cv_train    0.149375
 cv_val      0.187942
 dtype: float64,
 cv_train    40.755431
 cv_val      52.893711
 dtype: float64)

In [163]:
(cv_output3_mape.mean(), cv_output3_rmse.mean())

(cv_train    0.144281
 cv_val      0.182703
 dtype: float64,
 cv_train    39.153835
 cv_val      51.507346
 dtype: float64)

In [164]:
(cv_output4_mape.mean(), cv_output4_rmse.mean())

(cv_train    0.140573
 cv_val      0.179490
 dtype: float64,
 cv_train    37.369172
 cv_val      50.113599
 dtype: float64)

In [165]:
(cv_output5_mape.mean(), cv_output5_rmse.mean())

(cv_train    0.139029
 cv_val      0.178565
 dtype: float64,
 cv_train    36.998979
 cv_val      49.859040
 dtype: float64)

In [166]:
(cv_output6_mape.mean(), cv_output6_rmse.mean())

(cv_train    0.138663
 cv_val      0.177984
 dtype: float64,
 cv_train    36.719352
 cv_val      49.603289
 dtype: float64)

In [167]:
(cv_output7_mape.mean(), cv_output7_rmse.mean())

(cv_train    0.138219
 cv_val      0.177236
 dtype: float64,
 cv_train    37.318259
 cv_val      49.813602
 dtype: float64)

The best Random Forest model performance is achieved with the features obtained with:
- top 14 MI score for Barcelona;
- top 20 MI score for Paris;
- top 15 Boruta Ranking for Rome.
Let's save the models.

In [186]:
model_bar = RandomForestRegressor(**hp_barcelona[4])
model_bar.fit(df_barcelona.loc[:, mi_features_14_bar].values, df_barcelona.loc[:, "realsum_cut"].values.ravel())
pickle.dump(model_bar, open("models/model_barcelona_rf.sav", "wb"))

model_par = RandomForestRegressor(**hp_paris[6])
model_par.fit(df_paris.loc[:, mi_features_20_par].values, df_paris.loc[:, "realsum_cut"].values.ravel())
pickle.dump(model_par, open("models/model_paris_rf.sav", "wb"))

model_rom = RandomForestRegressor(**hp_rome[0])
model_rom.fit(df_rome.loc[:, br_features_rom].values, df_rome.loc[:, "realsum_cut"].values.ravel())
pickle.dump(model_rom, open("models/model_rome_rf.sav", "wb"))