# Global Model

This notebook demonstrates the process of building and evaluating various regression models for predicting the `realsum_cut` variable using different feature selection techniques and hyperparameter tuning methods.

### Importing Libraries

In [2]:
import pickle

import pandas as pd
import numpy as np
from sklearn.feature_selection import RFECV
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

from sklearn.tree import DecisionTreeRegressor
from tqdm import tqdm
from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_error,
    median_absolute_error,
    mean_absolute_percentage_error,
    r2_score,
    make_scorer
)
from sklearn.model_selection import (
    KFold,
    ShuffleSplit,
    cross_val_score,
    cross_validate,
    RandomizedSearchCV,
    GridSearchCV
)
from sklearn.linear_model import (
    LinearRegression,
    Lasso,
    Ridge,
    ElasticNet
)

from sklearn.exceptions import ConvergenceWarning
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

import warnings
warnings.simplefilter("ignore", category=ConvergenceWarning)

pd.set_option("display.max_columns", 500)
pd.set_option("display.max_rows", 150)

### Loading Data

In [3]:
df = pd.read_csv("data/output/data_train.csv")
df_test = pd.read_csv("data/output/data_test.csv")
fr = pd.read_csv("data/output/general_ranking.csv", index_col=0)

### Linear Regression

#### Feature Selection

We will select a few features dataset, based on different metrics, i.e. correlation, mutual information, and F-score.

1. Correlation

First, we will base our selection on the correlation between the features and the target variable.

In [4]:
feature_selection_with_corr = fr['corr'].sort_values(ascending=False)

In [5]:
feature_selection_with_corr.index[0:5]


Index(['latitude_disc_48_837_48_901', 'room_type', 'lat', 'person_capacity',
       'longitude_disc_2_093_2_347'],
      dtype='object')

Let's define wrapper functions for the OLS model and scoring metrics.

In [6]:
def ols_model_wrapper(
    X_train: pd.DataFrame,
    y_train: pd.DataFrame,
    X_test: pd.DataFrame,
    fit_intercept: bool=True,
) -> np.array :
    reg = LinearRegression(fit_intercept=fit_intercept)
    reg.fit(X=X_train, y=y_train)
    pred = reg.predict(X_test)
    return pred.ravel()  # revel() - Return a contiguous flattened array.


def scoring_wrapper(y_true: np.array, y_pred: np.array) -> dict:
    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    medae = median_absolute_error(y_true, y_pred)
    mape = mean_absolute_percentage_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    return {"mse": mse, "mae": mae, "medae": medae, "mape": mape, "r2": r2}

Let's evaluate the model using the top 5 features selected based on the correlation. We will test the performance both on the training and test set.

In [7]:
scores_on_train = dict()

for i in tqdm(range(1, 6)):
    feautres_i = feature_selection_with_corr.index[0 : i]
    pred = ols_model_wrapper(
        df[feautres_i], df[["realsum_cut"]], df[feautres_i]
    )
    score = scoring_wrapper(df["realsum_cut"].to_numpy().ravel(), pred)
    scores_on_train.update({f"top_{i}": score})


100%|██████████| 5/5 [00:00<00:00, 121.40it/s]


In [8]:
display(pd.DataFrame(scores_on_train).T.sort_values("mape").head(1))
display(pd.DataFrame(scores_on_train).T.sort_values("mse").head(1))
display(pd.DataFrame(scores_on_train).T.sort_values("mae").head(1))
display(pd.DataFrame(scores_on_train).T.sort_values("medae").head(1))

Unnamed: 0,mse,mae,medae,mape,r2
top_5,18746.21076,92.977781,66.640021,0.36209,0.426104


Unnamed: 0,mse,mae,medae,mape,r2
top_5,18746.21076,92.977781,66.640021,0.36209,0.426104


Unnamed: 0,mse,mae,medae,mape,r2
top_5,18746.21076,92.977781,66.640021,0.36209,0.426104


Unnamed: 0,mse,mae,medae,mape,r2
top_5,18746.21076,92.977781,66.640021,0.36209,0.426104


In [9]:
scores_on_test = dict()

for i in tqdm(range(1, 6)):
    feautres_i = feature_selection_with_corr.index[0 : i]
    pred = ols_model_wrapper(
        df[feautres_i], df[["realsum_cut"]], df_test[feautres_i]
    )
    score = scoring_wrapper(df_test["realsum_cut"].to_numpy().ravel(), pred)
    scores_on_test.update({f"top_{i}": score})


100%|██████████| 5/5 [00:00<00:00, 173.42it/s]


In [10]:
display(pd.DataFrame(scores_on_test).T.sort_values("mape").head(1))
display(pd.DataFrame(scores_on_test).T.sort_values("mse").head(1))
display(pd.DataFrame(scores_on_test).T.sort_values("mae").head(1))
display(pd.DataFrame(scores_on_test).T.sort_values("medae").head(1))

Unnamed: 0,mse,mae,medae,mape,r2
top_5,21627.936479,95.980542,67.580178,0.362782,0.39028


Unnamed: 0,mse,mae,medae,mape,r2
top_5,21627.936479,95.980542,67.580178,0.362782,0.39028


Unnamed: 0,mse,mae,medae,mape,r2
top_5,21627.936479,95.980542,67.580178,0.362782,0.39028


Unnamed: 0,mse,mae,medae,mape,r2
top_5,21627.936479,95.980542,67.580178,0.362782,0.39028


We have, that the best model is the one with the top 5 features, based on the correlation.

2. ElasticNet

Next, we will use ElasticNet coefficient calculated in previous notebook to select the features.

In [11]:
fr["EN_coef"].sort_values(ascending=False)

city                           118.697960
longitude_disc_2_093_2_347      92.785460
bedrooms                        69.666650
room_type                       66.605290
latitude_disc_48_837_48_901     65.790436
biz                             61.137344
person_capacity                 33.363330
multi                           31.634403
metro_dist                      19.963486
cleanliness_rating              15.145918
is_weekend                       4.033917
guest_satisfaction_overall       0.100369
room_private                    -0.013505
latitude_disc_41_893_48_837     -2.632091
longitude_disc_2_347_12_47      -3.129059
lat                             -7.342648
longitude_disc_12_47_12_583     -8.543576
dist                           -21.108894
latitude_disc_41_35_41_893     -24.304527
lng                            -26.697237
host_is_superhost                     NaN
Name: EN_coef, dtype: float64

In [12]:
features_selection_en = fr["EN_coef"].dropna().index.tolist()
features_selection_en

['room_type',
 'room_private',
 'person_capacity',
 'multi',
 'biz',
 'cleanliness_rating',
 'guest_satisfaction_overall',
 'bedrooms',
 'dist',
 'metro_dist',
 'lng',
 'lat',
 'is_weekend',
 'city',
 'longitude_disc_2_093_2_347',
 'longitude_disc_2_347_12_47',
 'longitude_disc_12_47_12_583',
 'latitude_disc_41_35_41_893',
 'latitude_disc_41_893_48_837',
 'latitude_disc_48_837_48_901']

In [13]:
features_selection_en.remove("guest_satisfaction_overall")
features_selection_en.remove("room_private")
features_selection_en.remove("is_weekend")

In [14]:
features_selection_en

['room_type',
 'person_capacity',
 'multi',
 'biz',
 'cleanliness_rating',
 'bedrooms',
 'dist',
 'metro_dist',
 'lng',
 'lat',
 'city',
 'longitude_disc_2_093_2_347',
 'longitude_disc_2_347_12_47',
 'longitude_disc_12_47_12_583',
 'latitude_disc_41_35_41_893',
 'latitude_disc_41_893_48_837',
 'latitude_disc_48_837_48_901']

3. Recursive Feature Elimination

We will use Recursive Feature Elimination with Cross-Validation to select the features.

In [15]:
features_candidates_rec = df.columns.tolist()
features_candidates_rec.remove("realsum_cut")

In [16]:
estimator = LinearRegression()
selector = RFECV(estimator, step=1, cv=5, min_features_to_select=10)
selector = selector.fit(
    df.loc[:, features_candidates_rec].values, df.loc[:, "realsum_cut"].values.ravel()
)

features_candidates_rec = df.loc[:, features_candidates_rec].iloc[:, selector.support_].columns.tolist()

In [17]:
features_candidates_rec

['room_type',
 'room_private',
 'person_capacity',
 'host_is_superhost',
 'multi',
 'biz',
 'cleanliness_rating',
 'bedrooms',
 'dist',
 'metro_dist',
 'lng',
 'lat',
 'is_weekend',
 'city',
 'longitude_disc_2_093_2_347',
 'longitude_disc_2_347_12_47',
 'longitude_disc_12_47_12_583',
 'latitude_disc_41_35_41_893',
 'latitude_disc_41_893_48_837',
 'latitude_disc_48_837_48_901']

4. Sequential Feature Selection

We will use Sequential Feature Selection to select the features.

In [18]:
features_selection_back = df.columns.tolist()
features_selection_back.remove("realsum_cut")

In [19]:
model= LinearRegression()

sf = SFS(
    model,
    k_features=(1, 10),
    forward=True,
    floating=False,
    verbose=0,
    scoring="neg_mean_squared_error",
    cv=5,
)

sffit = sf.fit(df.loc[:, features_selection_back].values, df.loc[:, "realsum_cut"].values.ravel())

features_selection_back = df.loc[:, features_selection_back].columns[list(sffit.k_feature_idx_)]

In [20]:
features_selection_back

Index(['room_type', 'person_capacity', 'multi', 'biz', 'cleanliness_rating',
       'bedrooms', 'dist', 'lat', 'longitude_disc_2_093_2_347',
       'latitude_disc_48_837_48_901'],
      dtype='object')

#### Model evaluation with cross-validation approach

We will evaluate the model using the cross-validation approach.

In [21]:
features = feature_selection_with_corr.index[0:5]
features

Index(['latitude_disc_48_837_48_901', 'room_type', 'lat', 'person_capacity',
       'longitude_disc_2_093_2_347'],
      dtype='object')

Now we will define wrapper functions for our Cross Validation. We created two functions, one optimizing MAPE metric, and the other RMSE. We will also create a function for the shuffle cross-validation.

In [22]:
def CV_wrapper(df_train, feature, model, num_split, random=None, shuff=False, display_res = False):
    train_mape_list = list()
    val_mape_list = list()

    kf = KFold(n_splits=num_split, shuffle=shuff, random_state=random)
    for train_index, val_index in kf.split(df_train.index.values):
        reg = model
        reg.fit(
        X=df_train[feature].iloc[train_index],
        y=df_train[["realsum_cut"]].iloc[train_index],
        )
        pred_train = reg.predict(df_train[feature].iloc[train_index])   .ravel()
        pred_val = reg.predict(df_train[feature].iloc[val_index])       .ravel()
        train_mape = scoring_wrapper(
            df_train[["realsum_cut"]].iloc[train_index], pred_train
            ).get("mape")
        val_mape = scoring_wrapper(df_train[["realsum_cut"]]            .iloc[val_index], pred_val).get("mape")
        train_mape_list.append(train_mape)
        val_mape_list.append(val_mape)

    if display_res == True:
        view = pd.DataFrame([train_mape_list,val_mape_list]).T.rename(columns={0:"cv_train", 1:"cv_val"})
        return view
    else:

        return train_mape_list, val_mape_list

In [23]:
def CV_rmse_wrapper(df_train, feature, model, num_split, random=None, shuff=False,  display_res = False):
    train_rmse_list = list()
    val_rmse_list = list()

    kf = KFold(n_splits=num_split, shuffle=shuff, random_state=random)
    for train_index, val_index in kf.split(df_train.index.values):
        reg = model
        reg.fit(
        X=df_train[feature].iloc[train_index],
        y=df_train[["realsum_cut"]].iloc[train_index],
        )
        pred_train = reg.predict(df_train[feature].iloc[train_index])   .ravel()
        pred_val = reg.predict(df_train[feature].iloc[val_index])       .ravel()
        train_rmse = np.sqrt(scoring_wrapper(
            df_train[["realsum_cut"]].iloc[train_index], pred_train
            ).get("mse"))
        val_rmse = np.sqrt((scoring_wrapper(df_train[["realsum_cut"]]            .iloc[val_index], pred_val).get("mse")))
        train_rmse_list.append(train_rmse)
        val_rmse_list.append(val_rmse)

    if display_res == True:
        view = pd.DataFrame([train_rmse_list,val_rmse_list]).T.rename(columns={0:"cv_train", 1:"cv_val"})
        return view
    else:
        return train_rmse_list, val_rmse_list

In [24]:
def shuffle_CV(df_train, feature, model, num_split, random, display_res = False):
    train_mape_list = list()
    val_mape_list = list()

    kf = ShuffleSplit(n_splits=num_split, test_size=0.25, random_state=random)
    for train_index, val_index in kf.split(df_train.index.values):
        reg = model
        reg.fit(
        X=df_train[feature].iloc[train_index],
        y=df_train[["realsum_cut"]].iloc[train_index],
        )
        pred_train = reg.predict(df_train[feature].iloc[train_index]).ravel()
        pred_val = reg.predict(df_train[feature].iloc[val_index]).ravel()
        train_mape = scoring_wrapper(
        df_train[["realsum_cut"]].iloc[train_index], pred_train
        ).get("mape")
        val_mape = scoring_wrapper(df_train[["realsum_cut"]].iloc[val_index], pred_val).get(
        "mape"
        )
        train_mape_list.append(train_mape)
        val_mape_list.append(val_mape)

    if display_res == True:
        view = pd.DataFrame([train_mape_list, val_mape_list]).T.rename(columns={0: "cv_train", 1: "cv_val"})
        return view
    else:
        return train_mape_list, val_mape_list


In [25]:
def shuffle_CV_rmse(df_train, feature, model,  num_split, random,  display_res = False):
    train_rmse_list = list()
    val_rmse_list = list()

    kf = ShuffleSplit(n_splits=num_split, test_size=0.25, random_state=random)
    for train_index, val_index in kf.split(df_train.index.values):
        reg = model
        reg.fit(
        X=df_train[feature].iloc[train_index],
        y=df_train[["realsum_cut"]].iloc[train_index],
        )
        pred_train = reg.predict(df_train[feature].iloc[train_index]).ravel()
        pred_val = reg.predict(df_train[feature].iloc[val_index]).ravel()
        train_rmse = np.sqrt(scoring_wrapper(
        df_train[["realsum_cut"]].iloc[train_index], pred_train
        ).get("mse"))
        val_rmse = np.sqrt(scoring_wrapper(df_train[["realsum_cut"]].iloc[val_index], pred_val).get(
        "mse"))

        train_rmse_list.append(train_rmse)
        val_rmse_list.append(val_rmse)

    if display_res == True:
        view = pd.DataFrame([train_rmse_list, val_rmse_list]).T.rename(columns={0: "cv_train", 1: "cv_val"})
        return view
    else:
        return train_rmse_list, val_rmse_list


In [26]:
model_ols = LinearRegression()
(CV_wrapper(df, features, model_ols, 10, display_res=True).mean(),
 CV_rmse_wrapper(df, features, model_ols, 10, display_res=True).mean(),
 CV_wrapper(df, features, model_ols, 10, 123, shuff=True, display_res=True).mean(),
 CV_rmse_wrapper(df, features, model_ols, 10, 123, shuff=True, display_res=True).mean(),
 shuffle_CV(df, features, model_ols, 10, 123, display_res=True).mean(),
 shuffle_CV_rmse(df, features, model_ols, 10, 123, display_res=True).mean())

(cv_train    0.362091
 cv_val      0.362218
 dtype: float64,
 cv_train    136.911950
 cv_val      136.884043
 dtype: float64,
 cv_train    0.362084
 cv_val      0.362265
 dtype: float64,
 cv_train    136.911440
 cv_val      136.844225
 dtype: float64,
 cv_train    0.363524
 cv_val      0.361665
 dtype: float64,
 cv_train    137.570685
 cv_val      134.941773
 dtype: float64)

In [27]:
(CV_wrapper(df, features_selection_en, model_ols, 10, display_res=True).mean(),
 CV_rmse_wrapper(df, features_selection_en, model_ols, 10, display_res=True).mean(),
 CV_wrapper(df, features_selection_en, model_ols, 10, 123, shuff=True, display_res=True).mean(),
 CV_rmse_wrapper(df, features_selection_en, model_ols, 10, 123, shuff=True, display_res=True).mean(),
 shuffle_CV(df, features_selection_en, model_ols, 10, 123, display_res=True).mean(),
 shuffle_CV_rmse(df, features_selection_en, model_ols, 10, 123, display_res=True).mean())

(cv_train    0.340574
 cv_val      0.341279
 dtype: float64,
 cv_train    124.179533
 cv_val      124.299739
 dtype: float64,
 cv_train    0.340570
 cv_val      0.341166
 dtype: float64,
 cv_train    124.180565
 cv_val      124.249869
 dtype: float64,
 cv_train    0.341890
 cv_val      0.342076
 dtype: float64,
 cv_train    124.719898
 cv_val      122.663828
 dtype: float64)

In [28]:
(CV_wrapper(df, features_candidates_rec, model_ols, 10, display_res=True).mean(),
 CV_rmse_wrapper(df, features_candidates_rec, model_ols, 10, display_res=True).mean(),
 CV_wrapper(df, features_candidates_rec, model_ols, 10, 123, shuff=True, display_res=True).mean(),
 CV_rmse_wrapper(df, features_candidates_rec, model_ols, 10, 123, shuff=True, display_res=True).mean(),
 shuffle_CV(df, features_candidates_rec, model_ols, 10, 123, display_res=True).mean(),
 shuffle_CV_rmse(df, features_candidates_rec, model_ols, 10, 123, display_res=True).mean())

(cv_train    0.340056
 cv_val      0.340769
 dtype: float64,
 cv_train    124.100152
 cv_val      124.236409
 dtype: float64,
 cv_train    0.340050
 cv_val      0.340751
 dtype: float64,
 cv_train    124.100652
 cv_val      124.195857
 dtype: float64,
 cv_train    0.341321
 cv_val      0.341802
 dtype: float64,
 cv_train    124.631268
 cv_val      122.626229
 dtype: float64)

In [29]:
(CV_wrapper(df, features_selection_back, model_ols, 10, display_res=True).mean(),
 CV_rmse_wrapper(df, features_selection_back, model_ols, 10, display_res=True).mean(),
 CV_wrapper(df, features_selection_back, model_ols, 10, 123, shuff=True, display_res=True).mean(),
 CV_rmse_wrapper(df, features_selection_back, model_ols, 10, 123, shuff=True, display_res=True).mean(),
 shuffle_CV(df, features_selection_back, model_ols, 10, 123, display_res=True).mean(),
 shuffle_CV_rmse(df, features_selection_back, model_ols, 10, 123, display_res=True).mean())

(cv_train    0.347679
 cv_val      0.348010
 dtype: float64,
 cv_train    126.288880
 cv_val      126.327969
 dtype: float64,
 cv_train    0.347673
 cv_val      0.347940
 dtype: float64,
 cv_train    126.289125
 cv_val      126.301988
 dtype: float64,
 cv_train    0.349162
 cv_val      0.348191
 dtype: float64,
 cv_train    126.841726
 cv_val      124.697595
 dtype: float64)

We see that, the best score obtained the features dataset selected through Recursive Feature Elimination process.

In [31]:
model = LinearRegression()
model.fit(df.loc[:, features_candidates_rec], df.loc[:, "realsum_cut"].values.ravel())
pickle.dump(model, open("models/model.sav", "wb"))

#### Regularization
Let's move on to the topic of regularization for a few moments. As we know from the lecture, it is a set of techniques that allows us to modify the cost function to reduce the risk of overfitting our model. Three the most popular regularization types are: Lasso (l1), Ridge (l2), Elastic Net (linear combination of l1 and l2). Each of these models has hyperparameters (parameters that are not estimable by the model and we have to "guess"/choose them ourselves based on our expert knowledge or cross-validation): Lasso and Ridge (alpha - controlling regularization strength), Elastic Net (alpha - controlling regularization strength and l1_ratio - the ElasticNet mixing parameter). Let's jump into documentation and read more how we should interpret our HP.

Generally, the alpha hyperparameter controls the strength of regularization in the model. A higher value of alpha will result in stronger regularization and simpler models with smaller coefficients. Conversely, a lower value of alpha will result in weaker regularization and more complex models with larger coefficients.

For each regularization we created a wrapper function to easily apply the technique to our dataset.

##### Lasso
For Lasso regression, a reasonable range of alpha values is typically between 0.0001 and 1. Smaller values of alpha result in less regularization, while larger values result in more regularization. Similarly to Ridge regression, a logarithmic scale can be used for alpha values, such as np.logspace(-4, 0, 5), which generates a sequence of alpha values between 0.0001 and 1.

In [32]:
def lasso_model_wrapper(df_train, feature_selection, num_split, random, shuff=True, display_res = False):
    kf = KFold(n_splits=num_split, shuffle=shuff, random_state=random)

# for alpha in [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.025]:
    for alpha in np.logspace(-4, 0, 10): #start, stop, num
        train_mape_list = list()
        val_mape_list = list()

        for train_index, val_index in kf.split(df_train.index.values):
            reg = Lasso(alpha=alpha, fit_intercept=True)
            reg.fit(
            X=df_train[feature_selection].iloc[train_index],
            y=df_train[["realsum_cut"]].iloc[train_index],
            )
            pred_train = reg.predict(df_train[feature_selection].iloc[train_index]).ravel()
            pred_val = reg.predict(df_train[feature_selection].iloc[val_index]).ravel()
            train_mape = scoring_wrapper(
                df_train[["realsum_cut"]].iloc[train_index], pred_train
            ).get("mape")
            val_mape = scoring_wrapper(
                df_train[["realsum_cut"]].iloc[val_index], pred_val
            ).get("mape")
            train_mape_list.append(train_mape)
            val_mape_list.append(val_mape)

    if display_res == True:
        view = pd.DataFrame([train_mape_list, val_mape_list]).T.rename(columns={0: "cv_train", 1: "cv_val"})
        return view
    else:
        return train_mape_list, val_mape_list

In [33]:
lasso_model_wrapper(df, features, 10, 123, display_res=True).mean()

cv_train    0.361137
cv_val      0.361316
dtype: float64

In [34]:
lasso_model_wrapper(df, features_selection_en, 10, 123, display_res=True).mean()

cv_train    0.344010
cv_val      0.344418
dtype: float64

In [35]:
lasso_model_wrapper(df, features_candidates_rec, 10, 123, display_res=True).mean()

cv_train    0.343618
cv_val      0.344130
dtype: float64

In [36]:
lasso_model_wrapper(df, features_selection_back, 10, 123, display_res=True).mean()

cv_train    0.345068
cv_val      0.345363
dtype: float64

There is no significant impact with lasso regularization.

##### Ridge
For Ridge regression, a reasonable range of alpha values is typically between 0.01 and 100. Smaller values of alpha result in less regularization, while larger values result in more regularization. It's common to use a logarithmic scale for alpha values, such as np.logspace(-3, 3, 7), which generates a sequence of alpha values between 0.001 and 1000.

In [37]:
def ridge_model_wrapper(df_train, feature_selection, num_split, random, shuff=True, display_res = False):
    kf = KFold(n_splits=num_split, shuffle=shuff, random_state=random)

# for alpha in [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.025]:
    for alpha in np.logspace(-4, 0, 10): #start, stop, num
        train_mape_list = list()
        val_mape_list = list()

        for train_index, val_index in kf.split(df_train.index.values):
            reg = Ridge(alpha=alpha, fit_intercept=True)
            reg.fit(
            X=df_train[feature_selection].iloc[train_index],
            y=df_train[["realsum_cut"]].iloc[train_index],
            )
            pred_train = reg.predict(df_train[feature_selection].iloc[train_index]).ravel()
            pred_val = reg.predict(df_train[feature_selection].iloc[val_index]).ravel()
            train_mape = scoring_wrapper(
                df_train[["realsum_cut"]].iloc[train_index], pred_train
            ).get("mape")
            val_mape = scoring_wrapper(
                df_train[["realsum_cut"]].iloc[val_index], pred_val
            ).get("mape")
            train_mape_list.append(train_mape)
            val_mape_list.append(val_mape)

    if display_res == True:
        view = pd.DataFrame([train_mape_list, val_mape_list]).T.rename(columns={0: "cv_train", 1: "cv_val"})
        return view
    else:
        return train_mape_list, val_mape_list

In [38]:
ridge_model_wrapper(df, features, 10, 123, display_res=True).mean()

cv_train    0.362069
cv_val      0.362251
dtype: float64

In [39]:
ridge_model_wrapper(df, features_selection_en, 10, 123, display_res=True).mean()

cv_train    0.342882
cv_val      0.343330
dtype: float64

In [40]:
ridge_model_wrapper(df, features_candidates_rec, 10, 123, display_res=True).mean()

cv_train    0.342251
cv_val      0.342831
dtype: float64

In [41]:
ridge_model_wrapper(df, features_selection_back, 10, 123, display_res=True).mean()

cv_train    0.347653
cv_val      0.347919
dtype: float64

No improvement with ridge regularization either. Let's move on to the ElasticNet regularization.

##### ElasticNet
Now we have to check two HP: l1_ratio and alpha in CV.

The hyperparameter l1_ratio controls the balance between L1 and L2 regularization. It is a value between 0 and 1 that determines the proportion of the penalty that comes from L1 regularization. When l1_ratio=0, the ElasticNet model reduces to Ridge regression, and when l1_ratio=1, it reduces to Lasso regression. For values of l1_ratio between 0 and 1, the model uses a combination of L1 and L2 regularization.

In [42]:
def EN_model_wrapper(df_train, feature_selection, num_split, random, shuff=True, display_res = False):
    kf = KFold(n_splits=num_split, shuffle=shuff, random_state=random)

    for l1_ratio in [0.1, 0.25, 0.5, 0.75, 0.9]:
        for alpha in [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.025]: #start, stop, num
            train_mape_list = list()
            val_mape_list = list()

            for train_index, val_index in kf.split(df_train.index.values):
                reg = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, fit_intercept=True)
                reg.fit(
                    X=df_train[feature_selection].iloc[train_index],
                    y=df_train[["realsum_cut"]].iloc[train_index],
                )
                pred_train = reg.predict(df_train[feature_selection].iloc[train_index]).ravel()
                pred_val = reg.predict(df_train[feature_selection].iloc[val_index]).ravel()
                train_mape = scoring_wrapper(
                    df_train[["realsum_cut"]].iloc[train_index], pred_train
                ).get("mape")
                val_mape = scoring_wrapper(
                    df_train[["realsum_cut"]].iloc[val_index], pred_val
                ).get("mape")
                train_mape_list.append(train_mape)
                val_mape_list.append(val_mape)

    if display_res == True:
        view = pd.DataFrame([train_mape_list, val_mape_list]).T.rename(columns={0: "cv_train", 1: "cv_val"})
        return view
    else:
        return train_mape_list, val_mape_list

In [43]:
EN_model_wrapper(df, features, 10, 123, display_res=True).mean()

cv_train    0.361618
cv_val      0.361799
dtype: float64

In [44]:
EN_model_wrapper(df, features_selection_en, 10, 123, display_res=True).mean()

cv_train    0.344721
cv_val      0.345146
dtype: float64

In [45]:
EN_model_wrapper(df, features_candidates_rec, 10, 123, display_res=True).mean()

cv_train    0.344078
cv_val      0.344600
dtype: float64

In [46]:
EN_model_wrapper(df, features_selection_back, 10, 123, display_res=True).mean()

cv_train    0.346957
cv_val      0.347222
dtype: float64

Unfortunately, ElasticNet did not improve the results on MAPE either.

#### Hyperparameters tuning
Ok, we are now ready for the last step of our OLS model analysis, i.e. testing different hyperparameter search strategies in CV. We will focus on three strategies:
- Random search
- Grid search

##### Random Search
We use random search as a method for searching or sampling candidates.

In [47]:
estimator = LinearRegression()

parameter_space = dict(
   # alpha=np.logspace(-3, 3, 20),
    #l1_ratio=uniform(0,1),
   fit_intercept=[True, False],
)

cross_validation_schema = KFold(n_splits=10, shuffle=True)
score_function = {'mape': 'neg_mean_absolute_percentage_error',
           'mae': make_scorer(mean_absolute_error, greater_is_better=False)}

n_iterations = 100

In [48]:
random_search_cv = RandomizedSearchCV(
    estimator=estimator,
    param_distributions=parameter_space,
    n_iter=n_iterations,
    n_jobs=-1,
    cv=cross_validation_schema,
    scoring=score_function,
    refit="mape",
    random_state=123,
    return_train_score=False,
    verbose=3 #Controls the verbosity: the higher, the more messages.

)
search = random_search_cv.fit(df[features_candidates_rec], df["realsum_cut"])

Fitting 10 folds for each of 2 candidates, totalling 20 fits




In [49]:
search.best_params_

{'fit_intercept': True}

In [50]:
search.best_estimator_

In [51]:
search.best_score_

np.float64(-0.3406749217568844)

In [52]:
yhat = search.predict(df_test[features_candidates_rec])

In [53]:
scoring_wrapper(df_test[["realsum_cut"]].to_numpy().ravel(), yhat)

{'mse': 18047.46750058156,
 'mae': 87.43100383382195,
 'medae': np.float64(62.6939027660018),
 'mape': 0.3416328851753703,
 'r2': 0.4912180329296497}

Grid Search

In [54]:
estimator = LinearRegression()

parameter_space = dict(
    #alpha=np.logspace(-3, 3, 5),
    #l1_ratio=np.random.uniform(0,1,5), #please note that now this is deterministic list
    fit_intercept=[True, False],
)

cross_validation_schema = KFold(n_splits=10, shuffle=True)
score_function = {'mape': 'neg_mean_absolute_percentage_error',
           'mae': make_scorer(mean_absolute_error, greater_is_better=False)}

In [55]:
grind_search_cv = GridSearchCV(
    estimator=estimator,
    param_grid=parameter_space,
    scoring=score_function,
    n_jobs=-1,
    cv=cross_validation_schema,
    refit="mape",
    return_train_score=False,
    verbose=3 #Controls the verbosity: the higher, the more messages.

)

search = grind_search_cv.fit(df[features_selection_en], df["realsum_cut"])

Fitting 10 folds for each of 2 candidates, totalling 20 fits


In [56]:
search.best_params_

{'fit_intercept': True}

In [57]:
search.best_score_

np.float64(-0.3412348520929619)

In [58]:
yhat = search.predict(df_test[features_selection_en])
scoring_wrapper(df_test[["realsum_cut"]].to_numpy().ravel(), yhat)

{'mse': 18067.76982601333,
 'mae': 87.67896224518782,
 'medae': np.float64(63.38702441164979),
 'mape': 0.3428945303756582,
 'r2': 0.4906456835366423}

### KNN Regression

#### Searching for "good enough" model to feature selection

In [59]:
var = fr.mi_score.sort_values(ascending=False).index.tolist()[0:10]

In [60]:
print(var)

['lng', 'lat', 'city', 'latitude_disc_48_837_48_901', 'longitude_disc_2_093_2_347', 'dist', 'metro_dist', 'room_type', 'person_capacity', 'room_private']


In [61]:
param = {
    "n_neighbors": [3, 5, 7, 10, 12, 15, 20, 25, 40, 50, 100],
    "weights": ["uniform", "distance"],
    "metric": ["chebyshev", "manhattan", "minkowski"],
    "p": [1, 2],
}

In [62]:
mse = make_scorer(mean_squared_error, greater_is_better=False)

In [63]:
model = KNeighborsRegressor()
grid_CV = GridSearchCV(
    model, param, cv=5, scoring=mse, return_train_score=True, n_jobs=-1
)
grid_CV.fit(df.loc[:, var].values, df.loc[:, "realsum_cut"].values.ravel())

In [64]:
grid_CV.best_params_

{'metric': 'manhattan', 'n_neighbors': 100, 'p': 1, 'weights': 'distance'}

#### Feature selection

1. Mutual Information Score

In [65]:
fr.sort_values("mi_score", ascending=False, inplace=True)

In [66]:
fr.head()

Unnamed: 0,mi_score,sign_fscore,sign_fscore_0_1,corr,EN_coef,boruta_rank
lng,0.859389,0.0,1,-0.391676,-26.697237,1
lat,0.814053,0.0,1,0.376978,-7.342648,1
city,0.730737,9.967065e-273,1,-0.36487,118.69796,2
latitude_disc_48_837_48_901,0.437252,0.0,1,0.479799,65.790436,7
longitude_disc_2_093_2_347,0.381556,1.495659e-284,1,0.29582,92.78546,4


In [67]:
mi_features = fr.iloc[0:8].index.tolist()
mi_features_10 = fr.iloc[0:10].index.tolist()
mi_features_14 = fr.iloc[0:14].index.tolist()
mi_features_17 = fr.iloc[0:17].index.tolist()
mi_features_20 = fr.iloc[0:20].index.tolist()

2. Boruta Ranking

In [68]:
br_features = fr[fr.boruta_rank.isin([1,2,3])].index.tolist()

3. Correlation

In [69]:
fr["corr_abs"] = np.abs(fr["corr"])
fr.sort_values("corr_abs", ascending=False, inplace=True)
corr_features = fr.iloc[0:12].index.tolist()

4. Sequential Feature Selection

In [70]:
sf_features = df.columns.tolist()
sf_features.remove("realsum_cut")

In [71]:
model = KNeighborsRegressor(**grid_CV.best_params_)

sf = SFS(
    model,
    k_features=(5, 15),
    forward=True,
    floating=False,
    verbose=0,
    scoring=mse,
    cv=5,
    n_jobs=-1,
)

sffit = sf.fit(
    df.loc[:, sf_features].values, df.loc[:, "realsum_cut"].values.ravel()
)

sf_features = df.loc[:, sf_features].columns[list(sffit.k_feature_idx_)]


In [72]:
sf_features

Index(['room_type', 'room_private', 'person_capacity', 'biz', 'bedrooms',
       'lng', 'lat', 'longitude_disc_12_47_12_583',
       'latitude_disc_41_893_48_837', 'latitude_disc_48_837_48_901'],
      dtype='object')

#### Hyperparameters tuning for each set of features

Let's define function that will tune hyperparameters of the KNN function for each set of features.

In [73]:
def cv_proc(var, model, param):
    model = model
    grid_CV = GridSearchCV(
        model, param, cv=5, scoring=mse, return_train_score=True, n_jobs=-1
    )
    grid_CV.fit(df.loc[:, var].values, df.loc[:, "realsum_cut"].values.ravel())
    print(grid_CV.best_params_)
    print(grid_CV.best_score_)

In [74]:
model = KNeighborsRegressor(**grid_CV.best_params_)

In [76]:
cv_proc(br_features, model, param)

{'metric': 'manhattan', 'n_neighbors': 10, 'p': 1, 'weights': 'distance'}
-12816.500763610082


In [77]:
cv_proc(mi_features, model, param)

{'metric': 'manhattan', 'n_neighbors': 100, 'p': 1, 'weights': 'distance'}
-10645.891020206327


In [78]:
cv_proc(mi_features_10, model, param)

{'metric': 'manhattan', 'n_neighbors': 100, 'p': 1, 'weights': 'distance'}
-7658.3700972693


In [79]:
cv_proc(mi_features_14, model, param)

{'metric': 'manhattan', 'n_neighbors': 40, 'p': 1, 'weights': 'distance'}
-7894.134587006585


In [80]:
cv_proc(mi_features_17, model, param)

{'metric': 'manhattan', 'n_neighbors': 15, 'p': 1, 'weights': 'distance'}
-12532.827763989437


In [81]:
cv_proc(mi_features_20, model, param)

{'metric': 'manhattan', 'n_neighbors': 7, 'p': 1, 'weights': 'distance'}
-11680.583528270065


In [82]:
cv_proc(corr_features, model, param)

{'metric': 'manhattan', 'n_neighbors': 50, 'p': 1, 'weights': 'distance'}
-7733.323817174483


In [83]:
cv_proc(sf_features, model, param)

{'metric': 'manhattan', 'n_neighbors': 50, 'p': 1, 'weights': 'distance'}
-6653.962242727318


In [84]:
hp_knn = [
    {'metric': 'manhattan', 'n_neighbors': 10, 'p': 1, 'weights': 'distance'},
    {'metric': 'manhattan', 'n_neighbors': 100, 'p': 1, 'weights': 'distance'},
    {'metric': 'manhattan', 'n_neighbors': 100, 'p': 1, 'weights': 'distance'},
    {'metric': 'manhattan', 'n_neighbors': 40, 'p': 1, 'weights': 'distance'},
    {'metric': 'manhattan', 'n_neighbors': 15, 'p': 1, 'weights': 'distance'},
    {'metric': 'manhattan', 'n_neighbors': 7, 'p': 1, 'weights': 'distance'},
    {'metric': 'manhattan', 'n_neighbors': 50, 'p': 1, 'weights': 'distance'},
    {'metric': 'manhattan', 'n_neighbors': 50, 'p': 1, 'weights': 'distance'},
]

Now we will evaluate the model using the cross-validation approach. We will check both MAPE and RMSE metrics for each feature dataset.

In [85]:
model = KNeighborsRegressor(**hp_knn[0])
var = br_features
cv_output0_rmse = CV_rmse_wrapper(df, br_features, model,10, 123, shuff=True, display_res=True)
cv_output0_mape = CV_wrapper(df, br_features, model, 10, 123, shuff=True,  display_res=True)

In [86]:
cv_output0_rmse.mean()

cv_train      0.000000
cv_val      111.476953
dtype: float64

In [87]:
cv_output0_mape.mean()

cv_train    0.00000
cv_val      0.26079
dtype: float64

In [88]:
model = KNeighborsRegressor(**hp_knn[1])
cv_output1_rmse = CV_rmse_wrapper(df, mi_features, model,10, 123, shuff=True, display_res=True)
cv_output1_mape = CV_wrapper(df, mi_features, model, 10, 123, shuff=True,  display_res=True)

In [89]:
cv_output1_mape.mean()

cv_train    0.00000
cv_val      0.17459
dtype: float64

In [90]:
cv_output1_rmse.mean()

cv_train     0.000000
cv_val      97.919683
dtype: float64

In [91]:
model = KNeighborsRegressor(**hp_knn[2])
cv_output2_rmse = CV_rmse_wrapper(df, mi_features_10, model,10, 123, shuff=True, display_res=True)
cv_output2_mape = CV_wrapper(df, mi_features_10, model, 10, 123, shuff=True,  display_res=True)

In [92]:
cv_output2_mape.mean()

cv_train    0.000000
cv_val      0.151257
dtype: float64

In [93]:
cv_output2_rmse.mean()

cv_train     0.000000
cv_val      82.737532
dtype: float64

In [94]:
model = KNeighborsRegressor(**hp_knn[3])
cv_output3_rmse = CV_rmse_wrapper(df, mi_features_14, model,10, 123, shuff=True, display_res=True)
cv_output3_mape = CV_wrapper(df, mi_features_14, model, 10, 123, shuff=True,  display_res=True)

In [95]:
cv_output3_mape.mean()

cv_train    0.00000
cv_val      0.15227
dtype: float64

In [96]:
cv_output3_rmse.mean()

cv_train     0.000000
cv_val      84.032507
dtype: float64

In [97]:
model = KNeighborsRegressor(**hp_knn[4])
cv_output4_rmse = CV_rmse_wrapper(df, mi_features_17, model,10, 123, shuff=True, display_res=True)
cv_output4_mape = CV_wrapper(df, mi_features_17, model, 10, 123, shuff=True,  display_res=True)

In [98]:
cv_output4_mape.mean()

cv_train    0.000000
cv_val      0.262314
dtype: float64

In [99]:
cv_output4_rmse.mean()

cv_train      0.000000
cv_val      110.725694
dtype: float64

In [100]:
model = KNeighborsRegressor(**hp_knn[5])
cv_output5rmse = CV_rmse_wrapper(df, mi_features_20, model,10, 123, shuff=True, display_res=True)
cv_output5_mape = CV_wrapper(df, mi_features_20, model, 10, 123, shuff=True,  display_res=True)

In [101]:
cv_output5_mape.mean()

cv_train    0.000000
cv_val      0.245508
dtype: float64

In [102]:
cv_output5rmse.mean()

cv_train      0.000000
cv_val      106.210569
dtype: float64

In [103]:
model = KNeighborsRegressor(**hp_knn[6])
cv_output6_rmse = CV_rmse_wrapper(df, corr_features, model,10, 123, shuff=True, display_res=True)
cv_output6_mape = CV_wrapper(df, corr_features, model, 10, 123, shuff=True,  display_res=True)

In [104]:
cv_output6_mape.mean()

cv_train    0.000090
cv_val      0.159348
dtype: float64

In [105]:
cv_output6_rmse.mean()

cv_train     0.979875
cv_val      83.304007
dtype: float64

In [106]:
model = KNeighborsRegressor(**hp_knn[7])
cv_output7_rmse = CV_rmse_wrapper(df, sf_features, model,10, 123, shuff=True, display_res=True)
cv_output7_mape = CV_wrapper(df, sf_features, model, 10, 123, shuff=True,  display_res=True)

In [107]:
cv_output7_mape.mean()

cv_train    0.012815
cv_val      0.140850
dtype: float64

In [108]:
cv_output7_rmse.mean()

cv_train    15.056186
cv_val      77.633269
dtype: float64

We significantly improved our score. The best result was obtained for the Sequential Feature Selection dataset.

In [109]:
model = KNeighborsRegressor(**hp_knn[7])
model.fit(df.loc[:, sf_features].values, df.loc[:, "realsum_cut"].values.ravel())
pickle.dump(model, open("models/model_knn.sav", "wb"))

### Decision Tree
Now we will move on to the Decision Tree model. We will start with feature selection. We will use GridSearchCV to find the best hyperparameters for the model.

#### Feature selection

In [110]:
param_dt = {
    "max_depth": [3, 5, 7, 10],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
}

In [111]:
model = DecisionTreeRegressor()
grid_CV = GridSearchCV(
    model, param_dt, cv=5, scoring=mse, return_train_score=True, n_jobs=-1
)
grid_CV.fit(df.loc[:, var].values, df.loc[:, "realsum_cut"].values.ravel())

In [112]:
grid_CV.best_score_

np.float64(-12722.537351894076)

In [113]:
sf_features_dt = df.columns.tolist()
sf_features_dt.remove("realsum_cut")
model = DecisionTreeRegressor(**grid_CV.best_params_)

sf = SFS(
    model,
    k_features=(5, 15),
    forward=True,
    floating=False,
    verbose=0,
    scoring=mse,
    cv=5,
    n_jobs=-1,
)

sffit = sf.fit(
    df.loc[:, sf_features_dt].values, df.loc[:, "realsum_cut"].values.ravel()
)

sf_features_dt = df.loc[:, sf_features_dt].columns[list(sffit.k_feature_idx_)]


In [114]:
sf_features_dt

Index(['room_type', 'person_capacity', 'host_is_superhost', 'multi', 'biz',
       'bedrooms', 'lng', 'lat', 'longitude_disc_2_093_2_347',
       'longitude_disc_12_47_12_583', 'latitude_disc_41_35_41_893',
       'latitude_disc_41_893_48_837'],
      dtype='object')

#### Hyperparameters tuning

In [115]:
cv_proc(sf_features_dt, model, param_dt)

{'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 10}
-11681.14512907412


In [116]:
cv_proc(mi_features, model, param_dt)

{'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10}
-17937.97212961888


In [117]:
cv_proc(mi_features_10, model, param_dt)

{'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 5}
-13087.749890003417


In [118]:
cv_proc(mi_features_14, model, param_dt)

{'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2}
-12692.538970580921


In [119]:
cv_proc(mi_features_17, model, param_dt)

{'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 2}
-12824.52396135405


In [120]:
cv_proc(mi_features_20, model, param_dt)

{'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10}
-12677.781180596257


In [121]:
hp_dt = [
    {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 5},
    {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10},
    {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2},
    {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 5},
    {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 2},
    {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 5}
]

#### Model evaluation with cross-validation approach

In [122]:
model = DecisionTreeRegressor(**hp_dt[0])
(CV_wrapper(df, sf_features_dt, model, 10, 123, shuff=True, display_res=True).mean(),
 CV_rmse_wrapper(df, sf_features_dt, model, 10, 123, shuff=True, display_res=True).mean())

(cv_train    0.210574
 cv_val      0.245909
 dtype: float64,
 cv_train     90.235306
 cv_val      107.586513
 dtype: float64)

In [123]:
model = DecisionTreeRegressor(**hp_dt[1])
(CV_wrapper(df, mi_features, model, 10, 123, shuff=True, display_res=True).mean(),
 CV_rmse_wrapper(df, mi_features, model, 10, 123, shuff=True, display_res=True).mean())

(cv_train    0.249071
 cv_val      0.288543
 dtype: float64,
 cv_train    114.638546
 cv_val      131.230501
 dtype: float64)

In [124]:
model = DecisionTreeRegressor(**hp_dt[2])
(CV_wrapper(df, mi_features_10, model, 10, 123, shuff=True, display_res=True).mean(),
    CV_rmse_wrapper(df, mi_features_10, model, 10, 123, shuff=True, display_res=True).mean())

(cv_train    0.207671
 cv_val      0.248270
 dtype: float64,
 cv_train     89.678899
 cv_val      112.661205
 dtype: float64)

In [125]:
model = DecisionTreeRegressor(**hp_dt[3])
(CV_wrapper(df, mi_features_14, model, 10, 123, shuff=True, display_res=True).mean(),
    CV_rmse_wrapper(df, mi_features_14, model, 10, 123, shuff=True, display_res=True).mean())

(cv_train    0.207678
 cv_val      0.246787
 dtype: float64,
 cv_train     89.447232
 cv_val      112.573604
 dtype: float64)

In [126]:
model = DecisionTreeRegressor(**hp_dt[4])
(CV_wrapper(df, mi_features_17, model, 10, 123, shuff=True, display_res=True).mean(),
    CV_rmse_wrapper(df, mi_features_17, model, 10, 123, shuff=True, display_res=True).mean())

(cv_train    0.213039
 cv_val      0.250748
 dtype: float64,
 cv_train     89.763219
 cv_val      110.433729
 dtype: float64)

In [127]:
model = DecisionTreeRegressor(**hp_dt[5])
(CV_wrapper(df, mi_features_20, model, 10, 123, shuff=True, display_res=True).mean(),
    CV_rmse_wrapper(df, mi_features_20, model, 10, 123, shuff=True, display_res=True).mean())

[CV 8/10] END fit_intercept=True; mae: (test=-86.805) mape: (test=-0.342) total time=   0.1s
[CV 1/10] END fit_intercept=False; mae: (test=-84.215) mape: (test=-0.323) total time=   0.0s
[CV 7/10] END fit_intercept=False; mae: (test=-86.311) mape: (test=-0.354) total time=   0.0s
[CV 3/10] END fit_intercept=False; mae: (test=-88.907) mape: (test=-0.342) total time=   0.0s
[CV 9/10] END fit_intercept=False; mae: (test=-85.738) mape: (test=-0.354) total time=   0.0s
[CV 4/10] END fit_intercept=True; mae: (test=-84.659) mape: (test=-0.345) total time=   0.1s
[CV 4/10] END fit_intercept=True; mae: (test=-83.807) mape: (test=-0.343) total time=   0.0s
[CV 1/10] END fit_intercept=False; mae: (test=-83.644) mape: (test=-0.353) total time=   0.0s
[CV 6/10] END fit_intercept=False; mae: (test=-98.406) mape: (test=-0.413) total time=   0.0s
[CV 5/10] END fit_intercept=True; mae: (test=-85.819) mape: (test=-0.331) total time=   0.1s
[CV 9/10] END fit_intercept=True; mae: (test=-85.427) mape: (tes

(cv_train    0.210418
 cv_val      0.247831
 dtype: float64,
 cv_train     88.831536
 cv_val      110.115168
 dtype: float64)

In [128]:
model = DecisionTreeRegressor(**hp_dt[0])
model.fit(df.loc[:, sf_features_dt].values, df.loc[:, "realsum_cut"].values.ravel())
pickle.dump(model, open("models/model_dt.sav", "wb"))

### Random Forest

#### Feature selection

In [129]:
param_rf = {
    "n_estimators": [20, 50, 100, 500],
    "max_depth": [5, 10, 15],
    "min_samples_split": [2, 5],
    "min_samples_leaf": [1, 2],
}

In [130]:
model = RandomForestRegressor(random_state=123, n_estimators=10, max_depth=15)
rf_grid_CV = GridSearchCV(
    model, param_rf, cv=5, scoring=mse, return_train_score=True, n_jobs=-1
)
rf_grid_CV.fit(df.loc[:, var].values, df.loc[:, "realsum_cut"].values.ravel())

In [131]:
rf_grid_CV.best_score_

np.float64(-7508.174623396648)

In [132]:
rf_features  = df.columns.tolist()
rf_features.remove("realsum_cut")

In [None]:
model = RandomForestRegressor(**rf_grid_CV.best_params_)

sf = SFS(
    model,
    k_features=(5, 15),
    forward=True,
    floating=False,
    verbose=0,
    scoring=mse,
    cv=5,
    n_jobs=-1,
)

sffit = sf.fit(
    df.loc[:, rf_features].values, df.loc[:, "realsum_cut"].values.ravel()
)

rf_features = df.loc[:, rf_features].columns[list(sffit.k_feature_idx_)]

#### Hyperparameters tuning

In [None]:
cv_proc(mi_features, model, param_rf)

In [None]:
cv_proc(mi_features_10, model, param_rf)

In [None]:
cv_proc(mi_features_14, model, param_rf)

In [None]:
cv_proc(mi_features_17, model, param_rf)

In [None]:
cv_proc(mi_features_20, model, param_rf)

In [None]:
cv_proc(corr_features, model, param_rf)

In [None]:
cv_proc(rf_features, model, param_rf)

#### Model evaluation with cross-validation approach

In [None]:
CV_wrapper(df, mi_features, model, 10, 123, shuff=True, display_res=True).mean()

In [None]:
CV_rmse_wrapper(df, mi_features, model, 10, 123, shuff=True, display_res=True).mean()

In [None]:
CV_wrapper(df, mi_features_10, model, 10, 123, shuff=True, display_res=True).mean()

In [None]:
CV_rmse_wrapper(df, mi_features_10, model, 10, 123, shuff=True, display_res=True).mean()

In [None]:
CV_wrapper(df, mi_features_14, model, 10, 123, shuff=True, display_res=True).mean()

In [None]:
CV_rmse_wrapper(df, mi_features_14, model, 10, 123, shuff=True, display_res=True).mean()

In [None]:
CV_wrapper(df, mi_features_17, model, 10, 123, shuff=True, display_res=True).mean()

In [None]:
CV_rmse_wrapper(df, mi_features_17, model, 10, 123, shuff=True, display_res=True).mean()

In [None]:
CV_wrapper(df, mi_features_20, model, 10, 123, shuff=True, display_res=True).mean()

In [None]:
CV_rmse_wrapper(df, mi_features_20, model, 10, 123, shuff=True, display_res=True).mean()

In [None]:
CV_wrapper(df, corr_features, model, 10, 123, shuff=True, display_res=True).mean()

In [None]:
CV_rmse_wrapper(df, corr_features, model, 10, 123, shuff=True, display_res=True).mean()

In [None]:
CV_wrapper(df, rf_features, model, 10, 123, shuff=True, display_res=True).mean()

In [None]:
CV_rmse_wrapper(df, rf_features, model, 10, 123, shuff=True, display_res=True).mean()