In [1]:
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor, BaggingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import KFold
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV
from src.utils import *

%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
# Datasets
BASELINE_TRAIN = "data/train/baseline_train.csv"
BASELINE_VAL = "data/train/baseline_val.csv"
BASELINE_TEST = "data/test/baseline_test.csv"

BASELINE_W_FEAT_ENG_TRAIN = "data/train/baseline-w-feature-eng_train.csv"
BASELINE_W_FEAT_ENG_VAL = "data/train/baseline-w-feature-eng_val.csv"
BASELINE_W_FEAT_ENG_TEST = "data/test/baseline-w-feature-eng_test.csv"

TRUNCATED_BASELINE_TRAIN = "data/train/baseline-truncated_train.csv"
TRUNCATED_BASELINE_VAL = "data/train/baseline-truncated_val.csv"
TRUNCATED_BASELINE_TEST = "data/test/baseline-truncated_test.csv"

TRUNCATED_FEAT_ENG_TRAIN = "data/train/truncated-feat-eng_train.csv"
TRUNCATED_FEAT_ENG_VAL = "data/train/truncated-feat-eng_val.csv"
TRUNCATED_FEAT_ENG_TEST = "data/test/truncated-feat-eng_test.csv"

### Train/Test

In [3]:
TRAIN_DATA = TRUNCATED_FEAT_ENG_TRAIN
VAL_DATA = TRUNCATED_FEAT_ENG_VAL

train_set = pd.read_csv(TRAIN_DATA)
val_set = pd.read_csv(VAL_DATA)

X_train, y_train = split_features_and_monthly_rent_label(train_set)
X_test, y_test = split_features_and_monthly_rent_label(val_set)

In [4]:
model = BaggingRegressor()
pipeline = Pipeline(steps=[('model', model)])

# define grid search for hyperparameters
grid = {
    'model__max_features': [0.5, 0.7, 0.9, 1.0],
    'model__n_estimators': [50, 100, 150, 250, 500],
}
cv = KFold(n_splits=10)
grid_search = HalvingGridSearchCV(estimator=pipeline, param_grid=grid, cv=cv, scoring='neg_root_mean_squared_error', verbose=3)
# Execute the grid search
grid_results = grid_search.fit(X_train, y_train)

n_iterations: 3
n_required_iterations: 3
n_possible_iterations: 3
min_resources_: 6000
max_resources_: 54000
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 20
n_resources: 6000
Fitting 10 folds for each of 20 candidates, totalling 200 fits
[CV 1/10] END model__max_features=0.5, model__n_estimators=50;, score=(train=-231.407, test=-559.717) total time=   0.8s
[CV 2/10] END model__max_features=0.5, model__n_estimators=50;, score=(train=-210.882, test=-562.789) total time=   0.8s
[CV 3/10] END model__max_features=0.5, model__n_estimators=50;, score=(train=-217.071, test=-515.454) total time=   0.8s
[CV 4/10] END model__max_features=0.5, model__n_estimators=50;, score=(train=-221.706, test=-563.125) total time=   0.7s
[CV 5/10] END model__max_features=0.5, model__n_estimators=50;, score=(train=-230.098, test=-481.323) total time=   0.7s
[CV 6/10] END model__max_features=0.5, model__n_estimators=50;, score=(train=-216.951, test=-550.591) total time=   0.8s
[CV 7/10

[CV 7/10] END model__max_features=0.7, model__n_estimators=100;, score=(train=-198.611, test=-503.412) total time=   2.0s
[CV 8/10] END model__max_features=0.7, model__n_estimators=100;, score=(train=-194.140, test=-487.019) total time=   2.1s
[CV 9/10] END model__max_features=0.7, model__n_estimators=100;, score=(train=-195.544, test=-532.016) total time=   2.1s
[CV 10/10] END model__max_features=0.7, model__n_estimators=100;, score=(train=-191.761, test=-517.066) total time=   2.1s
[CV 1/10] END model__max_features=0.7, model__n_estimators=150;, score=(train=-197.962, test=-535.456) total time=   3.1s
[CV 2/10] END model__max_features=0.7, model__n_estimators=150;, score=(train=-191.688, test=-563.096) total time=   3.1s
[CV 3/10] END model__max_features=0.7, model__n_estimators=150;, score=(train=-194.680, test=-514.806) total time=   3.1s
[CV 4/10] END model__max_features=0.7, model__n_estimators=150;, score=(train=-198.078, test=-557.496) total time=   3.1s
[CV 5/10] END model__ma

[CV 5/10] END model__max_features=0.9, model__n_estimators=250;, score=(train=-193.349, test=-475.855) total time=   6.4s
[CV 6/10] END model__max_features=0.9, model__n_estimators=250;, score=(train=-193.417, test=-533.107) total time=   6.5s
[CV 7/10] END model__max_features=0.9, model__n_estimators=250;, score=(train=-192.917, test=-504.951) total time=   6.4s
[CV 8/10] END model__max_features=0.9, model__n_estimators=250;, score=(train=-189.900, test=-489.501) total time=   6.4s
[CV 9/10] END model__max_features=0.9, model__n_estimators=250;, score=(train=-193.442, test=-529.627) total time=   6.5s
[CV 10/10] END model__max_features=0.9, model__n_estimators=250;, score=(train=-189.086, test=-516.032) total time=   6.4s
[CV 1/10] END model__max_features=0.9, model__n_estimators=500;, score=(train=-194.360, test=-533.472) total time=  12.9s
[CV 2/10] END model__max_features=0.9, model__n_estimators=500;, score=(train=-188.028, test=-567.919) total time=  13.0s
[CV 3/10] END model__ma

[CV 2/10] END model__max_features=0.9, model__n_estimators=150;, score=(train=-191.750, test=-525.352) total time=  12.2s
[CV 3/10] END model__max_features=0.9, model__n_estimators=150;, score=(train=-194.190, test=-516.538) total time=  12.4s
[CV 4/10] END model__max_features=0.9, model__n_estimators=150;, score=(train=-194.309, test=-529.227) total time=  12.5s
[CV 5/10] END model__max_features=0.9, model__n_estimators=150;, score=(train=-195.002, test=-510.272) total time=  12.2s
[CV 6/10] END model__max_features=0.9, model__n_estimators=150;, score=(train=-193.781, test=-506.785) total time=  12.3s
[CV 7/10] END model__max_features=0.9, model__n_estimators=150;, score=(train=-196.033, test=-529.861) total time=  12.3s
[CV 8/10] END model__max_features=0.9, model__n_estimators=150;, score=(train=-191.317, test=-521.029) total time=  12.5s
[CV 9/10] END model__max_features=0.9, model__n_estimators=150;, score=(train=-192.242, test=-494.467) total time=  12.4s
[CV 10/10] END model__ma

[CV 10/10] END model__max_features=0.7, model__n_estimators=250;, score=(train=-193.238, test=-513.029) total time=  16.2s
----------
iter: 2
n_candidates: 3
n_resources: 54000
Fitting 10 folds for each of 3 candidates, totalling 30 fits
[CV 1/10] END model__max_features=0.7, model__n_estimators=150;, score=(train=-209.069, test=-510.080) total time=  30.2s
[CV 2/10] END model__max_features=0.7, model__n_estimators=150;, score=(train=-206.593, test=-520.742) total time=  30.0s
[CV 3/10] END model__max_features=0.7, model__n_estimators=150;, score=(train=-211.804, test=-510.763) total time=  29.3s
[CV 4/10] END model__max_features=0.7, model__n_estimators=150;, score=(train=-209.437, test=-507.453) total time=  29.9s
[CV 5/10] END model__max_features=0.7, model__n_estimators=150;, score=(train=-205.897, test=-499.949) total time=  30.8s
[CV 6/10] END model__max_features=0.7, model__n_estimators=150;, score=(train=-208.032, test=-503.763) total time=  30.3s
[CV 7/10] END model__max_featu

In [5]:
grid_results_df = pd.DataFrame(grid_results.cv_results_)
grid_results_df.describe()

Unnamed: 0,iter,n_resources,mean_fit_time,std_fit_time,mean_score_time,std_score_time,split0_test_score,split1_test_score,split2_test_score,split3_test_score,...,split2_train_score,split3_train_score,split4_train_score,split5_train_score,split6_train_score,split7_train_score,split8_train_score,split9_train_score,mean_train_score,std_train_score
count,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,...,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0
mean,0.433333,13600.0,13.643888,0.133738,0.135427,0.007018,-528.858864,-552.289421,-516.406787,-547.686026,...,-199.161766,-201.476605,-201.339579,-200.031102,-201.567956,-197.651373,-200.315862,-197.097258,-199.774361,2.547339
std,0.678911,14606.564419,20.367906,0.202168,0.211515,0.012173,13.77192,23.203368,4.202683,20.672441,...,9.12747,9.157387,10.103554,8.366184,10.113013,9.332023,9.522322,9.643118,9.154287,1.282992
min,0.0,6000.0,0.743427,0.010451,0.012266,4.9e-05,-559.716755,-582.490408,-528.287848,-570.579098,...,-218.942739,-224.967775,-230.097981,-217.96463,-233.123476,-221.464828,-223.295256,-215.35274,-221.011489,1.242516
25%,0.0,6000.0,2.62201,0.031624,0.026634,0.000291,-536.026525,-570.343907,-519.254282,-561.834943,...,-203.522195,-205.436962,-205.108069,-204.398855,-204.433798,-202.57299,-206.899941,-204.325844,-204.798534,1.932292
50%,0.0,6000.0,6.375646,0.062025,0.057772,0.002289,-533.717868,-562.359042,-516.294599,-557.790972,...,-195.425798,-197.958363,-197.156605,-196.85066,-198.370543,-193.623248,-195.928549,-192.521005,-195.911576,2.233582
75%,1.0,18000.0,13.764525,0.111106,0.116252,0.005721,-513.525512,-522.452633,-514.137911,-527.92949,...,-192.701811,-194.315109,-194.634867,-193.827445,-194.819918,-190.822596,-193.591369,-190.371447,-192.975827,2.624319
max,2.0,54000.0,99.542242,0.828454,1.062305,0.045473,-508.156732,-516.877078,-508.5204,-505.708258,...,-190.899541,-192.20321,-192.769378,-191.803619,-192.525391,-188.769206,-190.735011,-187.581317,-191.278382,7.841092


In [6]:
print(grid_results.best_params_)
print(grid_results.best_score_)

{'model__max_features': 0.7, 'model__n_estimators': 500}
-507.5608720207116


| Approach | Best RMSE | Best Params |
|:---------|:----------|:------------|
|1|523.35|'model__max_features': 0.7, 'model__n_estimators': 500|
|2|509.86|'model__max_features': 0.7, 'model__n_estimators': 500|
|3|521.34|'model__max_features': 0.7, 'model__n_estimators': 500|
|4|507.56|'model__max_features': 0.7, 'model__n_estimators': 500|