In [5]:
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.model_selection import KFold
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV
from utils import split_features_and_monthly_rent_label

%load_ext autoreload
%autoreload 2
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
# Datasets
BASELINE_TRAIN = "../data/train/baseline_train.csv"
BASELINE_VAL = "../data/train/baseline_val.csv"
BASELINE_TEST = "../data/test/baseline_test.csv"

BASELINE_W_FEAT_ENG_TRAIN = "../data/train/baseline-w-feature-eng_train.csv"
BASELINE_W_FEAT_ENG_VAL = "../data/train/baseline-w-feature-eng_val.csv"
BASELINE_W_FEAT_ENG_TEST = "../data/test/baseline-w-feature-eng_test.csv"

TRUNCATED_BASELINE_TRAIN = "../data/train/baseline-truncated_train.csv"
TRUNCATED_BASELINE_VAL = "../data/train/baseline-truncated_val.csv"
TRUNCATED_BASELINE_TEST = "../data/test/baseline-truncated_test.csv"

TRUNCATED_FEAT_ENG_TRAIN = "../data/train/truncated-feat-eng_train.csv"
TRUNCATED_FEAT_ENG_VAL = "../data/train/truncated-feat-eng_val.csv"
TRUNCATED_FEAT_ENG_TEST = "../data/test/truncated-feat-eng_test.csv"

### Train/Test

In [7]:
TRAIN_DATA = TRUNCATED_FEAT_ENG_TRAIN
VAL_DATA = TRUNCATED_FEAT_ENG_VAL

train_set = pd.read_csv(TRAIN_DATA)
val_set = pd.read_csv(VAL_DATA)

X_train, y_train = split_features_and_monthly_rent_label(train_set)
X_test, y_test = split_features_and_monthly_rent_label(val_set)

In [22]:
model = AdaBoostRegressor(estimator=DecisionTreeRegressor())
pipeline = Pipeline(steps=[('model', model)])

# define grid search for hyperparameters
grid = {
    'model__estimator__max_depth': [i for i in range(3, 9, 2)],
    'model__n_estimators': [50, 100, 200, 300, 400, 500],
    'model__learning_rate': [0.1, 0.2, 0.5, 1.0],
    'model__loss': ['linear', 'square', 'exponential']
}
cv = KFold(n_splits=10)
grid_search = HalvingGridSearchCV(estimator=pipeline, param_grid=grid, cv=cv, scoring='neg_root_mean_squared_error', verbose=3)
# Execute the grid search
grid_results = grid_search.fit(X_train, y_train)

n_iterations: 5
n_required_iterations: 5
n_possible_iterations: 5
min_resources_: 666
max_resources_: 54000
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 216
n_resources: 666
Fitting 10 folds for each of 216 candidates, totalling 2160 fits
[CV 1/10] END model__estimator__max_depth=3, model__learning_rate=0.1, model__loss=linear, model__n_estimators=50;, score=(train=-491.662, test=-414.988) total time=   0.1s
[CV 2/10] END model__estimator__max_depth=3, model__learning_rate=0.1, model__loss=linear, model__n_estimators=50;, score=(train=-482.085, test=-459.151) total time=   0.1s
[CV 3/10] END model__estimator__max_depth=3, model__learning_rate=0.1, model__loss=linear, model__n_estimators=50;, score=(train=-490.717, test=-515.470) total time=   0.1s
[CV 4/10] END model__estimator__max_depth=3, model__learning_rate=0.1, model__loss=linear, model__n_estimators=50;, score=(train=-507.054, test=-642.992) total time=   0.1s
[CV 5/10] END model__estimator__max_depth

In [23]:
grid_results_df = pd.DataFrame(grid_results.cv_results_)
grid_results_df.describe()

Unnamed: 0,iter,n_resources,mean_fit_time,std_fit_time,mean_score_time,std_score_time,split0_test_score,split1_test_score,split2_test_score,split3_test_score,...,split2_train_score,split3_train_score,split4_train_score,split5_train_score,split6_train_score,split7_train_score,split8_train_score,split9_train_score,mean_train_score,std_train_score
count,323.0,323.0,323.0,323.0,323.0,323.0,323.0,323.0,323.0,323.0,...,323.0,323.0,323.0,323.0,323.0,323.0,323.0,323.0,323.0,323.0
mean,0.482972,2282.544892,0.594302,0.019727,0.011435,0.000805,-468.938597,-494.574739,-507.356468,-595.821054,...,-397.811552,-401.27981,-415.082672,-396.479839,-400.10041,-412.449108,-415.777771,-391.959997,-401.845446,12.149101
std,0.812664,5812.815937,1.190868,0.038208,0.007346,0.000938,36.025864,70.047336,18.853196,65.630042,...,102.992132,98.250959,90.712073,96.109985,95.134066,94.21246,89.204475,98.143383,96.805266,4.057372
min,0.0,666.0,0.047033,0.000806,0.002737,6.5e-05,-547.300475,-640.69153,-556.448142,-676.829266,...,-525.637181,-515.818859,-534.301457,-527.331777,-508.94558,-525.673946,-526.621564,-515.069197,-521.545088,0.844472
25%,0.0,666.0,0.172512,0.003797,0.005024,0.000334,-505.20263,-545.502387,-520.95033,-642.93966,...,-482.845878,-492.32282,-493.221336,-471.638453,-481.462266,-494.505421,-491.509498,-474.50558,-484.305818,9.378648
50%,0.0,666.0,0.349167,0.007448,0.0097,0.000532,-455.847592,-463.144552,-505.822461,-636.31118,...,-458.361475,-434.416162,-455.135193,-454.303394,-446.035362,-451.138383,-457.617094,-446.478056,-453.404406,13.269908
75%,1.0,1998.0,0.611128,0.015469,0.016587,0.000895,-440.910239,-441.303413,-490.885672,-517.89258,...,-351.225543,-354.203737,-376.843756,-342.868558,-362.668738,-378.461536,-379.67723,-340.325638,-359.694454,15.60744
max,4.0,53946.0,12.048673,0.380723,0.045402,0.007876,-412.207886,-418.419532,-469.796034,-479.26038,...,-212.779166,-224.247036,-248.308589,-223.148825,-222.991958,-241.312269,-247.989555,-218.804361,-228.710951,18.097473


In [24]:
print(grid_results.best_params_)
print(grid_results.best_score_)

{'model__estimator__max_depth': 7, 'model__learning_rate': 0.1, 'model__loss': 'linear', 'model__n_estimators': 100}
-503.8694178664199


In [25]:
joblib.dump(grid_results, '../models/approach4_ab.pkl')

['models/approach4_ab.pkl']

In [8]:
grid_search_model = joblib.load('../models/approach4_ab.pkl')
grid_search_model.best_score_

-503.8694178664199

| Approach | Best RMSE | Best Params |
|:---------|:----------|:------------|
|1|517.18|'model__estimator__max_depth': 5, 'model__learning_rate': 0.1, 'model__loss': 'exponential', 'model__n_estimators': 100|
|2|500.43|'model__estimator__max_depth': 7, 'model__learning_rate': 0.1, 'model__loss': 'exponential', 'model__n_estimators': 50|
|3|501.79|'model__estimator__max_depth': 7, 'model__learning_rate': 0.1, 'model__loss': 'exponential', 'model__n_estimators': 50|
|4|503.87|'model__estimator__max_depth': 7, 'model__learning_rate': 0.1, 'model__loss': 'linear', 'model__n_estimators': 100|