In [9]:
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.model_selection import KFold
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV
from src.utils import *

%load_ext autoreload
%autoreload 2
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [10]:
# Datasets
BASELINE_TRAIN = "data/train/baseline_train.csv"
BASELINE_VAL = "data/train/baseline_val.csv"
BASELINE_TEST = "data/test/baseline_test.csv"

BASELINE_W_FEAT_ENG_TRAIN = "data/train/baseline-w-feature-eng_train.csv"
BASELINE_W_FEAT_ENG_VAL = "data/train/baseline-w-feature-eng_val.csv"
BASELINE_W_FEAT_ENG_TEST = "data/test/baseline-w-feature-eng_test.csv"

TRUNCATED_BASELINE_TRAIN = "data/train/baseline-truncated_train.csv"
TRUNCATED_BASELINE_VAL = "data/train/baseline-truncated_val.csv"
TRUNCATED_BASELINE_TEST = "data/test/baseline-truncated_test.csv"

TRUNCATED_FEAT_ENG_TRAIN = "data/train/truncated-feat-eng_train.csv"
TRUNCATED_FEAT_ENG_VAL = "data/train/truncated-feat-eng_val.csv"
TRUNCATED_FEAT_ENG_TEST = "data/test/truncated-feat-eng_test.csv"

### Train/Test

In [11]:
TRAIN_DATA = TRUNCATED_BASELINE_TRAIN
VAL_DATA = TRUNCATED_BASELINE_VAL

train_set = pd.read_csv(TRAIN_DATA)
val_set = pd.read_csv(VAL_DATA)

X_train, y_train = split_features_and_monthly_rent_label(train_set)
X_test, y_test = split_features_and_monthly_rent_label(val_set)

In [12]:
model = AdaBoostRegressor(estimator=DecisionTreeRegressor())
pipeline = Pipeline(steps=[('model', model)])

# define grid search for hyperparameters
grid = {
    'model__estimator__max_depth': [i for i in range(3, 9, 2)],
    'model__n_estimators': [50, 100, 200, 300, 400, 500],
    'model__learning_rate': [0.1, 0.2, 0.5, 1.0],
    'model__loss': ['linear', 'square', 'exponential']
}
cv = KFold(n_splits=10)
grid_search = HalvingGridSearchCV(estimator=pipeline, param_grid=grid, cv=cv, scoring='neg_root_mean_squared_error', verbose=3)
# Execute the grid search
grid_results = grid_search.fit(X_train, y_train)

n_iterations: 5
n_required_iterations: 5
n_possible_iterations: 5
min_resources_: 666
max_resources_: 54000
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 216
n_resources: 666
Fitting 10 folds for each of 216 candidates, totalling 2160 fits
[CV 1/10] END model__estimator__max_depth=3, model__learning_rate=0.1, model__loss=linear, model__n_estimators=50;, score=(train=-508.890, test=-482.833) total time=   0.1s
[CV 2/10] END model__estimator__max_depth=3, model__learning_rate=0.1, model__loss=linear, model__n_estimators=50;, score=(train=-497.405, test=-544.950) total time=   0.1s
[CV 3/10] END model__estimator__max_depth=3, model__learning_rate=0.1, model__loss=linear, model__n_estimators=50;, score=(train=-490.074, test=-493.484) total time=   0.1s
[CV 4/10] END model__estimator__max_depth=3, model__learning_rate=0.1, model__loss=linear, model__n_estimators=50;, score=(train=-525.322, test=-531.082) total time=   0.1s
[CV 5/10] END model__estimator__max_depth

In [13]:
grid_results_df = pd.DataFrame(grid_results.cv_results_)
grid_results_df.describe()

Unnamed: 0,iter,n_resources,mean_fit_time,std_fit_time,mean_score_time,std_score_time,split0_test_score,split1_test_score,split2_test_score,split3_test_score,...,split2_train_score,split3_train_score,split4_train_score,split5_train_score,split6_train_score,split7_train_score,split8_train_score,split9_train_score,mean_train_score,std_train_score
count,323.0,323.0,323.0,323.0,323.0,323.0,323.0,323.0,323.0,323.0,...,323.0,323.0,323.0,323.0,323.0,323.0,323.0,323.0,323.0,323.0
mean,0.482972,2282.544892,0.591082,0.027085,0.012218,0.000957,-525.245057,-558.938178,-510.500281,-511.97735,...,-398.304162,-408.773296,-405.565446,-417.446934,-421.684991,-415.543319,-416.489631,-428.522632,-413.148611,13.411689
std,0.812664,5812.815937,1.107595,0.073175,0.007846,0.001223,38.607774,26.543716,36.610928,45.162608,...,99.146767,97.454251,92.56757,94.579604,96.916709,94.567795,97.308268,79.157027,93.147006,4.079893
min,0.0,666.0,0.049367,0.000555,0.002972,0.000127,-612.875438,-634.955036,-599.907107,-612.786849,...,-562.82286,-542.621838,-524.081346,-551.479165,-553.034005,-535.831245,-541.734414,-540.130854,-540.816174,0.689127
25%,0.0,666.0,0.165585,0.003432,0.005364,0.000348,-567.97038,-577.672181,-522.622339,-549.029997,...,-485.969955,-491.722001,-489.456134,-499.314713,-505.275347,-503.620336,-498.629544,-499.220713,-496.647174,11.363565
50%,0.0,666.0,0.354194,0.007194,0.010283,0.000533,-507.334206,-559.317528,-498.351408,-509.237103,...,-444.95191,-465.184022,-449.448748,-467.532068,-468.386112,-456.577354,-467.480475,-453.924191,-465.220253,13.957294
75%,1.0,1998.0,0.609813,0.018216,0.017964,0.000953,-496.482997,-535.416947,-485.235389,-470.358059,...,-350.018317,-357.015961,-366.65898,-378.340146,-387.887426,-380.587319,-379.404084,-405.724941,-379.296963,15.677745
max,4.0,53946.0,11.286095,0.970647,0.04727,0.007227,-460.192667,-512.247122,-458.210578,-439.539953,...,-224.989606,-231.729001,-237.212998,-244.054851,-242.84016,-242.446094,-235.157968,-280.77014,-244.496621,24.604027


In [14]:
print(grid_results.best_params_)
print(grid_results.best_score_)

{'model__estimator__max_depth': 7, 'model__learning_rate': 0.1, 'model__loss': 'exponential', 'model__n_estimators': 50}
-501.78647444674397


| Approach | Best RMSE | Best Params |
|:---------|:----------|:------------|
|1|517.18|'model__estimator__max_depth': 5, 'model__learning_rate': 0.1, 'model__loss': 'exponential', 'model__n_estimators': 100|
|2|500.43|'model__estimator__max_depth': 7, 'model__learning_rate': 0.1, 'model__loss': 'exponential', 'model__n_estimators': 50|
|3|501.79|'model__estimator__max_depth': 7, 'model__learning_rate': 0.1, 'model__loss': 'exponential', 'model__n_estimators': 50|
|4|490.97|learning_rate: 0.1, max_depth: 3, n_estimators: 100, subsample: 0.5|

In [17]:
joblib.dump(grid_results, 'models/approach3_ab.pkl')

['models/approach3_ab.pkl']

In [18]:
grid_search_model = joblib.load('models/approach3_ab.pkl')
grid_search_model.best_score_

-501.78647444674397

### SHAP

#### Standard SHAP values

In [74]:
X_train_summary = shap.sample(X_train, 100)
explainer = shap.KernelExplainer(model.predict, X_train_summary)
shap_values = explainer.shap_values(X_test[0:100])

# shap.force_plot(explainer.expected_value[0], shap_values[0], X_test)
shap.force_plot(explainer.expected_value, shap_values, X_test[0:100])

100%|██████████| 100/100 [03:16<00:00,  1.96s/it]
