In [1]:
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.model_selection import KFold
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV
from src.utils import *

%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
# Datasets
BASELINE_TRAIN = "data/train/baseline_train.csv"
BASELINE_VAL = "data/train/baseline_val.csv"
BASELINE_TEST = "data/test/baseline_test.csv"

BASELINE_W_FEAT_ENG_TRAIN = "data/train/baseline-w-feature-eng_train.csv"
BASELINE_W_FEAT_ENG_VAL = "data/train/baseline-w-feature-eng_val.csv"
BASELINE_W_FEAT_ENG_TEST = "data/test/baseline-w-feature-eng_test.csv"

TRUNCATED_BASELINE_TRAIN = "data/train/baseline-truncated_train.csv"
TRUNCATED_BASELINE_VAL = "data/train/baseline-truncated_val.csv"
TRUNCATED_BASELINE_TEST = "data/test/baseline-truncated_test.csv"

TRUNCATED_FEAT_ENG_TRAIN = "data/train/truncated-feat-eng_train.csv"
TRUNCATED_FEAT_ENG_VAL = "data/train/truncated-feat-eng_val.csv"
TRUNCATED_FEAT_ENG_TEST = "data/test/truncated-feat-eng_test.csv"

### Train/Test

In [3]:
TRAIN_DATA = BASELINE_W_FEAT_ENG_TRAIN
VAL_DATA = BASELINE_W_FEAT_ENG_VAL

train_set = pd.read_csv(TRAIN_DATA)
val_set = pd.read_csv(VAL_DATA)

X_train, y_train = split_features_and_monthly_rent_label(train_set)
X_test, y_test = split_features_and_monthly_rent_label(val_set)

In [4]:
model = AdaBoostRegressor(estimator=DecisionTreeRegressor())
pipeline = Pipeline(steps=[('model', model)])

# define grid search for hyperparameters
grid = {
    'model__estimator__max_depth': [i for i in range(3, 9, 2)],
    'model__n_estimators': [50, 100, 200, 300, 400, 500],
    'model__learning_rate': [0.1, 0.2, 0.5, 1.0],
    'model__loss': ['linear', 'square', 'exponential']
}
cv = KFold(n_splits=10)
grid_search = HalvingGridSearchCV(estimator=pipeline, param_grid=grid, cv=cv, scoring='neg_root_mean_squared_error', verbose=3)
# Execute the grid search
grid_results = grid_search.fit(X_train, y_train)

n_iterations: 5
n_required_iterations: 5
n_possible_iterations: 5
min_resources_: 666
max_resources_: 54000
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 216
n_resources: 666
Fitting 10 folds for each of 216 candidates, totalling 2160 fits
[CV 1/10] END model__estimator__max_depth=3, model__learning_rate=0.1, model__loss=linear, model__n_estimators=50;, score=(train=-505.852, test=-527.204) total time=   0.1s
[CV 2/10] END model__estimator__max_depth=3, model__learning_rate=0.1, model__loss=linear, model__n_estimators=50;, score=(train=-495.145, test=-516.642) total time=   0.2s
[CV 3/10] END model__estimator__max_depth=3, model__learning_rate=0.1, model__loss=linear, model__n_estimators=50;, score=(train=-498.271, test=-558.057) total time=   0.1s
[CV 4/10] END model__estimator__max_depth=3, model__learning_rate=0.1, model__loss=linear, model__n_estimators=50;, score=(train=-491.343, test=-537.626) total time=   0.1s
[CV 5/10] END model__estimator__max_depth

In [5]:
grid_results_df = pd.DataFrame(grid_results.cv_results_)
grid_results_df.describe()

Unnamed: 0,iter,n_resources,mean_fit_time,std_fit_time,mean_score_time,std_score_time,split0_test_score,split1_test_score,split2_test_score,split3_test_score,...,split2_train_score,split3_train_score,split4_train_score,split5_train_score,split6_train_score,split7_train_score,split8_train_score,split9_train_score,mean_train_score,std_train_score
count,323.0,323.0,323.0,323.0,323.0,323.0,323.0,323.0,323.0,323.0,...,323.0,323.0,323.0,323.0,323.0,323.0,323.0,323.0,323.0,323.0
mean,0.482972,2282.544892,1.60512,0.037296,0.015574,0.000951,-516.945767,-550.112557,-594.937733,-530.747057,...,-391.397777,-384.988018,-400.674181,-396.880877,-410.81703,-406.767316,-396.536207,-398.240251,-396.801113,11.490697
std,0.812664,5812.815937,4.30911,0.082815,0.012478,0.001248,16.057385,29.342184,58.192415,16.91666,...,90.20147,96.252695,93.016226,89.008097,84.847398,89.842877,97.06005,87.43326,90.795866,3.056717
min,0.0,666.0,0.082388,0.001036,0.003516,9.4e-05,-555.963017,-624.407903,-694.555153,-573.946171,...,-501.984799,-518.630278,-541.534026,-524.38813,-531.690543,-527.712564,-528.567768,-518.153399,-519.549727,0.534027
25%,0.0,666.0,0.316938,0.006102,0.006302,0.000382,-530.789769,-565.160431,-642.014832,-542.351177,...,-483.48947,-471.95019,-486.404388,-475.002631,-491.005624,-492.958848,-486.434292,-474.701842,-485.803938,10.03027
50%,0.0,666.0,0.654799,0.013112,0.014343,0.000595,-515.866774,-545.382609,-620.243161,-529.718297,...,-391.092633,-386.064834,-388.653089,-392.79928,-407.162653,-404.535165,-394.39739,-398.422543,-394.008381,11.245992
75%,1.0,1998.0,1.203845,0.031324,0.020852,0.001,-502.121983,-528.222098,-526.127434,-521.228269,...,-344.938683,-345.513745,-363.762233,-364.721304,-356.899245,-367.533848,-360.006175,-369.236723,-360.628737,13.323871
max,4.0,53946.0,54.800489,1.01222,0.123083,0.009439,-489.260255,-499.806032,-497.182054,-488.076875,...,-227.28709,-214.518855,-239.668403,-233.641941,-251.275423,-242.928677,-222.125129,-237.575588,-235.940009,19.643492


In [6]:
print(grid_results.best_params_)
print(grid_results.best_score_)

{'model__estimator__max_depth': 7, 'model__learning_rate': 0.1, 'model__loss': 'exponential', 'model__n_estimators': 50}
-500.4290676718836


| Approach | Best RMSE | Best Params |
|:---------|:----------|:------------|
|1|517.18|'model__estimator__max_depth': 5, 'model__learning_rate': 0.1, 'model__loss': 'exponential', 'model__n_estimators': 100|
|2|500.43|'model__estimator__max_depth': 7, 'model__learning_rate': 0.1, 'model__loss': 'exponential', 'model__n_estimators': 50|
|3|491.55|learning_rate: 0.1, max_depth: 3, n_estimators: 100, subsample: 0.7|
|4|490.97|learning_rate: 0.1, max_depth: 3, n_estimators: 100, subsample: 0.5|

In [7]:
joblib.dump(grid_results, 'models/approach2_ab.pkl')

['models/approach2_ab.pkl']

In [8]:
grid_search_model = joblib.load('models/approach2_ab.pkl')
grid_search_model.best_score_

-500.4290676718836

### SHAP

#### Standard SHAP values

In [74]:
X_train_summary = shap.sample(X_train, 100)
explainer = shap.KernelExplainer(model.predict, X_train_summary)
shap_values = explainer.shap_values(X_test[0:100])

# shap.force_plot(explainer.expected_value[0], shap_values[0], X_test)
shap.force_plot(explainer.expected_value, shap_values, X_test[0:100])

100%|██████████| 100/100 [03:16<00:00,  1.96s/it]
