In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.model_selection import KFold
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV
from src.utils import *

%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
# Datasets
BASELINE_TRAIN = "data/train/baseline_train.csv"
BASELINE_VAL = "data/train/baseline_val.csv"
BASELINE_TEST = "data/test/baseline_test.csv"

TRUNCATED_BASELINE_TRAIN = "data/train/baseline-truncated_train.csv"
TRUNCATED_BASELINE_VAL = "data/train/baseline-truncated_val.csv"
TRUNCATED_BASELINE_TEST = "data/test/baseline-truncated_test.csv"

BASELINE_W_FEAT_ENG_TRAIN = "data/train/baseline-w-feature-eng_train.csv"
BASELINE_W_FEAT_ENG_VAL = "data/train/baseline-w-feature-eng_val.csv"
BASELINE_W_FEAT_ENG_TEST = "data/test/baseline-w-feature-eng_test.csv"

TRUNCATED_FEAT_ENG_TRAIN = "data/train/truncated-feat-eng_train.csv"
TRUNCATED_FEAT_ENG_VAL = "data/train/truncated-feat-eng_val.csv"
TRUNCATED_FEAT_ENG_TEST = "data/test/truncated-feat-eng_test.csv"

### Train/Test

In [7]:
TRAIN_DATA = BASELINE_TRAIN
VAL_DATA = BASELINE_VAL

train_set = pd.read_csv(TRAIN_DATA)
val_set = pd.read_csv(VAL_DATA)

X_train, y_train = split_features_and_monthly_rent_label(train_set)
X_test, y_test = split_features_and_monthly_rent_label(val_set)

In [None]:
model = AdaBoostRegressor()

# define grid search for hyperparameters
grid = {
    'estimator__max_depth': [i for i in range(3, 11, 2)],
    'n_estimators': [50, 100, 200, 300, 400, 500],
    'learning_rate': [0.1, 0.2, 0.5, 1.0],
    'loss': ['linear', 'square', 'exponential']
}
cv = KFold(n_splits=10)

mse = make_scorer(mean_squared_error, greater_is_better=False)
grid_search = HalvingGridSearchCV(estimator=model, param_grid=grid, cv=cv, scoring='neg_root_mean_squared_error', verbose=3)
# Execute the grid search
grid_results = grid_search.fit(X_train, y_train)

In [10]:
grid_search_results = pd.DataFrame(result.cv_results_)
grid_search_results.describe()

0.4559725597145493
266703.5045288154


In [None]:
print(result.best_params_)
print(result.best_score_)

In [None]:
print(model.score(X_val, y_val))
y_pred = model.predict(X_val)
print(mean_squared_error(y_val, y_pred))

### SHAP

#### Standard SHAP values

In [74]:
X_train_summary = shap.sample(X_train, 100)
explainer = shap.KernelExplainer(model.predict, X_train_summary)
shap_values = explainer.shap_values(X_test[0:100])

# shap.force_plot(explainer.expected_value[0], shap_values[0], X_test)
shap.force_plot(explainer.expected_value, shap_values, X_test[0:100])

100%|██████████| 100/100 [03:16<00:00,  1.96s/it]
