In [1]:
%load_ext autoreload
%autoreload 2

import os
import sys

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
import scipy
import altair as alt
from altair import datum
from tqdm.auto import tqdm, trange

from src.model import tscv

%run constants.py

%matplotlib inline
print("Versions:")
print("  Python: %s" % sys.version)
for module in [pd, np, sns, sklearn, alt]:
    print("  %s: %s" %(module.__name__, module.__version__))

Versions:
  Python: 3.8.2 (default, Jul 16 2020, 14:00:26) 
[GCC 9.3.0]
  pandas: 1.1.0
  numpy: 1.19.1
  seaborn: 0.10.1
  sklearn: 0.23.2
  altair: 4.1.0


# Model tuning pipeline

We're not through with feature engineering, but I want have my hyperparameter optimization pipeline ready. To make things simple I'll be using [Optuna](https://optuna.readthedocs.io/en/stable/index.html). It's a automated hyperparameter optimization framework that implements some bayesian algorithms.

I could also use random search or a grid search but I thought this would be a nice opportunity to try something different.

The first thing we need to do is define the loss function. We'll define one for XGB first.

In [4]:
import xgboost as xgb
from src.model.metrics import corrected_rmse
from optuna import Trial


def xgb_feval(y_pred, dtrain):
    return 'cRMSE', corrected_rmse(dtrain.get_label(), y_pred)


def make_xgb_loss(X_train, y_train, cv_splits, verbose=True):
    dtrain = xgb.DMatrix(X_train, y_train)
    return lambda params: xgb.cv(
        params, dtrain, folds=cv_splits, feval=xgb_feval,
        maximize=False, early_stopping_rounds=30, 
        num_boost_round=1000, verbose_eval=verbose)['test-cRMSE-mean'].min()

Let's try it in action. Also notice we're using early stopping, that's why we use XGB's built in CV instead of sklearn's `cross_validate`. We're also using this higher level function that returns the actual loss function so we can reuse the `DMatrix` object and reduce memory usage.

In [5]:
from src.feature_engineering import df_to_X_y
train_set = pd.read_parquet(os.path.join(PROCESSED_DATA_DIR, 'train-set-features-001.parquet'))
X_train, y_train = df_to_X_y(train_set)

In [6]:
from src.model import tscv
cv_splits = tscv.split(train_set['date_block_num'])

In [7]:
l = make_xgb_loss(X_train, y_train, cv_splits)
l({})

[0]	train-rmse:2.59019+0.00458	test-rmse:2.42555+0.21618	train-cRMSE:1.23558+0.00224	test-cRMSE:1.15701+0.10312
[1]	train-rmse:2.36037+0.00452	test-rmse:2.22328+0.22623	train-cRMSE:1.12599+0.00210	test-cRMSE:1.06053+0.10791
[2]	train-rmse:2.23398+0.00368	test-rmse:2.11549+0.23570	train-cRMSE:1.06568+0.00176	test-cRMSE:1.00911+0.11243
[3]	train-rmse:2.16498+0.00338	test-rmse:2.05637+0.24291	train-cRMSE:1.03272+0.00164	test-cRMSE:0.98091+0.11587
[4]	train-rmse:2.12670+0.00308	test-rmse:2.02637+0.24968	train-cRMSE:1.01442+0.00151	test-cRMSE:0.96660+0.11910
[5]	train-rmse:2.10466+0.00353	test-rmse:2.00651+0.25183	train-cRMSE:1.00397+0.00170	test-cRMSE:0.95712+0.12013
[6]	train-rmse:2.09047+0.00312	test-rmse:1.99606+0.25456	train-cRMSE:0.99712+0.00151	test-cRMSE:0.95214+0.12143
[7]	train-rmse:2.07994+0.00369	test-rmse:1.98961+0.25619	train-cRMSE:0.99214+0.00175	test-cRMSE:0.94906+0.12220
[8]	train-rmse:2.07217+0.00439	test-rmse:1.98055+0.25324	train-cRMSE:0.98841+0.00209	test-cRMSE:0.94474+

0.9237956666666666

That's good. Now we have to define the objective function we will feed optuna

In [11]:
from optuna import Trial

def trial_to_params(trial: Trial):
    return {"objective": "reg:squarederror",
            "n_jobs": -1,
            "base_score": 0.5,
            "scale_pos_weight": 1,
            "max_depth": trial.suggest_int('max_depth', 2, 20, 1),
            "subsample": trial.suggest_discrete_uniform('subsample', .20, 1.00, .01),
            "colsample_bytree": trial.suggest_discrete_uniform('colsample_bytree', .20, 1., .01),
            "colsample_bylevel": trial.suggest_discrete_uniform('colsample_bylevel', .20, 1., .01),
            "seed": trial.suggest_int('seed', 0, 999999),
            "learning_rate": trial.suggest_uniform('learning_rate', 0.01, 0.15),
            "gamma": trial.suggest_categorical("gamma", [0, 0, 0, 0, 0, 0.01, 0.1, 0.2, 0.3, 0.5, 1., 10., 100.]),
            "min_child_weight": trial.suggest_categorical('min_child_weight', [1, 1, 1, 1, 2, 3, 4, 5, 1, 6, 7, 8, 9, 10, 11, 15, 30, 60, 100, 1, 1, 1]),
            "max_delta_step": trial.suggest_categorical("max_delta_step", [0, 0, 0, 0, 0, 1, 2, 5, 8]),
            "reg_alpha": trial.suggest_categorical("reg_alpha", [0, 0, 0, 0, 0, 0.00000001, 0.00000005, 0.0000005, 0.000005]),
            "reg_lambda": trial.suggest_categorical("reg_lambda", [1, 1, 1, 1, 2, 3, 4, 5, 1])}

def make_xgb_objective(xgb_loss):
    return lambda trial: xgb_loss(trial_to_params(trial))

And that's it. Now all we have to do is feed it to optuna. Here's an example of how that'd work (I won't leave it running since this is just an example and the actual optimization will be made through the scripts).

In [15]:
import optuna

study = optuna.create_study(direction='minimize')
objective = make_xgb_objective(l)
study.optimize(objective, n_trials=10, n_jobs=3, gc_after_trial=True)

[4]	train-rmse:2.94690+0.00523	test-rmse:2.74566+0.21119	train-cRMSE:1.41221+0.00241	test-cRMSE:1.30970+0.10074
[6]	train-rmse:2.72940+0.00549	test-rmse:2.51788+0.22125	train-cRMSE:1.30145+0.00254	test-cRMSE:1.20105+0.10553
[7]	train-rmse:2.70014+0.00472	test-rmse:2.49283+0.22348	train-cRMSE:1.28845+0.00229	test-cRMSE:1.18907+0.10659
[5]	train-rmse:2.94016+0.00524	test-rmse:2.73815+0.21076	train-cRMSE:1.40857+0.00242	test-cRMSE:1.30614+0.10055
[0]	train-rmse:2.90466+0.00520	test-rmse:2.70362+0.21369	train-cRMSE:1.38992+0.00241	test-cRMSE:1.28944+0.10192
[0]	train-rmse:2.94556+0.00544	test-rmse:2.74030+0.21348	train-cRMSE:1.41169+0.00248	test-cRMSE:1.30711+0.10184
[8]	train-rmse:2.67394+0.00426	test-rmse:2.47117+0.22453	train-cRMSE:1.27700+0.00228	test-cRMSE:1.17880+0.10710
[1]	train-rmse:2.90023+0.01200	test-rmse:2.69205+0.21323	train-cRMSE:1.38737+0.00541	test-cRMSE:1.28405+0.10159
[1]	train-rmse:2.83023+0.00460	test-rmse:2.63382+0.21718	train-cRMSE:1.35434+0.00206	test-cRMSE:1.25630+

KeyboardInterrupt: 