In [1]:
%load_ext autoreload
%autoreload 2

import os
import sys

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
import scipy
import altair as alt
from altair import datum
from tqdm.auto import tqdm, trange

from src.model import tscv

%run constants.py

%matplotlib inline
print("Versions:")
print("  Python: %s" % sys.version)
for module in [pd, np, sns, sklearn, alt]:
    print("  %s: %s" %(module.__name__, module.__version__))

Versions:
  Python: 3.8.2 (default, Jul 16 2020, 14:00:26) 
[GCC 9.3.0]
  pandas: 1.1.0
  numpy: 1.19.1
  seaborn: 0.10.1
  sklearn: 0.23.2
  altair: 4.1.0


# Model tuning pipeline

We're not through with feature engineering, but I want have my hyperparameter optimization pipeline ready. To make things simple I'll be using [Optuna](https://optuna.readthedocs.io/en/stable/index.html). It's a automated hyperparameter optimization framework that implements some bayesian algorithms.

I could also use random search or a grid search but I thought this would be a nice opportunity to try something different.

The first thing we need to do is define the loss function. We'll define one for XGB first.

In [5]:
import xgboost as xgb
from src.model.metrics import corrected_rmse
from optuna import Trial


def xgb_feval(y_pred, dtrain):
    return 'cRMSE', corrected_rmse(dtrain.get_label(), y_pred)


def make_xgb_loss(X_train, y_train, cv_splits, verbose=True):
    dtrain = xgb.DMatrix(X_train, y_train)
    return lambda params: xgb.cv(
        params, dtrain, folds=cv_splits, feval=xgb_feval,
        maximize=False, verbose_eval=verbose)['test-cRMSE-mean'].min()

Let's try it in action. Also notice we're using XGB's built in CV instead of sklearn's `cross_validate` inside this higher level function that returns the actual loss function so we can reuse the `DMatrix` object and reduce memory usage.

In [6]:
from src.feature_engineering import df_to_X_y
train_set = pd.read_parquet(os.path.join(PROCESSED_DATA_DIR, 'train-set-features-001.parquet'))
X_train, y_train = df_to_X_y(train_set)

In [7]:
from src.model import tscv
cv_splits = tscv.split(train_set['date_block_num'])

In [8]:
l = make_xgb_loss(X_train, y_train, cv_splits)
l({})

[0]	train-rmse:2.59019+0.00458	test-rmse:2.42555+0.21618	train-cRMSE:1.23558+0.00224	test-cRMSE:1.15701+0.10312
[1]	train-rmse:2.36037+0.00452	test-rmse:2.22328+0.22623	train-cRMSE:1.12599+0.00210	test-cRMSE:1.06053+0.10791
[2]	train-rmse:2.23398+0.00368	test-rmse:2.11549+0.23570	train-cRMSE:1.06568+0.00176	test-cRMSE:1.00911+0.11243
[3]	train-rmse:2.16498+0.00338	test-rmse:2.05637+0.24291	train-cRMSE:1.03272+0.00164	test-cRMSE:0.98091+0.11587
[4]	train-rmse:2.12670+0.00308	test-rmse:2.02637+0.24968	train-cRMSE:1.01442+0.00151	test-cRMSE:0.96660+0.11910
[5]	train-rmse:2.10466+0.00353	test-rmse:2.00651+0.25183	train-cRMSE:1.00397+0.00170	test-cRMSE:0.95712+0.12013
[6]	train-rmse:2.09047+0.00312	test-rmse:1.99606+0.25456	train-cRMSE:0.99712+0.00151	test-cRMSE:0.95214+0.12143
[7]	train-rmse:2.07994+0.00369	test-rmse:1.98961+0.25619	train-cRMSE:0.99214+0.00175	test-cRMSE:0.94906+0.12220
[8]	train-rmse:2.07217+0.00439	test-rmse:1.98055+0.25324	train-cRMSE:0.98841+0.00209	test-cRMSE:0.94474+

0.9420986666666668

Looks good. Now let's define the objective function we will feed optuna (that's also where we define the state space for our search).

Also note we're fixing the number of boost rounds for now. This is a special hyperparameter since we can use early stopping afterwards and get an optimal number.

In [48]:
from optuna import Trial
STATIC_PARAMS = {"objective": "reg:squarederror",
                 "n_jobs": -1,
                 "base_score": 0.5,
                 "scale_pos_weight": 1}

def trial_to_params(trial: Trial):
    return {**STATIC_PARAMS,
            "max_depth": trial.suggest_int('max_depth', 2, 20, 1),
            "subsample": trial.suggest_discrete_uniform('subsample', .20, 1.00, .01),
            "colsample_bytree": trial.suggest_discrete_uniform('colsample_bytree', .20, 1., .01),
            "colsample_bylevel": trial.suggest_discrete_uniform('colsample_bylevel', .20, 1., .01),
            "seed": trial.suggest_int('seed', 0, 999999),
            "learning_rate": trial.suggest_uniform('learning_rate', 0.01, 0.15),
            "gamma": trial.suggest_categorical("gamma", [0, 0, 0, 0, 0, 0.01, 0.1, 0.2, 0.3, 0.5, 1., 10., 100.]),
            "min_child_weight": trial.suggest_categorical('min_child_weight', [1, 1, 1, 1, 2, 3, 4, 5, 1, 6, 7, 8, 9, 10, 11, 15, 30, 60, 100, 1, 1, 1]),
            "max_delta_step": trial.suggest_categorical("max_delta_step", [0, 0, 0, 0, 0, 1, 2, 5, 8]),
            "reg_alpha": trial.suggest_categorical("reg_alpha", [0, 0, 0, 0, 0, 0.00000001, 0.00000005, 0.0000005, 0.000005]),
            "reg_lambda": trial.suggest_categorical("reg_lambda", [1, 1, 1, 1, 2, 3, 4, 5, 1])}

def make_xgb_objective(xgb_loss):
    return lambda trial: xgb_loss(trial_to_params(trial))

And that's it. Now all we have to do is feed it to optuna. Here's an example of how that'd work.

In [17]:
# just making sure GC runs before study so we don't get OOM errors
import gc
gc.collect()

import optuna

study = optuna.create_study(direction='minimize')

objective = make_xgb_objective(make_xgb_loss(X_train, y_train, cv_splits, verbose=False))
study.optimize(objective, n_trials=2, n_jobs=3, gc_after_trial=True)

[I 2020-08-27 15:34:16,934] Trial 0 finished with value: 0.9699249999999999 and parameters: {'max_depth': 15, 'subsample': 0.95, 'colsample_bytree': 0.99, 'colsample_bylevel': 0.37, 'seed': 144093, 'learning_rate': 0.124810494766307, 'gamma': 100.0, 'min_child_weight': 2, 'max_delta_step': 0, 'reg_alpha': 0, 'reg_lambda': 3}. Best is trial 0 with value: 0.9699249999999999.
[I 2020-08-27 15:36:36,955] Trial 1 finished with value: 1.1400723333333334 and parameters: {'max_depth': 6, 'subsample': 0.55, 'colsample_bytree': 0.41000000000000003, 'colsample_bylevel': 0.8200000000000001, 'seed': 298054, 'learning_rate': 0.042448496604582955, 'gamma': 100.0, 'min_child_weight': 1, 'max_delta_step': 0, 'reg_alpha': 1e-08, 'reg_lambda': 1}. Best is trial 1 with value: 1.1400723333333334.
[I 2020-08-27 15:38:54,044] Trial 0 finished with value: 1.2238716666666667 and parameters: {'max_depth': 13, 'subsample': 0.75, 'colsample_bytree': 0.99, 'colsample_bylevel': 0.65, 'seed': 275575, 'learning_rate'

The best parameters are stored in the study as an attribute:

In [49]:
best_params = {**STATIC_PARAMS,
               **study.best_params}
best_params

{'objective': 'reg:squarederror',
 'n_jobs': -1,
 'base_score': 0.5,
 'scale_pos_weight': 1,
 'max_depth': 18,
 'subsample': 0.24000000000000002,
 'colsample_bytree': 0.53,
 'colsample_bylevel': 0.5900000000000001,
 'seed': 235545,
 'learning_rate': 0.09267604192580113,
 'gamma': 10.0,
 'min_child_weight': 60,
 'max_delta_step': 0,
 'reg_alpha': 0,
 'reg_lambda': 1}

Now we use this parameters and find the optimal number of boosting rounds with the early stopping method.

In [50]:
train_idx, test_idx = tscv.train_test_split(train_set['date_block_num'])
dtrain = xgb.DMatrix(X_train[train_idx], y_train[train_idx])
dtest = xgb.DMatrix(X_train[test_idx], y_train[test_idx])
bst = xgb.train(best_params, dtrain, early_stopping_rounds=40,
                num_boost_round=1000, evals=[(dtrain, 'dtrain'), (dtest, 'dtest')],
               feval=xgb_feval)

[0]	dtrain-rmse:2.87555	dtest-rmse:2.81223	dtrain-cRMSE:1.37582	dtest-cRMSE:1.34150
Multiple eval metrics have been passed: 'dtest-cRMSE' will be used for early stopping.

Will train until dtest-cRMSE hasn't improved in 40 rounds.
[1]	dtrain-rmse:2.77999	dtest-rmse:2.71688	dtrain-cRMSE:1.32925	dtest-cRMSE:1.29598
[2]	dtrain-rmse:2.68620	dtest-rmse:2.62598	dtrain-cRMSE:1.28392	dtest-cRMSE:1.25262
[3]	dtrain-rmse:2.58686	dtest-rmse:2.54867	dtrain-cRMSE:1.23593	dtest-cRMSE:1.21576
[4]	dtrain-rmse:2.52886	dtest-rmse:2.48750	dtrain-cRMSE:1.20778	dtest-cRMSE:1.18657
[5]	dtrain-rmse:2.45296	dtest-rmse:2.42516	dtrain-cRMSE:1.17132	dtest-cRMSE:1.15682
[6]	dtrain-rmse:2.40156	dtest-rmse:2.38016	dtrain-cRMSE:1.14675	dtest-cRMSE:1.13537
[7]	dtrain-rmse:2.35633	dtest-rmse:2.34078	dtrain-cRMSE:1.12532	dtest-cRMSE:1.11658
[8]	dtrain-rmse:2.31732	dtest-rmse:2.30800	dtrain-cRMSE:1.10680	dtest-cRMSE:1.10095
[9]	dtrain-rmse:2.27001	dtest-rmse:2.27467	dtrain-cRMSE:1.08427	dtest-cRMSE:1.08506
[10]	dtrain-r

In [51]:
bst.best_ntree_limit

59

And we can use this to train the model and generate a submission.

In [52]:
reg = xgb.XGBRegressor(n_estimators=bst.best_ntree_limit, **best_params)
reg.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree',
             colsample_bylevel=0.5900000000000001, colsample_bynode=1,
             colsample_bytree=0.53, gamma=10.0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.09267604192580113, max_delta_step=0, max_depth=18,
             min_child_weight=60, missing=nan, monotone_constraints='()',
             n_estimators=59, n_jobs=-1, num_parallel_tree=1,
             random_state=235545, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             seed=235545, subsample=0.24000000000000002, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [44]:
from src.feature_engineering import drop_non_features
import zipfile
with zipfile.ZipFile(os.path.join(RAW_DATA_DIR, 'competitive-data-science-predict-future-sales.zip'), 'r') as datasets_file:
    test_set = pd.read_csv(datasets_file.open('test.csv'))
test_subset = pd.read_parquet(os.path.join(PROCESSED_DATA_DIR, 'test-subset-features-001.parquet'))

X_test = drop_non_features(test_subset).values

In [53]:
from src.submission import submission_from_subset
test_subset['item_cnt_month'] = reg.predict(X_test)
submission = submission_from_subset(test_subset, test_set)

In [54]:
submission.to_csv(os.path.join(TMP_DIR, 'xgb-dataset-001-tuning.csv'), index=False)

In [55]:
%%bash
kaggle c submit -f ${TMP_DIR}/xgb-dataset-001-tuning.csv -m 'Experimenting with XGB with date ids and lagged item counts and hyperparameter optimization using optuna' competitive-data-science-predict-future-sales

Successfully submitted to Predict Future Sales

100%|██████████| 2.30M/2.30M [00:35<00:00, 68.1kB/s]


`1.08553`. It's an improvement over the last one, but I expected more from the CV scores.