In [1]:
%load_ext autoreload
%autoreload 2

import os
import sys

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
import scipy
from tqdm.auto import tqdm, trange
import xgboost as xgb
import joblib
from sklearn.model_selection import cross_validate
import zipfile

from src.model import tscv, ClippedOutputRegressor
from src.data import get_feature_cols, df_to_X_y, drop_non_features, add_lagged_features, add_as_features, df_to_X


%run constants.py

baseline_reg = joblib.load(os.path.join(MODELS_DIR, 'xgb-baseline.model'))

%matplotlib inline
print("Versions:")
print("  Python: %s" % sys.version)
for module in [pd, np, sns, sklearn]:
    print("  %s: %s" %(module.__name__, module.__version__))

Versions:
  Python: 3.8.2 (default, Jul 16 2020, 14:00:26) 
[GCC 9.3.0]
  pandas: 1.1.2
  numpy: 1.19.2
  seaborn: 0.11.0
  sklearn: 0.23.2


# Model Stacking

Let's start stacking stuff. We already have some CV predictions from previous experiments, so let's start this by building a model on top of that.

In [3]:
lgb_cv_preds = pd.read_parquet(os.path.join(MODEL_OUTPUTS_DIR, 'cv-lgb-features-025.parquet'))
xgb_cv_preds = pd.read_parquet(os.path.join(MODEL_OUTPUTS_DIR, 'cv-xgb-features-025.parquet'))

In [4]:
lgb_cv_preds.head()

Unnamed: 0,oof_preds,oof_idx,item_cnt_month,fold_id
0,0.453406,3266760,1.0,0
1,0.257648,3266761,1.0,0
2,0.243354,3266762,0.0,0
3,0.147679,3266763,0.0,0
4,0.294278,3266764,1.0,0


Now the function to combine the predictions. We want to make sure the split used to generate them are the same, so let's encode that into our function too.

In [5]:
def combine_cv_preds(df1, df2, suffixes=('_df1', '_df2')):
    df = df1.merge(df2, on=['oof_idx', 'fold_id', 'item_cnt_month'], suffixes=suffixes)
    if not (df.shape[0] == df1.shape[0] and df.shape[0] == df2.shape[0]):
        raise ValueError("CV preds don't align")
    return df

In [6]:
combined_cv_preds = combine_cv_preds(lgb_cv_preds, xgb_cv_preds, suffixes=('_lgb', '_xgb'))

In [7]:
combined_cv_preds.head()

Unnamed: 0,oof_preds_lgb,oof_idx,item_cnt_month,fold_id,oof_preds_xgb
0,0.453406,3266760,1.0,0,0.514854
1,0.257648,3266761,1.0,0,0.272606
2,0.243354,3266762,0.0,0,0.285903
3,0.147679,3266763,0.0,0,0.139986
4,0.294278,3266764,1.0,0,0.324377


Let's try averaging our model outputs first.

In [50]:
combined_cv_preds['oof_preds_avg'] = (combined_cv_preds['oof_preds_xgb'] + combined_cv_preds['oof_preds_lgb']) / 2.0

In [65]:
from sklearn.metrics import mean_squared_error
def cv_df_scores(cv_df, ytrue_col='item_cnt_month', ypred_col='oof_preds', n=3):
    scores = []
    for fold_id in cv_df['fold_id'].unique()[-n:]:
        y_true = cv_df[cv_df['fold_id'] == fold_id][ytrue_col].values
        y_pred = cv_df[cv_df['fold_id'] == fold_id][ypred_col].values
        err = mean_squared_error(y_true, np.clip(y_pred, 0, 20), squared=False)
        scores.append(err)
    return scores

In [66]:
print('XGB: %.5f' % np.mean(cv_df_scores(combined_cv_preds, ypred_col='oof_preds_xgb')))
print('LGB: %.5f' % np.mean(cv_df_scores(combined_cv_preds, ypred_col='oof_preds_lgb')))
print('Avg LGB XGB: %.5f' % np.mean(cv_df_scores(combined_cv_preds, ypred_col='oof_preds_avg')))

XGB: 0.79638
LGB: 0.80421
Avg LGB XGB: 0.79747


Which is to be expected. We can try a weighted average, but instead of guessing the weights I'll just train a linear regression with l1 penalty and positive weights on this.

First let's generate a train and validation set from the CV predictions. Since the data is reduced, let's use just the last month as a validation set. We also need to recalculate our scores here so we have something to compare the stacking results to.

In [69]:
print('XGB: %.5f' % np.mean(cv_df_scores(combined_cv_preds, ypred_col='oof_preds_xgb', n=1)))
print('LGB: %.5f' % np.mean(cv_df_scores(combined_cv_preds, ypred_col='oof_preds_lgb', n=1)))
print('Avg LGB XGB: %.5f' % np.mean(cv_df_scores(combined_cv_preds, ypred_col='oof_preds_avg', n=1)))

XGB: 0.77848
LGB: 0.78570
Avg LGB XGB: 0.77923


In [70]:
from sklearn.metrics import make_scorer, mean_squared_error
def _clipped_rmse(y, ypred):
    return mean_squared_error(y, np.clip(ypred, 0, 20), squared=False)
clipped_rmse_score = make_scorer(_clipped_rmse, greater_is_better=False)

In [71]:
train_set = add_as_features(combined_cv_preds, ['oof_preds_xgb', 'oof_preds_lgb'])
cv_splits = tscv.split(train_set['fold_id'], n=1, window=7, test_months=train_set['fold_id'].unique())
X, y = df_to_X_y(train_set)

In [72]:
from sklearn.linear_model import Lasso

reg = ClippedOutputRegressor(Lasso(random_state=123, alpha=1e-12, positive=True, fit_intercept=True, normalize=False))
scores = cross_validate(reg, X, y, cv=cv_splits, scoring=clipped_rmse_score, verbose=2)
scores

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  ................................................................
[CV] ................................................. , total=   1.0s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.0s finished


{'fit_time': array([0.94893312]),
 'score_time': array([0.00551581]),
 'test_score': array([-0.77798477])}

A small improvement over our best model (XGB).

We can also try our baseline XGB on this.

In [74]:
scores = cross_validate(baseline_reg, X, y, cv=cv_splits, scoring=clipped_rmse_score, verbose=2)
scores

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  ................................................................
[CV] ................................................. , total=   1.9s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.9s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.9s finished


{'fit_time': array([1.3246274]),
 'score_time': array([0.57164526]),
 'test_score': array([-0.81622036])}

That went terrible for some reason. Let's test the tuned XGB here just for fun (I don't expect it to improve the solution that much).

In [75]:
xgb_025 = joblib.load(os.path.join(MODELS_DIR, 'xgb-features-025.model'))
scores = cross_validate(xgb_025, X, y, cv=cv_splits, scoring=clipped_rmse_score, verbose=2)
scores

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  ................................................................
[CV] ................................................. , total=  10.0s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   10.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   10.0s finished


{'fit_time': array([5.31406832]),
 'score_time': array([4.72536635]),
 'test_score': array([-0.81720215])}

Also bad. I shouldn't be doing this, but I want to investigate why XGB gives very poor results here. Since the dataset is a lot smaller, let's try changing the tree method to `exact`.

In [81]:
xgb_small = xgb.XGBRegressor(random_state=8, n_estimators=10, tree_method='exact', n_jobs=-1)
cross_validate(xgb_small, X, y, cv=cv_splits, scoring=clipped_rmse_score, verbose=2)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  ................................................................
[CV] ................................................. , total=   7.2s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    7.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    7.2s finished


{'fit_time': array([7.09215951]),
 'score_time': array([0.07775664]),
 'test_score': array([-0.7784614])}

A lot better, but still worse than our plain Lasso regressor.

Another thing we can try is "passthrough", where we just re-add the original features.

In [82]:
train_set_025 = pd.read_parquet(os.path.join(PROCESSED_DATA_DIR, 'train-set-features-025.parquet')).iloc[train_set['oof_idx']]

In [83]:
X_train_set_025 = df_to_X(train_set_025)
del train_set_025

In [84]:
X_passthrough = np.concatenate((X, X_train_set_025), axis=1)

In [85]:
cross_validate(baseline_reg, X_passthrough, y, cv=cv_splits, scoring='neg_root_mean_squared_error', verbose=2)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  ................................................................
[CV] ................................................. , total=  14.6s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   14.6s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   14.6s finished


{'fit_time': array([13.06254673]),
 'score_time': array([1.57453036]),
 'test_score': array([-0.81288667])}

In [86]:
np.mean(scores['test_score'])

-0.8172021453770638

Does not look good either.