In [1]:
%load_ext autoreload
%autoreload 2

import os
import sys

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
import scipy
from tqdm.auto import tqdm, trange

from src.model import tscv
from src.data import add_lagged_features

%run constants.py

%matplotlib inline
print("Versions:")
print("  Python: %s" % sys.version)
for module in [pd, np, sns, sklearn]:
    print("  %s: %s" %(module.__name__, module.__version__))

Versions:
  Python: 3.8.2 (default, Jul 16 2020, 14:00:26) 
[GCC 9.3.0]
  pandas: 1.1.1
  numpy: 1.19.1
  seaborn: 0.10.1
  sklearn: 0.23.2


Let's build the feature set using the shop item-cat encoding we developed earlier.

In [3]:
train_set = pd.read_parquet(os.path.join(PROCESSED_DATA_DIR, 'train-set.parquet'))
shop_item_cat_encoding = pd.read_parquet(os.path.join(PROCESSED_DATA_DIR, 'shop-item-cat-encoding.parquet'))

In [4]:
shop_item_cat_encoding

Unnamed: 0,shop_id,date_block_num,shop_item_cat_enc_0,shop_item_cat_enc_1,shop_item_cat_enc_2,shop_item_cat_enc_3,shop_item_cat_enc_4,shop_item_cat_enc_5,shop_item_cat_enc_6,shop_item_cat_enc_7,shop_item_cat_enc_8,shop_item_cat_enc_9,shop_item_cat_enc_10,shop_item_cat_enc_11,shop_item_cat_enc_12,shop_item_cat_enc_13,shop_item_cat_enc_14
0,0,0,0.025636,0.000000,0.034062,0.008426,0.161169,0.0,0.0,0.165471,0.002510,0.408032,0.012549,0.094658,0.045357,0.042130,0.000000
1,0,1,0.021707,0.000163,0.034764,0.009466,0.166476,0.0,0.0,0.150481,0.003754,0.392035,0.013547,0.105109,0.062510,0.039987,0.000000
2,1,0,0.037326,0.000000,0.038683,0.007805,0.168646,0.0,0.0,0.184595,0.003393,0.392263,0.000000,0.083135,0.045809,0.038344,0.000000
3,1,1,0.044887,0.000000,0.026159,0.009215,0.155172,0.0,0.0,0.181034,0.003864,0.379608,0.001486,0.087990,0.079667,0.030916,0.000000
4,2,0,0.028796,0.000000,0.057592,0.017452,0.209424,0.0,0.0,0.229494,0.002618,0.216405,0.023560,0.048866,0.089005,0.076789,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1581,59,29,0.016429,0.000000,0.016429,0.005476,0.213582,0.0,0.0,0.221249,0.016429,0.196057,0.004381,0.094195,0.205915,0.009858,0.000000
1582,59,30,0.036290,0.000000,0.012097,0.003024,0.183468,0.0,0.0,0.213710,0.014113,0.154234,0.008065,0.108871,0.241935,0.018145,0.006048
1583,59,31,0.021417,0.000000,0.020593,0.001647,0.254530,0.0,0.0,0.299835,0.009885,0.132619,0.011532,0.071664,0.164745,0.009885,0.001647
1584,59,32,0.020788,0.000000,0.022976,0.004376,0.256018,0.0,0.0,0.258206,0.008753,0.163020,0.014223,0.074398,0.138950,0.014223,0.024070


In [6]:
feature_cols = [col for col in shop_item_cat_encoding.columns
                if col not in ['shop_id', 'date_block_num']]
train_set_item_cat = add_lagged_features(
    train_set, shop_item_cat_encoding, feature_cols, max_lag=1,
    index_cols=['shop_id'])
train_set_item_cat.head()

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




Unnamed: 0,item_id,shop_id,date_block_num,item_cnt_month,f__shop_item_cat_enc_0_1,f__shop_item_cat_enc_1_1,f__shop_item_cat_enc_2_1,f__shop_item_cat_enc_3_1,f__shop_item_cat_enc_4_1,f__shop_item_cat_enc_5_1,f__shop_item_cat_enc_6_1,f__shop_item_cat_enc_7_1,f__shop_item_cat_enc_8_1,f__shop_item_cat_enc_9_1,f__shop_item_cat_enc_10_1,f__shop_item_cat_enc_11_1,f__shop_item_cat_enc_12_1,f__shop_item_cat_enc_13_1,f__shop_item_cat_enc_14_1
0,5037,5,10,0.0,0.000789,0.0,0.020521,0.007103,0.187056,0.0,0.0,0.255722,0.003157,0.29045,0.000789,0.144436,0.084451,0.005525,0.0
1,5320,5,10,0.0,0.000789,0.0,0.020521,0.007103,0.187056,0.0,0.0,0.255722,0.003157,0.29045,0.000789,0.144436,0.084451,0.005525,0.0
2,5233,5,10,0.0,0.000789,0.0,0.020521,0.007103,0.187056,0.0,0.0,0.255722,0.003157,0.29045,0.000789,0.144436,0.084451,0.005525,0.0
3,5232,5,10,0.0,0.000789,0.0,0.020521,0.007103,0.187056,0.0,0.0,0.255722,0.003157,0.29045,0.000789,0.144436,0.084451,0.005525,0.0
4,5268,5,10,0.0,0.000789,0.0,0.020521,0.007103,0.187056,0.0,0.0,0.255722,0.003157,0.29045,0.000789,0.144436,0.084451,0.005525,0.0


In [4]:
from src.data import df_to_X_y

X_train, y_train = df_to_X_y(train_set_item_cat)

In [5]:
from xgboost import XGBRegressor
from sklearn.model_selection import cross_validate
from src.model import ClippedOutputRegressor, tscv

reg = ClippedOutputRegressor(XGBRegressor(n_jobs=-1, verbosity=2, tree_method='gpu_hist', gpu_id=0)) #change to hist if you don't have access to a gpu
cv_split = tscv.split(train_set_item_cat['date_block_num'])

In [22]:
scores = cross_validate(reg, X=X_train, y=y_train,
                        scoring='neg_root_mean_squared_error', verbose=2, 
                        cv=cv_split, return_train_score=True)
scores

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  ................................................................
[CV] ................................................. , total=  12.7s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   32.8s remaining:    0.0s


[CV] ................................................. , total=  13.0s
[CV]  ................................................................
[CV] ................................................. , total=  13.4s


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  1.7min finished


{'fit_time': array([11.28167295, 11.60196567, 12.17813659]),
 'score_time': array([1.4282546 , 1.36682439, 1.20800328]),
 'test_score': array([-1.03316543, -1.10780143, -1.08599005]),
 'train_score': array([-1.10594588, -1.10209658, -1.10209402])}

After seeing little improvement for adding these 15 new features, I realized all I did was a mean encoding over the `shop-id, category` pair but keeping all the shop means. Instead, I'll generate different mean encodings now.

I'll go over each of this ids for encoding it:
- `item_id`
- `shop_id`
- `category_name`
- `subcategory_name`

This generates the following combinations (we don't need `item` and `category` on the same combination since each item belongs to a single category):
- `item_id`, `shop_id`
- `shop_id`, `category_name`
- `shop_id`, `subcategory_name`

We can do it two ways: one is to generate the mean encoding from the same data we use to train, or encode the previous month. Since it's simpler I'll start with the previous month encoding, whic is basically a lagged feature and can be generated beforehand.

In [6]:
train_set = pd.read_parquet(os.path.join(PROCESSED_DATA_DIR, 'train-set-features-002.parquet')) # load features with category metadata
cv_split = tscv.split(train_set['date_block_num'])

In [7]:
mean_encode_cols = [["item_id"],
                    ["shop_id"],
                    ["category_name"],
                    ["subcategory_name"],
                    ["item_id", "shop_id"],
                    ["shop_id", "category_name"],
                    ["shop_id", "subcategory_name"]]

def mean_encoding_col_name(on, label='item_cnt_month'):
    return 'mean_%s_on_%s' % (label, '_'.join(on))

def mean_encoding_df(df, on, label='item_cnt_month'):
    encode_column = mean_encoding_col_name(on, label)
    return df[on + [label]].groupby(on)[label].mean().reset_index().rename(
        columns={label: encode_column})

In [81]:
train_set_with_mes = train_set.copy()
for cols in tqdm(mean_encode_cols):
    encoding_df = mean_encoding_df(sales_train, cols + ['date_block_num'], 
                                   label='item_cnt')
    encoding_col = [col for col in encoding_df.columns if col.startswith('mean_')]
    train_set_with_mes = add_lagged_features(
        train_set_with_mes, encoding_df, encoding_col,
        max_lag=1, index_cols=cols)
train_set_with_mes.head()

HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))





Unnamed: 0,item_id,shop_id,date_block_num,item_cnt_month,f__item_cnt_1,f__item_cnt_2,f__item_cnt_3,f__item_cnt_4,f__item_cnt_5,f__item_cnt_6,...,subcategory_name,f__cat__le_category_name,f__cat__le_subcategory_name,f__mean_item_cnt_on_item_id_date_block_num_1,f__mean_item_cnt_on_shop_id_date_block_num_1,f__mean_item_cnt_on_category_name_date_block_num_1,f__mean_item_cnt_on_subcategory_name_date_block_num_1,f__mean_item_cnt_on_item_id_shop_id_date_block_num_1,f__mean_item_cnt_on_shop_id_category_name_date_block_num_1,f__mean_item_cnt_on_shop_id_subcategory_name_date_block_num_1
0,5037,5,10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,PS3,4,12,0.0,1.702957,3.46371,3.765795,0.0,2.194444,2.678571
1,5037,4,10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,PS3,4,12,0.0,1.957728,3.46371,3.765795,0.0,2.52381,2.6125
2,5037,6,10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,PS3,4,12,0.0,2.291147,3.46371,3.765795,0.0,3.902299,4.543689
3,5037,3,10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,PS3,4,12,0.0,1.701717,3.46371,3.765795,0.0,2.27551,2.309091
4,5037,2,10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,PS3,4,12,0.0,2.202216,3.46371,3.765795,0.0,2.565574,2.336957


In [82]:
X_train, y_train = df_to_X_y(train_set_with_mes)
reg = ClippedOutputRegressor(XGBRegressor(n_jobs=-1, verbosity=2, tree_method='gpu_hist', gpu_id=0)) #change to hist if you don't have access to a gpu
scores = cross_validate(reg, X=X_train, y=y_train,
                        scoring='neg_root_mean_squared_error', verbose=2, 
                        cv=cv_split, return_train_score=True)
scores

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  ................................................................
[CV] ................................................. , total=  15.4s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   29.0s remaining:    0.0s


[CV] ................................................. , total=  15.4s
[CV]  ................................................................
[CV] ................................................. , total=  16.4s


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  2.5min finished


{'fit_time': array([14.27924633, 14.71596694, 15.29335403]),
 'score_time': array([1.10193229, 0.70656729, 1.15259314]),
 'test_score': array([-0.76510254, -0.89657982, -0.89600143]),
 'train_score': array([-0.72167615, -0.7237208 , -0.7286878 ])}

In [83]:
scores['test_score'].mean()

-0.8525612605780832

Nice improvement. Let's try increasing the number of months we'll use the mean.

In [89]:
train_set_with_mes = train_set.copy()
for cols in tqdm(mean_encode_cols):
    encoding_df = mean_encoding_df(sales_train, cols + ['date_block_num'], 
                                   label='item_cnt')
    encoding_col = [col for col in encoding_df.columns if col.startswith('mean_')]
    train_set_with_mes = add_lagged_features(
        train_set_with_mes, encoding_df, encoding_col,
        max_lag=3, index_cols=cols)
    
X_train, y_train = df_to_X_y(train_set_with_mes)
reg = ClippedOutputRegressor(XGBRegressor(n_jobs=-1, verbosity=2, tree_method='gpu_hist', gpu_id=0)) #change to hist if you don't have access to a gpu
scores = cross_validate(reg, X=X_train, y=y_train,
                        scoring='neg_root_mean_squared_error', verbose=2, 
                        cv=cv_split, return_train_score=True)
scores

HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))





[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  ................................................................
[CV] ................................................. , total=  21.8s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   43.7s remaining:    0.0s


[CV] ................................................. , total=  22.0s
[CV]  ................................................................
[CV] ................................................. , total=  22.6s


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  3.3min finished


{'fit_time': array([20.56279349, 20.80930138, 21.3822844 ]),
 'score_time': array([1.24337244, 1.17137599, 1.16880822]),
 'test_score': array([-0.76403668, -0.89122926, -0.89704724]),
 'train_score': array([-0.70870893, -0.7142106 , -0.72097478])}

In [90]:
scores['test_score'].mean()

-0.8507710597986403

A bit better. And let's try with a rolling window now:

In [8]:
def rolling_mean_encoding_col_name(on, label='item_cnt_month', w=20):
    return 'rolling_window_%d_mean_%s_on_%s' % (w, label, '_'.join(on))


def rolling_mean_encoding_df(df, on, label='item_cnt_month', w=20,
                             date_col='date_block_num'):
    encode_column = rolling_mean_encoding_col_name(on, label, w)

    dfs = []
    for m in trange(w, 34):
        tmp_df = df.loc[
            (df['date_block_num'] <= m) & (df['date_block_num'] > (m - w)),
            on + [label]].groupby(on)[label].mean().reset_index().rename(
            columns={label: encode_column})
        tmp_df['date_block_num'] = m
        dfs.append(tmp_df)
    return pd.concat(dfs)

In [92]:
rolling_mean_encoding_df(train_set, ['item_id'])

HBox(children=(FloatProgress(value=0.0, max=14.0), HTML(value='')))




Unnamed: 0,item_id,rolling_window_20_mean_item_cnt_month_on_item_id,date_block_num
0,30,0.510823,20
1,31,0.326840,20
2,32,0.932900,20
3,33,0.445887,20
4,38,0.015152,20
...,...,...,...
5095,22162,0.673810,33
5096,22163,0.088095,33
5097,22164,0.500000,33
5098,22166,0.260714,33


In [93]:
train_set_with_rolling_means = train_set.copy()
for cols in tqdm(mean_encode_cols):
    encoding_df = rolling_mean_encoding_df(
        sales_train, cols, label='item_cnt')
    encoding_col = [col for col in encoding_df.columns if col.startswith('rolling_window_')]
    train_set_with_rolling_means = add_lagged_features(
        train_set_with_rolling_means, encoding_df, encoding_col,
        max_lag=1, index_cols=cols)
train_set_with_rolling_means.head()

HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=14.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=14.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=14.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=14.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=14.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=14.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=14.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))





Unnamed: 0,item_id,shop_id,date_block_num,item_cnt_month,f__item_cnt_1,f__item_cnt_2,f__item_cnt_3,f__item_cnt_4,f__item_cnt_5,f__item_cnt_6,...,subcategory_name,f__cat__le_category_name,f__cat__le_subcategory_name,f__rolling_window_20_mean_item_cnt_on_item_id_1,f__rolling_window_20_mean_item_cnt_on_shop_id_1,f__rolling_window_20_mean_item_cnt_on_category_name_1,f__rolling_window_20_mean_item_cnt_on_subcategory_name_1,f__rolling_window_20_mean_item_cnt_on_item_id_shop_id_1,f__rolling_window_20_mean_item_cnt_on_shop_id_category_name_1,f__rolling_window_20_mean_item_cnt_on_shop_id_subcategory_name_1
0,5037,5,10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,PS3,4,12,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,5037,4,10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,PS3,4,12,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,5037,6,10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,PS3,4,12,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,5037,3,10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,PS3,4,12,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5037,2,10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,PS3,4,12,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [94]:
X_train, y_train = df_to_X_y(train_set_with_rolling_means)
reg = ClippedOutputRegressor(XGBRegressor(n_jobs=-1, verbosity=2, tree_method='gpu_hist', gpu_id=0)) #change to hist if you don't have access to a gpu
scores = cross_validate(reg, X=X_train, y=y_train,
                        scoring='neg_root_mean_squared_error', verbose=2, 
                        cv=cv_split, return_train_score=True)
scores

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  ................................................................
[CV] ................................................. , total=  15.7s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   30.8s remaining:    0.0s


[CV] ................................................. , total=  15.9s
[CV]  ................................................................
[CV] ................................................. , total=  16.6s


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  2.6min finished


{'fit_time': array([14.46369553, 15.07717991, 15.30547309]),
 'score_time': array([1.21507454, 0.80455375, 1.30124831]),
 'test_score': array([-0.77202219, -0.8980028 , -0.88608265]),
 'train_score': array([-0.73037889, -0.73424932, -0.73492693])}

In [95]:
scores['test_score'].mean()

-0.8520358800706661

Cool, improves the score a lot too. Let's see the monthly mean plus the rolling mean together (they may be redundant).

In [11]:
import gc
train_set_with_all_means = train_set.copy()
for cols in tqdm(mean_encode_cols):
    encoding_df = mean_encoding_df(train_set, cols + ['date_block_num'])
    encoding_col = [col for col in encoding_df.columns if col.startswith('mean_')]
    train_set_with_all_means = add_lagged_features(
        train_set_with_all_means, encoding_df, encoding_col,
        max_lag=3, index_cols=cols)
    del encoding_df
    gc.collect()
    
for cols in tqdm(mean_encode_cols):
    encoding_df = rolling_mean_encoding_df(
        train_set, cols)
    encoding_col = [col for col in encoding_df.columns if col.startswith('rolling_window_')]
    train_set_with_all_means = add_lagged_features(
        train_set_with_all_means, encoding_df, encoding_col,
        max_lag=1, index_cols=cols)
    del encoding_df
    gc.collect()
train_set_with_all_means.head()

HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))





HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=14.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=14.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=14.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=14.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=14.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=14.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=14.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))





Unnamed: 0,item_id,shop_id,date_block_num,item_cnt_month,f__item_cnt_1,f__item_cnt_2,f__item_cnt_3,f__item_cnt_4,f__item_cnt_5,f__item_cnt_6,...,f__mean_item_cnt_month_on_shop_id_subcategory_name_date_block_num_1,f__mean_item_cnt_month_on_shop_id_subcategory_name_date_block_num_2,f__mean_item_cnt_month_on_shop_id_subcategory_name_date_block_num_3,f__rolling_window_20_mean_item_cnt_month_on_item_id_1,f__rolling_window_20_mean_item_cnt_month_on_shop_id_1,f__rolling_window_20_mean_item_cnt_month_on_category_name_1,f__rolling_window_20_mean_item_cnt_month_on_subcategory_name_1,f__rolling_window_20_mean_item_cnt_month_on_item_id_shop_id_1,f__rolling_window_20_mean_item_cnt_month_on_shop_id_category_name_1,f__rolling_window_20_mean_item_cnt_month_on_shop_id_subcategory_name_1
0,5037,5,10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,5037,4,10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,5037,6,10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,5037,3,10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5037,2,10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
X_train, y_train = df_to_X_y(train_set_with_all_means)
reg = ClippedOutputRegressor(XGBRegressor(n_jobs=-1, verbosity=2, tree_method='gpu_hist', gpu_id=0)) #change to hist if you don't have access to a gpu
scores = cross_validate(reg, X=X_train, y=y_train,
                        scoring='neg_root_mean_squared_error', verbose=2, 
                        cv=cv_split, return_train_score=True)
scores

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  ................................................................
[CV] ................................................. , total=  25.2s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   40.0s remaining:    0.0s


[CV] ................................................. , total=  23.0s
[CV]  ................................................................
[CV] ................................................. , total=  23.8s


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  2.5min finished


{'fit_time': array([24.52456713, 21.68511224, 22.47918677]),
 'score_time': array([0.71460915, 1.27430677, 1.29898357]),
 'test_score': array([-0.75929485, -0.89783685, -0.87959264]),
 'train_score': array([-0.70281605, -0.70237109, -0.71014564])}

In [13]:
scores['test_score'].mean()

-0.8455747816063504

Pretty good. Let's turn it in a submission and check how it goes.

In [14]:
test_set = pd.read_parquet(os.path.join(PROCESSED_DATA_DIR, 'test-set-features-002.parquet'))
test_set_with_all_means = test_set.copy()
for cols in tqdm(mean_encode_cols):
    encoding_df = mean_encoding_df(train_set, cols + ['date_block_num'])
    encoding_col = [col for col in encoding_df.columns if col.startswith('mean_')]
    test_set_with_all_means = add_lagged_features(
        test_set_with_all_means, encoding_df, encoding_col,
        max_lag=3, index_cols=cols)
    del encoding_df
    gc.collect()
    
for cols in tqdm(mean_encode_cols):
    encoding_df = rolling_mean_encoding_df(
        train_set, cols)
    encoding_col = [col for col in encoding_df.columns if col.startswith('rolling_window_')]
    test_set_with_all_means = add_lagged_features(
        test_set_with_all_means, encoding_df, encoding_col,
        max_lag=1, index_cols=cols)
    del encoding_df
    gc.collect()

HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))





HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=14.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=14.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=14.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=14.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=14.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=14.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=14.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))





In [17]:
from src.data import df_to_X

X_test = df_to_X(test_set_with_all_means)

reg.fit(X_train, y_train)

test_set['item_cnt_month'] = reg.predict(X_test)
test_set[['ID', 'item_cnt_month']].to_csv(os.path.join(TMP_DIR, 'xgb-nb-09-002.csv'), index=False)

In [18]:
%%bash
kaggle c submit -f ${TMP_DIR}/xgb-nb-09-002.csv -m 'Default XGB with means' competitive-data-science-predict-future-sales

Successfully submitted to Predict Future Sales

100%|██████████| 3.50M/3.50M [00:10<00:00, 342kB/s] 


Although there's a visible improvement using all the features here, it's pretty small and costs a lot of extra memory, so I don't know if I'll use it in the final model (the previous submission got 0.97721 and is a passable submission for the class, but still lacks some hyperparam optimization, stacking, etc).