In [1]:
%load_ext autoreload
%autoreload 2

import os
import sys

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
import scipy
import altair as alt
from altair import datum
from tqdm.auto import tqdm, trange

from src.feature_engineering import add_lagged_features, drop_non_features, add_as_features, add_as_cat_features
from src.functional import comp, partial

%run constants.py

%matplotlib inline
print("Versions:")
print("  Python: %s" % sys.version)
for module in [pd, np, sns, sklearn, alt]:
    print("  %s: %s" %(module.__name__, module.__version__))

Versions:
  Python: 3.8.2 (default, Jul 16 2020, 14:00:26) 
[GCC 9.3.0]
  pandas: 1.1.0
  numpy: 1.19.1
  seaborn: 0.10.1
  sklearn: 0.23.2
  altair: 4.1.0


# Feature Engineering

Let's first load our train and test sets and set up our CV split.

In [2]:
train_set = pd.read_parquet(os.path.join(PROCESSED_DATA_DIR, 'train-set.parquet'))
test_set = pd.read_parquet(os.path.join(PROCESSED_DATA_DIR, 'test-set.parquet'))
sales_train_by_month = pd.read_parquet(os.path.join(PROCESSED_DATA_DIR, 'sales-train-by-month.parquet'))

Now let's start with the obvious one: the item counts. I wrote a function that calculates lagged features from a dataset.

Since we're dealing with lagged features, early months won't have any. Let's cut it down by half on both the window of lagged features and the train window for now.

In [5]:
train_set_w18 = add_lagged_features(train_set, sales_train_by_month, 'item_cnt', max_lag=18)
test_set_w18 = add_lagged_features(test_set, sales_train_by_month, 'item_cnt', max_lag=18)

HBox(children=(FloatProgress(value=0.0, max=18.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=18.0), HTML(value='')))




In [6]:
train_set_w18.describe()

Unnamed: 0,item_id,shop_id,date_block_num,item_cnt_month,f__item_cnt_1,f__item_cnt_2,f__item_cnt_3,f__item_cnt_4,f__item_cnt_5,f__item_cnt_6,...,f__item_cnt_9,f__item_cnt_10,f__item_cnt_11,f__item_cnt_12,f__item_cnt_13,f__item_cnt_14,f__item_cnt_15,f__item_cnt_16,f__item_cnt_17,f__item_cnt_18
count,5140800.0,5140800.0,5140800.0,5140800.0,5140800.0,5140800.0,5140800.0,5140800.0,5140800.0,5140800.0,...,5140800.0,5140800.0,5140800.0,5140800.0,5140800.0,5140800.0,5140800.0,5140800.0,5140800.0,5140800.0
mean,11019.4,31.64286,21.5,0.2199702,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
std,6252.631,17.56189,6.922187,1.113889,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
min,30.0,2.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,5381.5,16.0,15.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,11203.0,34.5,21.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,16071.5,47.0,27.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,22167.0,59.0,33.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
test_set_w18.describe()

Unnamed: 0,ID,shop_id,item_id,date_block_num,f__item_cnt_1,f__item_cnt_2,f__item_cnt_3,f__item_cnt_4,f__item_cnt_5,f__item_cnt_6,...,f__item_cnt_9,f__item_cnt_10,f__item_cnt_11,f__item_cnt_12,f__item_cnt_13,f__item_cnt_14,f__item_cnt_15,f__item_cnt_16,f__item_cnt_17,f__item_cnt_18
count,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,...,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0
mean,107099.5,31.642857,11019.398627,34.0,0.293413,0.274622,0.278137,0.244188,0.241839,0.263193,...,0.254888,0.315481,0.485752,0.3238,0.248922,0.225037,0.247241,0.206004,0.208492,0.199748
std,61834.358168,17.561933,6252.64459,0.0,5.550976,2.498978,2.149646,2.143116,2.286223,3.828952,...,1.879339,4.079211,5.561023,4.229684,3.040342,2.998698,2.83356,2.655728,3.120215,2.763012
min,0.0,2.0,30.0,34.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
25%,53549.75,16.0,5381.5,34.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,107099.5,34.5,11203.0,34.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,160649.25,47.0,16071.5,34.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,214199.0,59.0,22167.0,34.0,2253.0,473.0,436.0,482.0,444.0,742.0,...,257.0,1000.0,1209.0,772.0,634.0,639.0,591.0,563.0,771.0,602.0


In [30]:
X_train, y_train = drop_non_features(train_set_w18).values, train_set_w18['item_cnt_month'].values
X_test = drop_non_features(test_set_w18).values

I'll use a default XGB regressor since in my experience it's usually a good default to start with.

In [None]:
from xgboost import XGBRegressor
from sklearn.model_selection import cross_validate
from src.model import ClippedOutputRegressor
from src.model import tscv

reg = ClippedOutputRegressor(XGBRegressor(n_jobs=-1, verbosity=1))

In [28]:
scores = cross_validate(reg, X=X_train, y=y_train,
                        scoring='neg_root_mean_squared_error', verbose=2, n_jobs=-1, 
                        cv=8, return_train_score=True)
scores

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   8 | elapsed:   26.1s remaining:   15.7s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:   27.3s finished


{'fit_time': array([25.36750627, 26.59036994, 21.67717743, 26.66516399, 20.32476091,
        20.22686911, 20.47635698, 26.70344663]),
 'score_time': array([0.09192777, 0.07116914, 0.09098458, 0.07239461, 0.09095144,
        0.09002376, 0.08779168, 0.06810689]),
 'test_score': array([-0.75167242, -1.18909489, -1.16221947, -1.00335854, -0.74581479,
        -0.78018019, -0.6606931 , -0.69698062]),
 'train_score': array([-0.82447206, -0.76880405, -0.76328491, -0.79633034, -0.82486782,
        -0.82851853, -0.83226782, -0.82897516])}

In [29]:
scores['test_score'].mean(), scores['test_score'].std() 

(-0.8737517512159962, 0.19878434691882502)

In [31]:
reg.fit(X_train, y_train)
test_set_w18['item_cnt_month'] = reg.predict(X_test)

test_set_w18[['ID', 'item_cnt_month']].to_csv(os.path.join(TMP_DIR, 'xgb-dataset-01.csv'), index=False)

In [None]:
%%bash
kaggle c submit -f ${TMP_DIR}/xgb-dataset-01.csv -m 'Default XGB with ids and lagged item counts' competitive-data-science-predict-future-sales

Pretty good. Let's try adding some categorical features.

In [23]:
train_set_w18_base_ids = add_as_cat_features(train_set_w18, ['item_id', 'shop_id'])

In [24]:
X_train = drop_non_features(train_set_w18_base_ids).values

scores = cross_validate(reg, X=X_train, y=y_train,
                        scoring='neg_root_mean_squared_error', verbose=2, n_jobs=-1, 
                        cv=8, return_train_score=True)
scores

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   8 | elapsed:   30.3s remaining:   18.2s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:   31.6s finished


{'fit_time': array([23.12996221, 28.62978172, 23.28729844, 30.28636765, 23.13221407,
        24.89766264, 30.38679814, 30.56204271]),
 'score_time': array([0.11903572, 0.11405587, 0.13348985, 0.10038733, 0.14433527,
        0.12946177, 0.11193705, 0.11129618]),
 'test_score': array([-0.58648636, -0.99890535, -0.96909378, -0.81985889, -0.61720264,
        -0.76004499, -0.49600374, -0.56115139]),
 'train_score': array([-0.6023677 , -0.5723863 , -0.55554051, -0.58031067, -0.60311393,
        -0.59139223, -0.61272041, -0.60674876])}

In [25]:
scores['test_score'].mean(), scores['test_score'].std() 

(-0.7260933919342771, 0.17850583438587173)

OK, we can see it's an improvement. Let's just submit that and see how it does on the public LB.

In [17]:
test_set_w18_base_ids = add_as_cat_features(test_set_w18, ['item_id', 'shop_id'])
X_test = drop_non_features(test_set_w18_base_ids).values

In [19]:
reg.fit(X_train, y_train)
test_set_w18_base_ids['item_cnt_month'] = reg.predict(X_test)

test_set_w18_base_ids[['ID', 'item_cnt_month']].to_csv(os.path.join(TMP_DIR, 'xgb-dataset-01.csv'), index=False)

In [32]:
%%bash
kaggle c submit -f ${TMP_DIR}/xgb-dataset-01.csv -m 'Default XGB with ids and lagged item counts' competitive-data-science-predict-future-sales

Successfully submitted to Predict Future Sales

100%|██████████| 3.55M/3.55M [00:10<00:00, 344kB/s] 


It actually got worse. We can see adding item/shop ids makes the model overfit apparently.