In [28]:
%load_ext autoreload
%autoreload 2

import os
import sys

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
import scipy
import altair as alt
from altair import datum
from tqdm.auto import tqdm, trange

from src.model import tscv
from src.model.metrics import corrected_rmse_score
from src.feature_engineering import add_lagged_features, drop_non_features, add_as_features, add_as_cat_features
from src.functional import comp, partial

%run constants.py

%matplotlib inline
print("Versions:")
print("  Python: %s" % sys.version)
for module in [pd, np, sns, sklearn, alt]:
    print("  %s: %s" %(module.__name__, module.__version__))

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Versions:
  Python: 3.8.2 (default, Jul 16 2020, 14:00:26) 
[GCC 9.3.0]
  pandas: 1.1.0
  numpy: 1.19.1
  seaborn: 0.10.1
  sklearn: 0.23.2
  altair: 4.1.0


# Feature Engineering

Let's first load our train and test sets and set up our CV split.

In [29]:
import zipfile
with zipfile.ZipFile(os.path.join(RAW_DATA_DIR, 'competitive-data-science-predict-future-sales.zip'), 'r') as datasets_file:
    test_set = pd.read_csv(datasets_file.open('test.csv'))
train_set = pd.read_parquet(os.path.join(PROCESSED_DATA_DIR, 'train-set.parquet'))
test_subset = pd.read_parquet(os.path.join(PROCESSED_DATA_DIR, 'test-subset.parquet'))
sales_train_by_month = pd.read_parquet(os.path.join(PROCESSED_DATA_DIR, 'sales-train-by-month.parquet'))
cv_splits = tscv.split(train_set['date_block_num'].values)

Now let's start with the obvious one: the item counts. I wrote a function that calculates lagged features from a dataset.

Since we're dealing with lagged features, early months won't have any. Let's cut it down by half on both the window of lagged features and the train window for now.

In [3]:
cv_splits_w18 = tscv.split(train_set['date_block_num'].values, window=18)
train_set_w18 = add_lagged_features(train_set, sales_train_by_month, 'item_cnt', max_lag=18)
test_subset_w18 = add_lagged_features(test_subset, sales_train_by_month, 'item_cnt', max_lag=18)

HBox(children=(FloatProgress(value=0.0, max=18.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=18.0), HTML(value='')))




In [4]:
train_set_w18.describe()

Unnamed: 0,date_block_num,item_id,shop_id,item_cnt_month,f__item_cnt_1,f__item_cnt_2,f__item_cnt_3,f__item_cnt_4,f__item_cnt_5,f__item_cnt_6,...,f__item_cnt_9,f__item_cnt_10,f__item_cnt_11,f__item_cnt_12,f__item_cnt_13,f__item_cnt_14,f__item_cnt_15,f__item_cnt_16,f__item_cnt_17,f__item_cnt_18
count,1609124.0,1609124.0,1609124.0,1609124.0,1609124.0,1609124.0,1609124.0,1609124.0,1609124.0,1609124.0,...,1609124.0,1609124.0,1609124.0,1609124.0,1609124.0,1609124.0,1609124.0,1609124.0,1609124.0,1609124.0
mean,14.66479,10680.99,32.80585,2.022806,1.454636,1.288606,1.153339,1.047351,0.9614697,0.8883274,...,0.6866053,0.625697,0.5632481,0.5133849,0.4563769,0.4128818,0.37231,0.3363905,0.3056452,0.2770955
std,9.542322,6238.883,16.53701,2.577964,7.989895,7.630533,7.427049,7.35471,7.217674,7.059026,...,6.602638,6.421906,6.058962,5.836906,5.658653,5.348509,5.066624,5.028181,4.846775,4.738297
min,0.0,0.0,0.0,0.0,-22.0,-22.0,-4.0,-4.0,-2.0,-4.0,...,-4.0,-4.0,-4.0,-2.0,-4.0,-4.0,-4.0,-4.0,-2.0,-4.0
25%,6.0,5045.0,21.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,14.0,10497.0,31.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,23.0,16060.0,47.0,2.0,1.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,33.0,22169.0,59.0,20.0,1305.0,1305.0,1305.0,1305.0,1305.0,1305.0,...,1305.0,1305.0,1305.0,1305.0,1305.0,1305.0,989.0,1305.0,1305.0,1305.0


In [5]:
test_subset_w18.describe()

Unnamed: 0,item_id,shop_id,date_block_num,f__item_cnt_1,f__item_cnt_2,f__item_cnt_3,f__item_cnt_4,f__item_cnt_5,f__item_cnt_6,f__item_cnt_7,...,f__item_cnt_9,f__item_cnt_10,f__item_cnt_11,f__item_cnt_12,f__item_cnt_13,f__item_cnt_14,f__item_cnt_15,f__item_cnt_16,f__item_cnt_17,f__item_cnt_18
count,28680.0,28680.0,28680.0,28680.0,28680.0,28680.0,28680.0,28680.0,28680.0,28680.0,...,28680.0,28680.0,28680.0,28680.0,28680.0,28680.0,28680.0,28680.0,28680.0,28680.0
mean,10701.266213,31.995223,34.0,2.191388,1.330021,1.200558,1.001604,0.99613,1.155544,1.20537,...,0.938703,1.216423,1.809484,1.20143,0.936681,0.823989,0.843968,0.705753,0.733264,0.656032
std,6102.659012,16.488541,0.0,15.032652,6.592039,5.628176,5.63741,6.002198,10.252714,11.010553,...,4.748555,10.785793,14.693371,10.818708,7.937403,7.753421,7.355884,6.910303,8.234696,6.645365
min,30.0,2.0,34.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
25%,5005.0,19.0,34.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,10659.0,31.0,34.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,15494.0,46.0,34.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,22167.0,59.0,34.0,2253.0,473.0,436.0,482.0,444.0,742.0,813.0,...,257.0,1000.0,1209.0,772.0,634.0,639.0,591.0,563.0,771.0,602.0


In [6]:
X_train, y_train = drop_non_features(train_set_w18).values, train_set_w18['item_cnt_month'].values
X_test = drop_non_features(test_subset_w18).values

I'll use a default XGB regressor since in my experience it's usually a good default to start with.

In [7]:
from xgboost import XGBRegressor
from sklearn.model_selection import cross_validate
from src.model import ClippedOutputRegressor

reg = ClippedOutputRegressor(XGBRegressor(n_jobs=-1, verbosity=1))

In [8]:
scores = cross_validate(reg, X=X_train, y=y_train,
                        scoring=corrected_rmse_score, verbose=2, n_jobs=-1, 
                        cv=cv_splits_w18, return_train_score=True)
scores

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  3.9min finished


{'fit_time': array([222.67374158, 218.43320179, 213.79310656]),
 'score_time': array([0.29977012, 0.25102425, 0.26123786]),
 'test_score': array([-0.77165437, -1.02308756, -1.03266588]),
 'train_score': array([-0.92676242, -0.92933981, -0.93133453])}

In [9]:
scores['test_score'].mean(), scores['test_score'].std() 

(-0.9424692693294418, 0.12084765474984473)

Pretty close to our previous dummy test, so not good. Let's try adding some categorical features: the previous test had the date block num, so we'll go even further and add the date ids.

In [10]:
date_ids = pd.read_parquet(os.path.join(PROCESSED_DATA_DIR, 'date-ids.parquet'))
date_ids.head()

Unnamed: 0,date_block_num,month_id,year_id
0,0,0,0
1,1,1,0
2,2,2,0
3,3,3,0
4,4,4,0


In [11]:
train_set_w18_date_ids = add_as_cat_features(train_set_w18.merge(date_ids, on='date_block_num', how='left', sort=False), 
                                             ['date_block_num', 'month_id', 'year_id'])
X_train = drop_non_features(train_set_w18_date_ids).values

In [12]:
scores = cross_validate(reg, X=X_train, y=y_train,
                        scoring=corrected_rmse_score, verbose=2, n_jobs=-1, 
                        cv=cv_splits_w18, return_train_score=True)
scores

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  4.2min finished


{'fit_time': array([244.08122063, 239.6075933 , 233.86898255]),
 'score_time': array([0.3487978 , 0.30139661, 0.31716919]),
 'test_score': array([-0.7724726 , -1.03043161, -1.0483989 ]),
 'train_score': array([-0.91004636, -0.91209181, -0.91578144])}

In [13]:
scores['test_score'].mean(), scores['test_score'].std() 

(-0.9504343724564208, 0.12605157671435444)

Well, it's worse. Let's add the base ids to the train set and run it again.

In [14]:
train_set_w18_date_ids_base_ids = add_as_cat_features(train_set_w18_date_ids, ['item_id', 'shop_id'])

In [15]:
X_train = drop_non_features(train_set_w18_date_ids_base_ids).values

scores = cross_validate(reg, X=X_train, y=y_train,
                        scoring=corrected_rmse_score, verbose=2, n_jobs=-1, 
                        cv=cv_splits, return_train_score=True)
scores

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  8.7min finished


{'fit_time': array([469.65439868, 472.22846842, 493.10321259]),
 'score_time': array([0.55858803, 0.49258566, 0.51222968]),
 'test_score': array([-0.7694865 , -0.98612991, -1.06177736]),
 'train_score': array([-0.8874763 , -0.87917883, -0.88494101])}

In [16]:
scores['test_score'].mean(), scores['test_score'].std() 

(-0.9391312550717849, 0.12386859351220451)

A little better. Let's try the whole dataset to see if that's any better

In [34]:
def add_date_ids(df):
    return add_as_cat_features(df.merge(date_ids, on='date_block_num', how='left', sort=False), 
                                        ['date_block_num', 'month_id', 'year_id'])

transform = comp(
    lambda df: add_as_cat_features(df, ['item_id', 'shop_id']),
    add_date_ids,
    lambda df: add_lagged_features(df, sales_train_by_month, 'item_cnt'))

train_set_lagged_date_ids_base_ids = transform(train_set)

HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))




In [36]:
X_train = drop_non_features(train_set_lagged_date_ids_base_ids).values
scores = cross_validate(reg, X=X_train, y=y_train,
                        scoring=corrected_rmse_score, verbose=2, n_jobs=-1, 
                        cv=cv_splits, return_train_score=True)
scores

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed: 12.0min finished


{'fit_time': array([669.21315479, 669.66440964, 693.05559611]),
 'score_time': array([0.57962251, 0.50677729, 0.52487874]),
 'test_score': array([-0.76700048, -0.98716346, -1.03798434]),
 'train_score': array([-0.88615257, -0.88542616, -0.89052309])}

In [37]:
scores['test_score'].mean(), scores['test_score'].std() 

(-0.9307160921631675, 0.11760893479275158)

OK, we can see it's an improvement. Let's just submit that and see how it does on the public LB.

In [30]:
test_subset_lagged_date_ids_base_ids = transform(test_subset)
X_test = drop_non_features(test_subset_lagged_date_ids_base_ids).values

HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))




In [38]:
from src.submission import submission_from_subset

reg.fit(X_train, y_train)
test_subset_lagged_date_ids_base_ids['item_cnt_month'] = reg.predict(X_test)

submission = submission_from_subset(test_subset_lagged_date_ids_base_ids, test_set)
submission.to_csv(os.path.join(TMP_DIR, 'xgb-dataset-01.csv'), index=False)

In [39]:
%%bash
kaggle c submit -f ${TMP_DIR}/xgb-dataset-01.csv -m 'Default XGB with date ids and lagged item counts' competitive-data-science-predict-future-sales

Successfully submitted to Predict Future Sales

100%|██████████| 2.30M/2.30M [00:40<00:00, 59.3kB/s]


1.08752, not bad considering we didn't run any hyperparameter optimization.