In [1]:
%load_ext autoreload
%autoreload 2

import os
import sys

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
import scipy
import altair as alt
from altair import datum
from tqdm.auto import tqdm, trange

from src.model import tscv

%run constants.py

%matplotlib inline
print("Versions:")
print("  Python: %s" % sys.version)
for module in [pd, np, sns, sklearn, alt]:
    print("  %s: %s" %(module.__name__, module.__version__))

Versions:
  Python: 3.8.2 (default, Jul 16 2020, 14:00:26) 
[GCC 9.3.0]
  pandas: 1.1.0
  numpy: 1.19.1
  seaborn: 0.10.1
  sklearn: 0.23.2
  altair: 4.1.0


# Feature Engineering

Let's first load our train and test sets and set up our CV split.

In [17]:
train_set = pd.read_parquet(os.path.join(PROCESSED_DATA_DIR, 'train-set-base.parquet'))
test_set = pd.read_parquet(os.path.join(PROCESSED_DATA_DIR, 'test-set-base.parquet'))
cv_splits = tscv.split(train_set['date_block_num'].values)

Now let's start with the obvious one: the item counts.

In [3]:
item_count_lagged = pd.read_parquet(os.path.join(PROCESSED_DATA_DIR, 'item-cnt-lagged.parquet'))
item_count_lagged.head()

Unnamed: 0,item_id,shop_id,date_block_num,item_cnt_lag_1,item_cnt_lag_2,item_cnt_lag_3,item_cnt_lag_4,item_cnt_lag_5,item_cnt_lag_6,item_cnt_lag_7,...,item_cnt_lag_23,item_cnt_lag_24,item_cnt_lag_25,item_cnt_lag_26,item_cnt_lag_27,item_cnt_lag_28,item_cnt_lag_29,item_cnt_lag_30,item_cnt_lag_31,item_cnt_lag_32
0,33,2,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,317,2,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,438,2,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,471,2,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,481,2,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Since we're dealing with lagged features, early months won't have any. Let's cut it down by half on both the window of lagged features and the train window for now.

In [4]:
cv_splits_w18 = tscv.split(train_set['date_block_num'].values, window=18)
item_count_lagged_w18 = item_count_lagged.drop(columns=['item_cnt_lag_%d' % d for d in range(19, 33)])

In [5]:
train_set_w18 = train_set.merge(item_count_lagged_w18, on=['item_id', 'shop_id', 'date_block_num'], how='left', sort=False)
test_set_w18 = test_set.merge(item_count_lagged_w18, on=['item_id', 'shop_id', 'date_block_num'], how='left', sort=False)

In [6]:
train_set_w18.describe()

Unnamed: 0,date_block_num,item_id,shop_id,item_cnt,item_cnt_lag_1,item_cnt_lag_2,item_cnt_lag_3,item_cnt_lag_4,item_cnt_lag_5,item_cnt_lag_6,...,item_cnt_lag_9,item_cnt_lag_10,item_cnt_lag_11,item_cnt_lag_12,item_cnt_lag_13,item_cnt_lag_14,item_cnt_lag_15,item_cnt_lag_16,item_cnt_lag_17,item_cnt_lag_18
count,1600855.0,1600855.0,1600855.0,1600855.0,1600855.0,1600855.0,1600855.0,1600855.0,1600855.0,1600855.0,...,1600855.0,1600855.0,1600855.0,1600855.0,1600855.0,1600855.0,1600855.0,1600855.0,1600855.0,1600855.0
mean,14.73702,10678.28,32.96987,2.022226,1.234869,1.087265,0.9621615,0.8651577,0.7878702,0.723812,...,0.5469146,0.4944164,0.4436685,0.4023031,0.354217,0.3189414,0.2850427,0.2560132,0.2302732,0.2065009
std,9.513026,6238.855,16.41684,2.577318,2.709731,2.598188,2.487749,2.388878,2.305473,2.235577,...,2.003668,1.917901,1.829092,1.757678,1.656151,1.586526,1.508673,1.435958,1.373439,1.313543
min,0.0,30.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,6.0,5043.0,21.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,14.0,10492.0,31.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,23.0,16059.0,47.0,2.0,1.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,33.0,22167.0,59.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,...,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0


In [7]:
test_set_w18.describe()

Unnamed: 0,ID,shop_id,item_id,date_block_num,item_cnt_lag_1,item_cnt_lag_2,item_cnt_lag_3,item_cnt_lag_4,item_cnt_lag_5,item_cnt_lag_6,...,item_cnt_lag_9,item_cnt_lag_10,item_cnt_lag_11,item_cnt_lag_12,item_cnt_lag_13,item_cnt_lag_14,item_cnt_lag_15,item_cnt_lag_16,item_cnt_lag_17,item_cnt_lag_18
count,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,...,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0
mean,107099.5,31.642857,11019.398627,34.0,0.255649,0.246452,0.260037,0.227605,0.221471,0.213301,...,0.235135,0.273305,0.407246,0.255215,0.213735,0.191116,0.218431,0.180705,0.180523,0.166335
std,61834.358168,17.561933,6252.64459,0.0,1.089856,1.115817,1.031644,0.949685,1.008618,1.018289,...,1.058539,1.214351,1.654933,1.299405,1.130777,1.053212,1.120085,0.968446,1.00961,0.983865
min,0.0,2.0,30.0,34.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,53549.75,16.0,5381.5,34.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,107099.5,34.5,11203.0,34.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,160649.25,47.0,16071.5,34.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,214199.0,59.0,22167.0,34.0,20.0,20.0,20.0,20.0,20.0,20.0,...,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0


In [8]:
def df_to_features_matrix(df):
    return df.drop(columns=['ID', 'item_id', 'shop_id', 'item_cnt', 'date_block_num'], errors='ignore').values

In [9]:
X_train, y_train = df_to_features_matrix(train_set_w18), train_set_w18['item_cnt'].values
X_test = df_to_features_matrix(test_set_w18)

I'll use a default XGB regressor since in my experience it's usually a good default to start with.

In [10]:
from xgboost import XGBRegressor

reg = XGBRegressor(n_jobs=-1, verbosity=1)

In [11]:
from sklearn.model_selection import cross_validate
scores = cross_validate(reg, X=X_train, y=y_train,
                        scoring='neg_root_mean_squared_error', verbose=2, n_jobs=-1, 
                        cv=cv_splits_w18, return_train_score=True)
scores

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  1.6min finished


{'fit_time': array([90.30915022, 88.62080193, 87.34032273]),
 'score_time': array([0.09704518, 0.08903933, 0.09367323]),
 'test_score': array([-1.61784172, -2.14437448, -2.17703911]),
 'train_score': array([-1.94471932, -1.95146995, -1.95555889])}

In [12]:
scores['test_score'].mean(), scores['test_score'].std() 

(-1.9797517705584087, 0.25625626366786247)

Pretty close to our previous dummy test, so not good. Let's try adding some categorical features: the previous test had the date block num, so we'll go even further and add the date ids.

In [13]:
date_ids = pd.read_parquet(os.path.join(PROCESSED_DATA_DIR, 'date-ids.parquet'))

train_set_w18_date_ids = train_set_w18.merge(date_ids, on='date_block_num', how='left', sort=False)
test_set_w18_date_ids = test_set_w18.merge(date_ids, on='date_block_num', how='left', sort=False)

X_train = df_to_features_matrix(train_set_w18_date_ids)
X_test = df_to_features_matrix(test_set_w18_date_ids)

In [14]:
scores = cross_validate(reg, X=X_train, y=y_train,
                        scoring='neg_root_mean_squared_error', verbose=2, n_jobs=-1, 
                        cv=cv_splits_w18, return_train_score=True)
scores

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  1.6min finished


{'fit_time': array([94.98539448, 92.15565038, 91.46346855]),
 'score_time': array([0.10364676, 0.09808302, 0.10697031]),
 'test_score': array([-1.62292303, -2.15322221, -2.18246359]),
 'train_score': array([-1.90972014, -1.91732515, -1.92140387])}

In [15]:
scores['test_score'].mean(), scores['test_score'].std() 

(-1.9862029448698504, 0.2571549284295111)

Well, it's worse. Let's take a step back and try the full train set with just original columns to see how they compare with our previous experiment using the random forest.

In [20]:
train_set_defaults = train_set.copy()
test_set_defaults = test_set.copy()
train_set_defaults[['f_cat_item_id', 'f_cat_shop_id', 'f_cat_date_block_num']] = train_set[['item_id', 'shop_id', 'date_block_num']]
test_set_defaults[['f_cat_item_id', 'f_cat_shop_id', 'f_cat_date_block_num']] = test_set[['item_id', 'shop_id', 'date_block_num']]

X_train = df_to_features_matrix(train_set_defaults)
X_test = df_to_features_matrix(test_set_defaults)

scores = cross_validate(reg, X=X_train, y=y_train,
                        scoring='neg_root_mean_squared_error', verbose=2, n_jobs=-1, 
                        cv=cv_splits, return_train_score=True)
scores

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   55.2s finished


{'fit_time': array([45.27446032, 46.42938328, 47.12165475]),
 'score_time': array([0.15620112, 0.13886952, 0.14787412]),
 'test_score': array([-1.75458612, -2.27860005, -2.21870513]),
 'train_score': array([-2.04748249, -2.0481986 , -2.0512862 ])}

In [21]:
scores['test_score'].mean(), scores['test_score'].std() 

(-2.0839637650582166, 0.23418522070019476)

Ok. At least we know our lagged features are better than just using the original ids.