In [1]:
%load_ext autoreload
%autoreload 2

import os
import sys

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
import scipy
import altair as alt
from altair import datum
from tqdm.auto import tqdm, trange

from src.model import tscv

%run constants.py

%matplotlib inline
print("Versions:")
print("  Python: %s" % sys.version)
for module in [pd, np, sns, sklearn, alt]:
    print("  %s: %s" %(module.__name__, module.__version__))

Versions:
  Python: 3.8.2 (default, Jul 16 2020, 14:00:26) 
[GCC 9.3.0]
  pandas: 1.1.0
  numpy: 1.19.1
  seaborn: 0.10.1
  sklearn: 0.23.2
  altair: 4.1.0


# Feature Engineering - category metadata and shop encodings

Let's first load our datasets and set up our CV split.

In [2]:
train_set = pd.read_parquet(os.path.join(PROCESSED_DATA_DIR, 'train-set-base-cats.parquet'))
item_categories_metadata = pd.read_parquet(os.path.join(PROCESSED_DATA_DIR, 'item-categories-metadata.parquet'))
shop_item_cat_encoding = pd.read_parquet(os.path.join(PROCESSED_DATA_DIR, 'shop-item-cat-encoding.parquet'))
cv_splits = tscv.split(train_set['date_block_num'].values)

def df_to_features_matrix(df):
    return df.drop(columns=['ID', 'item_id', 'shop_id', 'item_cnt', 'date_block_num', 'item_category_id'], errors='ignore').values

Let's label-encode the categories.

In [3]:
from sklearn.preprocessing import LabelEncoder
item_categories_metadata['cat__category_name'] = LabelEncoder().fit_transform(item_categories_metadata['category_name'])
item_categories_metadata['cat__subcategory_name'] = LabelEncoder().fit_transform(item_categories_metadata['subcategory_name'])

And now let's generate the train set and run it through our validation pipeline

In [4]:
train_set_cat_metadata = train_set.merge(item_categories_metadata.drop(columns=['category_name', 'subcategory_name']), on='item_id', how='left', sort=False)
train_set_cat_metadata.head()

Unnamed: 0,date_block_num,item_id,shop_id,item_cnt,cat__item_id,cat__shop_id,cat__date_block_num,item_category_id,cat__category_name,cat__subcategory_name
0,0,33,2,1.0,33,2,0,37,9,1
1,0,317,2,1.0,317,2,0,45,10,25
2,0,438,2,1.0,438,2,0,45,10,25
3,0,471,2,2.0,471,2,0,49,10,38
4,0,481,2,1.0,481,2,0,49,10,38


In [5]:
X_train, y_train = df_to_features_matrix(train_set_cat_metadata), train_set_cat_metadata['item_cnt'].values

In [6]:
from xgboost import XGBRegressor
from sklearn.model_selection import cross_validate
from src.model import ClippedOutputRegressor

reg = ClippedOutputRegressor(XGBRegressor(n_jobs=-1, verbosity=1))

In [7]:
scores = cross_validate(reg, X=X_train, y=y_train,
                        scoring='neg_root_mean_squared_error', verbose=2, n_jobs=-1, 
                        cv=cv_splits, return_train_score=True)
scores

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  1.3min finished


{'fit_time': array([66.23088074, 67.97157407, 69.28774571]),
 'score_time': array([0.174366  , 0.15217304, 0.15889645]),
 'test_score': array([-1.70838406, -2.23847939, -2.16835285]),
 'train_score': array([-2.00175247, -2.00441966, -2.00644994])}

In [8]:
scores['test_score'].mean(), scores['test_score'].std() 

(-2.038405431553157, 0.23510992115168114)

Which beats the score without it. I think the power of this feature will be more visible once we throw the lagged features in the mix, so let's see that.

In [9]:
train_set_full = pd.read_parquet(os.path.join(PROCESSED_DATA_DIR, 'train-set-base-cats-item-cnt-lagged-date-ids.parquet'))
train_set_full_cat_metadata = train_set_full.merge(item_categories_metadata.drop(columns=['category_name', 'subcategory_name']), on='item_id', how='left', sort=False)
train_set_full_cat_metadata.head()

Unnamed: 0,date_block_num,item_id,shop_id,item_cnt,cat__item_id,cat__shop_id,cat__date_block_num,item_cnt_lag_1,item_cnt_lag_2,item_cnt_lag_3,...,item_cnt_lag_14,item_cnt_lag_15,item_cnt_lag_16,item_cnt_lag_17,item_cnt_lag_18,month_id,year_id,item_category_id,cat__category_name,cat__subcategory_name
0,0,33,2,1.0,33,2,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,0,37,9,1
1,0,317,2,1.0,317,2,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,0,45,10,25
2,0,438,2,1.0,438,2,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,0,45,10,25
3,0,471,2,2.0,471,2,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,0,49,10,38
4,0,481,2,1.0,481,2,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,0,49,10,38


In [10]:
X_train, y_train = df_to_features_matrix(train_set_full_cat_metadata), train_set_full_cat_metadata['item_cnt'].values
scores = cross_validate(reg, X=X_train, y=y_train,
                        scoring='neg_root_mean_squared_error', verbose=2, n_jobs=-1, 
                        cv=cv_splits, return_train_score=True)
scores

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  3.7min finished


{'fit_time': array([206.61573505, 210.17860579, 215.39448953]),
 'score_time': array([0.19201756, 0.17068624, 0.17363071]),
 'test_score': array([-1.55400961, -2.00968467, -2.12546095]),
 'train_score': array([-1.82507594, -1.81249815, -1.80803364])}

In [11]:
scores['test_score'].mean(), scores['test_score'].std() 

(-1.8963850772011142, 0.2466667902439824)

A little bit better, but the benchmark model is still a lot better due to the added fake data being all zero. We should investigate it further to take full advantage of that, otherwise we'll keep inching towards the benchmark model.