In [1]:
%load_ext autoreload
%autoreload 2

import os
import sys

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
import scipy
from tqdm.auto import tqdm, trange
import xgboost as xgb
import joblib
from sklearn.model_selection import cross_validate
import zipfile

from src.model import tscv, ClippedOutputRegressor
from src.feature_engineering import get_feature_cols, df_to_X_y, drop_non_features, add_lagged_features


%run constants.py

baseline_reg = joblib.load(os.path.join(MODELS_DIR, 'xgb-baseline.model'))

%matplotlib inline
print("Versions:")
print("  Python: %s" % sys.version)
for module in [pd, np, sns, sklearn]:
    print("  %s: %s" %(module.__name__, module.__version__))

Versions:
  Python: 3.8.2 (default, Jul 16 2020, 14:00:26) 
[GCC 9.3.0]
  pandas: 1.1.1
  numpy: 1.19.1
  seaborn: 0.10.1
  sklearn: 0.23.2


# Rank features

We can get the rank of a sample price sales count with relation to the others in a group.

We want, for each date block, the ranks in the following groups:

- item_id
- category
- shop + category

In [2]:
prices_stats = pd.read_parquet(os.path.join(PROCESSED_DATA_DIR, 'prices-statistics.parquet'))
train_set = pd.read_parquet(os.path.join(PROCESSED_DATA_DIR, 'train-set.parquet'))
categories_meta = pd.read_parquet(os.path.join(PROCESSED_DATA_DIR, 'item-categories-metadata.parquet'))

In [3]:
prices_stats = prices_stats.merge(categories_meta, on='item_id')
train_set = train_set.merge(categories_meta, on='item_id')

In [5]:
ranks_index_cols = [['item_id'],
                    ['category_name'],
                    ['shop_id', 'category_name']]

ranks_df = train_set.copy()
for idx_cols in tqdm(ranks_index_cols):
    price_rank_col = 'rank_%s_item_price_median' % ('_'.join(idx_cols))
    item_cnt_rank_col = 'rank_%s_item_cnt_month' % ('_'.join(idx_cols))
    
    ranks_df[price_rank_col] = prices_stats.groupby(idx_cols + ['date_block_num'])['item_price_median'].rank('dense')
    ranks_df[item_cnt_rank_col] = train_set.groupby(idx_cols + ['date_block_num'])['item_cnt_month'].rank('dense')

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




In [6]:
ranks_df.describe()

Unnamed: 0,item_id,shop_id,date_block_num,item_cnt_month,item_category_id,rank_item_id_item_price_median,rank_item_id_item_cnt_month,rank_category_name_item_price_median,rank_category_name_item_cnt_month,rank_shop_id_category_name_item_price_median,rank_shop_id_category_name_item_cnt_month
count,5140800.0,5140800.0,5140800.0,5140800.0,5140800.0,5140800.0,5140800.0,5140800.0,5140800.0,5140800.0,5140800.0
mean,11019.4,31.64286,21.5,0.2199702,46.30961,1.0,1.194424,27.64095,1.218751,27.64095,1.199101
std,6252.631,17.56189,6.922187,1.113889,16.71654,0.0,0.8550523,19.54779,1.096516,19.54779,0.8842873
min,30.0,2.0,10.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,5381.5,16.0,15.75,0.0,37.0,1.0,1.0,13.0,1.0,13.0,1.0
50%,11203.0,34.5,21.5,0.0,43.0,1.0,1.0,24.0,1.0,24.0,1.0
75%,16071.5,47.0,27.25,0.0,58.0,1.0,1.0,39.0,1.0,39.0,1.0
max,22167.0,59.0,33.0,20.0,83.0,1.0,21.0,144.0,21.0,144.0,21.0


In [7]:
ranks_df.isnull().sum(axis=0)

item_id                                         0
shop_id                                         0
date_block_num                                  0
item_cnt_month                                  0
item_category_id                                0
category_name                                   0
subcategory_name                                0
rank_item_id_item_price_median                  0
rank_item_id_item_cnt_month                     0
rank_category_name_item_price_median            0
rank_category_name_item_cnt_month               0
rank_shop_id_category_name_item_price_median    0
rank_shop_id_category_name_item_cnt_month       0
dtype: int64

And now that we have them for each month we can use them as lagged features:

In [8]:
train_set_ranks = add_lagged_features(train_set, ranks_df, [col for col in ranks_df if col.startswith('rank_')], max_lag=6, fill_value=-999)
train_set_ranks.describe()

HBox(children=(FloatProgress(value=0.0, max=6.0), HTML(value='')))




Unnamed: 0,item_id,shop_id,date_block_num,item_cnt_month,item_category_id,f__rank_item_id_item_price_median_1,f__rank_item_id_item_cnt_month_1,f__rank_category_name_item_price_median_1,f__rank_category_name_item_cnt_month_1,f__rank_shop_id_category_name_item_price_median_1,...,f__rank_category_name_item_price_median_5,f__rank_category_name_item_cnt_month_5,f__rank_shop_id_category_name_item_price_median_5,f__rank_shop_id_category_name_item_cnt_month_5,f__rank_item_id_item_price_median_6,f__rank_item_id_item_cnt_month_6,f__rank_category_name_item_price_median_6,f__rank_category_name_item_cnt_month_6,f__rank_shop_id_category_name_item_price_median_6,f__rank_shop_id_category_name_item_cnt_month_6
count,5140800.0,5140800.0,5140800.0,5140800.0,5140800.0,5140800.0,5140800.0,5140800.0,5140800.0,5140800.0,...,5140800.0,5140800.0,5140800.0,5140800.0,5140800.0,5140800.0,5140800.0,5140800.0,5140800.0,5140800.0
mean,11019.4,31.64286,21.5,0.2199702,46.30961,-40.66667,-40.48181,-15.41115,-40.45852,-15.41115,...,-187.8008,-207.1649,-187.8008,-207.1811,-249.0,-248.8592,-230.8202,-248.8404,-230.8202,-248.8558
std,6252.631,17.56189,6.922187,1.113889,16.71654,199.8263,199.8666,205.9706,199.8726,205.9706,...,416.4409,406.2041,416.4409,406.1954,433.0127,433.0947,443.769,433.106,443.769,433.0967
min,30.0,2.0,10.0,0.0,0.0,-999.0,-999.0,-999.0,-999.0,-999.0,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0
25%,5381.5,16.0,15.75,0.0,37.0,1.0,1.0,11.0,1.0,11.0,...,5.0,1.0,5.0,1.0,-249.0,-249.0,-249.0,-249.0,-249.0,-249.0
50%,11203.0,34.5,21.5,0.0,43.0,1.0,1.0,22.0,1.0,22.0,...,15.0,1.0,15.0,1.0,1.0,1.0,14.0,1.0,14.0,1.0
75%,16071.5,47.0,27.25,0.0,58.0,1.0,1.0,38.0,1.0,38.0,...,32.0,1.0,32.0,1.0,1.0,1.0,31.0,1.0,31.0,1.0
max,22167.0,59.0,33.0,20.0,83.0,1.0,21.0,144.0,21.0,144.0,...,120.0,21.0,120.0,21.0,1.0,21.0,120.0,21.0,120.0,21.0


In [9]:
X_train, y_train = df_to_X_y(train_set_ranks)

scores = cross_validate(baseline_reg, X_train, y=y_train,
                        cv=tscv.split(train_set_ranks['date_block_num'], 
                                      window=16),
                        verbose=2, scoring='neg_root_mean_squared_error')
scores

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  ................................................................
[CV] ................................................. , total=  14.4s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   14.4s remaining:    0.0s


[CV] ................................................. , total=  13.2s
[CV]  ................................................................
[CV] ................................................. , total=  10.6s


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   38.3s finished


{'fit_time': array([13.21297026, 11.95881104,  9.8157568 ]),
 'score_time': array([1.18980646, 1.2430625 , 0.8322978 ]),
 'test_score': array([-0.79122826, -0.92876298, -0.89849347])}

In [10]:
np.mean(scores['test_score']), np.std(scores['test_score'])

(-0.8728282372454572, 0.059008350773385475)

Considering this doesn't include any of the previous features, it's a pretty good score :)