In [1]:
%load_ext autoreload
%autoreload 2

import os
import sys

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
import scipy
from tqdm.auto import tqdm, trange
import xgboost as xgb
import joblib
from sklearn.model_selection import cross_validate
import zipfile

from src.model import tscv, ClippedOutputRegressor
from src.feature_engineering import get_feature_cols, df_to_X_y, drop_non_features, add_lagged_features


%run constants.py

baseline_reg = joblib.load(os.path.join(MODELS_DIR, 'xgb-baseline.model'))

%matplotlib inline
print("Versions:")
print("  Python: %s" % sys.version)
for module in [pd, np, sns, sklearn]:
    print("  %s: %s" %(module.__name__, module.__version__))

Versions:
  Python: 3.8.2 (default, Jul 16 2020, 14:00:26) 
[GCC 9.3.0]
  pandas: 1.1.1
  numpy: 1.19.1
  seaborn: 0.10.1
  sklearn: 0.23.2


Last experiment on features relationships was a disaster, but I want to try it again. I'll make it simpler: calculate lagged feature deltas in some time windows. For now I'll just calculate it for prices and item counts.

In [2]:
sales_by_month = pd.read_parquet(os.path.join(PROCESSED_DATA_DIR, 'sales-train-by-month.parquet'))
prices_stats = pd.read_parquet(os.path.join(PROCESSED_DATA_DIR, 'prices-statistics.parquet'))
train_set = pd.read_parquet(os.path.join(PROCESSED_DATA_DIR, 'train-set.parquet'))

In [6]:
def feature_delta(df, feature_cols, index_cols=['item_id', 'shop_id'],
                  window=1, date_col='date_block_num'):
    df = df[feature_cols + index_cols + [date_col]]
    aux_df = df.copy()

    aux_df[date_col] = aux_df[date_col] + window
    df2 = df.merge(aux_df, on=index_cols + [date_col], how='left', sort=False,
                   suffixes=['_now', '_then'])
    # we fill with 0 so the delta is just the new value
    df2.fillna(0, inplace=True)
    features_now_cols = ['%s_now' % col for col in feature_cols]
    features_then_cols = ['%s_then' % col for col in feature_cols]
    delta_cols = ['%s_%s_delta_w%d' % ('_'.join(index_cols), col, window)
                  for col in feature_cols]
    df2[delta_cols] = df2[features_now_cols].values - df2[features_then_cols].values
    return df2[delta_cols]

In [7]:
def add_feature_deltas(df, feature_cols, windows=[1, 2, 3, 6, 9, 12], **kwargs):
    return pd.concat([df] + [feature_delta(df, feature_cols, window=w, **kwargs) for w in tqdm(windows)])

Now let's try our new function:

In [8]:
sales_train_with_deltas = add_feature_deltas(sales_by_month, ['item_cnt'])

HBox(children=(FloatProgress(value=0.0, max=6.0), HTML(value='')))




In [9]:
sales_train_with_deltas.describe()

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt,item_id_shop_id_item_cnt_delta_w1,item_id_shop_id_item_cnt_delta_w2,item_id_shop_id_item_cnt_delta_w3,item_id_shop_id_item_cnt_delta_w6,item_id_shop_id_item_cnt_delta_w9,item_id_shop_id_item_cnt_delta_w12
count,1609124.0,1609124.0,1609124.0,1609124.0,1609124.0,1609124.0,1609124.0,1609124.0,1609124.0,1609124.0
mean,14.66479,32.80585,10680.99,2.2672,0.8125645,0.9785939,1.113861,1.378873,1.580595,1.753815
std,9.542322,16.53701,6238.883,8.649882,6.954732,7.626672,7.77495,8.150903,8.359922,8.257252
min,0.0,0.0,0.0,-22.0,-1240.0,-1071.0,-1020.0,-1073.0,-736.0,-761.0
25%,6.0,21.0,5045.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0
50%,14.0,31.0,10497.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
75%,23.0,47.0,16060.0,2.0,1.0,1.0,1.0,2.0,2.0,2.0
max,33.0,59.0,22169.0,2253.0,2007.0,2178.0,2052.0,1961.0,2120.0,2185.0


Looking good. Now let's see how this affects our score.

In [7]:
train_set_sales_delta = add_lagged_features(train_set, sales_train_with_deltas,
                                            [c for c in sales_train_with_deltas.columns
                                             if c.startswith('item_cnt_delta_')], 
                                            max_lag=1, fill_value=0)

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




In [8]:
train_set_sales_delta

Unnamed: 0,item_id,shop_id,date_block_num,item_cnt_month,f__item_cnt_delta_w1_1,f__item_cnt_delta_w2_1,f__item_cnt_delta_w3_1,f__item_cnt_delta_w6_1,f__item_cnt_delta_w9_1,f__item_cnt_delta_w12_1
0,5037,5,10,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,5320,5,10,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,5233,5,10,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,5232,5,10,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5268,5,10,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
5140795,18454,45,33,1.0,0.0,0.0,0.0,0.0,0.0,0.0
5140796,16188,45,33,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5140797,15757,45,33,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5140798,19648,45,33,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
X_train, y_train = df_to_X_y(train_set_sales_delta)

scores = cross_validate(baseline_reg, X_train, y=y_train,
                        cv=tscv.split(train_set_sales_delta['date_block_num'], window=16),
                        verbose=2, scoring='neg_root_mean_squared_error')
scores

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  ................................................................
[CV] ................................................. , total=   6.3s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    6.3s remaining:    0.0s


[CV] ................................................. , total=   5.4s
[CV]  ................................................................
[CV] ................................................. , total=   6.0s


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   17.7s finished


{'fit_time': array([6.20678902, 5.31006551, 5.85935163]),
 'score_time': array([0.06795406, 0.10453439, 0.10542536]),
 'test_score': array([-1.03234395, -1.11599645, -1.09011108])}

In [10]:
np.mean(scores['test_score']), np.std(scores['test_score'])

(-1.0794838257372612, 0.03496797674404297)