In [1]:
%load_ext autoreload
%autoreload 2

import os
import sys

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
import scipy
from tqdm.auto import tqdm, trange
import xgboost as xgb
import joblib
from sklearn.model_selection import cross_validate
import zipfile

from src.model import tscv, ClippedOutputRegressor
from src.feature_engineering import get_feature_cols, df_to_X_y, drop_non_features, add_lagged_features


%run constants.py

baseline_reg = joblib.load(os.path.join(MODELS_DIR, 'xgb-baseline.model'))

%matplotlib inline
print("Versions:")
print("  Python: %s" % sys.version)
for module in [pd, np, sns, sklearn]:
    print("  %s: %s" %(module.__name__, module.__version__))

Versions:
  Python: 3.8.2 (default, Jul 16 2020, 14:00:26) 
[GCC 9.3.0]
  pandas: 1.1.1
  numpy: 1.19.1
  seaborn: 0.10.1
  sklearn: 0.23.2


Now I want to focus on features relationships. We have quite a few of features so far, so we can try generating features by combining them.

The first one will be the price changes with relation to economics indicators.

In [2]:
train_set = pd.read_parquet(os.path.join(PROCESSED_DATA_DIR, 'train-set.parquet'))
economics_hist = pd.read_parquet(os.path.join(PROCESSED_DATA_DIR, 'economics-history.parquet'))
prices_stats = pd.read_parquet(os.path.join(PROCESSED_DATA_DIR, 'prices-statistics.parquet'))

In [3]:
aux_df = prices_stats.copy()
aux_df['date_block_num'] = aux_df['date_block_num'] + 1
prices_stats_gains = prices_stats.merge(aux_df, on=['item_id', 'shop_id', 'date_block_num'], how='left', sort=False, suffixes=['_now', '_prev'])
prices_stats_gains.head()

Unnamed: 0,item_id,shop_id,date_block_num,item_shop_price_median_now,cat_price_median_now,cat_shop_price_median_now,item_price_median_now,item_shop_price_median_prev,cat_price_median_prev,cat_shop_price_median_prev,item_price_median_prev
0,5037,5,10,1699.0,1699.0,1999.0,1699.0,,,,
1,5037,4,10,1699.0,1699.0,1649.0,1699.0,,,,
2,5037,6,10,1699.0,1699.0,1999.0,1699.0,,,,
3,5037,3,10,1699.0,1699.0,1499.0,1699.0,,,,
4,5037,2,10,1699.0,1699.0,1499.0,1699.0,,,,


In [5]:
prices_cols = ['item_shop_price_median', 'cat_price_median', 'cat_shop_price_median', 'item_price_median']
for col in prices_cols:
    prices_stats_gains['%s_gain' % col] = ((prices_stats_gains['%s_now' % col] - prices_stats_gains['%s_prev' % col]) / prices_stats_gains['%s_now' % col]).fillna(1.0)

In [6]:
prices_stats_gains.describe()

Unnamed: 0,item_id,shop_id,date_block_num,item_shop_price_median_now,cat_price_median_now,cat_shop_price_median_now,item_price_median_now,item_shop_price_median_prev,cat_price_median_prev,cat_shop_price_median_prev,item_price_median_prev,item_shop_price_median_gain,cat_price_median_gain,cat_shop_price_median_gain,item_price_median_gain
count,5140800.0,5140800.0,5140800.0,5140800.0,5140800.0,5140800.0,5140800.0,4926600.0,4926600.0,4926600.0,4926600.0,5140800.0,5140800.0,5140800.0,5140800.0
mean,11019.4,31.64286,21.5,717.1264,703.3555,711.2942,785.0688,709.7926,696.4006,705.0839,776.4684,-0.1020204,0.04440177,0.001239136,-0.2565242
std,6252.631,17.56189,6.922187,1516.97,1497.071,1506.284,1560.655,1482.71,1463.426,1475.41,1529.924,19.89264,0.2291482,1.439572,29.51799
min,30.0,2.0,10.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-5197.0,-0.7109737,-99.8,-5197.0
25%,5381.5,16.0,15.75,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,0.0,0.0,-0.07296137,0.0
50%,11203.0,34.5,21.5,399.0,359.0,399.0,399.0,359.0,359.0,399.0,399.0,0.0,0.0,0.0,0.0
75%,16071.5,47.0,27.25,699.0,659.0,699.0,799.0,699.0,659.0,699.0,749.0,0.05042017,0.03861004,0.1253133,0.0
max,22167.0,59.0,33.0,42990.0,28392.0,42990.0,41990.0,33490.0,25990.0,32990.0,32990.0,6.453057,6.453057,6.453057,6.453057


In [7]:
economics_hist_gains = economics_hist.copy()

econ_cols = ['CNYRUB', 'EURRUB', 'USDRUB', 'MOEX']
for col in econ_cols:
    economics_hist_gains['%s_gain' % col] =  ((economics_hist_gains['%s_close' % col] - economics_hist_gains['%s_open' % col]) / economics_hist_gains['%s_close' % col]).fillna(1)
economics_hist_gains.describe()

Unnamed: 0,date_block_num,CNYRUB_open,CNYRUB_close,EURRUB_open,EURRUB_close,USDRUB_open,USDRUB_close,MOEX_open,MOEX_high,MOEX_low,MOEX_close,CNYRUB_gain,EURRUB_gain,USDRUB_gain,MOEX_gain
count,35.0,35.0,35.0,35.0,35.0,35.0,35.0,33.0,33.0,33.0,33.0,35.0,35.0,35.0,35.0
mean,17.0,6.912036,7.051546,52.920974,53.6908,42.858722,43.782481,1502.956667,1572.121515,1444.673333,1510.738788,0.017022,0.012547,0.017666,0.060733
std,10.246951,2.055718,2.108321,11.777598,11.940725,13.119323,13.555482,127.848997,142.924084,132.22998,136.920868,0.059836,0.057412,0.061565,0.238257
min,0.0,4.814853,4.838964,39.7617,39.7617,29.983436,30.093358,1304.93,1345.48,1182.89,1306.01,-0.152468,-0.154986,-0.148345,-0.08183
25%,8.5,5.386638,5.403534,43.6845,43.9145,32.946348,32.914444,1401.34,1469.4,1343.99,1400.71,-0.008252,-0.006852,-0.012579,-0.025385
50%,17.0,5.79467,5.828424,48.442,48.78,35.714368,35.714368,1473.54,1518.12,1429.28,1476.38,0.010889,0.013194,0.017914,0.016192
75%,25.5,8.764054,9.174595,61.9769,62.3975,54.326402,56.881987,1625.21,1706.29,1570.46,1642.97,0.041868,0.039173,0.04014,0.037459
max,34.0,11.314571,11.314571,79.925,79.925,70.698806,70.698806,1759.25,1873.53,1701.25,1771.05,0.151615,0.122056,0.157259,1.0


In [8]:
combo_gains = prices_stats_gains.merge(economics_hist_gains, on='date_block_num', how='left', sort=False).fillna(1)

In [9]:
combo_gains[[col for col in combo_gains.columns if col.endswith('_gain')]].describe()

Unnamed: 0,item_shop_price_median_gain,cat_price_median_gain,cat_shop_price_median_gain,item_price_median_gain,CNYRUB_gain,EURRUB_gain,USDRUB_gain,MOEX_gain
count,5140800.0,5140800.0,5140800.0,5140800.0,5140800.0,5140800.0,5140800.0,5140800.0
mean,-0.1020204,0.04440177,0.001239136,-0.2565242,0.02127412,0.01531425,0.02272461,0.003351139
std,19.89264,0.2291482,1.439572,29.51799,0.07000779,0.06740996,0.07181589,0.04518638
min,-5197.0,-0.7109737,-99.8,-5197.0,-0.1524676,-0.1549855,-0.1483448,-0.08182981
25%,0.0,0.0,-0.07296137,0.0,-0.01684404,-0.02102955,-0.02221238,-0.04080483
50%,0.0,0.0,0.0,0.0,0.02358764,0.01583342,0.02487894,0.01241884
75%,0.05042017,0.03861004,0.1253133,0.0,0.06308251,0.06757043,0.07199375,0.03742547
max,6.453057,6.453057,6.453057,6.453057,0.1516149,0.1220556,0.1572593,0.08875512


Now let's combine them.

In [10]:
eps=1e-5
for col1 in prices_cols:
    for col2 in econ_cols:
        combo_gains['%s_%s_ratio' % (col1, col2)] = combo_gains["%s_gain" % col2] / (eps + combo_gains["%s_gain" % col1])

In [11]:
combo_gains[[col for col in combo_gains.columns if col.endswith('_ratio')]].describe()

Unnamed: 0,item_shop_price_median_CNYRUB_ratio,item_shop_price_median_EURRUB_ratio,item_shop_price_median_USDRUB_ratio,item_shop_price_median_MOEX_ratio,cat_price_median_CNYRUB_ratio,cat_price_median_EURRUB_ratio,cat_price_median_USDRUB_ratio,cat_price_median_MOEX_ratio,cat_shop_price_median_CNYRUB_ratio,cat_shop_price_median_EURRUB_ratio,cat_shop_price_median_USDRUB_ratio,cat_shop_price_median_MOEX_ratio,item_price_median_CNYRUB_ratio,item_price_median_EURRUB_ratio,item_price_median_USDRUB_ratio,item_price_median_MOEX_ratio
count,5140800.0,5140800.0,5140800.0,5140800.0,5140800.0,5140800.0,5140800.0,5140800.0,5140800.0,5140800.0,5140800.0,5140800.0,5140800.0,5140800.0,5140800.0,5140800.0
mean,1209.329,760.3219,1290.475,104.2615,1282.672,799.629,1369.355,95.68334,782.93,543.3781,835.8754,142.0852,1382.058,922.654,1472.624,239.3324
std,5338.264,5175.086,5508.567,3351.85,5476.001,5331.673,5654.321,3448.416,4407.906,4165.654,4527.423,2705.737,5746.051,5505.024,5921.249,3559.434
min,-28390.41,-28697.79,-29373.56,-8816.421,-15246.76,-15498.55,-14834.48,-8182.981,-15246.76,-15498.55,-14834.48,-8182.981,-15246.76,-15498.55,-14834.48,-20923.86
25%,-0.3662868,-0.3321772,-0.3919688,-0.6384027,-0.5075127,-0.4349926,-0.5163403,-1.48738,-0.3055762,-0.3094522,-0.3463319,-0.2664626,-0.4262676,-0.4349926,-0.4267191,-0.6531246
50%,0.1720154,0.1368554,0.1962821,0.03668978,0.2489838,0.2723728,0.3031264,0.07039656,0.03286104,0.03439673,0.04706361,0.03282013,0.2395958,0.2368885,0.246419,0.09201127
75%,1951.789,1574.993,2293.977,1619.179,3679.364,1591.692,3112.447,1619.179,1.794396,1.8585,1.831858,0.9543563,3949.074,2066.941,3547.056,2688.344
max,25903.57,16517.54,25726.54,8875.512,15161.49,12205.56,15725.93,8875.512,15161.49,12205.56,15725.93,8875.512,50449.37,51282.53,49085.19,8875.512


Let's build a train set using the new features as lagged features and check how it goes with the baseline model.

First let's check how the gains go.

In [18]:
train_set_combo_gains = add_lagged_features(train_set, combo_gains, [c for c in combo_gains.columns if c.endswith('_gain')], max_lag=6, fill_value=1)
train_set_combo_gains.describe()

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




Unnamed: 0,item_id,shop_id,date_block_num,item_cnt_month,f__item_shop_price_median_gain_1,f__cat_price_median_gain_1,f__cat_shop_price_median_gain_1,f__item_price_median_gain_1,f__CNYRUB_gain_1,f__EURRUB_gain_1,...,f__USDRUB_gain_2,f__MOEX_gain_2,f__item_shop_price_median_gain_3,f__cat_price_median_gain_3,f__cat_shop_price_median_gain_3,f__item_price_median_gain_3,f__CNYRUB_gain_3,f__EURRUB_gain_3,f__USDRUB_gain_3,f__MOEX_gain_3
count,5140800.0,5140800.0,5140800.0,5140800.0,5140800.0,5140800.0,5140800.0,5140800.0,5140800.0,5140800.0,...,5140800.0,5140800.0,5140800.0,5140800.0,5140800.0,5140800.0,5140800.0,5140800.0,5140800.0,5140800.0
mean,11019.4,31.64286,21.5,0.2199702,-0.06232891,0.08168011,0.04191892,-0.2123943,0.0636911,0.05846026,...,0.1067443,0.08664679,0.025893,0.1644863,0.1256323,-0.1196137,0.1441486,0.1371966,0.1448004,0.1267499
std,6252.631,17.56189,6.922187,1.113889,19.88943,0.2973046,1.452155,29.51826,0.2072445,0.2073046,...,0.2785488,0.2788439,19.86649,0.3881752,1.472308,29.5188,0.3307605,0.3323056,0.330712,0.3328724
min,30.0,2.0,10.0,0.0,-5197.0,-0.693787,-99.8,-5197.0,-0.1524676,-0.1549855,...,-0.1483448,-0.08182981,-5197.0,-0.693787,-99.8,-5197.0,-0.1524676,-0.1549855,-0.1483448,-0.08182981
25%,5381.5,16.0,15.75,0.0,0.0,0.0,-0.06396588,0.0,-0.006126432,-0.007407411,...,-0.0125726,-0.03004562,0.0,0.0,-0.01957447,0.0,-0.006126432,-0.007407411,-0.0125726,-0.03004562
50%,11203.0,34.5,21.5,0.0,0.0,0.0,0.0,0.0,0.03222552,0.01829316,...,0.03329752,0.01801681,0.0,0.0,0.0,0.0,0.03814219,0.02753324,0.03329752,0.01801681
75%,16071.5,47.0,27.25,0.0,0.06671114,0.05722461,0.1432665,0.03539823,0.07217984,0.07480448,...,0.08772002,0.0409629,0.1519199,0.122807,0.2367942,0.1432665,0.09577566,0.08365995,0.09853186,0.05157805
max,22167.0,59.0,33.0,20.0,6.453057,6.453057,6.453057,6.453057,1.0,1.0,...,1.0,1.0,6.453057,6.453057,6.453057,6.453057,1.0,1.0,1.0,1.0


In [19]:
X_train, y_train = df_to_X_y(train_set_combo_gains)

scores = cross_validate(baseline_reg, X_train, y=y_train,
                        cv=tscv.split(train_set_combo_gains['date_block_num'], window=16),
                        verbose=2, scoring='neg_root_mean_squared_error')
scores

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  ................................................................
[CV] ................................................. , total=  10.6s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   10.6s remaining:    0.0s


[CV] ................................................. , total=  10.7s
[CV]  ................................................................
[CV] ................................................. , total=  10.4s


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   31.7s finished


{'fit_time': array([8.78158283, 8.96164536, 8.83346486]),
 'score_time': array([1.81970167, 1.72876048, 1.55873203]),
 'test_score': array([-1.04456123, -1.20960406, -1.10802251])}

In [20]:
np.mean(scores['test_score']), np.std(scores['test_score'])

(-1.1207292669442335, 0.06797490087464367)

Well, that's worse than using either economics indicators or price medians directly. Because of this I'll stop this experiment and move on. I'll probably get back to it eventually because this is something I strongly believe should add information to our models.