In [1]:
%load_ext autoreload
%autoreload 2

import os
import sys

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
import scipy
from tqdm.auto import tqdm, trange

from src.model import tscv
from src.data import add_lagged_features

%run constants.py

%matplotlib inline
print("Versions:")
print("  Python: %s" % sys.version)
for module in [pd, np, sns, sklearn]:
    print("  %s: %s" %(module.__name__, module.__version__))

Versions:
  Python: 3.8.2 (default, Jul 16 2020, 14:00:26) 
[GCC 9.3.0]
  pandas: 1.1.1
  numpy: 1.19.1
  seaborn: 0.10.1
  sklearn: 0.23.2


# Economics indicators

This is another leak we can exploit. Since we know which month the test set is in we can just use external data for that month. I'll use the following indicators:

- Russian rubles to currencies of their major trading partners (euro, chinese yuan, and US dolar)
- MOEX index (Moscow Exchange)
- Russian inflation rate

All of them should affect directly or indirectly the sales.

First let's define the months:

In [8]:
import datetime
import calendar
def next_month(m):
    _, month_end = m
    first_day = month_end + datetime.timedelta(days=1)
    return (first_day, first_day.replace(day=calendar.monthrange(first_day.year, first_day.month)[1]))

first_month = (datetime.date(2013, 1, 1), datetime.date(2013, 1, 31))
last_month = first_month
months = [first_month]
for i in range(1,35):
    last_month = next_month(last_month)
    months.append(last_month)
months

[(datetime.date(2013, 1, 1), datetime.date(2013, 1, 31)),
 (datetime.date(2013, 2, 1), datetime.date(2013, 2, 28)),
 (datetime.date(2013, 3, 1), datetime.date(2013, 3, 31)),
 (datetime.date(2013, 4, 1), datetime.date(2013, 4, 30)),
 (datetime.date(2013, 5, 1), datetime.date(2013, 5, 31)),
 (datetime.date(2013, 6, 1), datetime.date(2013, 6, 30)),
 (datetime.date(2013, 7, 1), datetime.date(2013, 7, 31)),
 (datetime.date(2013, 8, 1), datetime.date(2013, 8, 31)),
 (datetime.date(2013, 9, 1), datetime.date(2013, 9, 30)),
 (datetime.date(2013, 10, 1), datetime.date(2013, 10, 31)),
 (datetime.date(2013, 11, 1), datetime.date(2013, 11, 30)),
 (datetime.date(2013, 12, 1), datetime.date(2013, 12, 31)),
 (datetime.date(2014, 1, 1), datetime.date(2014, 1, 31)),
 (datetime.date(2014, 2, 1), datetime.date(2014, 2, 28)),
 (datetime.date(2014, 3, 1), datetime.date(2014, 3, 31)),
 (datetime.date(2014, 4, 1), datetime.date(2014, 4, 30)),
 (datetime.date(2014, 5, 1), datetime.date(2014, 5, 31)),
 (dateti

Now to fetch market indexes I'll use [yfinance library](https://pypi.org/project/yfinance/)

In [9]:
from yfinance import Ticker

moex = Ticker('IMOEX.ME').history(start=first_month[0], end=last_month[1], interval='1mo')
moex.describe()

Unnamed: 0,Open,High,Low,Close,Volume,Dividends,Stock Splits
count,33.0,33.0,33.0,33.0,33.0,33.0,33.0
mean,1502.956667,1572.121515,1444.673333,1510.738788,0.0,0.0,0.0
std,127.848997,142.924084,132.22998,136.920868,0.0,0.0,0.0
min,1304.93,1345.48,1182.89,1306.01,0.0,0.0,0.0
25%,1401.34,1469.4,1343.99,1400.71,0.0,0.0,0.0
50%,1473.54,1518.12,1429.28,1476.38,0.0,0.0,0.0
75%,1625.21,1706.29,1570.46,1642.97,0.0,0.0,0.0
max,1759.25,1873.53,1701.25,1771.05,0.0,0.0,0.0


In [53]:
moex.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2013-03-01,1473.54,1506.42,1416.78,1436.62,0,0,0
2013-04-01,1428.29,1432.37,1334.52,1386.69,0,0,0
2013-05-01,1377.45,1449.62,1343.99,1343.99,0,0,0
2013-06-01,1337.52,1345.48,1279.55,1331.24,0,0,0
2013-07-01,1334.44,1432.81,1332.26,1377.6,0,0,0


Unfortunately it starts on 03-01. Anyway, let's reformat this dataset.

In [10]:
moex = moex.reset_index().drop(columns=['Date', 'Volume', 'Dividends', 'Stock Splits'])
moex.rename(columns=lambda x: 'MOEX_%s' % x.lower(), inplace=True)
moex['date_block_num'] = np.arange(2, 35)
moex.head()

Unnamed: 0,MOEX_open,MOEX_high,MOEX_low,MOEX_close,date_block_num
0,1473.54,1506.42,1416.78,1436.62,2
1,1428.29,1432.37,1334.52,1386.69,3
2,1377.45,1449.62,1343.99,1343.99,4
3,1337.52,1345.48,1279.55,1331.24,5
4,1334.44,1432.81,1332.26,1377.6,6


And for currency quotes I'll use [forex-python](https://forex-python.readthedocs.io/en/latest/usage.html)

In [11]:
from forex_python.converter import CurrencyRates
forex_src = CurrencyRates()
forex_src.get_rate('CNY', 'RUB', date_obj=first_month[0])

4.9058474339

In [12]:
import calendar

def build_forex_df(quote, base):
    open_prices = []
    close_prices = []
    for beg_month, end_month in tqdm(months):
        open_prices.append(forex_src.get_rate(quote, base, date_obj=beg_month))
        close_prices.append(forex_src.get_rate(quote, base, date_obj=end_month))
    return pd.DataFrame({'date_block_num': range(0, 35),
                        '%s%s_open' % (quote, base): open_prices,
                        '%s%s_close' % (quote, base): close_prices})

cny_rub = build_forex_df('CNY', 'RUB')
cny_rub.describe()

HBox(children=(FloatProgress(value=0.0, max=35.0), HTML(value='')))




Unnamed: 0,date_block_num,CNYRUB_open,CNYRUB_close
count,35.0,35.0,35.0
mean,17.0,6.912036,7.051546
std,10.246951,2.055718,2.108321
min,0.0,4.814853,4.838964
25%,8.5,5.386638,5.403534
50%,17.0,5.79467,5.828424
75%,25.5,8.764054,9.174595
max,34.0,11.314571,11.314571


In [13]:
usd_rub = build_forex_df('USD', 'RUB')
eur_rub = build_forex_df('EUR', 'RUB')

HBox(children=(FloatProgress(value=0.0, max=35.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=35.0), HTML(value='')))




Great. Now let's merge everything into one df and evaluate them.

In [14]:
macros_history = pd.merge(moex, cny_rub, on='date_block_num', how='right', sort=False)
macros_history = macros_history.merge(usd_rub, on='date_block_num', how='left', sort=False)
macros_history = macros_history.merge(eur_rub, on='date_block_num', how='left', sort=False)
macros_history.head()

Unnamed: 0,MOEX_open,MOEX_high,MOEX_low,MOEX_close,date_block_num,CNYRUB_open,CNYRUB_close,USDRUB_open,USDRUB_close,EURRUB_open,EURRUB_close
0,,,,,0,4.905847,4.838964,30.566545,30.093358,40.3295,40.7765
1,,,,,1,4.814853,4.904956,29.983436,30.530353,40.9094,40.0833
2,1473.54,1506.42,1416.78,1436.62,2,4.940797,4.995188,30.75,31.051699,39.975,39.7617
3,1428.29,1432.37,1334.52,1386.69,3,4.995188,5.048749,31.051699,31.128672,39.7617,40.6914
4,1377.45,1449.62,1343.99,1343.99,4,5.048749,5.193698,31.128672,31.862217,40.6914,41.44


In [2]:
from src.data import add_lagged_features

train_set = pd.read_parquet(os.path.join(PROCESSED_DATA_DIR, 'train-set-features-001.parquet'))

In [70]:
train_set_macros = add_lagged_features(train_set, macros_history, ['MOEX_open', 'MOEX_high', 'MOEX_low', 'MOEX_close',
                                                                   'CNYRUB_open', 'CNYRUB_close',
                                                                   'USDRUB_open', 'USDRUB_close',
                                                                   'EURRUB_open', 'EURRUB_close'],
                                       max_lag=5, index_cols=[])

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




In [62]:
train_set_macros.head()

Unnamed: 0,item_id,shop_id,date_block_num,item_cnt_month,f__MOEX_open_1,f__MOEX_high_1,f__MOEX_low_1,f__MOEX_close_1,f__CNYRUB_open_1,f__CNYRUB_close_1,...,f__MOEX_open_5,f__MOEX_high_5,f__MOEX_low_5,f__MOEX_close_5,f__CNYRUB_open_5,f__CNYRUB_close_5,f__USDRUB_open_5,f__USDRUB_close_5,f__EURRUB_open_5,f__EURRUB_close_5
0,5037,5,10,0.0,1472.45,1538.14,1452.26,1509.62,5.261293,5.267504,...,1337.52,1345.48,1279.55,1331.24,5.193698,5.336946,31.862217,32.756116,41.44,42.845
1,5320,5,10,0.0,1472.45,1538.14,1452.26,1509.62,5.261293,5.267504,...,1337.52,1345.48,1279.55,1331.24,5.193698,5.336946,31.862217,32.756116,41.44,42.845
2,5233,5,10,0.0,1472.45,1538.14,1452.26,1509.62,5.261293,5.267504,...,1337.52,1345.48,1279.55,1331.24,5.193698,5.336946,31.862217,32.756116,41.44,42.845
3,5232,5,10,0.0,1472.45,1538.14,1452.26,1509.62,5.261293,5.267504,...,1337.52,1345.48,1279.55,1331.24,5.193698,5.336946,31.862217,32.756116,41.44,42.845
4,5268,5,10,0.0,1472.45,1538.14,1452.26,1509.62,5.261293,5.267504,...,1337.52,1345.48,1279.55,1331.24,5.193698,5.336946,31.862217,32.756116,41.44,42.845


In [3]:
from src.data import df_to_X_y
from src.model import ClippedOutputRegressor, tscv
from sklearn.model_selection import cross_validate
from xgboost import XGBRegressor
X_train, y_train = df_to_X_y(train_set_macros)
cv_split = tscv.split(train_set_macros['date_block_num'])
reg = ClippedOutputRegressor(XGBRegressor(tree_method='gpu_hist', gpu_id=0))

NameError: name 'train_set_macros' is not defined

In [75]:
scores = cross_validate(reg, X=X_train, y=y_train,
                        scoring='neg_root_mean_squared_error', verbose=2, 
                        cv=cv_split, return_train_score=True)
scores

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  ................................................................
[CV] ................................................. , total=  28.2s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   35.4s remaining:    0.0s


[CV] ................................................. , total=  34.0s
[CV]  ................................................................
[CV] ................................................. , total=  36.2s


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  2.0min finished


{'fit_time': array([27.82393217, 33.51648545, 35.80896282]),
 'score_time': array([0.34668732, 0.45320368, 0.35907745]),
 'test_score': array([-0.78619799, -0.92847565, -0.89679642]),
 'train_score': array([-0.77031535, -0.77181526, -0.77619987])}

In [1]:
scores['test_score'].mean()

-0.8704900200000001

For comparison, here's the CV score for the model without the new variables:

In [5]:
X_train, y_train = df_to_X_y(train_set)
scores = cross_validate(reg, X=X_train, y=y_train,
                        scoring='neg_root_mean_squared_error', verbose=2, 
                        cv=cv_split, return_train_score=True)
scores

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  ................................................................
[CV] ................................................. , total=  20.9s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   25.5s remaining:    0.0s


[CV] ................................................. , total=  36.1s
[CV]  ................................................................
[CV] ................................................. , total= 1.5min


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  2.7min finished


{'fit_time': array([20.66307259, 35.83074737, 87.13734627]),
 'score_time': array([0.27966952, 0.27983689, 0.30369496]),
 'test_score': array([-0.78521879, -0.92859643, -0.90362674]),
 'train_score': array([-0.7754801 , -0.77474973, -0.78020316])}

In [6]:
scores['test_score'].mean()

-0.8724806561687966

Which means we got an improvement. Not a big one, but it's a start.

Let's see if we can use less features for the same improvement. One idea is to combine open and close prices into a "change_ratio".

In [15]:
macros_history['MOEX_gain'] = macros_history['MOEX_close'] / macros_history['MOEX_open']
macros_history['CNYRUB_gain'] = macros_history['CNYRUB_close'] / macros_history['CNYRUB_open']
macros_history['USDRUB_gain'] = macros_history['USDRUB_close'] / macros_history['USDRUB_open']
macros_history['EURRUB_gain'] = macros_history['EURRUB_close'] / macros_history['EURRUB_open']
macros_history.head()

Unnamed: 0,MOEX_open,MOEX_high,MOEX_low,MOEX_close,date_block_num,CNYRUB_open,CNYRUB_close,USDRUB_open,USDRUB_close,EURRUB_open,EURRUB_close,MOEX_gain,CNYRUB_gain,USDRUB_gain,EURRUB_gain
0,,,,,0,4.905847,4.838964,30.566545,30.093358,40.3295,40.7765,,0.986367,0.984519,1.011084
1,,,,,1,4.814853,4.904956,29.983436,30.530353,40.9094,40.0833,,1.018714,1.018241,0.979807
2,1473.54,1506.42,1416.78,1436.62,2,4.940797,4.995188,30.75,31.051699,39.975,39.7617,0.974945,1.011009,1.009811,0.994664
3,1428.29,1432.37,1334.52,1386.69,3,4.995188,5.048749,31.051699,31.128672,39.7617,40.6914,0.970874,1.010722,1.002479,1.023382
4,1377.45,1449.62,1343.99,1343.99,4,5.048749,5.193698,31.128672,31.862217,40.6914,41.44,0.975709,1.02871,1.023565,1.018397


In [16]:
train_set_macros_gains = add_lagged_features(train_set, macros_history, ['MOEX_gain', 'CNYRUB_gain', 'USDRUB_gain', 'EURRUB_gain'], 
                                             max_lag=5, index_cols=[])

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




In [18]:
X_train, y_train = df_to_X_y(train_set_macros_gains)
scores = cross_validate(reg, X=X_train, y=y_train,
                        scoring='neg_root_mean_squared_error', verbose=2, 
                        cv=cv_split, return_train_score=True)
scores

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  ................................................................
[CV] ................................................. , total=  16.9s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   23.2s remaining:    0.0s


[CV] ................................................. , total=  17.3s
[CV]  ................................................................
[CV] ................................................. , total=  18.1s


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  1.2min finished


{'fit_time': array([16.46649837, 16.89188051, 17.70034838]),
 'score_time': array([0.40208316, 0.42681313, 0.42794752]),
 'test_score': array([-0.77533171, -0.91754171, -0.88763681]),
 'train_score': array([-0.75570923, -0.75706349, -0.76062456])}

In [19]:
scores['test_score'].mean()

-0.8601700781697751

Cool, the score actually improved.