In [1]:
# %pip install 'etna[all]'
# %pip install lightgbm xgboost
# %pip install pandas numpy matplotlib 

В домашнем задании вы будете работать с датасетом по продажам товаров. 

https://www.kaggle.com/competitions/demand-forecasting-kernels-only/overview

1. Постройте графики рядов, постройте корреляционную матрицу между некоторыми рядами. Выведите коррелограммы некоторых рядов. 

2. Постройте на данных модели Prophet и AutoARIMA, выведите метрики качества.

3. Найдите, с каким сегментом ваша модель справилась хуже всего.


Опционально: 

3. Постройте на данных модели градиентного бустинга CatBoost и LightGBM

4. Добейтесь качества по SMAPE не более 13%.


In [2]:
import pandas as pd

from etna.datasets.tsdataset import TSDataset
from etna.transforms import (
    StandardScalerTransform,
    MeanTransform, 
    LagTransform,
    DateFlagsTransform)

from etna.pipeline import Pipeline
from etna.metrics import SMAPE

import warnings
warnings.filterwarnings('ignore')

from etna.analysis import plot_forecast

In [3]:
HORIZON = 90

smape = SMAPE()

In [4]:
def train_and_evaluate_model(ts, 
                             model,
                             transforms,
                             horizon,
                             metrics,
                             print_metrics=True,
                             return_forecast=False,
                             n_train_samples=0):
    """"
    Обучает модель, вычисляет прогнозы для 
    тестовой выборки и строит график прогнозов. 
    """

    train_ts, test_ts = ts.train_test_split(test_size=horizon)
    pipe = Pipeline(model=model,
                    transforms=transforms,
                    horizon=horizon)
    pipe.fit(train_ts)
    forecast_ts = pipe.forecast()

    # оцениваем качество прогнозов по сегментам
    segment_metrics = metrics(test_ts, forecast_ts)
    segment_metrics = pd.Series(segment_metrics)
    
    avg_metric = sum(segment_metrics) / len(segment_metrics)
    
    if print_metrics:
        print(f"Avg (by segments) metric {metrics} is: {avg_metric}")
    
    if n_train_samples > 0:
        plot_forecast(forecast_ts, test_ts, 
                      train_ts, n_train_samples=n_train_samples)

    if return_forecast:
        return segment_metrics, avg_metric, forecast_ts
    
    return segment_metrics, avg_metric

In [22]:
df = pd.read_csv('demand-forecasting-kernels-only/train.csv', parse_dates=['date'])

df.rename(columns={'date': 'timestamp', 
                   'sales': 'target'}, inplace=True)
df

Unnamed: 0,timestamp,store,item,target
0,2013-01-01,1,1,13
1,2013-01-02,1,1,11
2,2013-01-03,1,1,14
3,2013-01-04,1,1,13
4,2013-01-05,1,1,10
...,...,...,...,...
912995,2017-12-27,10,50,63
912996,2017-12-28,10,50,59
912997,2017-12-29,10,50,74
912998,2017-12-30,10,50,62


In [23]:
df.store.unique(), df.item.unique()

(array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10]),
 array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
        35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50]))

In [6]:
df_new = pd.read_csv('demand-forecasting-kernels-only/test.csv', parse_dates=['date'])

df_new.rename(columns={'date': 'timestamp', 
                   'sales': 'target'}, inplace=True)


df_new.drop(columns='id', inplace=True)

df_new

Unnamed: 0,timestamp,store,item
0,2018-01-01,1,1
1,2018-01-02,1,1
2,2018-01-03,1,1
3,2018-01-04,1,1
4,2018-01-05,1,1
...,...,...,...
44995,2018-03-27,10,50
44996,2018-03-28,10,50
44997,2018-03-29,10,50
44998,2018-03-30,10,50


In [7]:
df['segment'] = (df['store'].astype(str) + ' + ' 
                 + df['item'].astype(str))

df_new['segment'] = (df_new['store'].astype(str) + ' + ' 
                     + df_new['item'].astype(str))

df.head()

Unnamed: 0,timestamp,store,item,target,segment
0,2013-01-01,1,1,13,1 + 1
1,2013-01-02,1,1,11,1 + 1
2,2013-01-03,1,1,14,1 + 1
3,2013-01-04,1,1,13,1 + 1
4,2013-01-05,1,1,10,1 + 1


In [8]:
for col in ['store', 'item']:
    df[col] = df[col].astype('category')
    df_new[col] = df_new[col].astype('category')

In [9]:
regressor_df = df[['timestamp', 'segment', 'store', 'item']].copy()
regressor_df

Unnamed: 0,timestamp,segment,store,item
0,2013-01-01,1 + 1,1,1
1,2013-01-02,1 + 1,1,1
2,2013-01-03,1 + 1,1,1
3,2013-01-04,1 + 1,1,1
4,2013-01-05,1 + 1,1,1
...,...,...,...,...
912995,2017-12-27,10 + 50,10,50
912996,2017-12-28,10 + 50,10,50
912997,2017-12-29,10 + 50,10,50
912998,2017-12-30,10 + 50,10,50


In [10]:
# экзогенных переменные store и item
regressor_df_new = df_new.copy()
regressor_df_new

Unnamed: 0,timestamp,store,item,segment
0,2018-01-01,1,1,1 + 1
1,2018-01-02,1,1,1 + 1
2,2018-01-03,1,1,1 + 1
3,2018-01-04,1,1,1 + 1
4,2018-01-05,1,1,1 + 1
...,...,...,...,...
44995,2018-03-27,10,50,10 + 50
44996,2018-03-28,10,50,10 + 50
44997,2018-03-29,10,50,10 + 50
44998,2018-03-30,10,50,10 + 50


In [11]:
regressor_df = regressor_df.sort_index(
    axis=1, ascending=False)

regressor_df_new = regressor_df_new.sort_index(
    axis=1, ascending=False)

In [14]:
regressor_df = pd.concat([regressor_df, regressor_df_new], axis=0)

regressor_df['quarter'] = regressor_df['timestamp'].dt.quarter
regressor_df['quarter_start'] = regressor_df['timestamp'].dt.is_quarter_start
regressor_df['quarter_end'] = regressor_df['timestamp'].dt.is_quarter_end
regressor_df

Unnamed: 0,timestamp,store,segment,item,quarter,quarter_start,quarter_end
0,2013-01-01,1,1 + 1,1,1,True,False
1,2013-01-02,1,1 + 1,1,1,False,False
2,2013-01-03,1,1 + 1,1,1,False,False
3,2013-01-04,1,1 + 1,1,1,False,False
4,2013-01-05,1,1 + 1,1,1,False,False
...,...,...,...,...,...,...,...
44995,2018-03-27,10,10 + 50,50,1,False,False
44996,2018-03-28,10,10 + 50,50,1,False,False
44997,2018-03-29,10,10 + 50,50,1,False,False
44998,2018-03-30,10,10 + 50,50,1,False,False


In [15]:
# подготавливаем исторический набор эндогенных переменных
df.drop(['store', 'item'], axis=1, inplace=True)
df

Unnamed: 0,timestamp,target,segment
0,2013-01-01,13,1 + 1
1,2013-01-02,11,1 + 1
2,2013-01-03,14,1 + 1
3,2013-01-04,13,1 + 1
4,2013-01-05,10,1 + 1
...,...,...,...
912995,2017-12-27,63,10 + 50
912996,2017-12-28,59,10 + 50
912997,2017-12-29,74,10 + 50
912998,2017-12-30,62,10 + 50


In [16]:
df = TSDataset.to_dataset(df)
df

segment,1 + 1,1 + 10,1 + 11,1 + 12,1 + 13,1 + 14,1 + 15,1 + 16,1 + 17,1 + 18,...,9 + 46,9 + 47,9 + 48,9 + 49,9 + 5,9 + 50,9 + 6,9 + 7,9 + 8,9 + 9
feature,target,target,target,target,target,target,target,target,target,target,...,target,target,target,target,target,target,target,target,target,target
timestamp,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2013-01-01,13,37,37,33,37,22,42,14,13,38,...,34,6,28,11,9,36,29,30,45,27
2013-01-02,11,34,43,35,31,35,33,11,18,51,...,28,14,38,16,11,44,33,24,43,36
2013-01-03,14,32,34,41,50,26,45,12,15,42,...,41,18,24,20,8,29,19,35,34,25
2013-01-04,13,45,52,45,45,32,39,15,19,50,...,41,15,30,19,15,43,33,35,41,31
2013-01-05,10,35,45,46,49,31,47,22,16,56,...,42,13,33,16,13,53,36,28,49,30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-12-27,14,55,43,47,49,42,62,16,29,54,...,49,19,41,26,18,52,39,44,52,44
2017-12-28,19,63,64,49,68,51,82,24,13,69,...,42,23,36,37,18,73,56,54,76,48
2017-12-29,15,56,60,58,73,42,65,11,27,66,...,58,17,48,15,20,68,56,59,73,54
2017-12-30,27,78,66,52,70,57,77,28,32,67,...,49,24,55,31,21,62,54,67,74,59


In [17]:
regressor_df = TSDataset.to_dataset(regressor_df)
regressor_df

segment,1 + 1,1 + 1,1 + 1,1 + 1,1 + 1,1 + 10,1 + 10,1 + 10,1 + 10,1 + 10,...,9 + 8,9 + 8,9 + 8,9 + 8,9 + 8,9 + 9,9 + 9,9 + 9,9 + 9,9 + 9
feature,item,quarter,quarter_end,quarter_start,store,item,quarter,quarter_end,quarter_start,store,...,item,quarter,quarter_end,quarter_start,store,item,quarter,quarter_end,quarter_start,store
timestamp,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2013-01-01,1,1,False,True,1,10,1,False,True,1,...,8,1,False,True,9,9,1,False,True,9
2013-01-02,1,1,False,False,1,10,1,False,False,1,...,8,1,False,False,9,9,1,False,False,9
2013-01-03,1,1,False,False,1,10,1,False,False,1,...,8,1,False,False,9,9,1,False,False,9
2013-01-04,1,1,False,False,1,10,1,False,False,1,...,8,1,False,False,9,9,1,False,False,9
2013-01-05,1,1,False,False,1,10,1,False,False,1,...,8,1,False,False,9,9,1,False,False,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018-03-27,1,1,False,False,1,10,1,False,False,1,...,8,1,False,False,9,9,1,False,False,9
2018-03-28,1,1,False,False,1,10,1,False,False,1,...,8,1,False,False,9,9,1,False,False,9
2018-03-29,1,1,False,False,1,10,1,False,False,1,...,8,1,False,False,9,9,1,False,False,9
2018-03-30,1,1,False,False,1,10,1,False,False,1,...,8,1,False,False,9,9,1,False,False,9


In [18]:
# создаем объединенный набор
ts = TSDataset(df=df, freq='D', 
               df_exog=regressor_df, 
               known_future='all')
ts

segment,1 + 1,1 + 1,1 + 1,1 + 1,1 + 1,1 + 1,1 + 10,1 + 10,1 + 10,1 + 10,...,9 + 8,9 + 8,9 + 8,9 + 8,9 + 9,9 + 9,9 + 9,9 + 9,9 + 9,9 + 9
feature,item,quarter,quarter_end,quarter_start,store,target,item,quarter,quarter_end,quarter_start,...,quarter_end,quarter_start,store,target,item,quarter,quarter_end,quarter_start,store,target
timestamp,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2013-01-01,1,1,False,True,1,13.0,10,1,False,True,...,False,True,9,45.0,9,1,False,True,9,27.0
2013-01-02,1,1,False,False,1,11.0,10,1,False,False,...,False,False,9,43.0,9,1,False,False,9,36.0
2013-01-03,1,1,False,False,1,14.0,10,1,False,False,...,False,False,9,34.0,9,1,False,False,9,25.0
2013-01-04,1,1,False,False,1,13.0,10,1,False,False,...,False,False,9,41.0,9,1,False,False,9,31.0
2013-01-05,1,1,False,False,1,10.0,10,1,False,False,...,False,False,9,49.0,9,1,False,False,9,30.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-12-27,1,4,False,False,1,14.0,10,4,False,False,...,False,False,9,52.0,9,4,False,False,9,44.0
2017-12-28,1,4,False,False,1,19.0,10,4,False,False,...,False,False,9,76.0,9,4,False,False,9,48.0
2017-12-29,1,4,False,False,1,15.0,10,4,False,False,...,False,False,9,73.0,9,4,False,False,9,54.0
2017-12-30,1,4,False,False,1,27.0,10,4,False,False,...,False,False,9,74.0,9,4,False,False,9,59.0
