# Imports

In [109]:
import numpy as np
import pandas as pd
import seaborn as sns
import lightgbm as lgb
import matplotlib.pyplot as plt

from pandas.plotting import register_matplotlib_converters
from IPython.display import display
from tqdm import tqdm
# from tsa_functions import *
from tsa_tools import *  # See last cell

register_matplotlib_converters()
sns.set_style('darkgrid')

np.set_printoptions(precision=4)
pd.set_option('precision', 4)

import statsmodels.api as sm
from statsmodels.tsa.seasonal import STL
from statsmodels.tsa.exponential_smoothing.ets import ETSModel

# Loading the Data

In [110]:
df_calendar = pd.read_csv('../data/m5/calendar.csv')
df_price = pd.read_csv('../data/m5/sell_prices.csv')
df_sales = pd.read_csv('../data/m5/sales_train_validation.csv')
df_sales1 = pd.read_csv('../data/m5/sales_train_evaluation.csv')
df_weights = pd.read_csv('../data/m5/weights_validation.csv')
levels1 = json.loads(open('levels1.json', 'r').read())

In [111]:
full_df = (df_sales1.set_index([*df_sales.columns[5::-1]]).T
           .set_index(pd.DatetimeIndex(df_calendar.date)[:1941]))

In [112]:
lvl9 = full_df.sum(level=levels1['9'], axis=1)
lvl9.head()

store_id,CA_1,CA_1,CA_1,CA_1,CA_1,CA_1,CA_1,CA_2,CA_2,CA_2,...,WI_2,WI_2,WI_2,WI_3,WI_3,WI_3,WI_3,WI_3,WI_3,WI_3
state_id,CA,CA,CA,CA,CA,CA,CA,CA,CA,CA,...,WI,WI,WI,WI,WI,WI,WI,WI,WI,WI
cat_id,HOBBIES,HOBBIES,HOUSEHOLD,HOUSEHOLD,FOODS,FOODS,FOODS,HOBBIES,HOBBIES,HOUSEHOLD,...,FOODS,FOODS,FOODS,HOBBIES,HOBBIES,HOUSEHOLD,HOUSEHOLD,FOODS,FOODS,FOODS
dept_id,HOBBIES_1,HOBBIES_2,HOUSEHOLD_1,HOUSEHOLD_2,FOODS_1,FOODS_2,FOODS_3,HOBBIES_1,HOBBIES_2,HOUSEHOLD_1,...,FOODS_1,FOODS_2,FOODS_3,HOBBIES_1,HOBBIES_2,HOUSEHOLD_1,HOUSEHOLD_2,FOODS_1,FOODS_2,FOODS_3
date,Unnamed: 1_level_4,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4,Unnamed: 13_level_4,Unnamed: 14_level_4,Unnamed: 15_level_4,Unnamed: 16_level_4,Unnamed: 17_level_4,Unnamed: 18_level_4,Unnamed: 19_level_4,Unnamed: 20_level_4,Unnamed: 21_level_4
2011-01-29,528,28,361,181,297,674,2268,522,16,529,...,191,155,1269,256,22,584,148,152,583,2293
2011-01-30,489,9,350,170,284,655,2198,381,16,461,...,184,128,1121,342,14,541,195,138,585,2383
2011-01-31,409,6,279,114,214,396,1398,352,16,306,...,154,199,1233,228,20,420,106,127,575,1841
2011-02-01,383,9,278,123,175,476,1607,344,6,270,...,205,244,1564,183,11,327,94,98,533,1965
2011-02-02,263,5,195,135,182,354,1496,283,13,217,...,96,147,724,70,4,151,53,87,340,1427


# Pre-procesing

## Pre-processing Full Dataset

## Pre-processing Lvl9

In [113]:
# No of values with less than 10 sales per day
lvl9[lvl9 < 10].count(axis=0).sum()

2403

In [114]:
df_lvl9 = (lvl9.apply(lambda x: np.where(x < 10,  np.nan, x))
            .interpolate(method='linear', axis=0)).fillna(method='bfill')

In [115]:
# To check if there are still values less than 10
df_lvl9[df_lvl9 < 10].count(axis=0).sum()

0

# Q1. Baseline Methods

In [116]:
train = df_lvl9.iloc[ :1913,:]
test = df_lvl9.iloc[ 1913:,:]

In [117]:
base_forcast = {}
h = 28
m = 7


# for bm in baseline:
naive = []
snaive = []
ses = []
hl = []
ahl = []
for x in train.columns:
    naive.append(naivef(train[x], h))
    snaive.append(snaivef(train[x], h, m))

    model1 =  ETSModel(train[x].values).fit() # SES
    model2 =  ETSModel(train[x].values, trend="add", seasonal=None).fit() # Holt's Linear
    model3 =  ETSModel(train[x], error="add", trend="add", 
                        seasonal="add", damped_trend=False, 
                        seasonal_periods=m).fit() # Additive Holt-Winter

    ses.append(model1.forecast(h))
    hl.append(model2.forecast(h))
    ahl.append(model3.forecast(h))

base_forcast['Naive'] = naive
base_forcast['Seasonal Naive'] = snaive
base_forcast['SES'] = ses
base_forcast['Holt\'s Linear'] = hl
base_forcast['Additive Holt-Winters'] = ahl


In [118]:
rmsse_res = {}

for k in base_forcast.keys():
    rmsse_list = []
    for x in range(len(train.columns)):
        rmsse_list.append(rmsse(test[train.columns[x]], 
                                base_forcast[k][x], 
                                train[train.columns[x]]))
    rmsse_res[k] = rmsse_list


In [119]:
pd.set_option('max_rows', None)
lvl9_rmsse = pd.DataFrame(rmsse_res, index=train.columns)
lvl9_rmsse_conc = (lvl9_rmsse
                   .reset_index(level=['state_id', 'cat_id'], drop=True))

lvl9_weights = (df_weights[df_weights['Level_id'] == 'Level9']
                .set_index(['Agg_Level_1', 'Agg_Level_2'])
                [['Weight']])
lvl9_weights.index.names = ['store_id', 'dept_id']

lvl9_fin = (lvl9_rmsse.join(lvl9_weights)
            .reorder_levels(['store_id','state_id','cat_id', 'dept_id']))

#ncat([lvl9_rmsse_conc, lvl9_weights], axis=1).reindex(lvl9_rmsse.index, level=0)
         #).set_index(["store_id", "state_id", "cat_id", "dept_id"])
#lvl9_fin = lvl9_rmsse.merge(lvl9_weights, left_index=True, right_on=['store_id', 'dept_id'])

lvl9_fin

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Naive,Seasonal Naive,SES,Holt's Linear,Additive Holt-Winters,Weight
store_id,state_id,cat_id,dept_id,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
CA_1,CA,HOBBIES,HOBBIES_1,1.4583,0.7619,0.8819,0.881,0.6311,0.0172
CA_1,CA,HOBBIES,HOBBIES_2,1.934,1.1429,0.8831,0.8748,0.7203,0.0006
CA_1,CA,HOUSEHOLD,HOUSEHOLD_1,2.1052,0.5172,1.1514,1.1636,0.4433,0.0228
CA_1,CA,HOUSEHOLD,HOUSEHOLD_2,2.2997,0.5228,1.2208,1.1789,0.5276,0.0083
CA_1,CA,FOODS,FOODS_1,0.9319,0.7312,0.9179,0.9362,0.7301,0.0052
CA_1,CA,FOODS,FOODS_2,2.0535,0.8269,2.0534,2.3136,0.593,0.0149
CA_1,CA,FOODS,FOODS_3,1.7113,0.4944,1.0825,1.0185,0.5045,0.0419
CA_2,CA,HOBBIES,HOBBIES_1,1.2053,0.7055,1.1179,1.118,0.6542,0.0116
CA_2,CA,HOBBIES,HOBBIES_2,1.3546,1.4893,1.3887,1.379,1.1641,0.0006
CA_2,CA,HOUSEHOLD,HOUSEHOLD_1,2.0287,0.6373,1.396,1.3853,0.5825,0.0256


In [120]:
lvl9_wrmsse = {'Naive': sum(lvl9_fin['Naive'] * lvl9_fin['Weight']),
               'S. Naive': sum(lvl9_fin['Seasonal Naive'] * 
                                lvl9_fin['Weight']),
               'SES': sum(lvl9_fin['SES'] * lvl9_fin['Weight']),
               'Holt\'s Linear': sum(lvl9_fin['Holt\'s Linear'] * 
                                      lvl9_fin['Weight']),
               'Additive Holt-Winters': sum(lvl9_fin['Additive Holt-Winters'] 
                                             * lvl9_fin['Weight'])}
pd.DataFrame.from_dict(lvl9_wrmsse, orient='index', columns=['WRMSSE'])

Unnamed: 0,WRMSSE
Naive,1.6286
S. Naive,0.93
SES,1.2075
Holt's Linear,1.2367
Additive Holt-Winters,0.8549


# Part 4: Middle-Out Method

## Bottom-Up

In [137]:
def compute_bottomup(df_orig, df_pred, lvl_pred):
    """Pre-processes the original data by level and returns 
    a dictionary of RMSSEs for each time series in each level.
    
    Parameters
    ----------
    df_orig : DataFrame
        DataFrame contaning the original data (index=date, columns=hts).
    df_pred : DataFrame
        DataFrame contaning the predictions using best model (index=date, columns=hts).
    lvl_pred : int
        Specified hierarchical level of the df_pred.

    Returns
    -------
    res_bylvl : DataFrame
        Nested dictionary of RMSSEs per time series per level
    """

    res_bylvl = {}
    lvl_preds = list(sorted(range(2, lvl_pred), reverse=True))
    for x in list(sorted(range(1, lvl_pred), reverse=True)):
        if x in lvl_preds:
            orig = (df_orig.sum(level=[levels[str(x)]], axis=1)
                    .apply(lambda x: np.where(x < 10,  np.nan, x))
                    .interpolate(method='linear', axis=0)
                    .fillna(method='bfill'))
            pred = df_pred.sum(level=[levels[str(x)]], axis=1)
                    

        else:
            orig = (df_orig.sum(level=levels[str(x)], axis=1)
                    .apply(lambda x: np.where(x < 10,  np.nan, x))
                    .interpolate(method='linear', axis=0)
                    .fillna(method='bfill'))
            pred = df_pred.sum(level=levels[str(x)], axis=1)
        
        # Test and Train Split
        train = orig.iloc[ :1913,]
        test = orig.iloc[ 1913:,]
        
        # Initialize res dictionary by column
        res_bycol = {} 

        if x in lvl_preds:
            for col in orig.columns:
                res_bycol[col] = rmsse(test[col], pred[col], train[col])
        else:
            res_bycol['Total'] = rmsse(test, pred, train)

        res_bylvl[x] = res_bycol 
        
    return res_bylvl


In [129]:
sample = pd.DataFrame(base_forcast['Naive'], index=train.columns, columns=test.index).T
sample.head()

store_id,CA_1,CA_1,CA_1,CA_1,CA_1,CA_1,CA_1,CA_2,CA_2,CA_2,...,WI_2,WI_2,WI_2,WI_3,WI_3,WI_3,WI_3,WI_3,WI_3,WI_3
state_id,CA,CA,CA,CA,CA,CA,CA,CA,CA,CA,...,WI,WI,WI,WI,WI,WI,WI,WI,WI,WI
cat_id,HOBBIES,HOBBIES,HOUSEHOLD,HOUSEHOLD,FOODS,FOODS,FOODS,HOBBIES,HOBBIES,HOUSEHOLD,...,FOODS,FOODS,FOODS,HOBBIES,HOBBIES,HOUSEHOLD,HOUSEHOLD,FOODS,FOODS,FOODS
dept_id,HOBBIES_1,HOBBIES_2,HOUSEHOLD_1,HOUSEHOLD_2,FOODS_1,FOODS_2,FOODS_3,HOBBIES_1,HOBBIES_2,HOUSEHOLD_1,...,FOODS_1,FOODS_2,FOODS_3,HOBBIES_1,HOBBIES_2,HOUSEHOLD_1,HOUSEHOLD_2,FOODS_1,FOODS_2,FOODS_3
date,Unnamed: 1_level_4,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4,Unnamed: 13_level_4,Unnamed: 14_level_4,Unnamed: 15_level_4,Unnamed: 16_level_4,Unnamed: 17_level_4,Unnamed: 18_level_4,Unnamed: 19_level_4,Unnamed: 20_level_4,Unnamed: 21_level_4
2016-04-25,641.0,73.0,1022.0,324.0,329.0,734.0,2990.0,447.0,60.0,1131.0,...,347.0,932.0,2380.0,230.0,21.0,766.0,201.0,317.0,557.0,2233.0
2016-04-26,641.0,73.0,1022.0,324.0,329.0,734.0,2990.0,447.0,60.0,1131.0,...,347.0,932.0,2380.0,230.0,21.0,766.0,201.0,317.0,557.0,2233.0
2016-04-27,641.0,73.0,1022.0,324.0,329.0,734.0,2990.0,447.0,60.0,1131.0,...,347.0,932.0,2380.0,230.0,21.0,766.0,201.0,317.0,557.0,2233.0
2016-04-28,641.0,73.0,1022.0,324.0,329.0,734.0,2990.0,447.0,60.0,1131.0,...,347.0,932.0,2380.0,230.0,21.0,766.0,201.0,317.0,557.0,2233.0
2016-04-29,641.0,73.0,1022.0,324.0,329.0,734.0,2990.0,447.0,60.0,1131.0,...,347.0,932.0,2380.0,230.0,21.0,766.0,201.0,317.0,557.0,2233.0


In [131]:
compute_bottom_up(full_df, sample, 9).keys()

dict_keys([8, 7, 6, 5, 4, 3, 2, 1])

## Top-Down Approach

In [172]:
def compute_topdown(df_full, df_pred, lvl_pred, approach='AHP'):
    """Pre-processes the original data by level and returns 
    a dictionary of RMSSEs for each time series in each level.
    
    Parameters
    ----------
    df_orig : DataFrame
        DataFrame contaning the original data (index=date, columns=hts).
    df_pred : DataFrame
        DataFrame contaning the predictions using best model (index=date, columns=hts).
    lvl_pred : int
        Specified hierarchical level of the df_pred.

    Returns
    -------
    res_bylvl : DataFrame
        Nested dictionary of RMSSEs per time series per level
    """
    levels1 = json.loads(open('levels1.json', 'r').read())
    lvl_preds = list(levels1.keys())[9:]
    
    if approach == 'AHP':
        res_bylvl = {}
        forc_bylvl = {}

        for x in lvl_preds:
            propors = {}
            next_lvl_forc = {}
            res_bycol = {}

            lvl = (full_df.sum(level=levels[x], axis=1)
                   .apply(lambda x: np.where(x < 10,  np.nan, x))
                   .interpolate(method='linear', axis=0)
                   .fillna(method='bfill'))

            # Test and Train Split
            train = lvl.iloc[ :1913,]
            test = lvl.iloc[ 1913:,]
     
            for col in lvl.columns.tolist():
                propors[col] = sum(lvl[col]/lvl.sum(axis=1)) * (1/len(lvl))
                next_lvl_forc[col] = (sample.sum(axis=1) * propors[col])
                res_bycol[col] = (rmsse(test[col], 
                                  next_lvl_forc[col], 
                                  train[col]))
            
            forc_bylvl[x] = next_level_forecasted
            res_bylvl[x] = res_bycol 

    return res_bylvl
        

In [173]:
compute_topdown(full_df, sample, 9)

In [175]:
! pip install tqdm



In [151]:
proportions = {}
next_level_forecasted = {}
lvl = full_df.sum(level=levels[str(10)], axis=1)
for col in lvl.columns.tolist():
    proportions[col] = sum(lvl[col]/lvl.sum(axis=1)) * (1/len(lvl))
    next_level_forecasted[col] =  sample.sum(axis=1) * proportions[col] 

Unnamed: 0,HOBBIES_1_001,HOBBIES_1_002,HOBBIES_1_003,HOBBIES_1_004,HOBBIES_1_005,HOBBIES_1_006,HOBBIES_1_007,HOBBIES_1_008,HOBBIES_1_009,HOBBIES_1_010,...,FOODS_3_818,FOODS_3_819,FOODS_3_820,FOODS_3_821,FOODS_3_822,FOODS_3_823,FOODS_3_824,FOODS_3_825,FOODS_3_826,FOODS_3_827
prop,5.8774e-05,7.6434e-05,2.0139e-05,0.0006,0.0002,0.0002,4.3144e-05,0.0014,0.0002,0.0002,...,0.0005,0.0006,0.0005,9.671e-05,0.0009,0.0002,0.0001,0.0002,0.0002,0.0002


In [150]:
props = pd.DataFrame(proportions, index=['prop'])
props

Unnamed: 0,HOBBIES_1_001,HOBBIES_1_002,HOBBIES_1_003,HOBBIES_1_004,HOBBIES_1_005,HOBBIES_1_006,HOBBIES_1_007,HOBBIES_1_008,HOBBIES_1_009,HOBBIES_1_010,...,FOODS_3_818,FOODS_3_819,FOODS_3_820,FOODS_3_821,FOODS_3_822,FOODS_3_823,FOODS_3_824,FOODS_3_825,FOODS_3_826,FOODS_3_827
prop,5.8774e-05,7.6434e-05,2.0139e-05,0.0006,0.0002,0.0002,4.3144e-05,0.0014,0.0002,0.0002,...,0.0005,0.0006,0.0005,9.671e-05,0.0009,0.0002,0.0001,0.0002,0.0002,0.0002


In [155]:
sample.sum(axis=1) * proportions['HOBBIES_1_001'] 

date
2016-04-25    2.9267
2016-04-26    2.9267
2016-04-27    2.9267
2016-04-28    2.9267
2016-04-29    2.9267
2016-04-30    2.9267
2016-05-01    2.9267
2016-05-02    2.9267
2016-05-03    2.9267
2016-05-04    2.9267
2016-05-05    2.9267
2016-05-06    2.9267
2016-05-07    2.9267
2016-05-08    2.9267
2016-05-09    2.9267
2016-05-10    2.9267
2016-05-11    2.9267
2016-05-12    2.9267
2016-05-13    2.9267
2016-05-14    2.9267
2016-05-15    2.9267
2016-05-16    2.9267
2016-05-17    2.9267
2016-05-18    2.9267
2016-05-19    2.9267
2016-05-20    2.9267
2016-05-21    2.9267
2016-05-22    2.9267
dtype: float64

## Compute for the WRMSSE per level

In [None]:
def compute_wrmsse(df_rmsse):