## Upload data under data/ and hts_utils.py under utils/

This notebook was heavily modified from here:

<a href="https://colab.research.google.com/github/Nixtla/hierarchicalforecast/blob/main/nbs/examples/NonNegativeReconciliation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# %%capture
# !pip install hierarchicalforecast statsforecast

In [4]:
import numpy as np
import pandas as pd

from utils.hts_eda_utils import *

from hierarchicalforecast.utils import HierarchicalPlot
from statsforecast.models import * # ARIMA, ETS, etc.
from statsforecast.core import StatsForecast

# TODO TopDown() reconciler causes KeyError 'ETS, Naive'. Same with Empirical Risk Minimization. Why?
from hierarchicalforecast.methods import * # Reconcialiation methods: BottomUp, TopDown, MinTrace etc.
from hierarchicalforecast.core import HierarchicalReconciliation

from hierarchicalforecast.evaluation import HierarchicalEvaluation

In [5]:
# dataset subset to use? # Use full initially
#     deal_w_zeros_method = remove_zero_columns(df, any_or_all='any')

SELECT_TOP_K_PRODUCTS = None # None = keep all


# CHOOSE TIME SERIES METHODS HERE! https://nixtla.github.io/statsforecast/src/core/models_intro.html
TSModels = [
    ETS(season_length=7, model='ZAA'),
    Naive(),
    AutoETS(season_length=7, model='ZAA'), # I think this is newer version of ETS()
    ARIMA(),
    SeasonalExponentialSmoothingOptimized(season_length=7),
    AutoRegressive(lags=6),
    RandomWalkWithDrift()
    ]

# https://nixtla.github.io/hierarchicalforecast/methods.html
reconciliation_methods = [
    BottomUp(),
    TopDown(method='forecast_proportions'), # 'average_proportions' causes KeyError below
    MinTrace(method='wls_struct'), # Ols seems to not converge (SVD error)
    OptimalCombination(method='wls_struct'), # Same
    # ERM(method='closed') # Empirical Risk Minimization - KeyError
]

TIME_SERIES_FREQ = 'M'
df = pd.read_excel('data/Quarterly_smoothing.xlsx', index_col=0)#.iloc[:,:5])

  ETS._warn()


In [6]:
dataset_hierarchy_delimiter = ' - ' # The delimiter currently used in the dataset
HIERARCHY_DELIMITER = '_' # '_' is needed by HierarchicalForecast. Need to replace

## 1. Load Data

In [7]:
df.columns = df.columns.str.replace(' - ', HIERARCHY_DELIMITER) # Replace Hierarchy delimiter

##### Columns of all zeros cause errors (Division by zero in Covariance calc.). Need to fix

In [8]:
# TODO make this transform a parameter too
df = add_1_to_all_df_cells(df)

df.columns[20:50]

Index(['Дальневосточный ФО_AMBROBENE_Ambrobene tabs 30 mg #20',
       'Дальневосточный ФО_AMBROBENE_Stoptussin tabs 4 mg + 100 mg #20',
       'Дальневосточный ФО_AMLODIPINE-TEVA_Amlodipine-Teva tabs 10 mg #30',
       'Дальневосточный ФО_AMLODIPINE-TEVA_Amlodipine-Teva tabs 5 mg #30',
       'Дальневосточный ФО_ANASTROSOLE_Anastrozole-Teva FC tabs 1 mg #28',
       'Дальневосточный ФО_ATORVASTATIN-TEVA_Atorvastatin-Teva FC tabs 10 mg #30',
       'Дальневосточный ФО_ATORVASTATIN-TEVA_Atorvastatin-Teva FC tabs 20 mg #30',
       'Дальневосточный ФО_ATORVASTATIN-TEVA_Atorvastatin-Teva FC tabs 40 mg #30',
       'Дальневосточный ФО_AZILECT_Azilect tabs 1 mg #100',
       'Дальневосточный ФО_AZILECT_Azilect tabs 1 mg #30',
       'Дальневосточный ФО_BECLASONE ECO_Beclazone Eco aerosol for inh 100 mcg 200 doses #1',
       'Дальневосточный ФО_BECLASONE ECO_Beclazone Eco aerosol for inh 250 mcg 200 doses #1',
       'Дальневосточный ФО_BECLASONE ECO_Beclazone Eco aerosol for inh 50 mcg 200

##### Optional: Select only top Products

Saves compute

In [9]:
# if SELECT_TOP_K_PRODUCTS is not None:
#     df = select_top_n_brands(df, n=SELECT_TOP_K_PRODUCTS)

# # df.head(5)
brand_name = 'BISOPROLOL-TEVA'
df_brand = select_brand(df, brand_name, HIERARCHY_DELIMITER='_')
df_brand.columns = [c.replace(brand_name + HIERARCHY_DELIMITER, "") for c in df_brand.columns]
df_brand.columns = ["_".join(c.split(HIERARCHY_DELIMITER)[::-1]) for c in df_brand.columns]
df_brand.columns

Index(['Bisoprolol-Teva FC tabs 10 mg #30_Дальневосточный ФО',
       'Bisoprolol-Teva FC tabs 10 mg #50_Дальневосточный ФО',
       'Bisoprolol-Teva FC tabs 5 mg #30_Дальневосточный ФО',
       'Bisoprolol-Teva FC tabs 5 mg #50_Дальневосточный ФО',
       'Bisoprolol-Teva FC tabs 10 mg #30_Приволжский ФО',
       'Bisoprolol-Teva FC tabs 10 mg #50_Приволжский ФО',
       'Bisoprolol-Teva FC tabs 5 mg #30_Приволжский ФО',
       'Bisoprolol-Teva FC tabs 5 mg #50_Приволжский ФО',
       'Bisoprolol-Teva FC tabs 10 mg #30_Северо-западный ФО',
       'Bisoprolol-Teva FC tabs 10 mg #50_Северо-западный ФО',
       'Bisoprolol-Teva FC tabs 5 mg #30_Северо-западный ФО',
       'Bisoprolol-Teva FC tabs 5 mg #50_Северо-западный ФО',
       'Bisoprolol-Teva FC tabs 10 mg #30_Северо-кавказский ФО',
       'Bisoprolol-Teva FC tabs 10 mg #50_Северо-кавказский ФО',
       'Bisoprolol-Teva FC tabs 5 mg #30_Северо-кавказский ФО',
       'Bisoprolol-Teva FC tabs 5 mg #50_Северо-кавказский ФО',
       '

In [10]:
df_brand

Unnamed: 0_level_0,Bisoprolol-Teva FC tabs 10 mg #30_Дальневосточный ФО,Bisoprolol-Teva FC tabs 10 mg #50_Дальневосточный ФО,Bisoprolol-Teva FC tabs 5 mg #30_Дальневосточный ФО,Bisoprolol-Teva FC tabs 5 mg #50_Дальневосточный ФО,Bisoprolol-Teva FC tabs 10 mg #30_Приволжский ФО,Bisoprolol-Teva FC tabs 10 mg #50_Приволжский ФО,Bisoprolol-Teva FC tabs 5 mg #30_Приволжский ФО,Bisoprolol-Teva FC tabs 5 mg #50_Приволжский ФО,Bisoprolol-Teva FC tabs 10 mg #30_Северо-западный ФО,Bisoprolol-Teva FC tabs 10 mg #50_Северо-западный ФО,...,Bisoprolol-Teva FC tabs 5 mg #30_Уральский ФО,Bisoprolol-Teva FC tabs 5 mg #50_Уральский ФО,Bisoprolol-Teva FC tabs 10 mg #30_Центральный ФО,Bisoprolol-Teva FC tabs 10 mg #50_Центральный ФО,Bisoprolol-Teva FC tabs 5 mg #30_Центральный ФО,Bisoprolol-Teva FC tabs 5 mg #50_Центральный ФО,Bisoprolol-Teva FC tabs 10 mg #30_Южный ФО,Bisoprolol-Teva FC tabs 10 mg #50_Южный ФО,Bisoprolol-Teva FC tabs 5 mg #30_Южный ФО,Bisoprolol-Teva FC tabs 5 mg #50_Южный ФО
Month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-03-01,4475,170,8920,611,39810,983,74980,1066,13483,523,...,39333,677,26757,168,13903,769,10272,181,11379,69
2018-04-01,7394,273,19969,707,67274,1027,119736,1474,14813,1454,...,43712,725,53317,639,66399,3856,17122,407,20741,993
2018-05-01,8836,1063,26304,1307,67153,1145,117562,2954,10093,1636,...,60264,1614,65601,2254,103821,7359,19740,629,33587,1455
2018-06-01,20156,1958,36987,3477,43315,1125,58628,4257,13125,1393,...,45874,1858,77633,3354,484082,7346,32751,1265,39578,2711
2018-07-01,17290,2535,26707,4806,19195,1457,21094,4126,12175,497,...,70248,2192,83843,3334,459791,4732,29467,1406,63421,1859
2018-08-01,20649,1745,24489,4508,53201,1487,123196,4588,12018,333,...,99915,1604,60404,1783,426612,1893,28649,1314,50877,1563
2018-09-01,7772,681,13567,2020,80486,1694,178477,2904,7892,354,...,76702,1444,59757,1598,43716,2118,9021,578,40711,787
2018-10-01,13244,271,21275,845,91392,1484,190313,5881,12832,439,...,98857,6167,39614,1353,30651,1917,27981,901,177315,735
2018-11-01,9290,277,19030,645,76317,792,115860,4583,15564,521,...,89657,5848,48800,2667,57036,3234,27510,805,179778,973
2018-12-01,9188,547,19530,1066,51041,1597,66043,4776,17937,359,...,194050,10568,295792,2347,606728,4195,38012,1834,196483,1321


In [11]:
%%capture
df_with_aggregates, hierarchy = prep_data_for_scikit_hts_prod_region(df_brand)

In [12]:
df_with_aggregates

Unnamed: 0_level_0,Bisoprolol-Teva FC tabs 10 mg #30_Дальневосточный ФО,Bisoprolol-Teva FC tabs 10 mg #50_Дальневосточный ФО,Bisoprolol-Teva FC tabs 5 mg #30_Дальневосточный ФО,Bisoprolol-Teva FC tabs 5 mg #50_Дальневосточный ФО,Bisoprolol-Teva FC tabs 10 mg #30_Приволжский ФО,Bisoprolol-Teva FC tabs 10 mg #50_Приволжский ФО,Bisoprolol-Teva FC tabs 5 mg #30_Приволжский ФО,Bisoprolol-Teva FC tabs 5 mg #50_Приволжский ФО,Bisoprolol-Teva FC tabs 10 mg #30_Северо-западный ФО,Bisoprolol-Teva FC tabs 10 mg #50_Северо-западный ФО,...,Bisoprolol-Teva FC tabs 5 mg #50_Центральный ФО,Bisoprolol-Teva FC tabs 10 mg #30_Южный ФО,Bisoprolol-Teva FC tabs 10 mg #50_Южный ФО,Bisoprolol-Teva FC tabs 5 mg #30_Южный ФО,Bisoprolol-Teva FC tabs 5 mg #50_Южный ФО,Bisoprolol-Teva FC tabs 10 mg #30,Bisoprolol-Teva FC tabs 5 mg #50,Bisoprolol-Teva FC tabs 5 mg #30,Bisoprolol-Teva FC tabs 10 mg #50,Total
Month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-03-01,4475,170,8920,611,39810,983,74980,1066,13483,523,...,769,10272,181,11379,69,146480,6713,204641,3293,722254
2018-04-01,7394,273,19969,707,67274,1027,119736,1474,14813,1454,...,3856,17122,407,20741,993,241009,13725,366968,6296,1255996
2018-05-01,8836,1063,26304,1307,67153,1145,117562,2954,10093,1636,...,7359,19740,629,33587,1455,260685,24380,485199,10623,1561774
2018-06-01,20156,1958,36987,3477,43315,1125,58628,4257,13125,1393,...,7346,32751,1265,39578,2711,270577,29466,807629,12296,2239936
2018-07-01,17290,2535,26707,4806,19195,1457,21094,4126,12175,497,...,4732,29467,1406,63421,1859,260287,27448,756040,13116,2113782
2018-08-01,20649,1745,24489,4508,53201,1487,123196,4588,12018,333,...,1893,28649,1314,50877,1563,309475,19383,883745,9230,2443666
2018-09-01,7772,681,13567,2020,80486,1694,178477,2904,7892,354,...,2118,9021,578,40711,787,263570,13234,499672,8857,1570666
2018-10-01,13244,271,21275,845,91392,1484,190313,5881,12832,439,...,1917,27981,901,177315,735,321531,19715,784626,7367,2266478
2018-11-01,9290,277,19030,645,76317,792,115860,4583,15564,521,...,3234,27510,805,179778,973,310227,24997,677462,16975,2059322
2018-12-01,9188,547,19530,1066,51041,1597,66043,4776,17937,359,...,4195,38012,1834,196483,1321,616191,34572,1413178,17980,4163842


In [13]:
df_with_aggregates.shape

(57, 37)

In [14]:
hierarchy

{'Total': ['Bisoprolol-Teva FC tabs 10 mg #30',
  'Bisoprolol-Teva FC tabs 5 mg #50',
  'Bisoprolol-Teva FC tabs 5 mg #30',
  'Bisoprolol-Teva FC tabs 10 mg #50'],
 'Bisoprolol-Teva FC tabs 10 mg #30': ['Bisoprolol-Teva FC tabs 10 mg #30_Сибирский ФО',
  'Bisoprolol-Teva FC tabs 10 mg #30_Южный ФО',
  'Bisoprolol-Teva FC tabs 10 mg #30_Северо-кавказский ФО',
  'Bisoprolol-Teva FC tabs 10 mg #30_Центральный ФО',
  'Bisoprolol-Teva FC tabs 10 mg #30_Приволжский ФО',
  'Bisoprolol-Teva FC tabs 10 mg #30_Дальневосточный ФО',
  'Bisoprolol-Teva FC tabs 10 mg #30_Уральский ФО',
  'Bisoprolol-Teva FC tabs 10 mg #30_Северо-западный ФО'],
 'Bisoprolol-Teva FC tabs 5 mg #50': ['Bisoprolol-Teva FC tabs 5 mg #50_Сибирский ФО',
  'Bisoprolol-Teva FC tabs 5 mg #50_Южный ФО',
  'Bisoprolol-Teva FC tabs 5 mg #50_Северо-кавказский ФО',
  'Bisoprolol-Teva FC tabs 5 mg #50_Центральный ФО',
  'Bisoprolol-Teva FC tabs 5 mg #50_Приволжский ФО',
  'Bisoprolol-Teva FC tabs 5 mg #50_Дальневосточный ФО',
  'Bis

<font color='cyan'>HierarchicalForecast likes data to be Drug | Date | Sales, rather than having DrugName as columns</font>


### Melt data into format required by HierarchicalForecast

Following how their example code's data looks

In [15]:
# Melt the DataFrame - convert ColNames to rows to match input to HierForecast
df_with_aggregates.reset_index(inplace=True) # Move Month index to column (package requirement)

# TODO Check these for prediction error
melted_df = df_with_aggregates.melt(id_vars=['Month'], var_name='Drug', value_name='Sales')

# Convert melted DataFrame to the required format
melted_df = melted_df[['Drug', 'Month', 'Sales']]

# Col names seem to need to be thus for package
melted_df.rename(columns={'Drug': 'unique_id', 'Month':'ds', 'Sales':'y'}, inplace=True)


print(melted_df.head())
print(melted_df.tail())


                                           unique_id         ds      y
0  Bisoprolol-Teva FC tabs 10 mg #30_Дальневосточ... 2018-03-01   4475
1  Bisoprolol-Teva FC tabs 10 mg #30_Дальневосточ... 2018-04-01   7394
2  Bisoprolol-Teva FC tabs 10 mg #30_Дальневосточ... 2018-05-01   8836
3  Bisoprolol-Teva FC tabs 10 mg #30_Дальневосточ... 2018-06-01  20156
4  Bisoprolol-Teva FC tabs 10 mg #30_Дальневосточ... 2018-07-01  17290
     unique_id         ds        y
2104     Total 2022-07-01  3102086
2105     Total 2022-08-01  4226404
2106     Total 2022-09-01  4372050
2107     Total 2022-10-01  3895336
2108     Total 2022-11-01  3644116


### Creating `S_df`

All colored font is Ariel

<font color='turquoise'>We've created `Y_df, tags`. All we need is `S_df`</font>
This is like a tree representing the hierarchy, with aggregations at each level

<font color='blue'>`S_df` is a representation of the Hierarchy - 1 means that column name (item, Drugs in our case), belongs to the Total row. Rows represent totals at each level of the hierarchy, for each node</font>

In [16]:
S_df = create_S_df(df_brand)

S_df

Unnamed: 0,Bisoprolol-Teva FC tabs 10 mg #30_Дальневосточный ФО,Bisoprolol-Teva FC tabs 10 mg #50_Дальневосточный ФО,Bisoprolol-Teva FC tabs 5 mg #30_Дальневосточный ФО,Bisoprolol-Teva FC tabs 5 mg #50_Дальневосточный ФО,Bisoprolol-Teva FC tabs 10 mg #30_Приволжский ФО,Bisoprolol-Teva FC tabs 10 mg #50_Приволжский ФО,Bisoprolol-Teva FC tabs 5 mg #30_Приволжский ФО,Bisoprolol-Teva FC tabs 5 mg #50_Приволжский ФО,Bisoprolol-Teva FC tabs 10 mg #30_Северо-западный ФО,Bisoprolol-Teva FC tabs 10 mg #50_Северо-западный ФО,...,Bisoprolol-Teva FC tabs 5 mg #30_Уральский ФО,Bisoprolol-Teva FC tabs 5 mg #50_Уральский ФО,Bisoprolol-Teva FC tabs 10 mg #30_Центральный ФО,Bisoprolol-Teva FC tabs 10 mg #50_Центральный ФО,Bisoprolol-Teva FC tabs 5 mg #30_Центральный ФО,Bisoprolol-Teva FC tabs 5 mg #50_Центральный ФО,Bisoprolol-Teva FC tabs 10 mg #30_Южный ФО,Bisoprolol-Teva FC tabs 10 mg #50_Южный ФО,Bisoprolol-Teva FC tabs 5 mg #30_Южный ФО,Bisoprolol-Teva FC tabs 5 mg #50_Южный ФО
Total,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
Bisoprolol-Teva FC tabs 10 mg #30,1,0,0,0,1,0,0,0,1,0,...,0,0,1,0,0,0,1,0,0,0
Bisoprolol-Teva FC tabs 10 mg #30_Дальневосточный ФО,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Bisoprolol-Teva FC tabs 10 mg #30_Приволжский ФО,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Bisoprolol-Teva FC tabs 10 mg #30_Северо-западный ФО,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
Bisoprolol-Teva FC tabs 10 mg #30_Северо-кавказский ФО,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Bisoprolol-Teva FC tabs 10 mg #30_Сибирский ФО,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Bisoprolol-Teva FC tabs 10 mg #30_Уральский ФО,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Bisoprolol-Teva FC tabs 10 mg #30_Центральный ФО,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
Bisoprolol-Teva FC tabs 10 mg #30_Южный ФО,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [17]:
S_df.shape

(37, 32)

### Create `tags`, which is a description of the Hierarchy as `dict`

Original `tags` loaded from example Dataset - they didn't create it programmatically

In [18]:
# TODO wtf did chatgpt do here?
transformed_data = { # Need names for hierarchy levels IMO
    "Molecule": ["Total"],
    "Molecule/Product": hierarchy['Total'],
    "Molecule/Product/Region": sum([hierarchy[region] for region in hierarchy['Total']], []),
    #"Sales/Region/DrugName/DrugDosage": sum([hierarchy[key] for key in sum([hierarchy[region] for region in hierarchy['Total']], [])], []),
}

# Convert the lists to numpy arrays for consistency with the format
for key in transformed_data:
    transformed_data[key] = np.array(transformed_data[key], dtype=object)

# print(transformed_data)
tags = transformed_data

In [19]:
tags

{'Molecule': array(['Total'], dtype=object),
 'Molecule/Product': array(['Bisoprolol-Teva FC tabs 10 mg #30',
        'Bisoprolol-Teva FC tabs 5 mg #50',
        'Bisoprolol-Teva FC tabs 5 mg #30',
        'Bisoprolol-Teva FC tabs 10 mg #50'], dtype=object),
 'Molecule/Product/Region': array(['Bisoprolol-Teva FC tabs 10 mg #30_Сибирский ФО',
        'Bisoprolol-Teva FC tabs 10 mg #30_Южный ФО',
        'Bisoprolol-Teva FC tabs 10 mg #30_Северо-кавказский ФО',
        'Bisoprolol-Teva FC tabs 10 mg #30_Центральный ФО',
        'Bisoprolol-Teva FC tabs 10 mg #30_Приволжский ФО',
        'Bisoprolol-Teva FC tabs 10 mg #30_Дальневосточный ФО',
        'Bisoprolol-Teva FC tabs 10 mg #30_Уральский ФО',
        'Bisoprolol-Teva FC tabs 10 mg #30_Северо-западный ФО',
        'Bisoprolol-Teva FC tabs 5 mg #50_Сибирский ФО',
        'Bisoprolol-Teva FC tabs 5 mg #50_Южный ФО',
        'Bisoprolol-Teva FC tabs 5 mg #50_Северо-кавказский ФО',
        'Bisoprolol-Teva FC tabs 5 mg #50_Центральный Ф

We split the dataframe in train/test splits.

In [20]:
Y_df = melted_df

Y_df

Unnamed: 0,unique_id,ds,y
0,Bisoprolol-Teva FC tabs 10 mg #30_Дальневосточ...,2018-03-01,4475
1,Bisoprolol-Teva FC tabs 10 mg #30_Дальневосточ...,2018-04-01,7394
2,Bisoprolol-Teva FC tabs 10 mg #30_Дальневосточ...,2018-05-01,8836
3,Bisoprolol-Teva FC tabs 10 mg #30_Дальневосточ...,2018-06-01,20156
4,Bisoprolol-Teva FC tabs 10 mg #30_Дальневосточ...,2018-07-01,17290
...,...,...,...
2104,Total,2022-07-01,3102086
2105,Total,2022-08-01,4226404
2106,Total,2022-09-01,4372050
2107,Total,2022-10-01,3895336


In [21]:
Y_test_df = Y_df.groupby('unique_id').tail(10) # Original code
Y_train_df = Y_df.drop(Y_test_df.index)

Y_test_df = Y_test_df.set_index('unique_id')
Y_train_df = Y_train_df.set_index('unique_id')

In [22]:
print(Y_test_df.head())
print(Y_test_df.tail())

                                                           ds      y
unique_id                                                           
Bisoprolol-Teva FC tabs 10 mg #30_Дальневосточн... 2022-02-01  17466
Bisoprolol-Teva FC tabs 10 mg #30_Дальневосточн... 2022-03-01  21130
Bisoprolol-Teva FC tabs 10 mg #30_Дальневосточн... 2022-04-01  20656
Bisoprolol-Teva FC tabs 10 mg #30_Дальневосточн... 2022-05-01  20823
Bisoprolol-Teva FC tabs 10 mg #30_Дальневосточн... 2022-06-01  25081
                  ds        y
unique_id                    
Total     2022-07-01  3102086
Total     2022-08-01  4226404
Total     2022-09-01  4372050
Total     2022-10-01  3895336
Total     2022-11-01  3644116


## 2. Base Forecasts

The following cell computes the *base forecast* for each time series using the `ETS` and `naive` models. Observe that `Y_hat_df` contains the forecasts but they are not coherent.

In [23]:
%%capture
fcst = StatsForecast(
    df=Y_train_df,
    models=TSModels,
    # models=[ETS(season_length=7, model='ZZA'), Naive()],
    freq=TIME_SERIES_FREQ,
    n_jobs=-1
)
Y_hat_df = fcst.forecast(h=10, fitted = True) # TODO What is h=7?
Y_fitted_df = fcst.forecast_fitted_values()

In [24]:
Y_fitted_df

Unnamed: 0_level_0,ds,y,ETS,Naive,AutoETS,ARIMA,SeasESOpt,AutoRegressive,RWD
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Bisoprolol-Teva FC tabs 10 mg #30,2018-03-01,146480.0,1.441883e+05,,1.441883e+05,4.674462e+05,,3.041423e+05,
Bisoprolol-Teva FC tabs 10 mg #30,2018-04-01,241009.0,1.893092e+05,146480.0,1.893092e+05,4.674462e+05,,2.206679e+05,1.534954e+05
Bisoprolol-Teva FC tabs 10 mg #30,2018-05-01,260685.0,3.211246e+05,241009.0,3.211246e+05,4.674462e+05,,3.010313e+05,2.480244e+05
Bisoprolol-Teva FC tabs 10 mg #30,2018-06-01,270577.0,3.077253e+05,260685.0,3.077253e+05,4.674462e+05,,3.259725e+05,2.677004e+05
Bisoprolol-Teva FC tabs 10 mg #30,2018-07-01,260287.0,2.931906e+05,270577.0,2.931906e+05,4.674462e+05,,2.495378e+05,2.775924e+05
...,...,...,...,...,...,...,...,...,...
Total,2021-09-01,3109588.0,1.437279e+06,1470772.0,1.437279e+06,3.209535e+06,2810029.50,2.090496e+06,1.542184e+06
Total,2021-10-01,4012742.0,3.084791e+06,3109588.0,3.084791e+06,3.209535e+06,2752011.75,4.291136e+06,3.181000e+06
Total,2021-11-01,5621248.0,4.522824e+06,4012742.0,4.522824e+06,3.209535e+06,3459664.25,4.157858e+06,4.084154e+06
Total,2021-12-01,4681078.0,5.870255e+06,5621248.0,5.870255e+06,3.209535e+06,3804654.50,4.609287e+06,5.692660e+06


In [25]:
Y_hat_df

Unnamed: 0_level_0,ds,ETS,Naive,AutoETS,ARIMA,SeasESOpt,AutoRegressive,RWD
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Bisoprolol-Teva FC tabs 10 mg #30,2022-01-31,4.878302e+05,469190.0,4.878302e+05,4.674462e+05,2.094909e+05,3.928502e+05,4.762054e+05
Bisoprolol-Teva FC tabs 10 mg #30,2022-02-28,4.496908e+05,469190.0,4.496908e+05,4.674462e+05,2.045327e+05,4.281569e+05,4.832209e+05
Bisoprolol-Teva FC tabs 10 mg #30,2022-03-31,5.005894e+05,469190.0,5.005894e+05,4.674462e+05,3.494135e+05,4.617206e+05,4.902363e+05
Bisoprolol-Teva FC tabs 10 mg #30,2022-04-30,5.439860e+05,469190.0,5.439860e+05,4.674462e+05,4.057328e+05,4.990850e+05,4.972518e+05
Bisoprolol-Teva FC tabs 10 mg #30,2022-05-31,6.287009e+05,469190.0,6.287009e+05,4.674462e+05,5.470935e+05,4.701790e+05,5.042672e+05
...,...,...,...,...,...,...,...,...
Total,2022-06-30,4.117565e+06,4007200.0,4.117565e+06,3.209535e+06,3.813419e+06,3.301046e+06,4.435671e+06
Total,2022-07-31,4.409718e+06,4007200.0,4.409718e+06,3.209535e+06,3.618966e+06,2.749932e+06,4.507083e+06
Total,2022-08-31,4.281864e+06,4007200.0,4.281864e+06,3.209535e+06,3.305472e+06,3.055143e+06,4.578495e+06
Total,2022-09-30,3.763056e+06,4007200.0,3.763056e+06,3.209535e+06,1.491639e+06,3.416078e+06,4.649907e+06


Observe that the ETS model computes negative forecasts for some series.

<font color='pink'>Does `Y_hat_df` have a `ds` column in the original code?</font>

Yes

In [None]:
#Y_hat_df['ds'] = Y_test_df['ds'] ## the model doesn't compute the dates well
Y_hat_df
#Y_test_df

In [29]:
Y_test_df

Unnamed: 0_level_0,ds,y
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1
Bisoprolol-Teva FC tabs 10 mg #30_Дальневосточный ФО,2022-02-01,17466
Bisoprolol-Teva FC tabs 10 mg #30_Дальневосточный ФО,2022-03-01,21130
Bisoprolol-Teva FC tabs 10 mg #30_Дальневосточный ФО,2022-04-01,20656
Bisoprolol-Teva FC tabs 10 mg #30_Дальневосточный ФО,2022-05-01,20823
Bisoprolol-Teva FC tabs 10 mg #30_Дальневосточный ФО,2022-06-01,25081
...,...,...
Total,2022-07-01,3102086
Total,2022-08-01,4226404
Total,2022-09-01,4372050
Total,2022-10-01,3895336


In [30]:
# `S_df` should have 1 entry for each unique row in `Y_hat_df`
assert(len(S_df.index) == len(set(Y_hat_df.index)))
assert(set(Y_train_df.index) - set(S_df.index) == set())
assert(set(S_df.index) - set(Y_train_df.index) == set())

## 3. Non-Negative Reconciliation

The following cell makes the previous forecasts coherent and nonnegative using the `HierarchicalReconciliation` class.

In [31]:
hrec = HierarchicalReconciliation(reconcilers=reconciliation_methods)


Y_rec_df = hrec.reconcile(Y_hat_df=Y_hat_df, Y_df=Y_fitted_df,
                          S=S_df, tags=tags)

Y_rec_df.head()

Unnamed: 0_level_0,ds,ETS,Naive,AutoETS,ARIMA,SeasESOpt,AutoRegressive,RWD,ETS/BottomUp,Naive/BottomUp,...,SeasESOpt/MinTrace_method-wls_struct,AutoRegressive/MinTrace_method-wls_struct,RWD/MinTrace_method-wls_struct,ETS/OptimalCombination_method-wls_struct,Naive/OptimalCombination_method-wls_struct,AutoETS/OptimalCombination_method-wls_struct,ARIMA/OptimalCombination_method-wls_struct,SeasESOpt/OptimalCombination_method-wls_struct,AutoRegressive/OptimalCombination_method-wls_struct,RWD/OptimalCombination_method-wls_struct
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Total,2022-01-31,3890069.75,4007200.0,3890069.75,3209535.0,3305472.0,2826829.75,4078611.75,2049985.0,2003600.0,...,2003350.0,1897186.0,2719075.0,2679160.0,2671467.0,2679160.0,2139690.0,2003350.0,1897186.0,2719075.0
Total,2022-02-28,3371261.0,4007200.0,3371261.0,3209535.0,1491638.875,3354780.75,4150023.75,1992756.0,2003600.0,...,1160971.0,2155543.0,2766682.0,2358044.0,2671467.0,2358044.0,2139690.0,1160971.0,2155543.0,2766682.0
Total,2022-03-31,3346584.5,4007200.0,3346584.5,3209535.0,3106592.5,3776429.25,4221435.5,1978182.625,2003600.0,...,1981212.0,2410851.0,2814290.0,2348451.0,2671467.0,2348451.0,2139690.0,1981212.0,2410851.0,2814290.0
Total,2022-04-30,3340011.5,4007200.0,3340011.5,3209535.0,3455022.25,4398597.0,4292847.5,2018615.625,2003600.0,...,2177949.0,2756703.0,2861898.0,2380859.0,2671467.0,2380859.0,2139690.0,2177949.0,2756703.0,2861898.0
Total,2022-05-31,3859134.25,4007200.0,3859134.25,3209535.0,5387509.5,3841195.0,4364259.5,2390530.75,2003600.0,...,3172695.0,2478952.0,2909506.0,2769503.0,2671467.0,2769503.0,2139690.0,3172695.0,2478952.0,2909506.0


In [50]:
# assign correct time stamps, the package doesn't compute them correctly
Y_rec_df['ds'] = list(Y_test_df['ds'])
Y_hat_df['ds'] = list(Y_test_df['ds'])

## 4. Evaluation

The `HierarchicalForecast` package includes the `HierarchicalEvaluation` class to evaluate the different hierarchies and also is capable of compute scaled metrics compared to a benchmark model.

In [46]:
# TODO enhance this
def mse(y, y_hat):
    return np.mean((y-y_hat)**2)

evaluator = HierarchicalEvaluation(evaluators=[mean_absolute_percentage_error, symmetric_mean_absolute_percentage_error])
evaluation = evaluator.evaluate(
        Y_hat_df=Y_rec_df, Y_test_df=Y_test_df,Y_df=Y_train_df,
        tags=tags#, benchmark='Naive'
)
evaluation = evaluation.applymap('{:.2f}'.format)
evaluation = evaluation.reset_index()

In [47]:
evaluation['metric'] = evaluation['metric'].apply(lambda x: "".join([y[0] for y in x.split("_")]))

In [48]:
evaluation_ARIMA = evaluation[['level','metric']+[c for c in evaluation.columns if "ARIMA" in c]]
evaluation_ARIMA

Unnamed: 0,level,metric,ARIMA,ARIMA/BottomUp,ARIMA/TopDown_method-forecast_proportions,ARIMA/MinTrace_method-wls_struct,ARIMA/OptimalCombination_method-wls_struct
0,Overall,mape,863.55,864.47,1768.78,26878.04,26878.04
1,Overall,smape,731.21,745.76,929.23,1061.56,1061.56
2,Molecule,mape,18.88,52.86,18.88,37.15,37.15
3,Molecule,smape,193.42,731.59,193.42,473.15,473.15
4,Molecule/Product,mape,67.19,67.19,165.47,889.51,889.51
5,Molecule/Product,smape,518.49,518.49,732.29,779.46,779.46
6,Molecule/Product/Region,mape,989.49,989.49,2023.88,30965.38,30965.38
7,Molecule/Product/Region,smape,774.61,774.61,976.84,1115.21,1115.21


In [40]:
evaluation_ETS = evaluation[['level','metric']+[c for c in evaluation.columns if "ETS" in c]]
evaluation_ETS

Unnamed: 0,level,metric,ETS,AutoETS,ETS/BottomUp,AutoETS/BottomUp,ETS/TopDown_method-forecast_proportions,AutoETS/TopDown_method-forecast_proportions,ETS/MinTrace_method-wls_struct,AutoETS/MinTrace_method-wls_struct,ETS/OptimalCombination_method-wls_struct,AutoETS/OptimalCombination_method-wls_struct
0,Overall,mape,781.17,781.17,783.96,783.96,879.34,879.34,25308.28,25308.28,25308.28,25308.28
1,Overall,smape,883.88,883.88,890.43,890.43,1046.9,1046.9,1109.31,1109.31,1109.31,1109.31
2,Molecule,mape,17.69,17.69,32.27,32.27,17.69,17.69,24.84,24.84,24.84,24.84
3,Molecule,smape,151.73,151.73,396.72,396.72,151.73,151.73,286.99,286.99,286.99,286.99
4,Molecule/Product,mape,55.54,55.54,77.71,77.71,135.0,135.0,867.37,867.37,867.37,867.37
5,Molecule/Product,smape,564.52,564.52,563.91,563.91,779.02,779.02,866.5,866.5,866.5,866.5
6,Molecule/Product/Region,mape,895.73,895.73,895.73,895.73,999.31,999.31,29153.51,29153.51,29153.51,29153.51
7,Molecule/Product/Region,smape,946.67,946.67,946.67,946.67,1108.36,1108.36,1165.36,1165.36,1165.36,1165.36


In [51]:
evaluation_AutoRegressive = evaluation[['level','metric']+[c for c in evaluation.columns if "AutoRegressive" in c]]
evaluation_AutoRegressive

Unnamed: 0,level,metric,AutoRegressive,AutoRegressive/BottomUp,AutoRegressive/TopDown_method-forecast_proportions,AutoRegressive/MinTrace_method-wls_struct,AutoRegressive/OptimalCombination_method-wls_struct
0,Overall,mape,816.42,817.23,1696.79,30574.61,30574.61
1,Overall,smape,746.59,764.04,969.0,1091.4,1091.4
2,Molecule,mape,15.56,55.37,15.56,34.97,34.97
3,Molecule,smape,162.56,775.57,162.56,438.1,438.1
4,Molecule/Product,mape,65.51,63.08,166.31,999.15,999.15
5,Molecule/Product,smape,496.21,504.31,733.43,813.13,813.13
6,Molecule/Product/Region,mape,935.31,935.31,1940.64,35225.9,35225.9
7,Molecule/Product/Region,smape,796.14,796.14,1023.65,1146.6,1146.6


In [52]:
evaluation_SeasESOpt = evaluation[['level','metric']+[c for c in evaluation.columns if "SeasESOpt" in c]]
evaluation_SeasESOpt

Unnamed: 0,level,metric,SeasESOpt,SeasESOpt/BottomUp,SeasESOpt/TopDown_method-forecast_proportions,SeasESOpt/MinTrace_method-wls_struct,SeasESOpt/OptimalCombination_method-wls_struct
0,Overall,mape,830.58,829.82,2268.72,21919.89,21919.89
1,Overall,smape,797.84,801.06,946.54,1084.5,1084.5
2,Molecule,mape,31.16,54.76,31.16,39.86,39.86
3,Molecule,smape,338.6,774.0,338.6,534.26,534.26
4,Molecule/Product,mape,70.44,57.55,159.91,745.99,745.99
5,Molecule/Product,smape,649.92,570.89,726.89,797.93,797.93
6,Molecule/Product/Region,mape,950.58,950.58,2602.25,25250.38,25250.38
7,Molecule/Product/Region,smape,830.68,830.68,992.99,1137.52,1137.52


In [53]:
evaluation_AutoETS = evaluation[['level','metric']+[c for c in evaluation.columns if "AutoETS" in c]]
evaluation_AutoETS

Unnamed: 0,level,metric,AutoETS,AutoETS/BottomUp,AutoETS/TopDown_method-forecast_proportions,AutoETS/MinTrace_method-wls_struct,AutoETS/OptimalCombination_method-wls_struct
0,Overall,mape,781.17,783.96,879.34,25308.28,25308.28
1,Overall,smape,883.88,890.43,1046.9,1109.31,1109.31
2,Molecule,mape,17.69,32.27,17.69,24.84,24.84
3,Molecule,smape,151.73,396.72,151.73,286.99,286.99
4,Molecule/Product,mape,55.54,77.71,135.0,867.37,867.37
5,Molecule/Product,smape,564.52,563.91,779.02,866.5,866.5
6,Molecule/Product/Region,mape,895.73,895.73,999.31,29153.51,29153.51
7,Molecule/Product/Region,smape,946.67,946.67,1108.36,1165.36,1165.36


Observe that the nonnegative reconciliation method performs better that its unconstrained counterpart.

## Plot Hierarchy & Evaluations

In [None]:
a = Y_test_df.sort_values(by=['unique_id', 'ds'], ascending=True)

# TODO programmatically get these by subtracting column names (set)
b = Y_rec_df.sort_values(by=['unique_id', 'ds'], ascending=True)
#b = b[['ETS', 'Naive', 'ETS/BottomUp', 'Naive/BottomUp']]
'''The months are incorrect  for b!!!'''

b['ds']

In [None]:
a

In [None]:
b

In [None]:
#temp = Y_rec_df.loc[Y_test_df.index]
merged_test_preds_df = pd.concat([a,b], axis=1)
merged_test_preds_df

In [None]:
pd.concat([Y_test_df,temp[['ETS', 'Naive', 'ETS/BottomUp', 'Naive/BottomUp']]])

In [None]:
merged_test_preds_df = merged_test_preds_df.sort_values(by='ds', ascending=True)
merged_test_preds_df

In [None]:
hplt = HierarchicalPlot(S=S_df, tags=tags)

hplt.plot_hierarchical_predictions_gap(Y_df=merged_test_preds_df, models = 'ETS')#['ETS', 'Naive', 'ETS/BottomUp', 'Naive/BottomUp'])

In [None]:
hplt.plot_hierarchically_linked_series(bottom_series='Северо-западный ФО_Bisoprolol-Teva FC tabs 5 mg #50', Y_df=Y_train_df)

In [None]:
# pd.read_csv('https://raw.githubusercontent.com/Nixtla/transfer-learning-time-series/main/datasets/tourism.csv')

In [None]:
# Y_df = pd.read_csv('https://raw.githubusercontent.com/Nixtla/transfer-learning-time-series/main/datasets/tourism.csv')
# Y_df = Y_df.rename({'Trips': 'y', 'Quarter': 'ds'}, axis=1)
# Y_df.insert(0, 'Country', 'Australia')
# Y_df = Y_df[['Country', 'Region', 'State', 'Purpose', 'ds', 'y']]
# Y_df['ds'] = Y_df['ds'].str.replace(r'(\d+) (Q\d)', r'\1-\2', regex=True)
# Y_df['ds'] = pd.to_datetime(Y_df['ds'])
# Y_df.head()