In [21]:
#---- Manipulação de dados:

import pandas as pd
import numpy as np

#---- Modelagem:

from hierarchicalforecast.utils import aggregate
from statsforecast import StatsForecast
from statsforecast.models import Naive, AutoARIMA, HoltWinters, AutoETS
from mlforecast import MLForecast
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

#---- Reconciliação

from hierarchicalforecast.methods import BottomUp, TopDown, ERM, OptimalCombination, MinTrace, MiddleOut
from hierarchicalforecast.core import HierarchicalReconciliation

In [22]:
#---- Read data

def read_data(path: str):

    df = pd.read_csv(path)

    return df

path_data = 'https://raw.githubusercontent.com/aws-samples/amazon-sagemaker-hierarchical-forecasting/main/retail-usa-clothing.csv'

dados = read_data(path = path_data)

df = dados\
    .copy()\
    .query('date >= "2005-01-01"')

df.head()

Unnamed: 0,date,state,item,quantity,region,country
2594,2005-01-01,NewYork,mens_clothing,11,Mid-Alantic,USA
2595,2005-01-02,NewYork,mens_clothing,11,Mid-Alantic,USA
2596,2005-01-03,NewYork,mens_clothing,9,Mid-Alantic,USA
2597,2005-01-04,NewYork,mens_clothing,12,Mid-Alantic,USA
2598,2005-01-05,NewYork,mens_clothing,12,Mid-Alantic,USA


In [23]:
#---- Transform data:

def clean_data(df: pd.DataFrame):

    #---- 1. Excluindo a variável de country:

    df = df\
        .drop(columns = 'country')

    #---- 2. Mudando o tipo da variável de date para datetime:

    df['date'] = pd.to_datetime(df['date'])

    #---- 3. Renomeando as variáveis de quantidade de vendas e data:
    # date -> ds
    # quantity -> y

    df = df\
        .rename(columns = {'date': 'ds', 
                           'quantity': 'y'})

    return df

df = clean_data(df = df)

df.head()

Unnamed: 0,ds,state,item,y,region
2594,2005-01-01,NewYork,mens_clothing,11,Mid-Alantic
2595,2005-01-02,NewYork,mens_clothing,11,Mid-Alantic
2596,2005-01-03,NewYork,mens_clothing,9,Mid-Alantic
2597,2005-01-04,NewYork,mens_clothing,12,Mid-Alantic
2598,2005-01-05,NewYork,mens_clothing,12,Mid-Alantic


In [24]:
#---- Hierachical data format:

def format_hierarchical_df(df: pd.DataFrame, cols_hierarchical: list):

    #---- 1. Cria uma lista de listas: [[col1], [col1, col2], ..., [col1, col2, coln]]

    hier_list = [cols_hierarchical[:i] for i in range(1, len(cols_hierarchical) + 1)]

    #---- 2. Aplica a função aggregate que formata os dados em que a lib hierarchical pede

    Y_df, S_df, tags = aggregate(df = df, spec = hier_list)

    return Y_df, S_df, tags

cols_hierarchical = ['region', 'state', 'item']

Y_df, S_df, tags = format_hierarchical_df(df = df, cols_hierarchical = cols_hierarchical)

In [25]:
Y_df.head()

Unnamed: 0_level_0,ds,y
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1
EastNorthCentral,2005-01-01,684
EastNorthCentral,2005-01-02,687
EastNorthCentral,2005-01-03,683
EastNorthCentral,2005-01-04,687
EastNorthCentral,2005-01-05,693


In [26]:
S_df.head()

Unnamed: 0,EastNorthCentral/Illinois/kids_clothing,EastNorthCentral/Illinois/kids_shoes,EastNorthCentral/Illinois/mens_clothing,EastNorthCentral/Illinois/womens_clothing,EastNorthCentral/Illinois/womens_shoes,EastNorthCentral/Indiana/kids_clothing,EastNorthCentral/Indiana/kids_shoes,EastNorthCentral/Indiana/mens_clothing,EastNorthCentral/Indiana/womens_clothing,EastNorthCentral/Indiana/womens_shoes,...,SouthCentral/Mississippi/kids_clothing,SouthCentral/Mississippi/kids_shoes,SouthCentral/Mississippi/mens_clothing,SouthCentral/Mississippi/womens_clothing,SouthCentral/Mississippi/womens_shoes,SouthCentral/Tennessee/kids_clothing,SouthCentral/Tennessee/kids_shoes,SouthCentral/Tennessee/mens_clothing,SouthCentral/Tennessee/womens_clothing,SouthCentral/Tennessee/womens_shoes
EastNorthCentral,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Mid-Alantic,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
NewEngland,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Pacific,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SouthCentral,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [27]:
#---- Fit time series

hw = HoltWinters(season_length = 7, error_type = 'M') # Holtwinters com sazonalidade de 7 dias e erro do tipo Aditivo

In [30]:
model_ts = StatsForecast(models = [hw], freq = 'D', n_jobs = -1)
model_ts.fit(Y_df)

StatsForecast(models=[HoltWinters])

In [32]:
#---- Predict

horizon_forecast = 19

Y_hat_df_ts = model_ts.forecast(h = horizon_forecast, fitted = True)

Y_hat_df_ts.head()

Forecast: 100%|█████████████████████████████████| 14/14 [00:59<00:00,  4.24s/it]

Forecast: 100%|█████████████████████████████████| 14/14 [00:59<00:00,  4.26s/it]
Forecast: 100%|█████████████████████████████████| 14/14 [00:59<00:00,  4.28s/it]
Forecast: 100%|█████████████████████████████████| 14/14 [00:59<00:00,  4.28s/it]
Forecast: 100%|█████████████████████████████████| 14/14 [01:00<00:00,  4.32s/it]
Forecast: 100%|█████████████████████████████████| 15/15 [01:00<00:00,  4.04s/it]
Forecast: 100%|█████████████████████████████████| 15/15 [01:01<00:00,  4.07s/it]


Unnamed: 0_level_0,ds,HoltWinters
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1
EastNorthCentral,2009-07-29,961.616882
EastNorthCentral,2009-07-30,960.224792
EastNorthCentral,2009-07-31,960.192383
EastNorthCentral,2009-08-01,960.561157
EastNorthCentral,2009-08-02,960.006042


In [41]:
#---- Reconciliação

reconcilers_ts = [BottomUp()]

hrec_ts = HierarchicalReconciliation(reconcilers = reconcilers_ts)

Y_rec_df_ts = hrec_ts.reconcile(Y_hat_df = Y_hat_df_ts, 
                                S = S_df,
                                tags = tags)

In [43]:
Y_rec_df_ts

Unnamed: 0,unique_id,ds,HoltWinters,HoltWinters/BottomUp
0,EastNorthCentral,2009-07-29,961.616882,962.696716
1,EastNorthCentral,2009-07-30,960.224792,961.207642
2,EastNorthCentral,2009-07-31,960.192383,961.224121
3,EastNorthCentral,2009-08-01,960.561157,962.099609
4,EastNorthCentral,2009-08-02,960.006042,961.382935
...,...,...,...,...
2161,SouthCentral/Tennessee/womens_shoes,2009-08-12,29.664722,29.664722
2162,SouthCentral/Tennessee/womens_shoes,2009-08-13,29.660910,29.660910
2163,SouthCentral/Tennessee/womens_shoes,2009-08-14,29.586050,29.586050
2164,SouthCentral/Tennessee/womens_shoes,2009-08-15,29.643011,29.643011


In [34]:
# Fit machine learning

lin_reg = LinearRegression() # Regressão linear

#---- Features de data:

from numba import njit
from window_ops.expanding import expanding_mean
from window_ops.rolling import rolling_mean

@njit
def rolling_mean_7(x):
    return rolling_mean(x, window_size = 7)

@njit
def rolling_mean_14(x):
    return rolling_mean(x, window_size = 14)

@njit
def rolling_mean_21(x):
    return rolling_mean(x, window_size = 21)

@njit
def rolling_mean_28(x):
    return rolling_mean(x, window_size = 28)

In [51]:
model_ml = MLForecast(models = [lin_reg], # Lista com 9 modelos
                   freq = 'D', # Frequência diária
                   num_threads = 6,
                   lags = [1, 7, 14, 21, 28, 30], # Criação de novas features de lags de 1, 7, ..., 30 dias da variável resposta
                   date_features = ['dayofweek', 'month', 'year', 'quarter', 'day', 'week'], # Features de data
                   lag_transforms = {
                       1: [expanding_mean],
                       7: [rolling_mean_7],
                       14: [rolling_mean_14],
                       21: [rolling_mean_21],
                       28: [rolling_mean_28],
                   }
           )

model_ml.fit(Y_df.reset_index(), id_col = 'unique_id', time_col = 'ds', target_col = 'y', fitted = True)

MLForecast(models=[LinearRegression], freq=D, lag_features=['lag1', 'lag7', 'lag14', 'lag21', 'lag28', 'lag30', 'expanding_mean_lag1', 'rolling_mean_7_lag7', 'rolling_mean_14_lag14', 'rolling_mean_21_lag21', 'rolling_mean_28_lag28'], date_features=['dayofweek', 'month', 'year', 'quarter', 'day', 'week'], num_threads=6)

In [52]:
#---- Predict

Y_hat_df_ml = model_ml.predict(h = horizon_forecast)

In [53]:
#---- Reconciliação

reconcilers_ml = [OptimalCombination(method = 'ols', nonnegative = True)]

hrec_ml = HierarchicalReconciliation(reconcilers = reconcilers_ml)

Y_rec_df_ml = hrec_ml.reconcile(Y_hat_df = Y_hat_df_ml, 
                                S = S_df,
                                tags = tags)

In [54]:
Y_rec_df_ml

Unnamed: 0_level_0,index,ds,LinearRegression,index/OptimalCombination_method-ols_nonnegative-True,LinearRegression/OptimalCombination_method-ols_nonnegative-True
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
EastNorthCentral,0,2009-07-29,961.552246,277.399994,961.543640
EastNorthCentral,1,2009-07-30,961.636475,279.200012,961.614746
EastNorthCentral,2,2009-07-31,962.255371,281.000000,962.216553
EastNorthCentral,3,2009-08-01,962.925781,282.799988,962.875000
EastNorthCentral,4,2009-08-02,962.776611,284.600006,962.709961
...,...,...,...,...,...
SouthCentral/Tennessee/womens_shoes,2161,2009-08-12,30.173487,307.890717,30.199209
SouthCentral/Tennessee/womens_shoes,2162,2009-08-13,30.207083,308.005463,30.233267
SouthCentral/Tennessee/womens_shoes,2163,2009-08-14,30.200855,308.120209,30.232256
SouthCentral/Tennessee/womens_shoes,2164,2009-08-15,30.220806,308.234985,30.262022


In [None]:
#---- Próximos passos:

# - Juntar tudo isso em uma função
# - Colocar ifs se depois a pessoa pode querer ou não usar ts ou ml
# - Criar outra função para organizar em um dataframe do pandas

In [None]:
def apply_models(Y_df: pd.DataFrame, S_df: pd.DataFrame, tags: dict, ts_models: list, ml_models: list, hier_methods: list, horizon_forecast: int)