In [1]:
%load_ext autotime

time: 101 µs (started: 2024-01-17 17:56:42 -03:00)


In [2]:
#---- Manipulação de dados:

import pandas as pd
import numpy as np

#---- Modelagem:

from hierarchicalforecast.utils import aggregate
from statsforecast import StatsForecast
from statsforecast.models import Naive, AutoARIMA, HoltWinters, AutoETS
from mlforecast import MLForecast
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from numba import njit
from window_ops.expanding import expanding_mean
from window_ops.rolling import rolling_mean

#---- Reconciliação

from hierarchicalforecast.methods import BottomUp, TopDown, ERM, OptimalCombination, MinTrace, MiddleOut
from hierarchicalforecast.core import HierarchicalReconciliation

  from tqdm.autonotebook import tqdm


time: 2.02 s (started: 2024-01-17 17:56:42 -03:00)


In [3]:
#---- Read data

def read_data(path: str):

    df = pd.read_csv(path)

    return df

path_data = 'https://raw.githubusercontent.com/aws-samples/amazon-sagemaker-hierarchical-forecasting/main/retail-usa-clothing.csv'

dados = read_data(path = path_data)

df = dados.copy()#.query('date >= "2005-01-01"')

df.head()

Unnamed: 0,date,state,item,quantity,region,country
0,1997-11-25,NewYork,mens_clothing,8,Mid-Alantic,USA
1,1997-11-26,NewYork,mens_clothing,9,Mid-Alantic,USA
2,1997-11-27,NewYork,mens_clothing,11,Mid-Alantic,USA
3,1997-11-28,NewYork,mens_clothing,11,Mid-Alantic,USA
4,1997-11-29,NewYork,mens_clothing,10,Mid-Alantic,USA


time: 1.02 s (started: 2024-01-17 17:56:44 -03:00)


In [4]:
#---- Transform data:

def clean_data(df: pd.DataFrame):

    #---- 1. Excluindo a variável de country:

    df = df\
        .drop(columns = 'country')

    #---- 2. Mudando o tipo da variável de date para datetime:

    df['date'] = pd.to_datetime(df['date'])

    #---- 3. Renomeando as variáveis de quantidade de vendas e data:
    # date -> ds
    # quantity -> y

    df = df\
        .rename(columns = {'date': 'ds', 
                           'quantity': 'y'})

    return df

df = clean_data(df = df)

df.head()

Unnamed: 0,ds,state,item,y,region
0,1997-11-25,NewYork,mens_clothing,8,Mid-Alantic
1,1997-11-26,NewYork,mens_clothing,9,Mid-Alantic
2,1997-11-27,NewYork,mens_clothing,11,Mid-Alantic
3,1997-11-28,NewYork,mens_clothing,11,Mid-Alantic
4,1997-11-29,NewYork,mens_clothing,10,Mid-Alantic


time: 109 ms (started: 2024-01-17 17:56:45 -03:00)


In [5]:
#---- Hierachical data format:

def format_hierarchical_df(df: pd.DataFrame, cols_hierarchical: list):

    #---- 1. Cria uma lista de listas: [[col1], [col1, col2], ..., [col1, col2, coln]]

    hier_list = [cols_hierarchical[:i] for i in range(1, len(cols_hierarchical) + 1)]

    #---- 2. Aplica a função aggregate que formata os dados em que a lib hierarchical pede

    Y_df, S_df, tags = aggregate(df = df, spec = hier_list)

    return Y_df, S_df, tags

cols_hierarchical = ['region', 'state', 'item']

Y_df, S_df, tags = format_hierarchical_df(df = df, cols_hierarchical = cols_hierarchical)

time: 765 ms (started: 2024-01-17 17:56:45 -03:00)


In [6]:
Y_df.head()

Unnamed: 0_level_0,ds,y
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1
EastNorthCentral,1997-11-25,507
EastNorthCentral,1997-11-26,504
EastNorthCentral,1997-11-27,510
EastNorthCentral,1997-11-28,507
EastNorthCentral,1997-11-29,513


time: 7.81 ms (started: 2024-01-17 17:56:46 -03:00)


In [7]:
S_df.head()

Unnamed: 0,EastNorthCentral/Illinois/kids_clothing,EastNorthCentral/Illinois/kids_shoes,EastNorthCentral/Illinois/mens_clothing,EastNorthCentral/Illinois/womens_clothing,EastNorthCentral/Illinois/womens_shoes,EastNorthCentral/Indiana/kids_clothing,EastNorthCentral/Indiana/kids_shoes,EastNorthCentral/Indiana/mens_clothing,EastNorthCentral/Indiana/womens_clothing,EastNorthCentral/Indiana/womens_shoes,...,SouthCentral/Mississippi/kids_clothing,SouthCentral/Mississippi/kids_shoes,SouthCentral/Mississippi/mens_clothing,SouthCentral/Mississippi/womens_clothing,SouthCentral/Mississippi/womens_shoes,SouthCentral/Tennessee/kids_clothing,SouthCentral/Tennessee/kids_shoes,SouthCentral/Tennessee/mens_clothing,SouthCentral/Tennessee/womens_clothing,SouthCentral/Tennessee/womens_shoes
EastNorthCentral,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Mid-Alantic,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
NewEngland,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Pacific,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SouthCentral,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


time: 43.9 ms (started: 2024-01-17 17:56:46 -03:00)


In [8]:
#---- Fit time series

hw = HoltWinters(season_length = 7, error_type = 'M') # Holtwinters com sazonalidade de 7 dias e erro do tipo Aditivo

time: 609 µs (started: 2024-01-17 17:56:46 -03:00)


In [9]:
# Fit machine learning

lin_reg = LinearRegression() # Regressão linear

#---- Features de data:

@njit
def rolling_mean_7(x):
    return rolling_mean(x, window_size = 7)

@njit
def rolling_mean_14(x):
    return rolling_mean(x, window_size = 14)

@njit
def rolling_mean_21(x):
    return rolling_mean(x, window_size = 21)

@njit
def rolling_mean_28(x):
    return rolling_mean(x, window_size = 28)

time: 4.61 ms (started: 2024-01-17 17:56:46 -03:00)


In [10]:
def apply_time_series_models(Y_df: pd.DataFrame, 
                             S_df: pd.DataFrame,
                             tags: dict,
                             freq: str,
                             ts_models: None,
                             reconcilers_ts: None,
                             horizon_forecast: int = 30):

    model_ts = StatsForecast(ts_models, 
                             freq = freq,
                             n_jobs = -1)
    model_ts.fit(Y_df)
    
    Y_hat_df_ts = model_ts.forecast(h = horizon_forecast)

    hrec_ts = HierarchicalReconciliation(reconcilers = reconcilers_ts)
    
    Y_rec_df_ts = hrec_ts.reconcile(Y_hat_df = Y_hat_df_ts, 
                                    S = S_df,
                                    tags = tags)

    return Y_rec_df_ts.reset_index()

time: 2.35 ms (started: 2024-01-17 17:56:46 -03:00)


In [11]:
# a = apply_time_series_models(Y_df = Y_df,
#                          S_df = S_df,
#                          tags = tags,
#                          freq = 'D',
#                          ts_models = [hw],
#                          reconcilers_ts = [BottomUp()],
#                          horizon_forecast = 30)

# a

time: 1.31 ms (started: 2024-01-17 17:56:46 -03:00)


In [12]:
def apply_machine_learning_models(Y_df: pd.DataFrame, 
                                  S_df: pd.DataFrame,
                                  tags: dict,
                                  freq: str,
                                  ml_models: None, 
                                  lags_ml: list,
                                  date_features_ml: list,
                                  lag_transforms_ml: dict,
                                  reconcilers_ml: None,
                                  horizon_forecast: int = 30):

    model_ml = MLForecast(models = ml_models,
                              freq = freq, 
                              num_threads = 6,
                              lags = lags_ml, 
                              date_features = date_features_ml, 
                              lag_transforms = lag_transforms_ml
                             )

    model_ml.fit(Y_df.reset_index(), id_col = 'unique_id', time_col = 'ds', target_col = 'y')
    
    Y_hat_df_ml = model_ml.predict(h = horizon_forecast)

    hrec_ml = HierarchicalReconciliation(reconcilers = reconcilers_ml)

    Y_rec_df_ml = hrec_ml.reconcile(Y_hat_df = Y_hat_df_ml, 
                            S = S_df,
                            tags = tags)

    Y_rec_df_ml = Y_rec_df_ml[[col for col in Y_rec_df_ml.columns if 'index' not in col]]

    return Y_rec_df_ml.reset_index()

time: 2.92 ms (started: 2024-01-17 17:56:46 -03:00)


In [13]:
# b = apply_machine_learning_models(Y_df = Y_df,
#                               S_df = S_df,
#                               tags = tags,
#                               freq = 'D',
#                               ml_models = [lin_reg],
#                               lags_ml = [1, 7, 14, 21, 28, 30],
#                               date_features_ml = ['dayofweek', 'month', 'year', 'quarter', 'day', 'week'],
#                               lag_transforms_ml = {
#                                   1: [expanding_mean],
#                                   7: [rolling_mean_7],
#                                   14: [rolling_mean_14],
#                                   21: [rolling_mean_21],
#                                   28: [rolling_mean_28],
#                               },
#                               reconcilers_ml = [OptimalCombination(method = 'ols', nonnegative = True)],
#                               horizon_forecast = 5)

# b

time: 4.85 ms (started: 2024-01-17 17:56:46 -03:00)


In [14]:
def apply_models(Y_df: pd.DataFrame, 
                 S_df: pd.DataFrame,
                 tags: dict,
                 freq: str,
                 ts_models: None,
                 reconcilers_ts: None,
                 ml_models: None, 
                 lags_ml: None,
                 date_features_ml: None,
                 lag_transforms_ml: None,
                 reconcilers_ml: None,
                 horizon_forecast: None):

    if ts_models:

        print('Executando os modelos de séries temporais...')

        ts_recommendations = apply_time_series_models(Y_df = Y_df,
                                                      S_df = S_df,
                                                      tags = tags,
                                                      freq = freq,
                                                      ts_models = ts_models,
                                                      reconcilers_ts = reconcilers_ts,
                                                      horizon_forecast = horizon_forecast)
    else:

        ts_recommendations = pd.DataFrame(columns = ['ds', 'unique_id'])

    if ml_models:

        print('Executando os modelos de Machine Learning')

        ml_recommendations = apply_machine_learning_models(Y_df = Y_df,
                                                           S_df = S_df,
                                                           tags = tags,
                                                           freq = freq,
                                                           ml_models = ml_models,
                                                           lags_ml = lags_ml,
                                                           date_features_ml = date_features_ml,
                                                           lag_transforms_ml = lag_transforms_ml,
                                                           reconcilers_ml = reconcilers_ml,
                                                           horizon_forecast = horizon_forecast)
    else:

        ml_recommendations = pd.DataFrame(columns = ['ds', 'unique_id'])

    result_df = ts_recommendations.merge(ml_recommendations, on = ['ds', 'unique_id'], how = 'outer')

    return result_df

time: 3.9 ms (started: 2024-01-17 17:56:46 -03:00)


In [15]:
df_recommendations =  apply_models(Y_df = Y_df,
                                   S_df = S_df,
                                   tags = tags,
                                   freq = 'D',
                                   ts_models = [hw],
                                   reconcilers_ts = [BottomUp()],
                                   ml_models = [lin_reg],
                                   lags_ml = [1, 7, 14, 21, 28, 30],
                                   date_features_ml = ['dayofweek', 'month', 'year', 'quarter', 'day', 'week'],
                                   lag_transforms_ml = {
                                       1: [expanding_mean],
                                       7: [rolling_mean_7],
                                       14: [rolling_mean_14],
                                       21: [rolling_mean_21],
                                       28: [rolling_mean_28],
                                   },
                                   reconcilers_ml = [OptimalCombination(method = 'ols', nonnegative = True)],
                                   horizon_forecast = 30)

Executando os modelos de séries temporais...


Forecast: 100%|█████████████████████████████████| 14/14 [01:04<00:00,  4.62s/it]
Forecast: 100%|█████████████████████████████████| 14/14 [01:04<00:00,  4.63s/it]
Forecast: 100%|█████████████████████████████████| 14/14 [01:04<00:00,  4.64s/it]
Forecast: 100%|█████████████████████████████████| 14/14 [01:05<00:00,  4.67s/it]
Forecast: 100%|█████████████████████████████████| 14/14 [01:05<00:00,  4.67s/it]
Forecast: 100%|█████████████████████████████████| 14/14 [01:05<00:00,  4.71s/it]
Forecast: 100%|█████████████████████████████████| 15/15 [01:06<00:00,  4.41s/it]
Forecast: 100%|█████████████████████████████████| 15/15 [01:06<00:00,  4.45s/it]


Executando os modelos de Machine Learning
time: 2min 23s (started: 2024-01-17 17:56:46 -03:00)


In [16]:
def clean_recommendations(df_rec: pd.DataFrame):

    # df = df_rec.copy()

    model_col = [col for col in df_rec.columns if '/' in col]

    df_rec1 = df_rec[['unique_id', 'ds'] + model_col]\
        .assign(\
            nivel_hierarquia = lambda x: np.where(x['unique_id'].str.count('/') == 0, 1, x['unique_id'].str.count('/') + 1)
        )\
        .query(f'nivel_hierarquia == {len(cols_hierarchical)}')

    df_rec1[cols_hierarchical] = df_rec1['unique_id'].str.split('/', n = len(cols_hierarchical), expand = True)

    df_rec1 = df_rec1\
        .rename(columns = {'ds': 'date'})\
        .drop(columns = ['unique_id', 'nivel_hierarquia'])\
        .reset_index(drop = True)[cols_hierarchical + ['date'] + model_col]
    
    return df_rec1

time: 2.72 ms (started: 2024-01-17 17:59:10 -03:00)


In [17]:
df_result = clean_recommendations(df_rec = df_recommendations)

df_result

Unnamed: 0,region,state,item,date,HoltWinters/BottomUp,LinearRegression/OptimalCombination_method-ols_nonnegative-True
0,EastNorthCentral,Illinois,kids_clothing,2009-07-29,95.041901,94.126701
1,EastNorthCentral,Illinois,kids_clothing,2009-07-30,95.028450,94.189064
2,EastNorthCentral,Illinois,kids_clothing,2009-07-31,95.250153,94.463600
3,EastNorthCentral,Illinois,kids_clothing,2009-08-01,95.227325,94.793335
4,EastNorthCentral,Illinois,kids_clothing,2009-08-02,94.962692,95.064285
...,...,...,...,...,...,...
2725,SouthCentral,Tennessee,womens_shoes,2009-08-23,29.568060,29.952673
2726,SouthCentral,Tennessee,womens_shoes,2009-08-24,29.719357,29.963165
2727,SouthCentral,Tennessee,womens_shoes,2009-08-25,29.792229,29.969330
2728,SouthCentral,Tennessee,womens_shoes,2009-08-26,29.675941,29.978373


time: 38.8 ms (started: 2024-01-17 17:59:10 -03:00)
