In [1]:
import pandas as pd
from statsforecast import StatsForecast


  from tqdm.autonotebook import tqdm


In [2]:
col_dtypes = {'unique_id': 'category',
              'y': int}
df = pd.read_csv('../data/processed/subgrups-dataset.csv', encoding='utf8', dtype= col_dtypes)
df['ds'] = pd.to_datetime(df['ds'])

df

Unnamed: 0,ds,unique_id,y
0,2018-01-01,0101,0
1,2018-02-01,0101,20
2,2018-03-01,0101,1
3,2018-04-01,0101,40
4,2018-05-01,0101,6
...,...,...,...
2824,2023-05-01,0702,55
2825,2023-06-01,0702,33
2826,2023-07-01,0702,27
2827,2023-08-01,0702,21


In [3]:
StatsForecast.plot(df, plot_random=False)

In [4]:
from statsforecast.models import MSTL

# Create a list of models and instantiation parameters 
models = [MSTL(season_length = 12)]


In [5]:
"""To instantiate a new StatsForecast object, we need the following parameters:

    df: The dataframe with the training data.
    models: The list of models defined in the previous step.
    freq: A string indicating the frequency of the data. See pandas’ available frequencies.
    n_jobs: An integer that indicates the number of jobs used in parallel processing. Use -1 to select all cores.
"""
sf = StatsForecast(
    df = df, 
    models = models, 
    freq = 'MS', 
    n_jobs = -1
)

In [6]:
horizon = 12
levels = [99] 

fcst = sf.forecast(h = horizon, level = levels, fitted = True)
fcst = fcst.reset_index()
fcst.head()


Unnamed: 0,unique_id,ds,MSTL,MSTL-lo-99,MSTL-hi-99
0,101,2023-10-01,336.709625,263.557678,409.861572
1,101,2023-11-01,432.298065,354.153168,510.442963
2,101,2023-12-01,454.318695,367.444794,541.192566
3,101,2024-01-01,472.415558,372.958405,571.872742
4,101,2024-02-01,485.497955,369.913513,601.082397


In [7]:
StatsForecast.plot(df, fcst, plot_random = False, unique_ids = ['0202', '0301'])


In [8]:
"""Recover insample forecasts and identify anomalies

In this example, an anomaly will be any observation outside the prediction interval of the insample forecasts for a given confidence level (here we selected 99%). 
Hence, we first need to recover the insample forecasts using the forecast_fitted_values method.
"""
insample_forecasts = sf.forecast_fitted_values().reset_index()
insample_forecasts.head()

Unnamed: 0,unique_id,ds,y,MSTL,MSTL-lo-99,MSTL-hi-99
0,101,2018-01-01,0.0,-3.644952,-76.22316,68.933258
1,101,2018-02-01,20.0,11.617954,-60.960251,84.196159
2,101,2018-03-01,1.0,22.8382,-49.740009,95.416405
3,101,2018-04-01,40.0,20.241737,-52.336472,92.819946
4,101,2018-05-01,6.0,1.567997,-71.010208,74.146202


In [9]:
anomalies = insample_forecasts.loc[(insample_forecasts['y'] >= insample_forecasts[f'MSTL-hi-{levels[0]}']) | (insample_forecasts['y'] <= insample_forecasts[f'MSTL-lo-{levels[0]}'])]
anomalies.head()


Unnamed: 0,unique_id,ds,y,MSTL,MSTL-lo-99,MSTL-hi-99
50,101,2022-03-01,14.0,-68.730148,-141.308365,3.848056
58,101,2022-11-01,136.0,60.154892,-12.423313,132.733093
62,101,2023-03-01,5.0,83.5467,10.968489,156.124908
122,102,2022-06-01,1.0,0.299953,0.029203,0.570704
134,102,2023-06-01,0.0,0.388075,0.117324,0.658826


In [10]:
StatsForecast.plot(insample_forecasts, plot_random = False, plot_anomalies = True)


In [12]:
StatsForecast.plot(insample_forecasts, unique_ids = ['0202','0301'], plot_anomalies = True)
