In [1]:
import datasets
from datasets import load_dataset
import pandas as pd
from tsfresh.feature_extraction import MinimalFCParameters
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
ds = load_dataset("autogluon/chronos_datasets", "m4_weekly",)

In [3]:
ds

DatasetDict({
    train: Dataset({
        features: ['id', 'timestamp', 'target', 'category'],
        num_rows: 359
    })
})

In [4]:
df = ds['train'].to_pandas()

In [5]:
# You can train test split df here
df

Unnamed: 0,id,timestamp,target,category
0,T000000,"[1975-08-03T12:00:00.000, 1975-08-10T12:00:00....","[1089.2, 1078.91, 1079.88, 1063.58, 1060.61, 1...",Other
1,T000001,"[1984-04-01T12:00:00.000, 1984-04-08T12:00:00....","[195.928, 194.796, 192.71, 190.288, 188.77, 18...",Other
2,T000002,"[1975-06-01T12:00:00.000, 1975-06-08T12:00:00....","[258.7, 259.8, 260.2, 260.8, 261.1, 263.1, 264...",Other
3,T000003,"[1967-07-02T12:00:00.000, 1967-07-09T12:00:00....","[3460.0, 3340.0, 2770.0, 2520.0, 2740.0, 2760....",Other
4,T000004,"[1986-08-03T12:00:00.000, 1986-08-10T12:00:00....","[4696.0, 4765.0, 4144.0, 3945.0, 4733.0, 4330....",Other
...,...,...,...,...
354,T000354,"[2016-01-03T12:00:00.000, 2016-01-10T12:00:00....","[4798.0, 5192.0, 4194.0, 4217.0, 3926.0, 3987....",Micro
355,T000355,"[2016-01-03T12:00:00.000, 2016-01-10T12:00:00....","[4923.0, 6382.0, 4387.0, 4979.0, 4660.0, 4437....",Micro
356,T000356,"[2016-01-03T12:00:00.000, 2016-01-10T12:00:00....","[4401.0, 4955.0, 3680.0, 4214.0, 4139.0, 3384....",Micro
357,T000357,"[2016-01-03T12:00:00.000, 2016-01-10T12:00:00....","[5977.0, 7770.0, 5843.0, 6169.0, 5990.0, 5537....",Micro


In [6]:
def to_pandas(ds: datasets.Dataset) -> "pd.DataFrame":
    """Convert dataset to long data frame format."""
    sequence_columns = [col for col in ds.features if isinstance(ds.features[col], datasets.Sequence)]
    return ds.to_pandas().explode(sequence_columns).infer_objects()

In [7]:
pandas_ds = to_pandas(ds['train'])

## Example of it working with 1 time series 

In [8]:
t = df[['timestamp','target']].iloc[0] 

# Do not split S
S = pd.DataFrame( t.target, index=t.timestamp)

# The id of the time series
S['id'] = 1
S = S.reset_index()

In [9]:
features =  MinimalFCParameters()
# add additional features if needed: https://tsfresh.readthedocs.io/en/latest/api/tsfresh.feature_extraction.html
additional = {
    'kurtosis':None}
features.update(additional)



# This is taking features for all of time 
# You should filter to the last N days and then do extract_features
from tsfresh import extract_features

for time_range in [30, 60, 180, ]:
    filtered_time_series = S.iloc[time_range]
    result_for_1_time_range = extract_features(S, column_id="id", column_sort="index", default_fc_parameters=features)

    # Append the results for each time range into a new object
    result_for_1_time_range

Feature Extraction: 100%|███████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.48s/it]
Feature Extraction: 100%|███████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.20s/it]
Feature Extraction: 100%|███████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.45s/it]


In [10]:
result_for_1_time_range

Unnamed: 0,0__sum_values,0__median,0__mean,0__length,0__standard_deviation,0__variance,0__root_mean_square,0__maximum,0__absolute_maximum,0__minimum,0__kurtosis
1,19606106.67,4392.895,8944.391729,2192.0,10999.09575,120980100.0,14176.820896,41304.72,41304.72,1049.32,1.933077


## Example working with all time series at once

In [11]:
pandas_ds

Unnamed: 0,id,timestamp,target,category
0,T000000,1975-08-03 12:00:00,1089.20,Other
0,T000000,1975-08-10 12:00:00,1078.91,Other
0,T000000,1975-08-17 12:00:00,1079.88,Other
0,T000000,1975-08-24 12:00:00,1063.58,Other
0,T000000,1975-08-31 12:00:00,1060.61,Other
...,...,...,...,...
358,T000358,2017-09-10 12:00:00,3386.00,Micro
358,T000358,2017-09-17 12:00:00,3627.00,Micro
358,T000358,2017-09-24 12:00:00,3299.00,Micro
358,T000358,2017-10-01 12:00:00,3743.00,Micro


In [12]:
pandas_ds['id'] = pandas_ds['id'].str.replace('T','').astype(int)

In [13]:
pandas_ds = pandas_ds.drop(columns='category')

In [14]:
pandas_ds.dtypes

id                    int64
timestamp    datetime64[ns]
target              float64
dtype: object

In [15]:
result_for_all_ids_and_alldates = extract_features(
    pandas_ds,
    column_id="id",
    column_sort="timestamp",
    default_fc_parameters=MinimalFCParameters()
    )

Feature Extraction: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:04<00:00,  4.97it/s]


In [16]:
result_for_all_ids_and_alldates

Unnamed: 0,target__sum_values,target__median,target__mean,target__length,target__standard_deviation,target__variance,target__root_mean_square,target__maximum,target__absolute_maximum,target__minimum
0,1.960611e+07,4392.895,8944.391729,2192.0,10999.095750,1.209801e+08,14176.820896,41304.72,41304.72,1049.320
1,1.892440e+06,584.531,1098.339961,1723.0,1159.626742,1.344734e+06,1597.211586,4155.19,4155.19,186.721
2,7.989222e+06,2507.900,3646.381561,2191.0,2605.907085,6.790752e+06,4481.835586,10089.10,10089.10,258.700
3,9.333605e+06,3354.785,3576.093851,2610.0,1155.673787,1.335582e+06,3758.194930,10735.00,10735.00,1330.000
4,5.507312e+06,3108.000,3407.990099,1616.0,1497.075882,2.241236e+06,3722.315504,21215.00,21215.00,1148.000
...,...,...,...,...,...,...,...,...,...,...
354,3.006700e+05,2880.000,3233.010753,93.0,985.815927,9.718330e+05,3379.969167,5448.00,5448.00,1971.000
355,4.019760e+05,4387.000,4322.322581,93.0,782.039079,6.115851e+05,4392.500155,7267.00,7267.00,2405.000
356,3.367430e+05,3521.000,3620.892473,93.0,587.196812,3.448001e+05,3668.196069,5546.00,5546.00,2249.000
357,5.077660e+05,5445.000,5459.849462,93.0,911.822327,8.314200e+05,5535.465302,7770.00,7770.00,3556.000


In [27]:
df['y'] = df['target'].apply(lambda x: np.array(x[-40:]))

In [32]:
# This is important 
df['y'][0].shape

(40,)

In [33]:
dataset = pd.merge(result_for_all_ids_and_alldates, df[['y']], left_index=True, right_index=True)

In [57]:
y = np.array(dataset['y'].to_list())

In [70]:
X = dataset.iloc[:,list(dataset.columns != 'y')]

In [71]:
# This is important 

X.shape

(359, 10)

In [59]:
# This is important 

y.shape

(359, 40)

In [None]:
# Note: I am skipping cross validation here
# get this working before adding cross validation to ur project

In [72]:
import numpy as np
from xgboost import XGBRegressor

model = XGBRegressor()
model.fit(X, y)

# test on same data for demo (do not do this in ur project)
preds = model.predict(X)
preds

array([[38874.13  , 38872.086 , 38658.95  , ..., 35777.7   , 34051.926 ,
        34058.85  ],
       [ 3831.4788,  3925.2114,  3906.888 , ...,  3523.983 ,  3458.1658,
         3365.8467],
       [ 9354.389 ,  9509.618 ,  9685.683 , ..., 10075.11  , 10000.262 ,
         9817.71  ],
       ...,
       [ 4059.9307,  4482.8306,  3675.1565, ...,  3282.4524,  4031.137 ,
         3759.3281],
       [ 5704.4834,  5276.334 ,  5657.639 , ...,  5348.448 ,  6163.2896,
         5868.8916],
       [ 3135.7136,  4160.616 ,  3728.5403, ...,  3304.361 ,  3766.5054,
         3942.3076]], dtype=float32)