In [1]:
import warnings
import importlib

warnings.filterwarnings("ignore")

from pathlib import Path
from typing import List, Optional, Union

import numpy as np
import pandas as pd

from tsururu.dataset import Pipeline, TSDataset
from tsururu.model_training.validator import HoldOutValidator
from tsururu.models.torch_based.dlinear import DLinear_NN
from tsururu.strategies import MIMOStrategy
from tsururu.transformers import (LagTransformer, SequentialTransformer,
                                  TargetGenerator, UnionTransformer, StandardScalerTransformer)

from tsururu.model_training.trainer import DLTrainer

In [2]:
def get_results(
    cv: int,
    regime: str,
    y_true: Optional[List[np.ndarray]] = None,
    y_pred: Optional[List[np.ndarray]] = None,
    ids: Optional[List[Union[float, str]]] = None,
) -> pd.DataFrame:
    def _get_fold_value(
        value: Optional[Union[float, np.ndarray]], idx: int
    ) -> List[Optional[Union[float, np.ndarray]]]:
        if value is None:
            return [None]
        if isinstance(value[idx], float):
            return value[idx]
        if isinstance(value[idx], np.ndarray):
            return value[idx].reshape(-1)
        raise TypeError(f"Unexpected value type. Value: {value}")

    df_res_dict = {}

    for idx_fold in range(cv):
        # Fill df_res_dict
        for name, value in [("y_true", y_true), ("y_pred", y_pred)]:
            df_res_dict[f"{name}_{idx_fold+1}"] = _get_fold_value(value, idx_fold)
        if regime != "local":
            df_res_dict[f"id_{idx_fold+1}"] = _get_fold_value(ids, idx_fold)

    # Save datasets to specified directory
    df_res = pd.DataFrame(df_res_dict)
    return df_res

In [3]:
def expand_val_with_train(train_data, val_data, id_column, date_column, history):
    L_split_data = train_data[date_column].values[(len(train_data) - history)]
    L_last_train_data = train_data[train_data[date_column] >= L_split_data]
    val_data_expanded = pd.concat((L_last_train_data, val_data))
    val_data_expanded = val_data_expanded.sort_values([id_column, date_column]).reset_index(
        drop=True
    )
    return val_data_expanded


def expand_test_with_val_and_train(
    train_data, val_data, test_data, id_column, date_column, history
):
    unqiue_id_cnt = val_data[id_column].nunique()
    L_split_data = val_data[date_column].values[
        (
            (len(val_data) - history)
            if (len(val_data) // val_data[id_column].nunique() - history) > 0
            else 0
        )
    ]
    L_last_val_data = val_data[val_data[date_column] >= L_split_data]
    if len(val_data) // unqiue_id_cnt - history < 0:
        if (len(train_data) - (history - len(L_last_val_data) / unqiue_id_cnt)) > 0:
            L_split_data = train_data[date_column].values[
                (
                    len(train_data) // unqiue_id_cnt
                    - (history - len(L_last_val_data) // unqiue_id_cnt)
                )
            ]
        else:
            L_split_data = 0
        L_last_train_data = train_data[train_data[date_column] >= L_split_data]
        test_data_expanded = pd.concat((L_last_train_data, L_last_val_data, test_data))
    else:
        test_data_expanded = pd.concat((L_last_val_data, test_data))
    test_data_expanded = test_data_expanded.sort_values([id_column, date_column]).reset_index(
        drop=True
    )
    return test_data_expanded


def get_train_val_test_datasets(dataset_path, columns_params, train_size, test_size, history):
    data = pd.read_csv(dataset_path)

    date_column = columns_params["date"]["columns"][0]
    id_column = columns_params["id"]["columns"][0]

    if dataset_path.parts[-1] in ["ETTh1.csv", "ETTh2.csv", "ETTm1.csv", "ETTm2.csv"]:
        train_val_split_data = "2017-06-25 23:00:00"
        val_test_slit_data = "2017-10-23 23:00:00"
    else:
        train_val_split_data = data[date_column].values[
            int(data[date_column].nunique() * train_size)
        ]
        val_test_slit_data = data[date_column].values[
            int(data[date_column].nunique() * (1 - test_size))
        ]

    train_data = data[data[date_column] <= train_val_split_data]
    val_data = data[
        (data[date_column] > train_val_split_data) & (data[date_column] <= val_test_slit_data)
    ]
    test_data = data[data[date_column] > val_test_slit_data]
    val_data = expand_val_with_train(train_data, val_data, id_column, date_column, history)
    test_data_expanded = expand_test_with_val_and_train(
        train_data, val_data, test_data, id_column, date_column, history
    )

    # train, val and test TSDataset initialization
    train_dataset = TSDataset(
        data=train_data,
        columns_params=columns_params,
    )

    val_dataset = TSDataset(
        data=val_data,
        columns_params=columns_params,
    )

    test_dataset = TSDataset(
        data=test_data_expanded,
        columns_params=columns_params,
    )

    return train_dataset, val_dataset, test_dataset

## Initialize TSDataset, Pipeline, Model, Validator, Strategy

### TSDataset

In [4]:
TRAIN_SIZE = 0.7
TEST_SIZE = 0.2
history = 7

df_path = Path("datasets/global/simulated_data_to_check.csv")

columns_params = {
    "target": {
        "columns": ["value"],
        "type": "continious",
    },
    "date": {
        "columns": ["date"],
        "type": "datetime",
    },
    "id": {
        "columns": ["id"],
        "type": "categorical",
    }
}

train_dataset, val_dataset, test_dataset = get_train_val_test_datasets(
    df_path, columns_params, TRAIN_SIZE, TEST_SIZE, history
)

### Pipeline

In [128]:
pipeline_params = {
        "target": {
            "columns": ["value"],
            "features": {
                "StandardScalerTransformer":
                    {
                        "transform_target": True, 
                        "transform_features": True
                    },
                "LagTransformer": {"lags": 7},
            },
        },
    }
    
pipeline = Pipeline.from_dict(pipeline_params, multivariate=True)

### Trainer

In [129]:
# Configure the model parameters
model = DLinear_NN
model_params = {"moving_avg": 7, "individual": False, "enc_in": None}

validation = HoldOutValidator
validation_params = {"validation_data": val_dataset}

trainer_params = {
    "device": "cpu",
    "num_workers": 0,
    "stop_by_metric": True,
}

trainer = DLTrainer(
    model, 
    model_params, 
    validation, 
    validation_params, 
    **trainer_params
)

### Strategy

In [130]:
horizon = 7
model_horizon = 7
history = 7
step = 1

In [131]:
strategy = MIMOStrategy(
    pipeline=pipeline,
    trainer=trainer,
    horizon=horizon,
    history=history,
    step=step,
)

In [132]:
strategy.fit(train_dataset)

Epoch 1/10, Loss: 0.0908
Validation, Loss: 0.0918, Metric: -0.0918
Validation, Loss: 0.0918, Metric: -0.0918
Model saved to checkpoints/fold_0/model_0.pth
Epoch 2/10, Loss: 0.0738
Validation, Loss: 0.0677, Metric: -0.0677
Validation, Loss: 0.0677, Metric: -0.0677
Model saved to checkpoints/fold_0/model_1.pth
Epoch 3/10, Loss: 0.0606
Validation, Loss: 0.0630, Metric: -0.0630
Validation, Loss: 0.0630, Metric: -0.0630
Model saved to checkpoints/fold_0/model_2.pth
Epoch 4/10, Loss: 0.0500
Validation, Loss: 0.0469, Metric: -0.0469
Validation, Loss: 0.0469, Metric: -0.0469
Model saved to checkpoints/fold_0/model_3.pth
Epoch 5/10, Loss: 0.0415
Validation, Loss: 0.0366, Metric: -0.0366
Validation, Loss: 0.0366, Metric: -0.0366
Model saved to checkpoints/fold_0/model_4.pth
Epoch 6/10, Loss: 0.0344
Validation, Loss: 0.0353, Metric: -0.0353
Validation, Loss: 0.0353, Metric: -0.0353
Removing worst model snapshot: checkpoints/fold_0/model_0.pth
Model saved to checkpoints/fold_0/model_5.pth
Epoch 7/

(22.10271978378296, <tsururu.strategies.mimo.MIMOStrategy at 0x7f1950548400>)

In [133]:
test_df = strategy.predict(test_dataset, test_all=True)[1]

Validation, Loss: nan, Metric: nan


In [134]:
test_df

Unnamed: 0,id,date,value
0,0,2022-03-15,1737.127652
1,0,2022-03-16,1804.999369
2,0,2022-03-17,1805.999753
3,0,2022-03-18,1807.001054
4,0,2022-03-19,1801.094394
...,...,...,...
1955,9,2022-09-22,10994.999592
1956,9,2022-09-23,10996.001086
1957,9,2022-09-24,10989.713151
1958,9,2022-09-25,10992.67563


In [149]:
true = test_dataset.data

merged_df = pd.merge(test_df, true, on=['id', 'date'], suffixes=('_df1', '_df2'))

# Step 3. Select relevant rows from second dataframe
filtered_df = merged_df[['id', 'date', 'value_df2']].rename(columns={'value_df2': 'value'})

print(filtered_df)

     id       date    value
0     0 2022-03-15   1804.0
1     0 2022-03-16   1805.0
2     0 2022-03-17   1806.0
3     0 2022-03-18   1807.0
4     0 2022-03-19   1808.0
...  ..        ...      ...
1955  9 2022-09-22  10995.0
1956  9 2022-09-23  10996.0
1957  9 2022-09-24  10997.0
1958  9 2022-09-25  10998.0
1959  9 2022-09-26  10999.0

[1960 rows x 3 columns]


In [150]:
def scale(df, pipeline): 
    def get_scaler(obj):
        print(obj)
        if isinstance(obj, StandardScalerTransformer):
            return obj
            
        if "transformers" in dir(obj):
            return get_scaler(obj.transformers)
                
        if "transformers_list" in dir(obj):
            curr_obj = None
            for tf in obj.transformers_list:
                curr_obj = get_scaler(tf)
                if isinstance(curr_obj, StandardScalerTransformer):
                    return curr_obj
            
        return None

    scaler = get_scaler(pipeline)
    if scaler is not None:
        new_df = pd.DataFrame(columns=df.columns)
        for i in df["id"].unique():
            temp = df[df["id"] == int(i)].copy()
            temp = scaler._transform_segment(temp, "id")
            temp = temp.drop("value", axis=1)
            temp = temp.rename({"value__standard_scaler": "value"}, axis=1)
            new_df = pd.concat((new_df, temp), axis=0)
        return new_df
    else:
        return df


In [151]:
filtered_df = scale(filtered_df, pipeline)
test_df = scale(test_df, pipeline)

<tsururu.dataset.pipeline.Pipeline object at 0x7f1950544c10>
<tsururu.transformers.base.UnionTransformer object at 0x7f19505450c0>
<tsururu.transformers.base.SequentialTransformer object at 0x7f195032b8b0>
<tsururu.transformers.numeric.StandardScalerTransformer object at 0x7f1950548610>
<tsururu.dataset.pipeline.Pipeline object at 0x7f1950544c10>
<tsururu.transformers.base.UnionTransformer object at 0x7f19505450c0>
<tsururu.transformers.base.SequentialTransformer object at 0x7f195032b8b0>
<tsururu.transformers.numeric.StandardScalerTransformer object at 0x7f1950548610>


In [152]:
np.abs(filtered_df["value"].values - test_df["value"].values).mean()

31.598307548505822

In [154]:
(filtered_df[filtered_df["id"] == 0]["value"].values).mean()

2.7233820755907043

In [42]:
pipeline.fitted_params

AttributeError: 'Pipeline' object has no attribute 'fitted_params'

In [157]:
filtered_df

Unnamed: 0,id,date,value
0,0,2022-03-15,2.241914
1,0,2022-03-16,2.246852
2,0,2022-03-17,2.251790
3,0,2022-03-18,2.256728
4,0,2022-03-19,2.261666
...,...,...,...
1955,9,2022-09-22,3.185098
1956,9,2022-09-23,3.190036
1957,9,2022-09-24,3.194974
1958,9,2022-09-25,3.199912
