In [None]:
# OLD: Baseline notebook for ETH_contest_v2 dataset (see _README.md in 2023-11_ETH_contest_v2)
# NEW 2023-12: this notebook is devoted to ETNA framework approach on synthetic data

# Инструкция

In [None]:
1. Python 3.10  // ETNA не поддерживает 3.11 пока
2. pip install etna[all] // Просто pip install etna не взлетит.
3. Для раздела MLSD Macro нужен файл с данными: "demo_data/stockdata_merged.csv.gz".
4. Запустить ноутбук, должен выполниться раздел "MLSD Macro experiments"

# Check env

In [None]:
!python -V
!pip list | grep -Ei "etn|pand|nump|sci|catb"

# (12-23) MLSD Macro experiments

## Imports

In [None]:
from datetime import datetime
import numpy as np
import pandas as pd
import time

from etna.analysis import plot_backtest
from etna.datasets.tsdataset import TSDataset
from etna.datasets import generate_from_patterns_df
from etna.metrics import MAE, MAPE, SMAPE
from etna.models import NaiveModel, LinearPerSegmentModel, CatBoostPerSegmentModel, ProphetModel, AutoARIMAModel, SARIMAXModel, SeasonalMovingAverageModel
from etna.models.nn import RNNModel
from etna.pipeline import Pipeline
from etna.transforms import LagTransform, LogTransform, LambdaTransform, MeanTransform, MedianTransform, BoxCoxTransform, FilterFeaturesTransform

## Defs

In [None]:
def create_pipeline_and_launch_backtest(model, transforms, tsd_dataset, horizon: int, n_folds: int, metrics=[MAE(),], 
                                        refit=1, return_components=True):
    print(tsd_dataset)
    
    print(f"{horizon=} {n_folds=}")
    
    pipeline = Pipeline(
        model = model,
        transforms=transforms,
        horizon=horizon
    )
    
    df_metrics, df_forecast, df_fold_info = pipeline.backtest(
        ts=tsd_dataset, metrics=metrics, 
        n_folds=n_folds,
        forecast_params=dict(return_components=return_components),
        refit=refit
    )
    
    return df_metrics, df_forecast, df_fold_info

In [None]:
def dump_folds_info_and_metrics(df_metrics, df_forecast, df_fold_info, history_len: int, main_metric="MAE"):
    # Print info about folds
    print(f"=== df_fold_info ===\nShape:{df_fold_info.shape}, head:\n{df_fold_info.head(3)}")

    # Print metrics and forecast
    print(f"\n=== df_metrics ===\nShape:{df_metrics.shape}, head:\n{df_metrics.head(4)}")
    print(f"{df_metrics[main_metric].mean()=:.3f}")
    
    print(f"\n=== df_forecast ===\n{type(df_forecast)=}")

    plot_backtest(df_forecast, tsd_train_test, history_len=history_len)

## Prepare data

In [None]:
FNAME = r"demo_data/stockdata_merged.csv.gz"

In [None]:
df_src = pd.read_csv(FNAME, index_col="ts", parse_dates=["ts"])

# Remove timezone-info from the index (required by Prophet!), convert to UTC
df_src.index = df_src.index.tz_convert(None)

# Resample the data to fill missing candles, and forward-fill the gaps (including nans if any)
df_src = df_src.resample(rule="1d").ffill()

# Replace remaining nans to previous good values (rarely occurred)
df_src = df_src.ffill()  # TBD: could be critical for some features with large missing data in the end

# Select required dates period
df_src = df_src[df_src.index >= datetime.strptime("2022-05-01", '%Y-%m-%d')]
df_src = df_src[df_src.index <= datetime.strptime("2023-10-30", '%Y-%m-%d')]
#df_src = df_src[df_src.index <= datetime.strptime("2023-09-30", '%Y-%m-%d')]

df_src

In [None]:
# Prepare columns "timestamp", "segment", "target" that are required by ETNA
df_src["timestamp"] = df_src.index      # For now - just create copy of index. TBD: try to rename the index
#df_src["segment"] = "dummy_segment"     # Segments are required by ETNA
df_src["segment"] = "OOT predictions"     # Segments are required by ETNA
df_src["target"] = df_src["adj_close_ALI=F"]    # TBD: check if "target" exists.

#df_exo = df_src[["timestamp", "segment", "adj_close_BTC=F", "adj_close_GC=F"]]
#df_exo = df_src[["timestamp", "segment", "adj_close_GC=F"]]
df_exo = df_src[["timestamp", "segment", "adj_close_BTC=F"]]

df_src = df_src[["timestamp", "segment", "target"]]

In [None]:
tsd_train_test = TSDataset(df=TSDataset.to_dataset(df_src), df_exog=TSDataset.to_dataset(df_exo), freq="D")
#tsd_train_test = TSDataset(df=TSDataset.to_dataset(df_src), freq="D")
tsd_train_test

## Single cycle

In [None]:
%%time
transforms=[
    #FilterFeaturesTransform(include=[])
    LogTransform(in_column="target"),
    #MeanTransform(in_column="target", window=7, out_column="target_mean_7"),
    #MedianTransform(in_column="target", window
    ##LagTransform(in_column="target", lags=[1, 2, 3, 4]),
    #LagTransform(lags=[1], in_column="target"),
    ##LagTransform(in_column="adj_close_BTC=F", lags=[1, 2]),
    #FilterFeaturesTransform(exclude=[("dummy_segment", "adj_close_BTC=F"), ("dummy_segment", "adj_close_GC=F")])  # CatBoostError: Input data must have at least one feature
]
#transforms=[]
#model = CatBoostPerSegmentModel(iterations=100, random_state=42)
#model = NaiveModel(lag=1)
#model = ProphetModel()
#model = AutoARIMAModel()
model = SARIMAXModel()
#model = LinearPerSegmentModel()
df_metrics, df_forecast, df_fold_info = create_pipeline_and_launch_backtest(model=model, tsd_dataset=tsd_train_test, 
                                                                            horizon=14, n_folds=2, transforms=transforms, refit=1, metrics=[MAPE()])

In [None]:
dump_folds_info_and_metrics(df_metrics, df_forecast, df_fold_info, main_metric="MAPE", history_len=60)
df_forecast

In [None]:
#df_metrics

## Big cycle

In [None]:
tsd_train_test

In [None]:
%%time
res = []
#for iters in [100, 200, 50]: #, 10, 20, 50, 100, 200, 500]: #, 1000]:
#  for max_lags_target in [1, 2, 3, 4, 5]:
#    for max_lags_btc in [1, 2, 3, 4, 5]:
for iters in [100]:
  for max_lags_target in [4]:
    for max_lags_btc in [2]:
        for model in [
            CatBoostPerSegmentModel(iterations=iters, random_state=42), 
            NaiveModel(lag=1), 
            #LinearPerSegmentModel(), - crashes on horizon > 1
            SeasonalMovingAverageModel(), 
            AutoARIMAModel(), 
            SARIMAXModel(),  
            ProphetModel(), 
            RNNModel(
                decoder_length=1,  #forecast_horizon,
                encoder_length=2 * 1,  #forecast_horizon,
                #input_size=11,
                #input_size=2+6,  # Number of features (including target)
                input_size=2,  # Number of features (including target)
                trainer_params=dict(max_epochs=10),
                lr=1E-3,
            )
            ]:
                start_time = time.time()
                transforms=[
                    #FilterFeaturesTransform(include=[])
                    LogTransform(in_column="target"),
                    #PercChangePerSegmentTransform(in_column="target"),
                    ##LagTransform(in_column="target", lags=range(1, max_lags_target+1)), # [1, 2, 3, 4, 5]),
                    #LagTransform(in_column="adj_close_GC=F", lags=[1, 2, 3, 4, 5]),
                    #LagTransform(in_column="adj_close_BTC=F", lags=[1, 2, 3, 4, 5]),
                    ##LagTransform(in_column="adj_close_BTC=F", lags=range(1, max_lags_btc + 1)),
                    #LagTransform(lags=[1], in_column="target"),
                    #FilterFeaturesTransform(exclude=[("dummy_segment", "adj_close_BTC=F"), ("dummy_segment", "adj_close_GC=F")])  # CatBoostError: Input data must have at least one feature
                ]
                if ("LinearPerSegmentModel" in str(model)) or ("CatBoostPerSegmentModel" in str(model)):
                    transforms.extend([
                        LagTransform(in_column="target", lags=range(1, max_lags_target+1)), # [1, 2, 3, 4, 5]),
                        LagTransform(in_column="adj_close_BTC=F", lags=range(1, max_lags_btc + 1)),
                    ])
                #model = CatBoostPerSegmentModel(iterations=iters, random_state=42)
                df_metrics, df_forecast, df_fold_info = create_pipeline_and_launch_backtest(
                    model=model, tsd_dataset=tsd_train_test, 
                    #horizon=1, n_folds=30, transforms=transforms, refit=1, metrics=[MAPE()], return_components=False)
                    horizon=14, n_folds=2, transforms=transforms, refit=1, metrics=[MAPE()], return_components=False)
            
                metric = df_metrics["MAPE"].mean()
                iter_seconds = time.time() - start_time
                print(f"{iters=} {max_lags_target=} {max_lags_btc=} {metric=:.5f} {iter_seconds=:.1f} {model=}")
                res.append(dict(iters=iters, metric=metric, iter_seconds=iter_seconds, max_lags_target=max_lags_target, max_lags_btc=max_lags_btc, model=model))
                df_res = pd.DataFrame(res)
                df_res.to_csv("__T1523.csv")

In [None]:
df_res = pd.DataFrame(res)
df_res.plot.bar(x="iters", y="metric", figsize=(20, 10))
#df_res.sort_values(by=["metric"])
df_res

In [None]:
raise KeyboardInterrupt

## (12-23) Experiments with custom transform for percentage changes (draft)

In [None]:
from etna.transforms.base import OneSegmentTransform

In [None]:
# Class for processing one segment.
class _PercChangeOneSegmentTransform(OneSegmentTransform):
    
    # Constructor with the name of the column to which the transformation will be applied.
    def __init__(self, in_column: str):
        self.in_column = in_column
        self.base_value = None

    def fit(self, df: pd.DataFrame) -> "_PercChangeOneSegmentTransform":
        self.base_value = df[self.in_column].iloc[0]
        return self

    # Apply changes.
    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
        assert self.base_value is not None
        
        result_df = df.copy()
        result_df[self.in_column] = result_df[self.in_column].pct_change()
        return result_df

    # Returns back changed values.
    def inverse_transform(self, df: pd.DataFrame) -> pd.DataFrame:
        assert self.base_value is not None
        
        result = df.copy()
        result[self.in_column] = result[self.in_column].add(1,fill_value=0).cumprod() * self.base_value

        return result

In [None]:
from etna.transforms.base import ReversiblePerSegmentWrapper
from typing import List

In [None]:
class PercChangePerSegmentTransform(ReversiblePerSegmentWrapper):
    """Transform that changes sequence of values to percent changes"""

    def __init__(self, in_column: str):
        self.in_column = in_column
        super().__init__(
            transform=_PercChangeOneSegmentTransform(in_column=self.in_column),
            required_features=[in_column],
        )

    # Here we need to specify output columns with regressors, if transform creates them.
    def get_regressors_info(self) -> List[str]:
        """Return the list with regressors created by the transform.

        Returns
        -------
        :
            List with regressors created by the transform.
        """
        return []

In [None]:
tr_perc = PercChangePerSegmentTransform(in_column="target")

In [None]:
#tr_perc.fit_transform(tsd_train_test)

In [None]:
#tsd_train_test

In [None]:
#tr_perc.inverse_transform(tsd_train_test)

In [None]:
raise KeyboardInterrupt

In [None]:
# import logging
# logger = logging.getLogger('etna')
# logger.setLevel(logging.CRITICAL)
# import time

In [None]:
#10 * (ds_pct+1).cumprod()

In [None]:
#perc_trans = LambdaTransform(in_column = "target", transform_func=lambda x: x.pct_change, inverse_transform_func=lambda x: (x+1).cumprod())

In [None]:
#lag_trans = LagTransform(in_column = "target", lags=[1,2])

In [None]:
#lag_trans.fit_transform(tsd_train_test)

In [None]:
#perc_trans.fit_transform(tsd_train_test)

# (12-21) Approach 0b. ETNA Syntetic datasets with exogenous features

## Imports

## Generate pandas df1+df2, df_target

In [None]:
NOIZE_SIGMA = 0.1
TRAIN_PERIODS = 24
TEST_PERIODS = 8
TRAIN_TEST_PERIODS = TRAIN_PERIODS + TEST_PERIODS

In [None]:
# Feature 1 - "saw" signal with period 4
df1= generate_from_patterns_df(periods=TRAIN_TEST_PERIODS, start_time="2023-01-01", add_noise=True, sigma=NOIZE_SIGMA, random_seed=42, patterns=[[2,4,6,4],])
df1.target.plot(marker="o")

In [None]:
# Feature 2 - "saw" signal with period 8
df2 = generate_from_patterns_df(periods=TRAIN_TEST_PERIODS, start_time="2023-01-01", add_noise=True, sigma=NOIZE_SIGMA, random_seed=43, patterns=[[2,3,4,5,6,5,4,3],])
df2.target.plot(marker="o")

In [None]:
# Just plot feature1 + feature2
(df1.target + df2.target).plot(marker="o")

In [None]:
# Create target separately from features to get non-coinciding version (due to different noize patterns).
TARGET_PATTERN = [4,7,10,9,8,9,10,7]
df_target = generate_from_patterns_df(periods=TRAIN_TEST_PERIODS, start_time="2023-01-01", add_noise=True, sigma=NOIZE_SIGMA, random_seed=44, patterns=[TARGET_PATTERN,])
df_target.target.plot(marker="o")

In [None]:
df_target.info()

In [None]:
df_target.head(3)

## Convert to TSDataset

In [None]:
from etna.datasets.tsdataset import TSDataset

In [None]:
# Combine features and target
assert all(df_target.timestamp == df1.timestamp) and all(df_target.timestamp == df2.timestamp)
assert all(df_target.segment == df1.segment) and all(df_target.segment == df2.segment)

# Dataframe with target only (recommended by ETNA)
df_train_test = pd.concat({
    "timestamp": df_target.timestamp,
    "segment": df_target.segment,
    "target": df_target.target
}, axis="columns")

# Dataframe with exogenous features
df_train_test_exog = pd.concat({
    "timestamp": df_target.timestamp,
    "segment": df_target.segment,
    "feat1": df1.target,
    "feat2": df2.target,
}, axis="columns")

In [None]:
df_train_test.head(3)

In [None]:
df_train_test_exog.head(3)

In [None]:
# Convert to TSDataset, combining "only-target dataset" with exogenous dataset
tsd_train_test = TSDataset(df=TSDataset.to_dataset(df_train_test), df_exog=TSDataset.to_dataset(df_train_test_exog), freq="D")

In [None]:
# Check shapes (missing candles could appear)
print(f"{tsd_train_test.to_pandas().shape=}")
tsd_train_test.info()
#tsd_train_test.to_pandas().info()
tsd_train_test.head(3)

## Split to train and test parts

In [None]:
tsd_train, tsd_test = tsd_train_test.train_test_split(
    # train_start="1980-01-01",
    # train_end="1993-12-01",
    # test_start="1994-01-01",
    # test_end="1994-08-01",
    test_size=TEST_PERIODS  # Alternative method
)
assert len(tsd_train.to_pandas()) == TRAIN_PERIODS
assert len(tsd_test.to_pandas()) == TEST_PERIODS

In [None]:
print(type(tsd_train))
tsd_train.head(3)

## Calculate "mean baseline" (MAE for pure mean-line predictions) (1.487)

In [None]:
y_true = tsd_test.to_pandas()[('segment_0', 'target')]
y_true.head()

In [None]:
# Calculate baseline MAE for pure zero predictions
from sklearn.metrics import mean_absolute_error
m = np.mean(TARGET_PATTERN)
y_pred = [m] * len(y_true)
f"{mean_absolute_error(y_true, y_pred):.3f}"

## Calculate "ideal baseline" (MAE for signal with original pattern of target) (0.049)

In [None]:
y_true = tsd_test.to_pandas()[('segment_0', 'target')]
y_true.head()

In [None]:
# Calculate baseline MAE for pure zero predictions
from sklearn.metrics import mean_absolute_error

print(f"Pattern used for target: {TARGET_PATTERN}")
df_ideal_target = generate_from_patterns_df(
    periods=len(y_true), start_time="2023-01-01", add_noise=False, sigma=0, random_seed=44, patterns=[TARGET_PATTERN,])

f"{mean_absolute_error(y_true, df_ideal_target.target):.3f}"

## Using ETNA "pipeline.backtest"

### Naive model (lag=1) (1.988)

In [None]:
%%time
transforms=[]
model = NaiveModel(lag=1)
df_metrics, df_forecast, df_fold_info = create_pipeline_and_launch_backtest(model=model, transforms=transforms, tsd_dataset=tsd_train_test)

In [None]:
dump_folds_info_and_metrics(df_metrics, df_forecast, df_fold_info)
df_forecast

### Naive model (lag=8) (0.113)

In [None]:
%%time
transforms=[]
model = NaiveModel(lag=8)
df_metrics, df_forecast, df_fold_info = create_pipeline_and_launch_backtest(model=model, transforms=transforms, tsd_dataset=tsd_train_test)

In [None]:
dump_folds_info_and_metrics(df_metrics, df_forecast, df_fold_info)
df_forecast

### Prophet model (1.851)

In [None]:
%%time
transforms=[]
model = ProphetModel()
df_metrics, df_forecast, df_fold_info = create_pipeline_and_launch_backtest(model=model, transforms=transforms, tsd_dataset=tsd_train_test)

In [None]:
dump_folds_info_and_metrics(df_metrics, df_forecast, df_fold_info)
df_forecast

### Catboost model on f1+f2 (0.112, 10seconds)

In [None]:
%%time
transforms=[]
model = CatBoostPerSegmentModel(random_state=42)
df_metrics, df_forecast, df_fold_info = create_pipeline_and_launch_backtest(model=model, transforms=transforms, tsd_dataset=tsd_train_test)

In [None]:
dump_folds_info_and_metrics(df_metrics, df_forecast, df_fold_info)
df_forecast

### Catboost on lags=[1] features only (1.498)

In [None]:
%%time
transforms=[
    #FilterFeaturesTransform(include=[])
    LagTransform(lags=[1], in_column="target"),
    FilterFeaturesTransform(exclude=["feat1", "feat2"])  # CatBoostError: Input data must have at least one feature
]
model = CatBoostPerSegmentModel(random_state=42)
df_metrics, df_forecast, df_fold_info = create_pipeline_and_launch_backtest(model=model, transforms=transforms, tsd_dataset=tsd_train_test)

In [None]:
dump_folds_info_and_metrics(df_metrics, df_forecast, df_fold_info)
df_forecast

### Catboost on lags=[8] features only (0.056)

In [None]:
from etna.transforms import FilterFeaturesTransform, LagTransform

In [None]:
%%time
transforms=[
    #FilterFeaturesTransform(include=[])
    #LagTransform(lags=[1,2,3,4,5,6,7,8], in_column="target"),
    LagTransform(lags=[8], in_column="target"),
    FilterFeaturesTransform(exclude=["feat1", "feat2"])  # CatBoostError: Input data must have at least one feature
]
model = CatBoostPerSegmentModel(random_state=42)
df_metrics, df_forecast, df_fold_info = create_pipeline_and_launch_backtest(model=model, transforms=transforms, tsd_dataset=tsd_train_test)

In [None]:
dump_folds_info_and_metrics(df_metrics, df_forecast, df_fold_info)
df_forecast

In [None]:
raise KeyboardInterrupt

# (12-18) [OBSOLETED by 0b]Approach 0a. ETNA Syntetic datasets

## Imports

In [None]:
from etna.analysis import plot_backtest
from etna.datasets import generate_from_patterns_df
from etna.metrics import MAE, MAPE, SMAPE
from etna.models import NaiveModel, CatBoostPerSegmentModel, ProphetModel
from etna.pipeline import Pipeline

## Generate pandas df1+df2, df_target

In [None]:
NOIZE_SIGMA = 0.1
TRAIN_PERIODS = 24
TEST_PERIODS = 8
TRAIN_TEST_PERIODS = TRAIN_PERIODS + TEST_PERIODS

In [None]:
df1= generate_from_patterns_df(periods=TRAIN_TEST_PERIODS, start_time="2023-01-01", add_noise=True, sigma=NOIZE_SIGMA, random_seed=42, patterns=[[2,4,6,4],])
df1.target.plot(marker="o")

In [None]:
df2 = generate_from_patterns_df(periods=TRAIN_TEST_PERIODS, start_time="2023-01-01", add_noise=True, sigma=NOIZE_SIGMA, random_seed=43, patterns=[[2,3,4,5,6,5,4,3],])
df2.target.plot(marker="o")

In [None]:
(df1.target + df2.target).plot(marker="o")

In [None]:
# Create target separately from features to get non-coinciding version (due to different noize patterns).
TARGET_PATTERN = [4,7,10,9,8,9,10,7]
df_target = generate_from_patterns_df(periods=TRAIN_TEST_PERIODS, start_time="2023-01-01", add_noise=True, sigma=NOIZE_SIGMA, random_seed=44, patterns=[TARGET_PATTERN,])
df_target.target.plot(marker="o")

In [None]:
df_target.info()

In [None]:
df_target.head(3)

## Convert to TSDataset

In [None]:
from etna.datasets.tsdataset import TSDataset

In [None]:
# Combine features and target
assert all(df_target.timestamp == df1.timestamp) and all(df_target.timestamp == df2.timestamp)
assert all(df_target.segment == df1.segment) and all(df_target.segment == df2.segment)

df_train_test = pd.concat({
    "timestamp": df_target.timestamp,
    "segment": df_target.segment,
    "feat1": df1.target,
    "feat2": df2.target,
    "target": df_target.target
}, axis="columns")

In [None]:
df_train_test

In [None]:
# Prepocess by ETNA (move timestamp to index and create multiindex for segment + all columns)
df_train_test_preprocessed = TSDataset.to_dataset(df_train_test)
type(df_train_test_preprocessed)

In [None]:
# Finally - convert to TSDataset 
tsd_train_test = TSDataset(df_train_test_preprocessed, freq="D")

# Check shapes (missing candles could appear)
print(f"{tsd_train_test.to_pandas().shape=}")
tsd_train_test.info()
tsd_train_test.to_pandas().info()
tsd_train_test.head(3)

## Split to train and test parts

In [None]:
tsd_train, tsd_test = tsd_train_test.train_test_split(
    # train_start="1980-01-01",
    # train_end="1993-12-01",
    # test_start="1994-01-01",
    # test_end="1994-08-01",
    test_size=TEST_PERIODS  # Alternative method
)
assert len(tsd_train.to_pandas()) == TRAIN_PERIODS
assert len(tsd_test.to_pandas()) == TEST_PERIODS

In [None]:
print(type(tsd_train))
tsd_train.head(3)

## Calculate "mean baseline" (MAE for pure mean-line predictions) (1.487)

In [None]:
y_true = tsd_test.to_pandas()[('segment_0', 'target')]
y_true.head()

In [None]:
# Calculate baseline MAE for pure zero predictions
from sklearn.metrics import mean_absolute_error
m = np.mean(TARGET_PATTERN)
f"{mean_absolute_error(y_true, [m] * len(y_true)):.3f}"

## Using ETNA "pipeline.backtest"

### Naive model (lag=1) (1.988)

In [None]:
%%time
HORIZON = 1
N_FOLDS = TEST_PERIODS
print(f"{HORIZON=} {N_FOLDS=}")

transforms=[]
model = NaiveModel(lag=1)

pipeline = Pipeline(
    model = model,
    transforms=transforms,
    horizon=HORIZON
)

df_metrics, df_forecast, df_fold_info = pipeline.backtest(ts=tsd_train_test, metrics=[MAE(),], n_folds=N_FOLDS)  # MSE(), SMAPE()], n_folds=10)

In [None]:
print(df_fold_info.shape)
print(df_fold_info.head(2))

print(df_metrics.shape)
print(df_metrics.head(2))

print(f"{df_metrics.MAE.mean()=:.3f}")

df_forecast

In [None]:
plot_backtest(df_forecast, tsd_train_test, history_len=TRAIN_PERIODS)

### Naive model (lag=8) (0.113)

In [None]:
%%time
HORIZON = 1
N_FOLDS = TEST_PERIODS
print(f"{HORIZON=} {N_FOLDS=}")

transforms=[]
model = NaiveModel(lag=8)

pipeline = Pipeline(
    model = model,
    transforms=transforms,
    horizon=HORIZON
)

df_metrics, df_forecast, df_fold_info = pipeline.backtest(ts=tsd_train_test, metrics=[MAE(),], n_folds=N_FOLDS)  # MSE(), SMAPE()], n_folds=10)

In [None]:
print(df_fold_info.shape)
print(df_fold_info.head(2))

print(df_metrics.shape)
print(df_metrics.head(2))

print(f"{df_metrics.MAE.mean()=:.3f}")

df_forecast

In [None]:
plot_backtest(df_forecast, tsd_train_test, history_len=TRAIN_PERIODS)

In [None]:
df_target.target.plot(marker="o")

### Prophet model (1.851)

In [None]:
%%time
HORIZON = 1
N_FOLDS = TEST_PERIODS
print(f"{HORIZON=} {N_FOLDS=}")

transforms=[]
model = ProphetModel()

pipeline = Pipeline(
    model = model,
    transforms=transforms,
    horizon=HORIZON
)

df_metrics, df_forecast, df_fold_info = pipeline.backtest(ts=tsd_train_test, metrics=[MAE(),], n_folds=N_FOLDS)  # MSE(), SMAPE()], n_folds=10)

In [None]:
print(df_fold_info.shape)
print(df_fold_info.head(2))

print(df_metrics.shape)
print(df_metrics.head(5))

print(f"{df_metrics.MAE.mean()=:.3f}")

df_forecast

In [None]:
plot_backtest(df_forecast, tsd_train_test, history_len=TRAIN_PERIODS)

In [None]:
df_target.target.plot(marker="o")

### Catboost model (4.015)

In [None]:
%%time
HORIZON = 1
N_FOLDS = TEST_PERIODS
print(f"{HORIZON=} {N_FOLDS=}")

transforms=[]
model = CatBoostPerSegmentModel(random_state=42)

pipeline = Pipeline(
    model = model,
    transforms=transforms,
    horizon=HORIZON
)

df_metrics, df_forecast, df_fold_info = pipeline.backtest(
    ts=tsd_train_test, metrics=[MAE(),], 
    n_folds=N_FOLDS,
    forecast_params=dict(return_components=True)
) 

In [None]:
print(df_fold_info.shape)
print(df_fold_info.head(2))

print(df_metrics.shape)
print(df_metrics.head(5))

print(f"{df_metrics.MAE.mean()=:.3f}")

df_forecast

In [None]:
plot_backtest(df_forecast, tsd_train_test, history_len=TRAIN_PERIODS)

In [None]:
df_target.target.plot(marker="o")

## Using ETNA "model.fit"

### "model.fit": Naive model (lag=1), (1.877)

In [None]:
%%time
HORIZON = len(tsd_test.to_pandas())
print(f"{HORIZON=}")

# Fit the model
model = NaiveModel(lag=1)
model.fit(tsd_train)
model

In [None]:
# Make dataset for forcasting (1 candle of train set + HORIZON empty candles)
print(f"{model.context_size=}")
tsd_future = tsd_train.make_future(future_steps=HORIZON, tail_steps=model.context_size)
print(type(tsd_future))
tsd_future

In [None]:
%%time
tsd_forecast = model.forecast(tsd_future, prediction_size=HORIZON)
# Here we get dataset of all HORIZON candles (i.e. the first one is dropped)
print(type(tsd_forecast))
tsd_forecast

In [None]:
mae = MAE()
mae(y_true=tsd_test, y_pred=tsd_forecast)

#### Another attempt - use manual tsd_future -> ERROR (Given context isn't big enough)


In [None]:
# Generate manual tsd_future2 TDataset - all test set + required context part of train
_, tsd_future2 = tsd_train_test.train_test_split(test_size=TEST_PERIODS + 1)

In [None]:
print(type(tsd_future2))
tsd_future2

In [None]:
# Use this new tsd_future2 for making predictions (done inplace!)
# model.forecast(tsd_test, prediction_size=HORIZON)  # Gives error
dummy = model.forecast(tsd_future2, prediction_size=HORIZON, return_components=True)
assert dummy is tsd_future2  # Just same object
tsd_future2

### "model.fit": Naive model (lag=7), ()

In [None]:
%%time
HORIZON = len(tsd_test.to_pandas())
print(f"{HORIZON=}")

# Fit the model
model = NaiveModel(lag=7)
model.fit(tsd_train)
model

In [None]:
# Make dataset for forcasting (1 candle of train set + HORIZON empty candles)
print(f"{model.context_size=}")
tsd_future = tsd_train.make_future(future_steps=HORIZON, tail_steps=model.context_size)
print(type(tsd_future))
tsd_future

In [None]:
%%time
tsd_forecast = model.forecast(tsd_future, prediction_size=HORIZON)
# Here we get dataset of all HORIZON candles (i.e. the first one is dropped)
print(type(tsd_forecast))
tsd_forecast

In [None]:
mae = MAE()
mae(y_true=tsd_test, y_pred=tsd_forecast)

#### Another attempt - use manual tsd_future -> OK

In [None]:
# Generate manual tsd_future2 TDataset - all test set + required context part of train
_, tsd_future2 = tsd_train_test.train_test_split(test_size=TEST_PERIODS + 7)

In [None]:
print(type(tsd_future2))
tsd_future2

In [None]:
# Use this new tsd_future2 for making predictions (done inplace!)
# model.forecast(tsd_test, prediction_size=HORIZON)  # Gives error
dummy = model.forecast(tsd_future2, prediction_size=HORIZON, return_components=True)
assert dummy is tsd_future2  # Just same object
tsd_future2

### "model.fit": Naive model (lag=8), (0.113)

In [None]:
%%time
HORIZON = len(tsd_test.to_pandas())
print(f"{HORIZON=}")

# Fit the model
model = NaiveModel(lag=8)
model.fit(tsd_train)
model

In [None]:
# Make dataset for forcasting (1 candle of train set + HORIZON empty candles)
print(f"{model.context_size=}")
tsd_future = tsd_train.make_future(future_steps=HORIZON, tail_steps=model.context_size)
print(type(tsd_future))
tsd_future

In [None]:
%%time
# Make autoregressive forecast (done inplace!)
dummy = model.forecast(tsd_future, prediction_size=HORIZON, return_components=True)
assert dummy is tsd_future  # Just same object
print(type(tsd_future))
tsd_future

In [None]:
mae = MAE()
mae(y_true=tsd_test, y_pred=tsd_forecast)

### "model.fit": CatBoost model (no lags), ()

In [None]:
%%time
HORIZON = len(tsd_test.to_pandas())
print(f"{HORIZON=}")

# Fit the model
model = CatBoostPerSegmentModel(random_state=42)
model.fit(tsd_train)
model

#### Approach 1 via make_future -> NaNs

In [None]:
# Make dataset for forcasting (0 candles of train set + HORIZON empty candles)
print(f"{model.context_size=}")
tsd_future = tsd_train.make_future(future_steps=HORIZON, tail_steps=model.context_size)
print(type(tsd_future))
tsd_future

In [None]:
%%time
# Make autoregressive forecast (done inplace!)
#dummy = model.forecast(tsd_future, prediction_size=HORIZON, return_components=True)
dummy = model.forecast(tsd_future, return_components=True)
assert dummy is tsd_future  # Just same object
print(type(tsd_future))
tsd_future

In [None]:
mae = MAE()
mae(y_true=tsd_test, y_pred=tsd_future)

#### Approach 2 via manual tsd_future (0.095)

In [None]:
# Generate manual tsd_future2 TDataset - all test set + required context part of train
_, tsd_future = tsd_train_test.train_test_split(test_size=TEST_PERIODS + 0)

In [None]:
print(type(tsd_future))
tsd_future

In [None]:
# Use this new tsd_future for making predictions (done inplace!)
#dummy = model.forecast(tsd_future, prediction_size=HORIZON, return_components=True)
dummy = model.forecast(tsd_future, return_components=True)
assert dummy is tsd_future  # Just same object
tsd_future

In [None]:
tsd_future.df.columns

In [None]:
tsd_future.df[('segment_0', 'target')].plot()

In [None]:
mae = MAE()
mae(y_true=tsd_test, y_pred=tsd_future)

## Using ETNA "pipeline.backtest" with exogenous features

In [None]:
ts = TSDataset(df=TSDataset.to_dataset(df), df_exog=TSDataset.to_dataset(df_exog), freq="D")

In [None]:
raise KeyboardInterrupt

# (12-20) Snippet from ETNA developers

In [None]:
import pandas as pd
import numpy as np

from etna.pipeline import Pipeline
from etna.models import CatBoostPerSegmentModel
from etna.datasets import TSDataset
from etna.metrics import MAE


#def main():
rng = np.random.default_rng(0)
df_exog = pd.DataFrame({
    "timestamp": pd.date_range(start="2020-01-01", periods=100, freq="D"),
    "segment": ["segment_0"]*100,
    "feature_1": rng.normal(size=100),
    "feature_2": rng.normal(size=100),
})
df = df_exog.copy()
df["target"] = df["feature_1"] + df["feature_2"] + rng.normal(scale=0.01)
df = df.drop(columns=["feature_1", "feature_2"])

print(df)
print(df_exog)

ts = TSDataset(df=TSDataset.to_dataset(df), df_exog=TSDataset.to_dataset(df_exog), freq="D")

pipeline = Pipeline(
    model=CatBoostPerSegmentModel(random_state=42),
    transforms=[],
    horizon=1
)
df_metrics, df_forecast, df_fold_info = pipeline.backtest(
    ts=ts, metrics=[MAE()],
    n_folds=2,
)


# if __name__ == "__main__":
#     main()

#main()

In [None]:
df_metrics

In [None]:
df_forecast