In [None]:
!which python

In [None]:
import matplotlib.pyplot as plt
import os
import pandas as pd

from etna.datasets.tsdataset import TSDataset
from etna.metrics import MAE, MSE, SMAPE, MAPE
from etna.pipeline import Pipeline
from etna.models import ProphetModel
from etna.analysis import plot_backtest

# Prepare dataset in format "timestamp - segment - target"

In [None]:
FNAME_DATA_SRC = os.path.join(r'../../data/PCPS_06-08-2023 20-05-34-68_timeSeries.csv')
assert os.path.isfile(FNAME_DATA_SRC), f"{FNAME_DATA_SRC=}"
print(f"Successfully checked: {FNAME_DATA_SRC=}")



In [None]:
df_src = pd.read_csv(FNAME_DATA_SRC, index_col=False)
print(df_src.shape)
df_src

In [None]:
# Transform data to row-level time series
# Commodity Code to select (may be wider than actually used for modeling)
COLS__INTERESTING_CC = [
    "PALUM",    # Aluminum
    "PCOAL",    # Coal index 
    "PALLMETA"  # All Metals Index    
]

# Unit Code to select
UNIT_CODE = "IX"

df_tmp = df_src[(df_src["Commodity Code"].isin(COLS__INTERESTING_CC)) & (df_src["Unit Code"] == UNIT_CODE)]
assert len(df_tmp) == len(COLS__INTERESTING_CC)
df_tmp

In [None]:
# Get names of feature columns (order may be different from ours)
CC_LABELS = df_tmp["Commodity Code"].to_list()
CC_LABELS

In [None]:
# Prepare resulting dataframe (transposed)
# Range of dates to select
COL__BEGIN_TS_LABEL = "1990M1"
COL__END_TS_LABEL = "2023M5"

df_main = df_tmp.loc[:, COL__BEGIN_TS_LABEL:COL__END_TS_LABEL].T

# Assign column names
df_main.columns = CC_LABELS

# Show the result
df_main

In [None]:
# Convert string dates to datetime. Example: "1990M1" -> "1990-01-01"
df_main["timestamp"] = pd.to_datetime(df_main.index, format="%YM%m")

In [None]:
# Create dummy segment (required by ETNA)
DUMMY_SEGMENT = "dummy_segment"
df_main["segment"] = DUMMY_SEGMENT

In [None]:
df_main["target"] = df_main["PALUM"]

In [None]:
df_main

# Convert data to TSDataset

In [None]:
# From ETNA docs:
# Convert pandas dataframe to ETNA Dataset format.
# Columns "timestamp" and "segment" are required.
df = TSDataset.to_dataset(df_main)
tsd_full = TSDataset(df, freq="MS")

In [None]:
tsd_full

In [None]:
# T1245
tsd_full.head(20)

In [None]:
# T1245
tsd_full.tail(26)

In [None]:
tsd_full.plot(segments=[DUMMY_SEGMENT])

# Modeling

## Prophet

In [None]:
horizon = 1  # Set the horizon for predictions
model = ProphetModel()  # Create a model
transforms = []  # A list of transforms -  we will not use any of them

In [None]:
pipeline = Pipeline(model=model, transforms=transforms, horizon=horizon)

In [None]:
metrics_df, forecast_df, fold_info_df = pipeline.backtest(ts=tsd_full, metrics=[MAE(), MSE(), SMAPE(), MAPE()], n_folds=24)

In [None]:
metrics_df

In [None]:
metrics_df.MAPE.mean()

In [None]:
# NEW 2023-11
forecast_df.head(32)

In [None]:
fold_info_df.head(10)

In [None]:
plot_backtest(forecast_df, tsd_full, history_len=70)

In [None]:
# !!!! (2023-11-27) Проблема - разрыв на графике predict

## Naive model

In [None]:
from etna.models import NaiveModel

In [None]:
horizon = 1  # Set the horizon for predictions
model = NaiveModel(lag=1)  # Create a model
transforms = []  # A list of transforms -  we will not use any of them

In [None]:
pipeline = Pipeline(model=model, transforms=transforms, horizon=horizon)

In [None]:
metrics_df, forecast_df, fold_info_df = pipeline.backtest(ts=tsd_full, metrics=[MAE(), MSE(), SMAPE(), MAPE()], n_folds=24)

In [None]:
metrics_df

In [None]:
metrics_df.MAPE.mean()

In [None]:
# NEW 2023-11
forecast_df.head(32)

In [None]:
fold_info_df.head(10)

In [None]:
plot_backtest(forecast_df, tsd_full, history_len=70)

## CatBoost model (ETNA)

In [None]:
from etna.models import CatBoostMultiSegmentModel

In [None]:
horizon = 1  # Set the horizon for predictions
model = CatBoostMultiSegmentModel(iterations=100, random_state=42)   # Create a model
transforms = []  # A list of transforms -  we will not use any of them

In [None]:
pipeline = Pipeline(model=model, transforms=transforms, horizon=horizon)

In [None]:
%%time
metrics_df, forecast_df, fold_info_df = pipeline.backtest(ts=tsd_full, metrics=[MAE(), MSE(), SMAPE(), MAPE()], n_folds=24)

In [None]:
metrics_df

In [None]:
metrics_df.MAPE.mean()

In [None]:
# NEW 2023-11
forecast_df.head(32)

In [None]:
fold_info_df.head(10)

In [None]:
plot_backtest(forecast_df, tsd_full, history_len=70)

# Metrics visualization (from the last model)

In [None]:
from etna.analysis import (
    metric_per_segment_distribution_plot,
    plot_residuals,
    plot_metric_per_segment,
    prediction_actual_scatter_plot,
)

In [None]:
metric_per_segment_distribution_plot(metrics_df=metrics_df, metric_name="MAPE", plot_type="box")

In [None]:
# NEW 2023-11
# Check "mean" mode
metric_per_segment_distribution_plot(metrics_df=metrics_df, metric_name="MAPE", plot_type="box",
                                    per_fold_aggregation_mode="mean")

In [None]:
plot_metric_per_segment(metrics_df=metrics_df, metric_name="MAPE", ascending=True)

In [None]:
plot_residuals(forecast_df=forecast_df, ts=tsd_full)

In [None]:
prediction_actual_scatter_plot(forecast_df=forecast_df, ts=tsd_full)

# Dump output predictions to file

In [None]:
forecast_df[('dummy_segment', 'target')].rename("prediction").to_csv("pipeline_v2_out.csv")

# Part 2 - from tutorial 04 (regressors)

In [None]:
df_main

In [None]:
df_main.info()  # Ensure timestamp is in datetime format

In [None]:
df_main2 = df_main.copy()
df_main2["timestamp"] = df_main2["timestamp"].dt.strftime('%Y-%m-%d')  # TBD T1338: try to rollback to datetime format
df_main2.info() 

In [None]:
df_main2.head()

## Separate target and features dataset

In [None]:
# Remove the last target for features to be larger than the target (ETNA requirement)
tsd_target = TSDataset.to_dataset(df_main2[["timestamp", "target", "segment"]].iloc[:-1, :])  # TBD 1316
tsd_target.tail()

In [None]:
tsd_target.info()

In [None]:
tsd_regressor = TSDataset.to_dataset(df_main2.drop("target", axis="columns"))
tsd_regressor.tail()

In [None]:
tsd_combined = TSDataset(df=tsd_target, freq="MS", df_exog=tsd_regressor, known_future="all")
tsd_combined.head()

## EDA

In [None]:
tsd_combined.plot(column="PCOAL", n_segments=1)

## 3. Forecast with regressors <a class="anchor" id="chapter3"></a>

We will use LinearPerSegmentModel. It is a simple model that works with regressors.

> Note: some models do not work with regressors. In this case, they will warn you about it.

We should forecast merchandise sales a year ahead using regressors with information about weather.

In [None]:
from etna.models import LinearPerSegmentModel, NaiveModel, CatBoostPerSegmentModel, CatBoostMultiSegmentModel

HORIZON = 1
FOLDS = 24
#model = LinearPerSegmentModel()
#model = NaiveModel(lag=1)
#model = CatBoostPerSegmentModel(random_state=42, iterations=100)
model = CatBoostMultiSegmentModel(random_state=42, iterations=100)

In [None]:
from etna.transforms import FilterFeaturesTransform

from etna.transforms import MeanTransform  # math
from etna.transforms import DateFlagsTransform, HolidayTransform  # datetime
from etna.transforms import LagTransform  # lags

transforms = [
    LagTransform(
        in_column="target",
        lags=list(range(HORIZON, HORIZON + 1)),  # TBD T1420
        out_column="target_lag",
    ),
    LagTransform(in_column="PCOAL", lags=list(range(1, 3)), out_column="tavg_lag"),
    # MeanTransform(in_column="tavg", window=7, out_column="tavg_mean"),
    # MeanTransform(
    #     in_column="target_lag_365",
    #     out_column="target_mean",
    #     window=104,
    #     seasonality=7,
    # ),
    # DateFlagsTransform(
    #     day_number_in_week=True,
    #     day_number_in_month=True,
    #     is_weekend=True,
    #     special_days_in_week=[4],
    #     out_column="date_flag",
    # ),
    # HolidayTransform(iso_code="SWE", out_column="SWE_holidays"),
    # HolidayTransform(iso_code="NOR", out_column="NOR_holidays"),
    # HolidayTransform(iso_code="FIN", out_column="FIN_holidays"),
    # LagTransform(
    #     in_column="SWE_holidays",
    #     lags=list(range(2, 6)),
    #     out_column="SWE_holidays_lag",
    # ),
    # LagTransform(
    #     in_column="NOR_holidays",
    #     lags=list(range(2, 6)),
    #     out_column="NOR_holidays_lag",
    # ),
    # LagTransform(
    #     in_column="FIN_holidays",
    #     lags=list(range(2, 6)),
    #     out_column="FIN_holidays_lag",
    # ),
    FilterFeaturesTransform(exclude=["PALUM"]),
]

In [None]:
from etna.pipeline import Pipeline

pipeline = Pipeline(model=model, transforms=transforms, horizon=HORIZON)

In [None]:
%%time
from etna.metrics import SMAPE

metrics, forecasts, _ = pipeline.backtest(tsd_combined, metrics=[MAPE(), SMAPE()], aggregate_metrics=True, n_folds=FOLDS)

In [None]:
metrics

In [None]:
from etna.analysis import plot_backtest

plot_backtest(forecasts, tsd_combined, history_len=12)

In [None]:
# 6.591814 - Linear model
# 6.099095 - CB(100) CatBoostPerSegmentModel
# 6.099095 - CB(100) CatBoostMultiSegmentModel

# FOLDS=1, HORIZON=24
# 9.67797 - CB(100) - CatBoostPerSegmentModel
# 9.67797 - CB(100) - CatBoostMultiSegmentModel