In [None]:
!which python

In [None]:
import matplotlib.pyplot as plt
import os
import pandas as pd

from etna.datasets.tsdataset import TSDataset
from etna.metrics import MAE, MSE, SMAPE, MAPE
from etna.pipeline import Pipeline
from etna.models import ProphetModel
from etna.analysis import plot_backtest

# Prepare dataset in format "timestamp - segment - target"

In [None]:
FNAME_DATA_SRC = os.path.join(r'../../data/PCPS_06-08-2023 20-05-34-68_timeSeries.csv')
assert os.path.isfile(FNAME_DATA_SRC), f"{FNAME_DATA_SRC=}"
print(f"Successfully checked: {FNAME_DATA_SRC=}")



In [None]:
df_src = pd.read_csv(FNAME_DATA_SRC, index_col=False)
print(df_src.shape)
df_src

In [None]:
# Transform data to row-level time series
# Commodity Code to select (may be wider than actually used for modeling)
COLS__INTERESTING_CC = [
    "PALUM",    # Aluminum
    "PCOAL",    # Coal index 
    "PALLMETA"  # All Metals Index    
]

# Unit Code to select
UNIT_CODE = "IX"

df_tmp = df_src[(df_src["Commodity Code"].isin(COLS__INTERESTING_CC)) & (df_src["Unit Code"] == UNIT_CODE)]
assert len(df_tmp) == len(COLS__INTERESTING_CC)
df_tmp

In [None]:
# Get names of feature columns (order may be different from ours)
CC_LABELS = df_tmp["Commodity Code"].to_list()
CC_LABELS

In [None]:
# Prepare resulting dataframe (transposed)
# Range of dates to select
COL__BEGIN_TS_LABEL = "1990M1"
COL__END_TS_LABEL = "2023M5"

df_main = df_tmp.loc[:, COL__BEGIN_TS_LABEL:COL__END_TS_LABEL].T

# Assign column names
df_main.columns = CC_LABELS

# Show the result
df_main

In [None]:
# Convert string dates to datetime. Example: "1990M1" -> "1990-01-01"
df_main["timestamp"] = pd.to_datetime(df_main.index, format="%YM%m")

In [None]:
# Create dummy segment (required by ETNA)
DUMMY_SEGMENT = "dummy_segment"
df_main["segment"] = DUMMY_SEGMENT

In [None]:
df_main["target"] = df_main["PALUM"]

In [None]:
df_main

# Convert data to TSDataset

In [None]:
# From ETNA docs:
# Convert pandas dataframe to ETNA Dataset format.
# Columns "timestamp" and "segment" are required.
df = TSDataset.to_dataset(df_main)
ts = TSDataset(df, freq="MS")

In [None]:
ts

In [None]:
# T1245
ts.head(20)

In [None]:
# T1245
ts.tail(26)

In [None]:
ts.plot(segments=[DUMMY_SEGMENT])

# Modeling

## Prophet

In [None]:
horizon = 1  # Set the horizon for predictions
model = ProphetModel()  # Create a model
transforms = []  # A list of transforms -  we will not use any of them

In [None]:
pipeline = Pipeline(model=model, transforms=transforms, horizon=horizon)

In [None]:
metrics_df, forecast_df, fold_info_df = pipeline.backtest(ts=ts, metrics=[MAE(), MSE(), SMAPE(), MAPE()], n_folds=24)

In [None]:
metrics_df

In [None]:
metrics_df.MAPE.mean()

In [None]:
# NEW 2023-11
forecast_df.head(32)

In [None]:
fold_info_df.head(10)

In [None]:
plot_backtest(forecast_df, ts, history_len=70)

## Naive model

In [None]:
from etna.models import NaiveModel

In [None]:
horizon = 1  # Set the horizon for predictions
model = NaiveModel(lag=1)  # Create a model
transforms = []  # A list of transforms -  we will not use any of them

In [None]:
pipeline = Pipeline(model=model, transforms=transforms, horizon=horizon)

In [None]:
metrics_df, forecast_df, fold_info_df = pipeline.backtest(ts=ts, metrics=[MAE(), MSE(), SMAPE(), MAPE()], n_folds=24)

In [None]:
metrics_df

In [None]:
metrics_df.MAPE.mean()

In [None]:
# NEW 2023-11
forecast_df.head(32)

In [None]:
fold_info_df.head(10)

In [None]:
plot_backtest(forecast_df, ts, history_len=70)

# Metrics visualization

In [None]:
from etna.analysis import (
    metric_per_segment_distribution_plot,
    plot_residuals,
    plot_metric_per_segment,
    prediction_actual_scatter_plot,
)

In [None]:
metric_per_segment_distribution_plot(metrics_df=metrics_df, metric_name="MAPE", plot_type="box")

In [None]:
# NEW 2023-11
# Check "mean" mode
metric_per_segment_distribution_plot(metrics_df=metrics_df, metric_name="MAPE", plot_type="box",
                                    per_fold_aggregation_mode="mean")

In [None]:
plot_metric_per_segment(metrics_df=metrics_df, metric_name="MAPE", ascending=True)

In [None]:
plot_residuals(forecast_df=forecast_df, ts=ts)

In [None]:
prediction_actual_scatter_plot(forecast_df=forecast_df, ts=ts)

# Dump output predictions to file

In [None]:
forecast_df[('dummy_segment',      'target')].rename("prediction").to_csv("pipeline_v2_out.csv")