# Cross-validation for time series (package [`sktime`](https://www.sktime.net/en/stable/))

Consider ARIMA-model as example

In [None]:
import numpy as np
import pandas as pd

# Модель для прогнозирования, например ARIMA
from sktime.forecasting.arima import ARIMA
# Визуализация временных рядов
from sktime.utils.plotting import plot_series
# Модули для кросс-валидации
from sktime.split import temporal_train_test_split, ExpandingWindowSplitter, SlidingWindowSplitter, SingleWindowSplitter
from sktime.forecasting.model_evaluation import evaluate
from sktime.performance_metrics.forecasting import MeanSquaredError, MeanAbsoluteError, MeanAbsolutePercentageError # Метрики MSE, MAE, MAPE

import pandas_datareader.data as web

# настройки визуализации
import matplotlib.pyplot as plt

# Не показывать Warnings
import warnings
warnings.simplefilter(action='ignore', category=Warning)
# Не показывать ValueWarning, ConvergenceWarning из statsmodels
from statsmodels.tools.sm_exceptions import ValueWarning, ConvergenceWarning
warnings.simplefilter('ignore', category=ValueWarning)
warnings.simplefilter('ignore', category=ConvergenceWarning)

Load from [`FRED`](https://fred.stlouisfed.org/) monthly data on Market Yield on U.S. Treasury Securities at 10-Year Constant Maturity (Symbol [`GS10`](https://fred.stlouisfed.org/series/GS10)) from 2000-01-01 to 2023-12-31 as `y` DataFrame

In [None]:
y = web.DataReader(name='GS10', data_source='fred', start='2000-01', end='2023-12')
y.index = pd.period_range(start='2000-01', end='2023-12', freq='M')
# длина ряда
len(y)

## Validation through train/test split

We use method [`temporal_train_test_split`](https://www.sktime.net/en/stable/api_reference/auto_generated/sktime.split.temporal_train_test_split.html) из пакета `sktime`

We split series into train part (first, for instance 80%, observations) and test part (last, for instance 20%, observations)

Metrics [`MeanSquaredError`](https://www.sktime.net/en/stable/api_reference/auto_generated/sktime.performance_metrics.forecasting.MeanSquaredError.html)

Consider ARIMA(2,1,2) as example

In [None]:
# специфицируем модель для прогнозирования, например ARIMA(2,1,2)
forecaster = ARIMA(order=(2,1,2), trend='n')

# разбиваем выбору на обучающую (первые 80%) и тестовую
y_train, y_test = temporal_train_test_split(y, train_size=0.8)

# Обучаем модель на тестовом множестве
forecaster.fit(y_train)

# прогнозируем для тестовых наблюдений
y_pred = forecaster.predict(fh = y_test.index)

# инициализируем метрику
metric = MeanSquaredError(square_root=False)
# вычисляем метрику на данных
metric.evaluate(y_test, y_pred)

In [None]:
plot_series(y_train, y_test, y_pred, labels=['train', 'test', 'pred'])

plt.show()

Alternatively we can use  [`SingleWindowSplitter`](https://www.sktime.net/en/stable/api_reference/auto_generated/sktime.split.SingleWindowSplitter.html) 

Metrics [`MeanSquaredError`](https://www.sktime.net/en/stable/api_reference/auto_generated/sktime.performance_metrics.forecasting.MeanSquaredError.html), [`MeanAbsoluteError`](https://www.sktime.net/en/stable/api_reference/auto_generated/sktime.performance_metrics.forecasting.MeanAbsoluteError.html), [`MeanAbsolutePercentageError`](https://www.sktime.net/en/stable/api_reference/auto_generated/sktime.performance_metrics.forecasting.MeanAbsolutePercentageError.html#sktime.performance_metrics.forecasting.MeanAbsolutePercentageError)

Finely we use [`evaluate`](https://www.sktime.net/en/stable/api_reference/auto_generated/sktime.forecasting.model_evaluation.evaluate.html)

Consider ARMA(2,1,2) as example

In [None]:
# специфицируем модель для прогнозирования, например ARIMA(2,1,2) без сноса
forecaster = ARIMA(order=(2,1,2), trend='n')

# разбиваем выбору на обучающую (первые 150) и тестовую
cv = SingleWindowSplitter(fh=np.arange(1, len(y)-150+1), window_length=150)

# инициализируем метрики
metric = [MeanSquaredError(square_root=False), MeanAbsoluteError(), MeanAbsolutePercentageError()]

df = evaluate(forecaster=forecaster, y=y, cv=cv, strategy="refit", return_data=False, scoring=metric)
df

In [None]:
# MSE, MAE, MAPE
df.iloc[:,[0,1,2]]

## Validation with k-Fold method (expanded train set)

We use  [`ExpandingWindowSplitter`](https://www.sktime.net/en/stable/api_reference/auto_generated/sktime.split.ExpandingWindowSplitter.html) 

Metrics [`MeanSquaredError`](https://www.sktime.net/en/stable/api_reference/auto_generated/sktime.performance_metrics.forecasting.MeanSquaredError.html), [`MeanAbsoluteError`](https://www.sktime.net/en/stable/api_reference/auto_generated/sktime.performance_metrics.forecasting.MeanAbsoluteError.html), [`MeanAbsolutePercentageError`](https://www.sktime.net/en/stable/api_reference/auto_generated/sktime.performance_metrics.forecasting.MeanAbsolutePercentageError.html#sktime.performance_metrics.forecasting.MeanAbsolutePercentageError)

Finely we use [`evaluate`](https://www.sktime.net/en/stable/api_reference/auto_generated/sktime.forecasting.model_evaluation.evaluate.html)

Consider ARIMA(2,1,2) as example

In [None]:
# специфицируем модель для прогнозирования, например ARIMA(2,1,2) без сноса
forecaster = ARIMA(order=(2,1,2), trend='n')

# разбиваем выбору на обучающую (начинаем с первых 100) и тестовую длины 10
cv = ExpandingWindowSplitter(fh=np.arange(1, 11), initial_window=100, step_length=10)

# инициализируем метрики
metric = [MeanSquaredError(square_root=False), MeanAbsoluteError(), MeanAbsolutePercentageError()]

df = evaluate(forecaster=forecaster, y=y, cv=cv, strategy="refit", return_data=False, scoring=metric)
df

In [None]:
# средняя MSE, MAE, MAPE
df.iloc[:,[0,1,2]].mean()

## Validation with k-Fold method (sliding train set)

We use method [`SlidingWindowSplitter`](https://www.sktime.net/en/stable/api_reference/auto_generated/sktime.split.SlidingWindowSplitter.html)

Metrics [`MeanSquaredError`](https://www.sktime.net/en/stable/api_reference/auto_generated/sktime.performance_metrics.forecasting.MeanSquaredError.html), [`MeanAbsoluteError`](https://www.sktime.net/en/stable/api_reference/auto_generated/sktime.performance_metrics.forecasting.MeanAbsoluteError.html), [`MeanAbsolutePercentageError`](https://www.sktime.net/en/stable/api_reference/auto_generated/sktime.performance_metrics.forecasting.MeanAbsolutePercentageError.html#sktime.performance_metrics.forecasting.MeanAbsolutePercentageError)

Finely we call [`evaluate`](https://www.sktime.net/en/stable/api_reference/auto_generated/sktime.forecasting.model_evaluation.evaluate.html)

Consider ARIMA(2,1,2) as example

In [None]:
# специфицируем модель для прогнозирования, например ARIMA(2,1,2) без сноса
forecaster = ARIMA(order=(2,1,2), trend='n')

# разбиваем выбору на обучающую (длины) 100) и тестовую (лины 10)
cv = SlidingWindowSplitter(fh=np.arange(1, 11), initial_window=100, step_length=10)

# инициализируем метрики
metric = [MeanSquaredError(square_root=False), MeanAbsoluteError(), MeanAbsolutePercentageError()]

df = evaluate(forecaster=forecaster, y=y, cv=cv, strategy="refit", return_data=False, scoring=metric)
df

In [None]:
# средняя MSE, MAE, MAPE
df.iloc[:,[0,1,2]].mean()

## Cross-validation for a collection of models

Consider the following collection of models
* ARIMA(1,1,1) without drift
* ARIMA(1,1,1) with drift
* ARIMA(1,2,1) without drift

In [None]:
# Зададим список из специфицированных моделей прогнозирования
forecasters = [ARIMA(order=(1,1,1), trend='n'), ARIMA(order=(1,1,1), trend='c'), ARIMA(order=(1,2,1), trend='n')]

# специфицируем метод кросс-валидации. Например, SlidingWindowSplitter
cv = SlidingWindowSplitter(fh=np.arange(1, 11), initial_window=100, step_length=10)

# инициализируем метрики
metric = [MeanSquaredError(square_root=False), MeanAbsoluteError(), MeanAbsolutePercentageError()]

# датафрейм с метриками по столбцам
cv_data = pd.DataFrame(data=None, columns=['MSE', 'MAE', 'MAPE'])

for model in forecasters:
	print(model)
	df = evaluate(forecaster=model, y=y, cv=cv, strategy="refit", return_data=False, scoring=metric)
	print(df.iloc[:,[0,1,2]].mean())
	cv_data.loc[len(cv_data.index)] = df.iloc[:,[0,1,2]].mean().values

In [None]:
# результаты кросс-валидации в виде датафрейма
cv_data

In [None]:
# Индекс модели
for i in range(cv_data.shape[1]):
	print(f'{cv_data.columns[i]}: model #={cv_data.iloc[:,i].argmin()}')