In [None]:
import pickle
from typing import List

import matplotlib.pyplot as plt
from matplotlib import patches

from src.applications.vedur_is import VedurHarmonicMagnitudes
from src.projects.fagradalsfjall.common.project_settings import (
    FILE_DATASET_FULL,
    FILE_DATASET_SELECTION,
    FILE_DATASET_TEST,
    FILE_DATASET_TRAIN,
    FILE_DATASET_CROSS_VALIDATION,
    DATASET_TRAIN_TEST_TS_FROM,
    DATASET_TRAIN_TEST_TS_TO,
    DATASET_TRAIN_SAMPLE_FROM,
    DATASET_TRAIN_SAMPLE_TO,
    DATASET_TEST_SAMPLE_FROM,
    DATASET_TEST_SAMPLE_TO,
    CV_MIN_SAMPLES_TRAIN,
    CV_MIN_SAMPLES_VALIDATE,
)
from src.tools.datetime import ts_to_float
from src.tools.matplotlib import plot_style_matplotlib_default
from src.base.forecasting.evaluation.cross_validation.cv_splits import TimeSeriesCVSplitter

In [None]:
# -------------------------------------------------------------------------
#  Load dataset
# -------------------------------------------------------------------------

print("Loading dataset...    ", end="")
with open(FILE_DATASET_FULL + ".pkl", "rb") as f:
    all_data = pickle.load(f)
print("Done.")

In [None]:
# -------------------------------------------------------------------------
#  Default plot settings
# -------------------------------------------------------------------------
plot_style_matplotlib_default()

In [None]:
# -------------------------------------------------------------------------
#  Make selection & split
# -------------------------------------------------------------------------

# convert datetime -> sample index
i_from = all_data.get_closest_index(DATASET_TRAIN_TEST_TS_FROM)
i_to = all_data.get_closest_index(DATASET_TRAIN_TEST_TS_TO)

# main selection (=train+test) & extended selection (for visualization; with 1 day extra on each side)
data_selection = all_data.slice(i_from, i_to)  # type: VedurHarmonicMagnitudes
data_selection_extra = all_data.slice(i_from - 96, i_to + 96)  # type: VedurHarmonicMagnitudes

# split in train & test
data_train = data_selection.slice(DATASET_TRAIN_SAMPLE_FROM, DATASET_TRAIN_SAMPLE_TO)
data_test = data_selection.slice(DATASET_TEST_SAMPLE_FROM, DATASET_TEST_SAMPLE_TO)

print()
print(f"dataset - ext. selection : {data_selection_extra.n_samples} samples")
print(f"dataset - selection      : {data_selection.n_samples} samples")
print(f"dataset - train          : {data_train.n_samples} samples.")
print(f"dataset - test           : {data_test.n_samples} samples.")

In [None]:
save_config = [
    (FILE_DATASET_SELECTION, data_selection),
    (FILE_DATASET_TRAIN, data_train),
    (FILE_DATASET_TEST, data_test),
]

In [None]:
# -------------------------------------------------------------------------
#  Save - PKL
# -------------------------------------------------------------------------
for base_file_name, data in save_config:  # type: str, VedurHarmonicMagnitudes
    print(base_file_name + ".pkl")
    with open(base_file_name + ".pkl", "wb") as f:
        pickle.dump(data, f)

print("Done")

In [None]:
# -------------------------------------------------------------------------
#  Save - CSV
# -------------------------------------------------------------------------
for base_file_name, data in save_config:  # type: str, VedurHarmonicMagnitudes
    print(base_file_name + ".csv")
    data.to_dataframe().to_csv(base_file_name + ".csv")

print("Done")

In [None]:
# -------------------------------------------------------------------------
#  Save - PNG
# -------------------------------------------------------------------------

# --- train -----------------------------------------------
fig, _ = data_train.create_plot(title="Fagradalsfjall (faf) - TRAINING SET")
fig.savefig(FILE_DATASET_TRAIN + ".png", dpi=450)
print(FILE_DATASET_TRAIN + ".png")

# --- test ------------------------------------------------
fig, _ = data_test.create_plot(title="Fagradalsfjall (faf) - TEST SET")
fig.savefig(FILE_DATASET_TEST + ".png", dpi=450)
print(FILE_DATASET_TEST + ".png")

In [None]:
# --- illustrate train/test split -------------------------
fig, ax = data_selection_extra.create_plot(title="Fagradalsfjall (faf) - TRAINING & TEST SETS")

x_train_from = ts_to_float(data_selection.time[DATASET_TRAIN_SAMPLE_FROM])
x_train_to = ts_to_float(data_selection.time[DATASET_TRAIN_SAMPLE_TO - 1])
x_test_from = ts_to_float(data_selection.time[DATASET_TEST_SAMPLE_FROM])
x_test_to = ts_to_float(data_selection.time[DATASET_TEST_SAMPLE_TO - 1])

train_rect = patches.Rectangle(
    (x_train_from, 250), x_train_to - x_train_from, 6500, alpha=0.1, edgecolor=None, facecolor="green"
)
ax.add_patch(train_rect)
ax.text(x_train_from + (6 * 60 * 60), 6250, "TRAINING DATA", fontsize=16, fontweight=600)

test_rect = patches.Rectangle(
    (x_test_from, 250), x_test_to - x_test_from, 6500, alpha=0.1, edgecolor=None, facecolor="blue"
)
ax.add_patch(test_rect)
ax.text(x_test_from + (6 * 60 * 60), 6250, "TEST DATA", fontsize=16, fontweight=600)

print(FILE_DATASET_SELECTION + ".png")
fig.savefig(FILE_DATASET_SELECTION + ".png", dpi=450)

In [None]:
# -------------------------------------------------------------------------
#  Illustrate cross-validation splits
# -------------------------------------------------------------------------

# --- obtain CV splits ------------------------------------
n_splits = 5
cv_splitter = TimeSeriesCVSplitter(
    min_samples_train=CV_MIN_SAMPLES_TRAIN, min_samples_validate=CV_MIN_SAMPLES_VALIDATE, n_splits=n_splits
)
splits = cv_splitter.get_splits(n_samples_tot=DATASET_TRAIN_SAMPLE_TO - DATASET_TRAIN_SAMPLE_FROM)

# --- create plot -----------------------------------------
fig, axes = plt.subplots(n_splits, 1)  # type: plt.Figure, List[plt.Axes]

for i_split, (n_samples_cv_train, n_samples_cv_val) in enumerate(splits):

    ax = axes[i_split]  # type: plt.Axes

    # actual signals
    data_selection_extra.create_plot(fig_ax=(fig, ax))
    ax.set_title(f"Cross-validation - SPLIT {i_split+1} of {n_splits}.", fontdict={"fontsize": 18})

    # train & test set
    black_rect_kwargs = dict(edgecolor="black", fill=False, ls="--", lw=2)
    ax.add_patch(patches.Rectangle((x_train_to, 250), x_train_from - x_train_to, 6500, **black_rect_kwargs))
    ax.text(x_train_from + (3 * 60 * 60), 6900, "TRAINING DATA", fontsize=16, fontweight=600)

    ax.add_patch(patches.Rectangle((x_test_from, 250), x_test_to - x_test_from, 6500, **black_rect_kwargs))
    ax.text(x_test_from + (3 * 60 * 60), 6900, "TEST DATA", fontsize=16, fontweight=600)
    ax.set_axisbelow(True)

    # CV train & validation data
    x_cv_train_from = ts_to_float(data_selection.time[DATASET_TRAIN_SAMPLE_FROM])
    x_cv_train_to = ts_to_float(data_selection.time[DATASET_TRAIN_SAMPLE_FROM + n_samples_cv_train])
    x_cv_validation_from = ts_to_float(data_selection.time[DATASET_TRAIN_SAMPLE_FROM + n_samples_cv_train])
    x_cv_validation_to = ts_to_float(
        data_selection.time[DATASET_TRAIN_SAMPLE_FROM + n_samples_cv_train + n_samples_cv_val]
    )

    ax.add_patch(
        patches.Rectangle(
            (x_cv_train_from, 250), x_cv_train_to - x_cv_train_from, 6500, alpha=0.1, edgecolor=None, facecolor="green"
        )
    )
    ax.text(x_cv_train_from + (6 * 60 * 60), 6250, f"SPLIT {i_split+1} - TRAIN", fontsize=16, fontweight=500)

    ax.add_patch(
        patches.Rectangle(
            (x_cv_validation_from, 250),
            x_cv_validation_to - x_cv_validation_from,
            6500,
            alpha=0.15,
            edgecolor=None,
            facecolor="yellow",
        )
    )
    ax.text(x_cv_validation_from + (6 * 60 * 60), 6250, f"SPLIT {i_split+1} - VAL", fontsize=16, fontweight=500)

# --- finalize --------------------------------------------
w, h = fig.get_size_inches()
fig.set_size_inches(w=w, h=w)  # make square without changing width
fig.tight_layout()

# --- save ------------------------------------------------
print(FILE_DATASET_CROSS_VALIDATION + ".png")
fig.savefig(FILE_DATASET_CROSS_VALIDATION + ".png", dpi=300)