In [None]:
from typing import Tuple

import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import ks_2samp

from src.projects.fagradalsfjall.common.dataset import load_train_data_numpy, load_test_data_numpy
from src.projects.fagradalsfjall.common.paths import get_blog_post_subfolder
from src.tools.matplotlib import plot_style_matplotlib_default

In [None]:
# -------------------------------------------------------------------------
#  Load TRAINING & TEST set
# -------------------------------------------------------------------------
print("Loading training dataset...    ", end="")
x_train = load_train_data_numpy()
x_test = load_test_data_numpy()
print("Done.")

In [None]:
# -------------------------------------------------------------------------
#  Output path settings
# -------------------------------------------------------------------------
path_figures = get_blog_post_subfolder(3, "eda")

In [None]:
# -------------------------------------------------------------------------
#  Helpers
# -------------------------------------------------------------------------
def compute_cdf_xy(samples: np.ndarray, x_min: float, x_max: float) -> Tuple[np.ndarray, np.ndarray]:

    x_values = [x_min]
    y_values = [0.0]

    n = len(samples)
    for i, x in enumerate(sorted(samples)):
        x_values += [x, x]
        y_values += [i / n, (i + 1) / n]

    x_values += [x_max]
    y_values += [1.0]

    return x_values, y_values


def compare_samples(
    a: np.ndarray, b: np.ndarray, x_min: float, x_max: float, a_name: str, b_name: str
) -> Tuple[plt.Figure, plt.Axes]:

    # --- prep --------------------------------------------
    plot_style_matplotlib_default()
    fig, ax = plt.subplots(1, 1)  # type: plt.Figure, plt.Axes
    cdf_a_x, cdf_a_y = compute_cdf_xy(a, x_min, x_max)
    cdf_b_x, cdf_b_y = compute_cdf_xy(b, x_min, x_max)

    # --- KS test -----------------------------------------
    ks_stat, p_value = ks_2samp(a, b)

    # --- simple statistics -------------------------------
    print(f"{a_name}:  {len(a):_} samples - mean={np.mean(a)} - std={np.std(a)}")
    print(f"{b_name}:  {len(b):_} samples - mean={np.mean(b)} - std={np.std(b)}")

    # --- actual plotting ---------------------------------
    h_cdf_a = ax.plot(cdf_a_x, cdf_a_y)
    h_cdf_b = ax.plot(cdf_b_x, cdf_b_y)

    # --- decorate ----------------------------------------
    ax.set_xlim(x_min, x_max)
    ax.legend([a_name, b_name])
    ax.grid(True)
    ax.set_ylabel("CDF")
    ax.set_title(f"Comparison of distributions '{a_name}' & '{b_name}'\nKS test: p={p_value:6.2e}")

    fig.tight_layout()

    # --- return ------------------------------------------
    return fig, ax

In [None]:
# -------------------------------------------------------------------------
#  TRAIN vs TEST
# -------------------------------------------------------------------------
fig, ax = compare_samples(a=x_train, b=x_test, x_min=0, x_max=7000, a_name="TRAIN set", b_name="TEST set")
fig.set_size_inches(w=10, h=7)

fig.savefig(path_figures / "compare_distributions_train_vs_test", dpi=300)

In [None]:
# -------------------------------------------------------------------------
#  TRAIN: first 20 vs remaining days
# -------------------------------------------------------------------------
x_train_a = x_train[0 : 20 * 96]
x_train_b = x_train[20 * 96 :]

fig, ax = compare_samples(
    a=x_train_a, b=x_train_b, x_min=0, x_max=7000, a_name="TRAIN set - first 20 days", b_name="TRAIN set - last 11 days"
)
fig.set_size_inches(w=10, h=7)

fig.savefig(path_figures / "compare_distributions_train_20_vs_11", dpi=300)