In [None]:
import pandas as pd
import numpy as np
import torch
import random
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.ticker import StrMethodFormatter
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import TransformedTargetRegressor
from torch import nn
from skorch import NeuralNetRegressor
from tqdm import tqdm
from utils.neural_nets import NN_Module_2C
from utils.params import fig_dir
from utils.metrics import ds_size_rel_errors
from utils.charts import ds_size_rel_error_plot

%config InlineBackend.figure_format ='retina'
mpl.style.use("ggplot")
torch.manual_seed(42)
torch.cuda.manual_seed(42)
random.seed(42)
np.random.seed(42)

In [None]:
df = pd.read_csv("data/flow_data_2c_2.csv").astype(np.float32)
df.shape

In [None]:
df_train, df_test = train_test_split(df, test_size=0.1, random_state=42)
X_test, Y_test = df_test.iloc[:, -140:], df_test.iloc[:, :5]

print(df_train.shape, df_test.shape)

In [None]:
np.save(f"data/predictions/ds_size/2c_2/Y_test", Y_test, allow_pickle=False)

In [None]:
sample_sizes = np.arange(start=5000, stop=len(df_train) + 1, step=5000)
for n_samples in tqdm(sample_sizes):
    train_sample = df_train.sample(round(n_samples), random_state=42)
    X_train, Y_train = train_sample.iloc[:, -140:], train_sample.iloc[:, :5]

    net = NeuralNetRegressor(
        module=NN_Module_2C,
        module__num_targets=5,
        criterion=nn.MSELoss,
        optimizer=torch.optim.AdamW,
        lr=2e-05,
        max_epochs=100,
        verbose=0,
        device="cuda" if torch.cuda.is_available() else "cpu",
    )
    tt = TransformedTargetRegressor(regressor=net, transformer=MinMaxScaler())
    pipe = Pipeline(
        [
            ("scale", MinMaxScaler()),
            ("tt", tt),
        ]
    )

    pipe.fit(X_train, Y_train)

    Y_pred = pipe.predict(X_test)
    np.save(f"data/predictions/ds_size/2c_2/Y_pred_{n_samples}", Y_pred, allow_pickle=False)

In [None]:
sample_sizes = np.arange(start=5000, stop=180_000 + 1, step=5000)
df_rel_errs = ds_size_rel_errors(
    "2c_2", sample_sizes, ["$M_w^s$", "$PDI^s$", "$M_w^l$", "$PDI^l$"]
)

In [None]:
ds_size_rel_error_plot(df_rel_errs)

plt.savefig(
    fig_dir + "/dataset_size_2c_2.png",
    dpi=300,
    bbox_inches="tight",
    pad_inches=0,
)

In [None]:
labels = ["$M_w^s$", "$PDI^s$", "$M_w^l$", "$PDI^l$"]
datasets_errors = [
    ds_size_rel_errors("2c_None", sample_sizes, labels),
    ds_size_rel_errors("2c_1", sample_sizes, labels),
    ds_size_rel_errors("2c_1_5", sample_sizes, labels),
    ds_size_rel_errors("2c_2", sample_sizes, labels),
]

In [None]:
labels = [
    "No restrictions",
    r"$\frac{M_w^l}{M_w^s}>PDI_{max}^1$",
    r"$\frac{M_w^l}{M_w^s}>PDI_{max}^{1.5}$",
    r"$\frac{M_w^l}{M_w^s}>PDI_{max}^{2}$",
]

fig, ax = plt.subplots()
fig.set_figheight(5)
fig.set_figwidth(7.5)

for i, df in enumerate(datasets_errors):
    plt.plot(
        np.mean(df, axis=1),
        linewidth=0.8,
        color=plt.cm.tab10(i),
        label=labels[i],
    )

plt.xlabel("Training Set Size")
plt.ylabel("Avg. MRE ($M_w^s$, $PDI^s$, $M_w^l$, $PDI^l$)")
ax.yaxis.set_major_formatter(StrMethodFormatter("{x:.0%}"))
ax.xaxis.set_major_formatter(StrMethodFormatter("{x:,.0f}"))
plt.legend(loc="upper right", framealpha=1, edgecolor="None")

plt.savefig(
    fig_dir + "/dataset_size_2c_all.png",
    dpi=300,
    bbox_inches="tight",
    pad_inches=0,
)