In [None]:
import pandas as pd
import numpy as np
import torch
import random
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import TransformedTargetRegressor
from torch import nn
from skorch import NeuralNetRegressor
from tqdm import tqdm
from utils.neural_nets import NN_Module_1C
from utils.metrics import ds_size_rel_errors
from utils.charts import ds_size_rel_error_plot
from utils.params import fig_dir

%config InlineBackend.figure_format ='retina'
mpl.style.use("ggplot")
torch.manual_seed(42)
torch.cuda.manual_seed(42)
random.seed(42)
np.random.seed(42)

In [None]:
df = pd.read_csv("data/flow_data_1c.csv").astype(np.float32)
df.shape

In [None]:
df_train, df_test = train_test_split(df, test_size=0.1, random_state=42)

df_train.to_csv("logs/ds_size/1c/df_train.csv", index=False)
df_test.to_csv("logs/ds_size/1c/df_test.csv", index=False)

In [None]:
df_train = pd.read_csv("logs/ds_size/1c/df_train.csv").astype(np.float32)
df_test = pd.read_csv("logs/ds_size/1c/df_test.csv").astype(np.float32)
X_test, Y_test = df_test.iloc[:, -140:], df_test.iloc[:, :2]

print(df_train.shape, df_test.shape)

In [None]:
sample_sizes = np.arange(start=122500, stop=len(df_train) + 1, step=5000)
for n_samples in tqdm(sample_sizes):
    train_sample = df_train.sample(round(n_samples), random_state=42)
    X_train, Y_train = train_sample.iloc[:, -140:], train_sample.iloc[:, :2]

    net = NeuralNetRegressor(
        module=NN_Module_1C,
        module__num_targets=2,
        criterion=nn.MSELoss,
        optimizer=torch.optim.AdamW,
        lr=1e-05,
        max_epochs=170,
        verbose=0,
        device="cuda" if torch.cuda.is_available() else "cpu",
    )
    tt = TransformedTargetRegressor(regressor=net, transformer=MinMaxScaler())
    pipe = Pipeline(
        [
            ("scale", MinMaxScaler()),
            ("tt", tt),
        ]
    )

    pipe.fit(X_train, Y_train)

    Y_pred = pipe.predict(X_test)
    np.save(f"logs/ds_size/1c/Y_pred_{n_samples}", Y_pred, allow_pickle=False)

In [None]:
sample_sizes = np.arange(start=2500, stop=135_000 + 1, step=2500)
df_rel_errs = ds_size_rel_errors("1c", sample_sizes, ["$M_w$", "$PDI$"])
df_rel_errs

In [None]:
ds_size_rel_error_plot(df_rel_errs)

plt.savefig(
    fig_dir + "/dataset_size_1c.png", dpi=300, bbox_inches="tight", pad_inches=0
)