In [None]:
import pandas as pd
import numpy as np
import torch
import random
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import TransformedTargetRegressor
from torch import nn
from skorch import NeuralNetRegressor
from skorch.callbacks import EarlyStopping
from utils.neural_nets import NN_Module_1C
from utils.params import fig_dir, tab_dir, label_color
from utils.charts import loss_plot, abs_error_plot, rel_error_plot
from utils.metrics import mean_absolute_errors, mean_relative_errors

%config InlineBackend.figure_format ='retina'
mpl.style.use("ggplot")
torch.manual_seed(42)
torch.cuda.manual_seed(42)
random.seed(42)
np.random.seed(42)

In [None]:
df = pd.read_csv("data/flow_data_1c.csv").astype(np.float32)
df.shape

In [None]:
indices = []
for i, row in df.iterrows():
    if row["PDI"] >= 2 and row["M_W"] >= 1_287_000:
        indices.append(i)

df = df.loc[indices].sample(120_000)

df.shape

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(
    df.iloc[:, -140:], df.iloc[:, :2], test_size=1 / 6
)

print(X_train.shape, Y_train.shape)

In [None]:
np.save(f"data/predictions/nn/1c_best/Y_test_2", Y_test, allow_pickle=False)

In [None]:
net = NeuralNetRegressor(
    module=NN_Module_1C,
    module__num_targets=2,
    module__num_features=100,
    criterion=nn.MSELoss,
    optimizer=torch.optim.AdamW,
    lr=1e-05,
    max_epochs=500,
    callbacks=[EarlyStopping(patience=5)],
    device="cuda" if torch.cuda.is_available() else "cpu",
)
tt = TransformedTargetRegressor(regressor=net, transformer=MinMaxScaler())
pipe = Pipeline(
    [
        ("scale", MinMaxScaler()),
        ("tt", tt),
    ]
)

col_indices = np.concatenate(
    [np.arange(start=0, stop=50), np.arange(start=70, stop=120)]
)
pipe.fit(X_train.iloc[:, col_indices], Y_train)

In [None]:
history = pd.DataFrame(pipe[1].regressor_.history)
history = history[["epoch", "train_loss", "valid_loss", "dur"]]
history.set_index("epoch", inplace=True)

In [None]:
loss_plot(history.loc[:, "train_loss"], history.loc[:, "valid_loss"])

plt.savefig(
    fig_dir + "/nn_1c_loss.png", dpi=300, bbox_inches="tight", pad_inches=0
)

In [None]:
loss_plot(history.loc[10:, "train_loss"], history.loc[10:, "valid_loss"])

plt.savefig(
    fig_dir + "/nn_1c_loss_trunc.png",
    dpi=300,
    bbox_inches="tight",
    pad_inches=0,
)

In [None]:
Y_pred = pipe.predict(X_test.iloc[:, col_indices])

In [None]:
errs = mean_absolute_errors(Y_test.values, Y_pred)
errs.extend(mean_relative_errors(Y_test.values, Y_pred))
errs.append(np.mean(mean_relative_errors(Y_test.values, Y_pred)))
errs

In [None]:
np.save("data/predictions/nn/1c_best/Y_pred_2", Y_pred, allow_pickle=False)

In [None]:
errors = []

In [None]:
Y_test = np.load("data/predictions/nn/1c_best/Y_test_2.npy", allow_pickle=False)
Y_pred = np.load("data/predictions/nn/1c_best/Y_pred_2.npy", allow_pickle=False)

errs = mean_absolute_errors(Y_test, Y_pred)
errs.extend(mean_relative_errors(Y_test, Y_pred))
errs.append(np.mean(mean_relative_errors(Y_test, Y_pred)))
errors.append(errs)

In [None]:
mod_errors = []
for i, row in enumerate(errors):
    new_row = []
    for j, error in enumerate(row):
        if j == 0:
            if i == 0:
                new_row.append(f"{error:,.0f}")
            else:
                new_row.append("\green{" + f"{error:,.0f}" + "}")
        elif j == 1:
            if i == 0:
                new_row.append(f"{error:.3f}")
            else:
                new_row.append("\green{" + f"{error:.3f}" + "}")
        else:
            if i == 0:
                new_row.append(f"{error*100:.2f}\%")
            else:
                new_row.append("\green{" + f"{error*100:.2f}\%" + "}")

    mod_errors.append(new_row)

In [None]:
df_errors = pd.DataFrame(mod_errors, dtype=str)
df_errors.index = ["Full Dataset", "Reduced Target Range"]

df_errors.to_latex(
    buf=tab_dir + "/1c_best_errors_2.tex",
    header=[
        r"MAE ($M_w$)",
        "MAE ($PDI$)",
        "MRE ($M_w$)",
        "MRE ($PDI$)",
        "Avg. MRE",
    ],
    column_format="lrrrrr",
    index=True,
    escape=False,
    bold_rows=True,
    caption=r"Mean absolute error (MAE), mean relative error (MRE) and the averaged MRE across all targets (Avg. MRE) of the best performing unimodal models with and without restricting the target ranges by $M_w \geq$ 1,287,000 and $PDI \geq$ 2 (100,000 training and 20,000 testing instances, using only the first 50 features each for $G'$ and $G''$, unimodal dataset)",
    label="tab:1c_best_errors",
    position="htb",
)

In [None]:
labels = ["$M_w$ [$g/mol$]", "$PDI$"]

fig, ax = plt.subplots()
fig.set_figheight(2.8)
fig.set_figwidth(8)

abs_error_plot(Y_test.values, Y_pred, labels)
plt.subplots_adjust(wspace=0.18)

plt.savefig(
    fig_dir + "/nn_1c_abs_errors.png",
    dpi=300,
    bbox_inches="tight",
    pad_inches=0,
)

In [None]:
labels = ["$M_w$ [$g/mol$]", "$PDI$"]

fig, ax = plt.subplots()
fig.set_figheight(2.7)
fig.set_figwidth(8)

rel_error_plot(Y_test.values, Y_pred, labels)
plt.subplots_adjust(wspace=0.25)

plt.savefig(
    fig_dir + "/nn_1c_rel_errors.png",
    dpi=300,
    bbox_inches="tight",
    pad_inches=0,
)

In [None]:
np.mean(mean_relative_errors(Y_test.values, Y_pred))

In [None]:
df_test = pd.read_csv("data/test_grid_1c.csv").astype(np.float32)
df_test

In [None]:
Y_pred = pipe.predict(df_test.iloc[:, -140:])

In [None]:
fig, ax = plt.subplots()
fig.set_figheight(5)
fig.set_figwidth(7.5)

plt.scatter(
    df_test["M_W"],
    df_test["PDI"],
    s=10,
    color=plt.cm.tab10(0),
    label="True values",
)
plt.scatter(
    Y_pred[:, 0],
    Y_pred[:, 1],
    s=10,
    color=plt.cm.tab10(1),
    zorder=10,
    label="Predictions",
)

plt.xlabel("$M_w$ [$g/mol$]")
plt.ylabel("$PDI$")
leg = plt.legend(
    loc="center left",
    bbox_to_anchor=(1, 0.5),
    frameon=False,
    scatteryoffsets=[0.5],
    labelcolor=label_color,
    labelspacing=1.2,
    handletextpad=0.1,
)
for legobj in leg.legendHandles:
    legobj.set_sizes([40])

plt.savefig(
    fig_dir + "/preds_vs_true_vals_1c.png",
    dpi=300,
    bbox_inches="tight",
    pad_inches=0,
)