In [1]:
import pandas as pd
import numpy as np
import torch
import random
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import TransformedTargetRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from skorch import NeuralNetRegressor
from utils.neural_nets import NN_Module_1C
from utils.metrics import mean_absolute_errors, mean_relative_errors
from utils.params import tab_dir

torch.manual_seed(42)
torch.cuda.manual_seed(42)
random.seed(42)
np.random.seed(42)

In [2]:
df = pd.read_csv("data/flow_data_1c.csv").astype(np.float32)
df = df.sample(50000)
df.shape

(50000, 142)

In [3]:
X_train, X_test, Y_train, Y_test = train_test_split(
    df.iloc[:, -140:], df.iloc[:, :2], test_size=0.2, random_state=42
)

print(X_train.shape, Y_train.shape, X_test.shape, Y_test.shape)

(40000, 140) (40000, 2)


In [4]:
%%time
dtr = DecisionTreeRegressor(random_state=42)
tt = TransformedTargetRegressor(regressor=dtr, transformer=MinMaxScaler())
pipe = Pipeline(
    [
        ("scale", MinMaxScaler()),
        ("tt", tt),
    ]
)

pipe.fit(X_train, Y_train)

Y_pred = pipe.predict(X_test)

Wall time: 5.73 s


In [5]:
def errors_overview(Y_test, Y_pred):
    errors = mean_absolute_errors(Y_test, Y_pred)
    errors.extend(mean_relative_errors(Y_test, Y_pred))
    errors.append(np.mean(mean_relative_errors(Y_test, Y_pred)))

    return errors

metrics = [errors_overview(Y_test.values, Y_pred)]

In [6]:
%%time
rfr = RandomForestRegressor(random_state=42, n_jobs=-1)
tt = TransformedTargetRegressor(regressor=rfr, transformer=MinMaxScaler())
pipe = Pipeline(
    [
        ("scale", MinMaxScaler()),
        ("tt", tt),
    ]
)

pipe.fit(X_train, Y_train)

Y_pred = pipe.predict(X_test)

metrics.append(errors_overview(Y_test.values, Y_pred))

Wall time: 1min 4s


In [7]:
%%time
net = NeuralNetRegressor(
    module=NN_Module_1C,
    module__num_targets=2,
    criterion=torch.nn.MSELoss,
    optimizer=torch.optim.AdamW,
    lr=1e-05,
    max_epochs=300,
    verbose=0,
    device="cuda" if torch.cuda.is_available() else "cpu",
)
tt = TransformedTargetRegressor(regressor=net, transformer=MinMaxScaler())
pipe = Pipeline(
    [
        ("scale", MinMaxScaler()),
        ("tt", tt),
    ]
)

pipe.fit(X_train, Y_train)

Y_pred = pipe.predict(X_test)

metrics.append(errors_overview(Y_test.values, Y_pred))

Wall time: 8min 30s


In [20]:
metrics_df = pd.DataFrame(
    metrics, index=["Decision Tree", "Random Forest", "Neural Network"]
)
metrics_df = metrics_df.astype(str)

for index, row in metrics_df.iterrows():
    if float(row[0]) == np.array(metrics)[:, 0].min():
        metrics_df.at[index, 0] = "\green{" + f"{float(row[0]):,.0f}" + "}"
    else:
        metrics_df.at[index, 0] = f"{float(row[0]):,.0f}"
    if float(row[1]) == np.array(metrics)[:, 1].min():
        metrics_df.at[index, 1] = "\green{" + f"{float(row[1]):.3f}" + "}"
    else:
        metrics_df.at[index, 1] = f"{float(row[1]):.3f}"
    if float(row[2]) == np.array(metrics)[:, 2].min():
        metrics_df.at[index, 2] = "\green{" + f"{float(row[2])*100:.2f}\%" + "}"
    else:
        metrics_df.at[index, 2] = f"{float(row[2])*100:.2f}\%"
    if float(row[3]) == np.array(metrics)[:, 3].min():
        metrics_df.at[index, 3] = "\green{" + f"{float(row[3])*100:.2f}\%" + "}"
    else:
        metrics_df.at[index, 3] = f"{float(row[3])*100:.2f}\%"
    if float(row[4]) == np.array(metrics)[:, 4].min():
        metrics_df.at[index, 4] = "\green{" + f"{float(row[4])*100:.2f}\%" + "}"
    else:
        metrics_df.at[index, 4] = f"{float(row[4])*100:.2f}\%"

metrics_df

Unnamed: 0,0,1,2,3,4
Decision Tree,36896,0.050,0.91\%,3.03\%,1.97\%
Random Forest,13635,0.025,\green{0.33\%},1.86\%,1.09\%
Neural Network,"\green{12,621}",\green{0.024},0.42\%,\green{1.14\%},\green{0.78\%}


In [25]:
metrics_df.to_latex(
    buf=tab_dir + "/model_selection_1c.tex",
    header=[
        r"MAE ($M_w$)",
        "MAE ($PDI$)",
        "MRE ($M_w$)",
        "MRE ($PDI$)",
        "Avg. MRE",
    ],
    column_format="lrrrrr",
    index=True,
    escape=False,
    bold_rows=True,
    caption=r"Mean absolute error (MAE), mean relative error (MRE) and the averaged MRE across all targets (Avg. MRE) for decision tree, random forest and neural network model predictions (unimodal dataset)",
    label="tab:model_selection_1c",
    position="htb",
)