In [1]:
%pwd
%cd ../..

/Users/aflamant/Documents/courses/2024-2025/mémoire/03-code/memoire/MLP


In [2]:
RANDOM_STATE = 42
import numpy as np
import re
import ast
import torch
from torch import nn
from torch.utils.data import random_split, DataLoader, Subset
from torch.nn import functional as F
from sklearn.model_selection import KFold

import torchmetrics.regression as R

from MLP.dataset import TenBarsCantileverTrussSingleEADataset

import mlflow
import matplotlib.pyplot as plt
from MLP.models.architecture import MultiLayerPerceptron
from MLP.models.processing import StandardScaler

np.random.seed(RANDOM_STATE)

device = torch.device(
    'cuda' if torch.cuda.is_available()
    else 'mps' if torch.backends.mps.is_available()
    else 'cpu'
)

# 1. Load the data


In [None]:
data_path = "./data/dataset/cantilever/data.hdf5"
_ds = TenBarsCantileverTrussSingleEADataset(data_path)

ds = _ds[np.random.choice(np.arange(len(_ds)), 50000, replace=False)]
in_dim = ds[0][0].__len__()
out_dim = ds[0][1].__len__()

print(f"Dataset size: {len(ds)}")
print(f"  Sample dimension: {in_dim}")
print(f"  Target dimension: {out_dim}")

# 2. Training and Validation routine

In [None]:
def train(model, train_ds, val_ds, lr, n_epochs, batch_size, verbose=True, plot=False):
    model = model.to(device)
    train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
    val_dl = DataLoader(val_ds, batch_size=batch_size, shuffle=True)

    x_scaler = StandardScaler(in_dim).to(device)
    y_scaler = StandardScaler(out_dim).to(device)
    for x, y, _, _, _ in train_dl:
        x_scaler.partial_fit(x.to(device))
        y_scaler.partial_fit(y.to(device))

    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    train_losses = []
    val_losses = []
    train_MSE = []
    val_MSE = []
    train_MAPE = []
    val_MAPE = []
    train_R2 = []
    val_R2 = []
    train_D2 = []
    val_D2 = []

    for epoch in range(n_epochs):
        model.train()
        train_loss_epoch = []
        train_MSE_epoch = []
        train_MAPE_epoch = []
        train_R2_epoch = []
        train_D2_epoch = []

        for batch in train_dl:
            x, y, _, _, _ = batch
            x, y = x.to(device), y.to(device)

            x = x_scaler.transform(x)
            y = y_scaler.transform(y)

            optimizer.zero_grad()
            y_pred = model(x)
            loss = criterion(y, y_pred)
            loss.backward()
            optimizer.step()

            y_unscaled = y_scaler.inverse_transform(y).cpu().detach()
            y_pred_unscaled = y_scaler.inverse_transform(y_pred).cpu().detach()

            train_loss_epoch.append(loss.item())
            train_MSE_epoch.append(F.mse_loss(y_pred, y).item())
            train_MAPE_epoch.append(MAPE(y_unscaled, y_pred_unscaled))
            train_D2_epoch.append(D2(y_unscaled, y_pred_unscaled))
            train_R2_epoch.append(R2(y_unscaled, y_pred_unscaled))

        model.eval()
        val_loss_epoch = []
        val_MSE_epoch = []
        val_MAPE_epoch = []
        val_R2_epoch = []
        val_D2_epoch = []
        for batch in val_dl:
            x, y, _, _, _ = batch
            x, y = x.to(device), y.to(device)

            x = x_scaler.transform(x)
            y = y_scaler.transform(y)

            y_pred = model(x)
            loss = criterion(y_pred, y)

            y_unscaled = y_scaler.inverse_transform(y).cpu().detach()
            y_pred_unscaled = y_scaler.inverse_transform(y_pred).cpu().detach()

            val_loss_epoch.append(loss.item())
            val_MSE_epoch.append(F.mse_loss(y_pred, y).item())
            val_MAPE_epoch.append(MAPE(y_unscaled, y_pred_unscaled))
            val_D2_epoch.append(D2(y_unscaled, y_pred_unscaled))
            val_R2_epoch.append(R2(y_unscaled, y_pred_unscaled))

        # Logging
        mlflow.log_metrics({
            "train_loss": np.mean(train_loss_epoch),
            "train_mse": np.mean(train_MSE_epoch),
            "train_mape": np.mean(train_MAPE_epoch),
            "train_r2": np.mean(train_R2_epoch),
            "train_d2": np.mean(train_D2_epoch),

            "val_loss": np.mean(val_loss_epoch),
            "val_mse": np.mean(val_MSE_epoch),
            "val_mape": np.mean(val_MAPE_epoch),
            "val_r2": np.mean(val_R2_epoch),
            "val_d2": np.mean(val_D2_epoch),
        }, step=epoch)

        train_losses.append(np.mean(train_loss_epoch))
        val_losses.append(np.mean(val_loss_epoch))
        train_MSE.append(np.mean(train_MSE_epoch))
        val_MSE.append(np.mean(val_MSE_epoch))
        train_MAPE.append(np.mean(train_MAPE_epoch))
        val_MAPE.append(np.mean(val_MAPE_epoch))
        train_R2.append(np.mean(train_R2_epoch))
        val_R2.append(np.mean(val_R2_epoch))
        train_D2.append(np.mean(train_D2_epoch))
        val_D2.append(np.mean(val_D2_epoch))

        if verbose and (epoch + 1) % 25 == 0:
            print(f"[Epoch] {epoch + 1:{len(str(n_epochs))}d}/{n_epochs:d}", end='  ')
            print(f"TRAIN", end='   ')
            print(f"Loss: {np.mean(train_loss_epoch):1.4f}", end='   ')
            print(f"MSE: {np.mean(train_MSE_epoch):1.4f}", end='   ')
            print(f"MAPE: {np.mean(train_MAPE_epoch):1.4f}", end='   ')
            print(f"R2: {np.mean(train_R2_epoch): 1.4f}", end='   ')
            print(f"D2: {np.mean(train_D2_epoch): 1.4f}", end='')
            print("  ||  ", end='')
            print(f"VALIDATION", end='   ')
            print(f"Loss: {np.mean(val_loss_epoch):1.4f}", end='   ')
            print(f"MSE: {np.mean(val_MSE_epoch):1.4f}", end='   ')
            print(f"MAPE: {np.mean(val_MAPE_epoch):1.4f}", end='   ')
            print(f"R2: {np.mean(val_R2_epoch): 1.4f}", end='   ')
            print(f"D2: {np.mean(val_D2_epoch): 1.4f}")
    if plot:
        fig, axs = plt.subplots(1, 4, figsize=(24, 8))
        axs[0].set_title("Loss")
        axs[0].plot(train_losses, label='Training')
        axs[0].plot(val_losses, label='Validation')
        axs[0].set_yscale('log')
        axs[3].set_xlabel("Epoch")
        axs[3].legend()

        axs[1].set_title("MAPE")
        axs[1].plot(train_MAPE, label='Training')
        axs[1].plot(val_MAPE, label='Validation')
        axs[1].set_yscale('log')
        axs[3].set_xlabel("Epoch")
        axs[3].legend()

        axs[2].set_title("R2")
        axs[2].plot(train_R2, label='Training')
        axs[2].plot(val_R2, label='Validation')
        axs[2].set_yscale('function', functions=(lambda x: 10 ** x, lambda x: np.log10(x)))
        axs[2].set_ylim(0, 1.0)
        axs[3].set_xlabel("Epoch")
        axs[3].legend()

        axs[3].set_title("D2")
        axs[3].plot(train_D2, label='Training')
        axs[3].plot(val_D2, label='Validation')
        axs[3].set_yscale('function', functions=(lambda x: 10 ** x, lambda x: np.log10(x)))
        axs[3].set_ylim(0, 1.0)
        axs[3].set_xlabel("Epoch")
        axs[3].legend()

    signature = mlflow.models.infer_signature(x.cpu().detach().numpy(), model(x).cpu().detach().numpy())
    model_info = mlflow.pytorch.log_model(
        pytorch_model=model,
        input_example=x.cpu().detach().numpy(),
        artifact_path='model',
        signature=signature,
    )

    # Set a tag that we can use to remind ourselves what this run was for
    mlflow.set_tag("run_type", "kfold")

    return {
        'train_losses': train_losses,
        'val_losses': val_losses,
        'train_MSE': train_MSE,
        'val_MSE': val_MSE,
        'train_MAPE': train_MAPE,
        'val_MAPE': val_MAPE,
        'train_R2': train_R2,
        'val_R2': val_R2,
        'train_D2': train_D2,
        'val_D2': val_D2,
    }

 # 3. Hyperparameter tuning
Considering a 10 bar cantilever dataset we want to predict the EA of the bars which is assumed to be a single common value.
The model is an MLP here are the parameters:
- Activation function
- Learning rate
- Number of layers
- Number of neurons per layer

## Capacity training
Capacity is assumed to be the number of trainable parameters. It is a function of the number of hidden layers and the number of neurons per layers.
The more capacity the model has the more accurate it can be, but it also increase the risk of overfitting.

In [None]:
mlflow.set_tracking_uri(uri="http://127.0.0.1:5000/")
mlflow.set_experiment("MLP_Cantilever_Capacity_50K")

n_neurons_values = [25, 30, 35, 40]
n_layers_values = [2, 3, 4]

for n_neurons in n_neurons_values:
    for n_layers in n_layers_values:
        for fold, (train_idx, val_idx) in enumerate(KFold(n_splits=5, shuffle=True).split(ds)):
            with mlflow.start_run(run_name=f"{n_neurons}x{n_layers}_[{fold}]"):
                train_ds, val_ds = Subset(ds, train_idx), Subset(ds, val_idx)
                model = MultiLayerPerceptron(in_dim, out_dim, n_neurons, n_layers, nn.ReLU)

                model_parameters = filter(lambda p: p.requires_grad, model.parameters())
                n_params = sum([np.prod(p.size()) for p in model_parameters])
                mlflow.log_params({'n_neurons': n_neurons, 'n_layers': n_layers, 'lr': 1e-4,
                                   'activation': 'ReLU', "capacity": n_params})

                results = train(model=model,
                                train_ds=train_ds,
                                val_ds=val_ds,
                                batch_size=2048,
                                lr=4e-4,
                                n_epochs=1000,
                                verbose=True)


We will then postprocess all these folds to extract the average measures

In [None]:
mlflow.set_tracking_uri(uri="http://127.0.0.1:5000/")
mlflow.set_experiment("MLP_Cantilever_Capacity")

experiment = mlflow.get_experiment_by_name("MLP_Cantilever_Capacity")
runs_df = mlflow.search_runs(experiment_ids=[experiment.experiment_id])
runs_df = runs_df[runs_df['tags.health'].isnull()]
all_run_names = runs_df["tags.mlflow.runName"].dropna().tolist()

n_neurons_values = [10, 15, 20, 25, 30, 35, 40]
n_layers_values = [1, 2, 3, 4, 5]
for fdfdf in n_neurons_values:
    for n_layers in n_layers_values:
        run_names = sorted([name for name in all_run_names if re.match(f"{fdfdf}x{n_layers}_\[\d*\]", name)])
        kfold_runs = runs_df[runs_df["tags.mlflow.runName"].isin(run_names)]

        if kfold_runs.shape[0] == 0: continue

        # Extract train and validation loss
        train_losses = []
        val_losses = []
        train_MSE = []
        val_MSE = []
        train_MAPE = []
        val_MAPE = []
        train_R2 = []
        val_R2 = []
        train_D2 = []
        val_D2 = []

        for _, run in kfold_runs.iterrows():
            run_id = run["run_id"]
            client = mlflow.tracking.MlflowClient()

            # Get metric history
            train_loss_hist = client.get_metric_history(run_id, "train_loss")
            val_loss_hist = client.get_metric_history(run_id, "val_loss")
            train_MSE_hist = client.get_metric_history(run_id, "train_mse")
            val_MSE_hist = client.get_metric_history(run_id, "val_mse")
            train_MAPE_hist = client.get_metric_history(run_id, "train_mape")
            val_MAPE_hist = client.get_metric_history(run_id, "val_mape")
            train_R2_hist = client.get_metric_history(run_id, "train_r2")
            val_R2_hist = client.get_metric_history(run_id, "val_r2")
            train_D2_hist = client.get_metric_history(run_id, "train_d2")
            val_D2_hist = client.get_metric_history(run_id, "val_d2")

            # Convert to lists of loss values per epoch
            train_losses.append([m.value for m in train_loss_hist])
            val_losses.append([m.value for m in val_loss_hist])

            train_MSE.append([m.value for m in train_MSE_hist])
            val_MSE.append([m.value for m in val_MSE_hist])

            train_MAPE.append([m.value for m in train_MAPE_hist])
            val_MAPE.append([m.value for m in val_MAPE_hist])

            train_R2.append([m.value for m in train_R2_hist])
            val_R2.append([m.value for m in val_R2_hist])

            train_D2.append([m.value for m in train_D2_hist])
            val_D2.append([m.value for m in val_D2_hist])

            # Post process fold results
            client.log_param(run_id, "capacity",
                             ((in_dim * fdfdf) + fdfdf) +
                             (n_layers - 1) * ((fdfdf * fdfdf) + fdfdf) +
                             ((fdfdf * out_dim) + 1))

            mlflow.log_metrics(
                {
                    'best_train_loss': np.min(train_losses),
                    'best_val_loss': np.min(val_losses),
                    'best_train_mse': np.min(train_MSE),
                    'best_val_mse': np.min(val_MSE),
                    'best_train_mape': np.min(train_MAPE),
                    'best_val_mape': np.min(val_MAPE),
                    'best_train_r2': np.max(train_R2),
                    'best_val_r2': np.max(val_R2),
                    'best_train_d2': np.max(train_D2),
                    'best_val_d2': np.max(val_D2),
                },
                run_id=run_id)

            client.set_tag(run_id, "run_type", "KFold")

        # Aggregate fold results
        train_losses = np.mean(train_losses, axis=0)
        val_losses = np.mean(val_losses, axis=0)
        train_MSE = np.mean(train_MSE, axis=0)
        val_MSE = np.mean(val_MSE, axis=0)
        train_MAPE = np.mean(train_MAPE, axis=0)
        val_MAPE = np.mean(val_MAPE, axis=0)
        train_R2 = np.mean(train_R2, axis=0)
        val_R2 = np.mean(val_R2, axis=0)
        train_D2 = np.mean(train_D2, axis=0)
        val_D2 = np.mean(val_D2, axis=0)

        best_train_loss = min(train_losses)
        best_val_loss = min(val_losses)
        best_train_MSE = min(train_MSE)
        best_val_MSE = min(val_MSE)
        best_train_MAPE = min(train_MAPE)
        best_val_MAPE = min(val_MAPE)
        best_train_R2 = max(train_R2)
        best_val_R2 = max(val_R2)
        best_train_D2 = max(train_D2)
        best_val_D2 = max(val_D2)

        with mlflow.start_run(run_name=f"{fdfdf}x{n_layers}"):
            mlflow.log_params({'n_neurons': fdfdf, 'n_layers': n_layers, 'lr': 1e-4,
                               'activation': 'ReLU', 'capacity': ((in_dim * fdfdf) + fdfdf) +
                                                                 (n_layers - 1) * (
                                                                         (fdfdf * fdfdf) + fdfdf) +
                                                                 ((fdfdf * out_dim) + 1)})
            mlflow.log_metrics({
                "best_train_loss": best_train_loss,
                "best_val_loss": best_val_loss,
                "best_train_mse": best_train_MSE,
                "best_val_mse": best_val_MSE,
                "best_train_mape": best_train_MAPE,
                "best_val_mape": best_val_MAPE,
                "best_train_r2": best_train_R2,
                "best_val_r2": best_val_R2,
                "best_train_d2": best_train_D2,
                "best_val_d2": best_val_D2,
            })

            for i in range(len(train_losses)):
                mlflow.log_metrics({
                    "train loss": train_losses[i],
                    "val_loss": val_losses[i],
                    "train_mse": train_MSE[i],
                    "val_mse": val_MSE[i],
                    "train_mape": train_MAPE[i],
                    "val_mape": val_MAPE[i],
                    "train_r2": train_R2[i],
                    "val_r2": val_R2[i],
                    "train_d2": train_D2[i],
                    "val_d2": val_D2[i],
                }, step=i)

            mlflow.set_tag("run_type", "KFold average")

In [None]:
from concurrent.futures import ThreadPoolExecutor

# Set up MLflow tracking
mlflow.set_tracking_uri(uri="http://127.0.0.1:5000/")
mlflow.set_experiment("MLP_Cantilever_Activation_Function_50K")

# Hyperparameter values
n_neurons_values = [30, 35, 40]
n_layers_values = [3, 4]
activations_values = [nn.LeakyReLU, nn.LeakyReLU, nn.LeakyReLU]
activation_params_values = [{"negative_slope": 1.}, {"negative_slope": 5e-1}, {"negative_slope": 2.5e-1}]

# Thread pool for parallel execution
with ThreadPoolExecutor(max_workers=5) as executor:  # Adjust max_workers as needed
    futures = []

    for activation, activation_params in zip(activations_values, activation_params_values):
        for fdfdf in n_neurons_values:
            for n_layers in n_layers_values:
                for fold, (train_idx, val_idx) in enumerate(KFold(n_splits=5, shuffle=True).split(ds)):
                    # Define run name
                    param_str = f"[{activation_params}]" if activation_params is not None else "[]"
                    run_name = f"{activation.__name__}{param_str}_{fdfdf}x{n_layers}_[{fold}]"


                    # Start MLflow run
                    def routine(run_name, train_idx, val_idx, n_neurons, n_layers, activation, activation_params):
                        with mlflow.start_run(run_name=run_name):
                            train_ds, val_ds = Subset(ds, train_idx), Subset(ds, val_idx)
                            model = MultiLayerPerceptron(in_dim, out_dim, n_neurons, n_layers, activation,
                                                         activation_params)

                            # Compute model parameters
                            model_parameters = filter(lambda p: p.requires_grad, model.parameters())
                            n_params = sum(np.prod(p.size()) for p in model_parameters)

                            # Log experiment parameters
                            mlflow.log_params({
                                'n_neurons': n_neurons, 'n_layers': n_layers, 'lr': 1e-4,
                                'activation': activation.__name__, "capacity": n_params
                            })

                            train(model=model, train_ds=train_ds, val_ds=val_ds,
                                  batch_size=2048, lr=4e-4, n_epochs=1_000, verbose=False)


                    # Submit training job to thread pool
                    future = executor.submit(routine, run_name=run_name, train_idx=train_idx, val_idx=val_idx,
                                             n_neurons=fdfdf, n_layers=n_layers,
                                             activation=activation, activation_params=activation_params)
                    futures.append(future)

    # Ensure all threads complete execution
    for future in futures:
        future.result()

In [None]:
run_name[:run_name.index("[")]

In [None]:
mlflow.set_tracking_uri(uri="http://127.0.0.1:5000/")
experiment = mlflow.get_experiment_by_name("MLP_Cantilever_Activation_Function_50K")
runs_df = mlflow.search_runs(experiment_ids=[experiment.experiment_id])

runs_df = runs_df[runs_df['tags.health'].isnull()]
all_run_names = runs_df["tags.mlflow.runName"].dropna().tolist()

# Group runs
run_names_grouped = []
activations_values = [a.__name__ for a in [nn.LeakyReLU, nn.Sigmoid, nn.ReLU, nn.Tanh]]
for activation in activations_values:
    run_names = sorted([name for name in all_run_names if re.match(f"{activation}.*", name)])

    params_list = set()
    for run_name in run_names:
        try:
            params = run_name[run_name.index('{'):run_name.index('}') + 1]
        except ValueError:
            params = ""

        params_list.add(params)

    for params in params_list:
        run_names_grouped.append([name for name in run_names if params in name])

# Update
for run_names in run_names_grouped:
    kfold_runs = runs_df[runs_df["tags.mlflow.runName"].isin(run_names)]
    if kfold_runs.shape[0] == 0: continue

    # Extract train and validation loss
    train_losses = []
    val_losses = []
    train_MSE = []
    val_MSE = []
    train_MAPE = []
    val_MAPE = []
    train_R2 = []
    val_R2 = []
    train_D2 = []
    val_D2 = []

    for _, run in kfold_runs.iterrows():
        run_id = run["run_id"]
        client = mlflow.tracking.MlflowClient()

        # Get metric history
        train_loss_hist = client.get_metric_history(run_id, "train_loss")
        val_loss_hist = client.get_metric_history(run_id, "val_loss")
        train_MSE_hist = client.get_metric_history(run_id, "train_mse")
        val_MSE_hist = client.get_metric_history(run_id, "val_mse")
        train_MAPE_hist = client.get_metric_history(run_id, "train_mape")
        val_MAPE_hist = client.get_metric_history(run_id, "val_mape")
        train_R2_hist = client.get_metric_history(run_id, "train_r2")
        val_R2_hist = client.get_metric_history(run_id, "val_r2")
        train_D2_hist = client.get_metric_history(run_id, "train_d2")
        val_D2_hist = client.get_metric_history(run_id, "val_d2")

        # Convert to lists of loss values per epoch
        train_losses.append([m.value for m in train_loss_hist])
        val_losses.append([m.value for m in val_loss_hist])

        train_MSE.append([m.value for m in train_MSE_hist])
        val_MSE.append([m.value for m in val_MSE_hist])

        train_MAPE.append([m.value for m in train_MAPE_hist])
        val_MAPE.append([m.value for m in val_MAPE_hist])

        train_R2.append([m.value for m in train_R2_hist])
        val_R2.append([m.value for m in val_R2_hist])

        train_D2.append([m.value for m in train_D2_hist])
        val_D2.append([m.value for m in val_D2_hist])

        # Post process fold results

        run_name = run["tags.mlflow.runName"]
        try:
            params = run_name[run_name.index('{'):run_name.index('}') + 1]
            params = ast.literal_eval(params)
        except:
            params = dict()

        for k, v in params.items():
            client.log_param(run_id, k, v)

        mlflow.log_metrics(
            {
                'best_train_loss': np.min(train_losses),
                'best_val_loss': np.min(val_losses),
                'best_train_mse': np.min(train_MSE),
                'best_val_mse': np.min(val_MSE),
                'best_train_mape': np.min(train_MAPE),
                'best_val_mape': np.min(val_MAPE),
                'best_train_r2': np.max(train_R2),
                'best_val_r2': np.max(val_R2),
                'best_train_d2': np.max(train_D2),
                'best_val_d2': np.max(val_D2),
            },
            run_id=run_id)

        client.set_tag(run_id, "run_type", "KFold")

    # Aggregate fold results
    train_losses = np.mean(train_losses, axis=0)
    val_losses = np.mean(val_losses, axis=0)
    train_MSE = np.mean(train_MSE, axis=0)
    val_MSE = np.mean(val_MSE, axis=0)
    train_MAPE = np.mean(train_MAPE, axis=0)
    val_MAPE = np.mean(val_MAPE, axis=0)
    train_R2 = np.mean(train_R2, axis=0)
    val_R2 = np.mean(val_R2, axis=0)
    train_D2 = np.mean(train_D2, axis=0)
    val_D2 = np.mean(val_D2, axis=0)

    best_train_loss = min(train_losses)
    best_val_loss = min(val_losses)
    best_train_MSE = min(train_MSE)
    best_val_MSE = min(val_MSE)
    best_train_MAPE = min(train_MAPE)
    best_val_MAPE = min(val_MAPE)
    best_train_R2 = max(train_R2)
    best_val_R2 = max(val_R2)
    best_train_D2 = max(train_D2)
    best_val_D2 = max(val_D2)

    mlflow.set_experiment("MLP_Cantilever_Activation_Function_50K")
    with mlflow.start_run(run_name=run_name[:run_name.index("]") + 1]):
        mlflow.log_params({'lr': 1e-4,
                           'activation': run_name[:run_name.index("[")]})
        for k, v in params.items():
            mlflow.log_param(k, v)

        mlflow.log_metrics({
            "best_train_loss": best_train_loss,
            "best_val_loss": best_val_loss,
            "best_train_mse": best_train_MSE,
            "best_val_mse": best_val_MSE,
            "best_train_mape": best_train_MAPE,
            "best_val_mape": best_val_MAPE,
            "best_train_r2": best_train_R2,
            "best_val_r2": best_val_R2,
            "best_train_d2": best_train_D2,
            "best_val_d2": best_val_D2,
        })

        for i in range(len(train_losses)):
            mlflow.log_metrics({
                "train loss": train_losses[i],
                "val_loss": val_losses[i],
                "train_mse": train_MSE[i],
                "val_mse": val_MSE[i],
                "train_mape": train_MAPE[i],
                "val_mape": val_MAPE[i],
                "train_r2": train_R2[i],
                "val_r2": val_R2[i],
                "train_d2": train_D2[i],
                "val_d2": val_D2[i],
            }, step=i)

        mlflow.set_tag("run_type", "KFold average")


In [None]:
from concurrent.futures import ThreadPoolExecutor

# Set up MLflow tracking
mlflow.set_tracking_uri(uri="http://127.0.0.1:5000/")
mlflow.set_experiment("MLP_Cantilever_Learning_Rate_50K")

# Hyperparameter values
n_neurons_values = [30, 35, 40]
n_layers_values = [3, 4]
activations_values = [nn.Tanh, nn.LeakyReLU]
activation_params_values = [{}, {"negative_slope": 1.e-3}]

lrs = np.logspace(-6, -3, 4)

# Thread pool for parallel execution
with ThreadPoolExecutor(max_workers=5) as executor:  # Adjust max_workers as needed
    futures = []

    for lr in lrs:
        for activation, activation_params in zip(activations_values, activation_params_values):
            for n_neurons in n_neurons_values:
                for n_layers in n_layers_values:
                    for fold, (train_idx, val_idx) in enumerate(KFold(n_splits=5, shuffle=True).split(ds)):
                        # Define run name
                        param_str = f"[{activation_params}]"
                        run_name = f"{lr}_{activation.__name__}{param_str}_{n_neurons}x{n_layers}_[{fold}]"


                        # Start MLflow run
                        def routine(run_name, train_idx, val_idx, n_neurons, n_layers, activation, activation_params):
                            with mlflow.start_run(run_name=run_name):
                                train_ds, val_ds = Subset(ds, train_idx), Subset(ds, val_idx)
                                model = MultiLayerPerceptron(in_dim, out_dim, n_neurons, n_layers, activation,
                                                             activation_params)

                                # Compute model parameters
                                model_parameters = filter(lambda p: p.requires_grad, model.parameters())
                                n_params = sum(np.prod(p.size()) for p in model_parameters)

                                # Log experiment parameters
                                mlflow.log_params({
                                    'n_neurons': n_neurons, 'n_layers': n_layers, 'lr': lr,
                                    'activation': activation.__name__, "capacity": n_params
                                })

                                for k, v in activation_params.items():
                                    mlflow.log_param(k, v)

                                train(model=model, train_ds=train_ds, val_ds=val_ds,
                                      batch_size=2048, lr=lr, n_epochs=1_000, verbose=False)


                        # Submit training job to thread pool
                        future = executor.submit(routine, run_name=run_name, train_idx=train_idx, val_idx=val_idx,
                                                 n_neurons=n_neurons, n_layers=n_layers,
                                                 activation=activation, activation_params=activation_params)
                        futures.append(future)

        # Ensure all threads complete execution
    for future in futures:
        future.result()

In [None]:
mlflow.set_tracking_uri(uri="http://127.0.0.1:5000/")
experiment = mlflow.get_experiment_by_name("MLP_Cantilever_Learning_Rate_50K")
runs_df = mlflow.search_runs(experiment_ids=[experiment.experiment_id])

runs_df = runs_df[runs_df['tags.health'].isnull()]
all_run_names = runs_df["tags.mlflow.runName"].dropna().tolist()

# Group runs
lrs = set()
for run_name in all_run_names:
    lrs.add(run_name.split('_')[0])

run_names_grouped = []
for lr in lrs:
    group = []
    for run_name in all_run_names:
        if run_name.split('_')[0] == lr:
            group.append(run_name)
    run_names_grouped.append(group)

# Update
for run_names in run_names_grouped:
    kfold_runs = runs_df[runs_df["tags.mlflow.runName"].isin(run_names)]
    if kfold_runs.shape[0] == 0: continue

    # Extract train and validation loss
    train_losses = []
    val_losses = []
    train_MSE = []
    val_MSE = []
    train_MAPE = []
    val_MAPE = []
    train_R2 = []
    val_R2 = []
    train_D2 = []
    val_D2 = []

    for _, run in kfold_runs.iterrows():
        run_id = run["run_id"]
        client = mlflow.tracking.MlflowClient()

        # Get metric history
        train_loss_hist = client.get_metric_history(run_id, "train_loss")
        val_loss_hist = client.get_metric_history(run_id, "val_loss")
        train_MSE_hist = client.get_metric_history(run_id, "train_mse")
        val_MSE_hist = client.get_metric_history(run_id, "val_mse")
        train_MAPE_hist = client.get_metric_history(run_id, "train_mape")
        val_MAPE_hist = client.get_metric_history(run_id, "val_mape")
        train_R2_hist = client.get_metric_history(run_id, "train_r2")
        val_R2_hist = client.get_metric_history(run_id, "val_r2")
        train_D2_hist = client.get_metric_history(run_id, "train_d2")
        val_D2_hist = client.get_metric_history(run_id, "val_d2")

        # Convert to lists of loss values per epoch
        train_losses.append([m.value for m in train_loss_hist])
        val_losses.append([m.value for m in val_loss_hist])

        train_MSE.append([m.value for m in train_MSE_hist])
        val_MSE.append([m.value for m in val_MSE_hist])

        train_MAPE.append([m.value for m in train_MAPE_hist])
        val_MAPE.append([m.value for m in val_MAPE_hist])

        train_R2.append([m.value for m in train_R2_hist])
        val_R2.append([m.value for m in val_R2_hist])

        train_D2.append([m.value for m in train_D2_hist])
        val_D2.append([m.value for m in val_D2_hist])

        mlflow.log_metrics(
            {
                'best_train_loss': np.min(train_losses),
                'best_val_loss': np.min(val_losses),
                'best_train_mse': np.min(train_MSE),
                'best_val_mse': np.min(val_MSE),
                'best_train_mape': np.min(train_MAPE),
                'best_val_mape': np.min(val_MAPE),
                'best_train_r2': np.max(train_R2),
                'best_val_r2': np.max(val_R2),
                'best_train_d2': np.max(train_D2),
                'best_val_d2': np.max(val_D2),
            },
            run_id=run_id)

        client.set_tag(run_id, "run_type", "KFold")

    # Aggregate fold results
    train_losses = np.mean(train_losses, axis=0)
    val_losses = np.mean(val_losses, axis=0)
    train_MSE = np.mean(train_MSE, axis=0)
    val_MSE = np.mean(val_MSE, axis=0)
    train_MAPE = np.mean(train_MAPE, axis=0)
    val_MAPE = np.mean(val_MAPE, axis=0)
    train_R2 = np.mean(train_R2, axis=0)
    val_R2 = np.mean(val_R2, axis=0)
    train_D2 = np.mean(train_D2, axis=0)
    val_D2 = np.mean(val_D2, axis=0)

    best_train_loss = min(train_losses)
    best_val_loss = min(val_losses)
    best_train_MSE = min(train_MSE)
    best_val_MSE = min(val_MSE)
    best_train_MAPE = min(train_MAPE)
    best_val_MAPE = min(val_MAPE)
    best_train_R2 = max(train_R2)
    best_val_R2 = max(val_R2)
    best_train_D2 = max(train_D2)
    best_val_D2 = max(val_D2)

    mlflow.set_experiment("MLP_Cantilever_Learning_Rate_50K")
    lr = ast.literal_eval(kfold_runs.iterrows().__next__()[1]["tags.mlflow.runName"].split("_")[0])
    with mlflow.start_run(run_name=str(lr)):
        mlflow.log_params({'lr': lr})

        mlflow.log_metrics({
            "best_train_loss": best_train_loss,
            "best_val_loss": best_val_loss,
            "best_train_mse": best_train_MSE,
            "best_val_mse": best_val_MSE,
            "best_train_mape": best_train_MAPE,
            "best_val_mape": best_val_MAPE,
            "best_train_r2": best_train_R2,
            "best_val_r2": best_val_R2,
            "best_train_d2": best_train_D2,
            "best_val_d2": best_val_D2,
        })

        for i in range(len(train_losses)):
            mlflow.log_metrics({
                "train loss": train_losses[i],
                "val_loss": val_losses[i],
                "train_mse": train_MSE[i],
                "val_mse": val_MSE[i],
                "train_mape": train_MAPE[i],
                "val_mape": val_MAPE[i],
                "train_r2": train_R2[i],
                "val_r2": val_R2[i],
                "train_d2": train_D2[i],
                "val_d2": val_D2[i],
            }, step=i)

        mlflow.set_tag("run_type", "KFold average")


# 4. Training the foundation model
We will train our model with the whole dataset to create a foundation model that will have learnt all the specifics of the problem.

In [None]:
data_path = "./data/dataset/cantilever/data.hdf5"
_ds = TenBarsCantileverTrussSingleEADataset(data_path)

ds = _ds
train_ds, val_ds = random_split(ds, (len(ds) - 25_000, 25_000))

in_dim = ds[0][0].__len__()
out_dim = ds[0][1].__len__()

print(f"Dataset size: {len(ds)}")
print(f"  Sample dimension: {in_dim}")
print(f"  Target dimension: {out_dim}")
print()
print(f"Train dataset size: {len(train_ds)}")
print()
print(f"Validation dataset size: {len(val_ds)}")

In [None]:
mlflow.set_tracking_uri(uri="http://127.0.0.1:5000/")
mlflow.set_experiment("MLP_Cantilever_final")

CCC = R.ConcordanceCorrCoef().to(device)
R2 = R.R2Score(multioutput='uniform_average').to(device)
MAPE = R.MeanAbsolutePercentageError().to(device)
MSE = R.MeanSquaredError().to(device)

print("Starting training")

best_loss = np.inf
best_val_loss = np.inf
with mlflow.start_run():
    LR = 1e-4
    N_NEURONS = 40
    N_LAYERS = 3
    N_EPOCHS = 2_500
    BATCH_SIZE = 2048
    ACTIVATION = nn.Tanh
    ACTIVATION_PARAMS = {}

    device = torch.device(
        'cuda' if torch.cuda.is_available()
        else 'mps' if torch.backends.mps.is_available()
        else 'cpu'
    )

    model = MultiLayerPerceptron(in_dim, out_dim, N_NEURONS, N_LAYERS, ACTIVATION,
                                 ACTIVATION_PARAMS).to(device)

    model_parameters = filter(lambda p: p.requires_grad, model.parameters())
    n_params = sum(np.prod(p.size()) for p in model_parameters)
    mlflow.log_params({
        'n_neurons': N_NEURONS, 'n_layers': N_LAYERS, 'lr': LR,
        'activation': ACTIVATION.__name__, "capacity": n_params, "n_epochs": N_EPOCHS,
    })

    train_dl = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
    val_dl = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=True)

    x_scaler = StandardScaler(in_dim).to(device)
    y_scaler = StandardScaler(out_dim).to(device)
    for x, y, _, _, _ in train_dl:
        x_scaler.partial_fit(x.to(device))
        y_scaler.partial_fit(y.to(device))

    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=LR)

    train_losses = []
    val_losses = []
    train_MSE = []
    val_MSE = []
    train_MAPE = []
    val_MAPE = []
    train_R2 = []
    val_R2 = []
    train_CCC = []
    val_CCC = []

    for epoch in range(N_EPOCHS):
        model.train()
        train_loss_epoch = []
        train_MSE_epoch = []
        train_MAPE_epoch = []
        train_R2_epoch = []
        train_CCC_epoch = []

        for batch in train_dl:
            x, y, _, _, _ = batch
            x, y = x.to(device), y.to(device)

            x = x_scaler.transform(x)
            y = y_scaler.transform(y)

            optimizer.zero_grad()
            y_pred = model(x)
            loss = criterion(y, y_pred)
            loss.backward()
            optimizer.step()

            y_unscaled = y_scaler.inverse_transform(y).cpu().detach()
            y_pred_unscaled = y_scaler.inverse_transform(y_pred).cpu().detach()

            train_loss_epoch.append(loss.item())
            train_MSE_epoch.append(MSE(y_pred_unscaled, y_unscaled).item())
            train_MAPE_epoch.append(MAPE(y_pred_unscaled, y_unscaled).item())
            train_CCC_epoch.append(CCC(y_pred_unscaled, y_unscaled).item())
            train_R2_epoch.append(R2(y_pred_unscaled, y_unscaled).item())

        model.eval()
        val_loss_epoch = []
        val_MSE_epoch = []
        val_MAPE_epoch = []
        val_R2_epoch = []
        val_CCC_epoch = []
        for batch in val_dl:
            x, y, _, _, _ = batch
            x, y = x.to(device), y.to(device)

            x = x_scaler.transform(x)
            y = y_scaler.transform(y)

            y_pred = model(x)
            loss = criterion(y_pred, y)

            y_unscaled = y_scaler.inverse_transform(y).cpu().detach()
            y_pred_unscaled = y_scaler.inverse_transform(y_pred).cpu().detach()

            val_loss_epoch.append(loss.item())
            val_MSE_epoch.append(MSE(y_pred_unscaled, y_unscaled).item())
            val_MAPE_epoch.append(MAPE(y_pred_unscaled, y_unscaled).item())
            val_CCC_epoch.append(CCC(y_pred_unscaled, y_unscaled).item())
            val_R2_epoch.append(R2(y_pred_unscaled, y_unscaled).item())

        mean_train_loss = np.mean(train_loss_epoch)
        mean_train_MSE = np.mean(train_MSE_epoch)
        mean_train_MAPE = np.mean(train_MAPE_epoch)
        mean_train_R2 = np.mean(train_R2_epoch)
        mean_train_CCC = np.mean(train_CCC_epoch)

        mean_val_loss = np.mean(val_loss_epoch)
        mean_val_MSE = np.mean(val_MSE_epoch)
        mean_val_MAPE = np.mean(val_MAPE_epoch)
        mean_val_R2 = np.mean(val_R2_epoch)
        mean_val_CCC = np.mean(val_CCC_epoch)

        # Logging
        mlflow.log_metrics({
            "train_loss": mean_train_loss,
            "train_mse": mean_train_MSE,
            "train_mape": mean_train_MAPE,
            "train_r2": mean_train_R2,
            "train_ccc": mean_train_CCC,

            "val_loss": mean_val_loss,
            "val_mse": mean_val_MSE,
            "val_mape": mean_val_MAPE,
            "val_r2": mean_val_R2,
            "val_ccc": mean_val_CCC,
        }, step=epoch)

        train_losses.append(mean_train_loss)
        val_losses.append(mean_val_loss)
        train_MSE.append(mean_train_MSE)
        val_MSE.append(mean_val_MSE)
        train_MAPE.append(mean_train_MAPE)
        val_MAPE.append(mean_val_MAPE)
        train_R2.append(mean_train_R2)
        val_R2.append(mean_val_R2)
        train_CCC.append(mean_train_CCC)
        val_CCC.append(mean_val_CCC)

        if (epoch + 1) % 100 == 0:
            print(f"[Epoch] {epoch + 1:{len(str(N_EPOCHS))}d}/{N_EPOCHS:d}", end='  ')
            print(f"TRAIN", end='   ')
            print(f"Loss: {mean_train_loss:1.4f}", end='   ')
            print(f"MSE: {mean_train_MSE:1.4e}", end='   ')
            print(f"MAPE: {mean_train_MAPE:1.4f}", end='   ')
            print(f"R2: {mean_train_R2: 1.4f}", end='   ')
            print(f"CCC: {mean_train_CCC: 1.4f}", end='')
            print(" ## ", end='')
            print(f"VALIDATION", end='   ')
            print(f"Loss: {mean_val_loss:1.4f}", end='   ')
            print(f"MSE: {mean_val_MSE:1.4e}", end='   ')
            print(f"MAPE: {mean_val_MAPE:1.4f}", end='   ')
            print(f"R2: {mean_val_R2: 1.4f}", end='   ')
            print(f"CCC: {mean_val_CCC: 1.4f}")

    signature = mlflow.models.infer_signature(x.cpu().detach().numpy(), model(x).cpu().detach().numpy())

    # Log all models
    mlflow.pytorch.log_model(
        pytorch_model=model,
        input_example=x.cpu().detach().numpy(),
        artifact_path='model',
        signature=signature,
    )

    signature = mlflow.models.infer_signature(x.cpu().detach().numpy(), x_scaler.transform(x).cpu().detach().numpy())
    mlflow.pytorch.log_model(
        pytorch_model=x_scaler,
        artifact_path='x_scaler',
        signature=signature,
    )

    signature = mlflow.models.infer_signature(y_pred.cpu().detach().numpy(),
                                              y_scaler.transform(y_pred).cpu().detach().numpy())
    mlflow.pytorch.log_model(
        pytorch_model=y_scaler,
        artifact_path='y_scaler',
        signature=signature,
    )

# 5. Prediction on real data
We will use as *real* data, data with shared multiplicative noise:
$$\varepsilon \sim \mathcal N \left( \mu = 1, \sigma = 0.0025 \right)$$

Such that $\hat x = x * \varepsilon$ has 95% chance of being within +- 0.5% of the true value. Which is the same order of magnitude observed with HBM sensors.

This noise will be applied to a set of data from which a subset will be extracted for fine-tuning. The noised features are:
- Displacement
- Bar strain
- Bars forces

In [3]:
_ds = TenBarsCantileverTrussSingleEADataset("./data/dataset/real_cantilever/data.hdf5")
ref = _ds[750]
ds = Subset(_ds, np.arange(750))
train_ds = Subset(_ds, np.arange(0, 750, 5))
val_ds = Subset(_ds, sorted(list(set(np.arange(751)) - set(np.arange(0, 750, 5)))))

In [4]:
class Model(nn.Module):
    def __init__(self, x_scaler, model, y_scaler):
        super().__init__()
        self.x_scaler = x_scaler
        self.model = model
        self.y_scaler = y_scaler

    def forward(self, x):
        x = self.x_scaler.transform(x)
        x = self.model(x)
        x = self.y_scaler.inverse_transform(x)
        return x

In [5]:
CCC = R.ConcordanceCorrCoef().to(device)
R2 = R.R2Score(multioutput='uniform_average').to(device)
MAPE = R.MeanAbsolutePercentageError().to(device)
MSE = R.MeanSquaredError().to(device)

## a. Non fine-tuned prediction
Scores using the foundation model for prediction

In [6]:
mlflow.set_tracking_uri(uri="http://127.0.0.1:5000/")

# Load the model
artifact_dir = 'runs:/32ba2bcaf225416e82c55ae552dbb628'

model = Model(mlflow.pytorch.load_model(f"{artifact_dir}/x_scaler"),
              mlflow.pytorch.load_model(f"{artifact_dir}/model"),
              mlflow.pytorch.load_model(f"{artifact_dir}/y_scaler"))

Downloading artifacts:   0%|          | 0/6 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/8 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/6 [00:00<?, ?it/s]

In [7]:
y_pred_init = model(ref[0].to(device))

In [8]:
error_init = MAPE(y_pred_init.reshape(1).to(device), ref[1].to(device))

In [9]:
print(f"Prediction: {y_pred_init.item() * 1e-6:.3f} MN")
print(f"Expected: {ref[1].to(device).item() * 1e-6:.3f} MN")
print(f"MAPE: {error_init * 100:.3f}%")

Prediction: 2517.327 MN
Expected: 300.000 MN
MAPE: 739.109%


## b. Fine-tuned model
We will finetune the foundation model using the subset of real data as input


### I. Experiment on the size of fine-tuning set
These experiments will help us define how many real example are needed for *sufficient* fine-tuning.

##### Without PINN

In [11]:
def finetune_no_pinn(train_ds, val_ds, train_size, n_epoch=100, verbose=False):
    mlflow.set_tracking_uri(uri="http://127.0.0.1:5000/")
    mlflow.set_experiment("MLP_Cantilever_finetuning")

    artifact_dir = 'runs:/32ba2bcaf225416e82c55ae552dbb628'
    x_scaler = mlflow.pytorch.load_model(f"{artifact_dir}/x_scaler")
    model = mlflow.pytorch.load_model(f"{artifact_dir}/model")
    y_scaler = mlflow.pytorch.load_model(f"{artifact_dir}/y_scaler")

    # CCC = R.ConcordanceCorrCoef().to(device)
    # R2 = R.R2Score(multioutput='uniform_average').to(device)
    MAPE = R.MeanAbsolutePercentageError().to(device)
    # MSE = R.MeanSquaredError().to(device)

    with mlflow.start_run():
        LR = 1e-4
        N_NEURONS = 40
        N_LAYERS = 3
        N_EPOCHS = n_epoch
        BATCH_SIZE = min(8, train_size)
        ACTIVATION = nn.Tanh

        model_parameters = filter(lambda p: p.requires_grad, model.parameters())
        n_params = sum(np.prod(p.size()) for p in model_parameters)
        mlflow.log_params({
            'n_neurons': N_NEURONS, 'n_layers': N_LAYERS, 'lr': LR,
            'activation': ACTIVATION.__name__, "capacity": n_params, "n_epochs": N_EPOCHS,
            'FEI': False, 'train_size': train_size
        })

        train_dl = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)
        val_dl = DataLoader(val_ds, batch_size=len(val_ds), shuffle=True,)

        criterion = nn.MSELoss()
        optimizer = torch.optim.Adam(model.parameters(), lr=LR)

        train_losses = []
        val_losses = []
        # train_MSE = []
        # val_MSE = []
        train_MAPE = []
        val_MAPE = []
        # train_R2 = []
        # val_R2 = []
        # train_CCC = []
        # val_CCC = []

        for epoch in range(N_EPOCHS):
            model.train()
            train_loss_epoch = []
            train_MSE_epoch = []
            train_MAPE_epoch = []
            train_R2_epoch = []
            train_CCC_epoch = []

            for batch in train_dl:
                x, y, _, _, _ = batch
                x, y = x.to(device), y.to(device)

                x = x_scaler.transform(x)
                y = y_scaler.transform(y)

                optimizer.zero_grad()
                y_pred = model(x)
                loss = criterion(y, y_pred)
                loss.backward()
                optimizer.step()

                y_unscaled = y_scaler.inverse_transform(y).cpu().detach()
                y_pred_unscaled = y_scaler.inverse_transform(y_pred).cpu().detach()

                train_loss_epoch.append(loss.item())
                # train_MSE_epoch.append(MSE(y_pred_unscaled, y_unscaled).item())
                train_MAPE_epoch.append(MAPE(y_pred_unscaled, y_unscaled).item())
                # train_CCC_epoch.append(CCC(y_pred_unscaled, y_unscaled).item())
                # train_R2_epoch.append(R2(y_pred_unscaled, y_unscaled).item())

            model.eval()
            val_loss_epoch = []
            # val_MSE_epoch = []
            val_MAPE_epoch = []
            # val_R2_epoch = []
            # val_CCC_epoch = []
            for batch in val_dl:
                x, y, _, _, _ = batch
                x, y = x.to(device), y.to(device)

                x = x_scaler.transform(x)
                y = y_scaler.transform(y)

                y_pred = model(x)
                loss = criterion(y_pred, y)

                y_unscaled = y_scaler.inverse_transform(y).cpu().detach()
                y_pred_unscaled = y_scaler.inverse_transform(y_pred).cpu().detach()

                val_loss_epoch.append(loss.item())
                #  val_MSE_epoch.append(MSE(y_pred_unscaled, y_unscaled).item())
                val_MAPE_epoch.append(MAPE(y_pred_unscaled, y_unscaled).item())
                # val_CCC_epoch.append(CCC(y_pred_unscaled, y_unscaled).item())
                # val_R2_epoch.append(R2(y_pred_unscaled, y_unscaled).item())

            mean_train_loss = np.mean(train_loss_epoch)
            # mean_train_MSE = np.mean(train_MSE_epoch)
            mean_train_MAPE = np.mean(train_MAPE_epoch)
            # mean_train_R2 = np.mean(train_R2_epoch)
            # mean_train_CCC = np.mean(train_CCC_epoch)

            mean_val_loss = np.mean(val_loss_epoch)
            # mean_val_MSE = np.mean(val_MSE_epoch)
            mean_val_MAPE = np.mean(val_MAPE_epoch)
            # mean_val_R2 = np.mean(val_R2_epoch)
            # mean_val_CCC = np.mean(val_CCC_epoch)

            # Reference structure score
            x, y, _, _, _ = ref
            x = x_scaler.inverse_transform(x.to(device))
            y_pred = model(x)
            y_pred = y_scaler.inverse_transform(y_pred)

            reference_MAPE = MAPE(y_pred, y.to(device))

            # Logging
            mlflow.log_metrics({
                "train_loss": mean_train_loss,
                # "train_mse": mean_train_MSE,
                "train_mape": mean_train_MAPE,
                # "train_r2": mean_train_R2,
                # "train_ccc": mean_train_CCC,

                "val_loss": mean_val_loss,
                # "val_mse": mean_val_MSE,
                "val_mape": mean_val_MAPE,
                # "val_r2": mean_val_R2,
                # "val_ccc": mean_val_CCC,

                "reference_MAPE": reference_MAPE,
            }, step=epoch)

            train_losses.append(mean_train_loss)
            val_losses.append(mean_val_loss)
            # train_MSE.append(mean_train_MSE)
            # val_MSE.append(mean_val_MSE)
            train_MAPE.append(mean_train_MAPE)
            val_MAPE.append(mean_val_MAPE)
            # train_R2.append(mean_train_R2)
            # val_R2.append(mean_val_R2)
            # train_CCC.append(mean_train_CCC)
            # val_CCC.append(mean_val_CCC)

            if (epoch + 1) % 100 == 0 and verbose:
                print(f"[Epoch] {epoch + 1:{len(str(N_EPOCHS))}d}/{N_EPOCHS:d}", end='  ')
                print(f"TRAIN", end='   ')
                print(f"Loss: {mean_train_loss:1.4f}", end='   ')
                # print(f"MSE: {mean_train_MSE:1.4e}", end='   ')
                print(f"MAPE: {mean_train_MAPE:1.4f}", end='   ')
                # print(f"R2: {mean_train_R2: 1.4f}", end='   ')
                # print(f"CCC: {mean_train_CCC: 1.4f}", end='')
                print(" ## ", end='')
                print(f"VALIDATION", end='   ')
                print(f"Loss: {mean_val_loss:1.4f}", end='   ')
                # print(f"MSE: {mean_val_MSE:1.4e}", end='   ')
                print(f"MAPE: {mean_val_MAPE:1.4f}", end='   ')
                # print(f"R2: {mean_val_R2: 1.4f}", end='   ')
                # print(f"CCC: {mean_val_CCC: 1.4f}")

In [None]:
import logging

print("Start")
logging.getLogger("mlflow").setLevel(logging.WARNING)

for i in range(0, len(train_ds)):
    idx = np.linspace(0, len(train_ds)-1, i+1, dtype=int)
    finetune_no_pinn(Subset(train_ds, idx), val_ds, i+1, n_epoch=100, verbose=False)

##### With PINN