In [None]:
from imghdr import tests
%pwd
%cd ../..

In [None]:
RANDOM_STATE = 42
import numpy as np

import os
import copy
import torch
from torch import nn
from torch.utils.data import random_split, DataLoader, Subset
from torch.nn import functional as F
from sklearn.model_selection import KFold
from sklearn.metrics import (d2_absolute_error_score as D2,
                             r2_score as R2,
                             mean_absolute_percentage_error as MAPE)

from dataset import TenBarsCantileverTrussSingleEADataset

import mlflow

from models.architecture import MultiLayerPerceptron
from models.processing import StandardScaler

np.random.seed(RANDOM_STATE)

device = torch.device(
    'cuda' if torch.cuda.is_available()
    else 'mps' if torch.backends.mps.is_available()
    else 'cpu'
)

# 1. Load the data


In [None]:
data_path = "./data/dataset/cantilever/data.hdf5"
_ds = TenBarsCantileverTrussSingleEADataset(data_path)

in_dim = _ds[0][0].__len__()
out_dim = _ds[0][1].__len__()

print(f"Dataset size: {len(_ds)}")
print(f"  Sample dimension: {in_dim}")
print(f"  Target dimension: {out_dim}")

# 2. Training and Validation routine

In [None]:
MAPE([1.1,1.1], [1,1])

In [11]:
def train(model, train_ds, val_ds, lr, n_epochs, batch_size, verbose=True):
    model = model.to(device)
    train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
    val_dl = DataLoader(val_ds, batch_size=batch_size, shuffle=True)

    x_scaler = StandardScaler(in_dim).to(device)
    y_scaler = StandardScaler(out_dim).to(device)
    for x, y, _, _, _ in train_dl:
        x_scaler.partial_fit(x.to(device))
        y_scaler.partial_fit(y.to(device))

    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    for epoch in range(n_epochs):
        model.train()
        train_loss = []
        train_MSE = []
        train_MAPE = []
        train_R2 = []
        train_D2 = []

        for batch in train_dl:
            x, y, _, _, _ = batch
            x, y = x.to(device), y.to(device)

            x = x_scaler.transform(x)
            y = y_scaler.transform(y)

            optimizer.zero_grad()
            y_pred = model(x)
            loss = criterion(y, y_pred)
            loss.backward()
            optimizer.step()

            y_unscaled = y_scaler.inverse_transform(y)
            y_pred_unscaled = y_scaler.inverse_transform(y_pred)

            train_loss.append(loss.item())
            train_MSE.append(F.mse_loss(y_pred_unscaled, y_unscaled).item())
            train_MAPE.append(MAPE(y_unscaled.cpu().detach(), y_pred_unscaled.cpu().detach()))
            train_D2.append(D2(y_unscaled.cpu().detach(), y_pred_unscaled.cpu().detach()))
            train_R2.append(R2(y_unscaled.cpu().detach(), y_pred_unscaled.cpu().detach()))

        model.eval()
        val_loss = []
        val_MSE = []
        val_MAPE = []
        val_R2 = []
        val_D2 = []
        for batch in val_dl:
            x, y, _, _, _ = batch
            x, y = x.to(device), y.to(device)

            x = x_scaler.transform(x)
            y = y_scaler.transform(y)

            y_pred = model(x)
            loss = criterion(y_pred, y)

            y_unscaled = y_scaler.inverse_transform(y)
            y_pred_unscaled = y_scaler.inverse_transform(y_pred)

            val_loss.append(loss.item())
            val_MSE.append(F.mse_loss(y_pred_unscaled, y_unscaled).item())
            val_MAPE.append(MAPE(y_unscaled.cpu().detach(), y_pred_unscaled.cpu().detach()))
            val_D2.append(D2(y_unscaled.cpu().detach(), y_pred_unscaled.cpu().detach()))
            val_R2.append(R2(y_unscaled.cpu().detach(), y_pred_unscaled.cpu().detach()))

        # mlflow.log_metric("train loss", np.mean(train_loss)), step=epoch)
        # mlflow.log_metric("train MSE", np.mean(train_MSE), step=epoch)
        # mlflow.log_metric("train MAPE", np.mean(train_MAPE), step=epoch)
        # mlflow.log_metric("train R2", np.mean(train_R2), step=epoch)
        # mlflow.log_metric("train D2", np.mean(train_D2), step=epoch)
        #
        # mlflow.log_metric("val loss", np.mean(val_loss), step=epoch)
        # mlflow.log_metric("val MSE", np.mean(val_MSE), step=epoch)
        # mlflow.log_metric("val MAPE", np.mean(val_MAPE), step=epoch)
        # mlflow.log_metric("val R2", np.mean(val_R2), step=epoch)
        # mlflow.log_metric("val D2", np.mean(val_D2), step=epoch)

        # Logging
        if verbose:
            print(f"[Epoch {epoch + 1:{len(str(n_epochs))}d}/{n_epochs:d}]", end='  ')
            print(f"TRAIN", end='   ')
            print(f"Loss: {np.mean(train_loss):1.4f}", end = ',  ')
            print(f"MSE: {np.mean(train_MSE):1.4e}", end = ',  ')
            print(f"MAPE: {np.mean(train_MAPE):1.4f}", end = ',  ')
            print(f"R2: {np.mean(train_R2):1.4f}", end = ',  ')
            print(f"D2: {np.mean(train_D2):1.4f}", end = '')
            print("  ||  ", end='')
            print(f"VALIDATION", end='   ')
            print(f"Loss: {np.mean(val_loss):1.4f}", end = ',  ')
            print(f"MSE: {np.mean(val_MSE):1.4e}", end = ',  ')
            print(f"MAPE: {np.mean(val_MAPE):1.4f}", end = ',  ')
            print(f"R2: {np.mean(val_R2):1.4f}", end = ',  ')
            print(f"D2: {np.mean(val_D2):1.4f}")

In [None]:
ds_1, ds_2 = random_split(_ds, (.8, .2))
train(model = MultiLayerPerceptron(in_dim, out_dim,
                                             40,3,
                                             nn.ReLU),
      train_ds = ds_1,
      lr=4e-4,
      val_ds = ds_2,
      n_epochs = 100,
      batch_size = 2048,
      verbose = True)

 # 2. Hyperparameter tuning
Considering a 10 bar cantilever dataset we want to predict the EA of the bars which is assumed to be a single common value.
The model is an MLP here are the parameters:
- Activation function
- Learning rate
- Number of layers
- Number of neurons per layer

In [None]:
# Model capacity tuning
n_neurons_values = [10, 15, 20, 25, 30]
n_layers_values = [2, 3, 4, 5]

outer_cv = KFold(n_splits=3, shuffle=True, random_state=RANDOM_STATE)
outer_configs = []
outer_scores = []
for outer_fold_id, (train_idx, test_idx) in enumerate(outer_cv.split(_ds)):
    train_ds, test_ds = Subset(_ds, train_idx), Subset(_ds, test_idx)

    inner_cv = KFold(n_splits=3, shuffle=True, random_state=RANDOM_STATE + outer_fold_id)
    inner_scores = {}
    for n_neurons in n_neurons_values:
        for n_layers in n_layers_values:
            inner_scores.setdefault((n_layers, n_neurons), [])
            for inner_fold_id, (train_idx, val_idx) in inner_cv.split(train_ds):
                train_ds, val_ds = Subset(train_ds, train_idx), Subset(train_ds, val_idx)

                model = MultiLayerPerceptron(in_dim, out_dim,
                                             n_layers, n_neurons,
                                             nn.ReLU)

                train(model, train_ds, val_ds)

                inner_scores[(n_layers, n_neurons)].append(score)

    best_score = np.inf
    best_config = None
    for config, scores in inner_scores.items():
        if scores < best_score:
            best_score = scores
            best_config = config

    model = MultiLayerPerceptron(in_dim, out_dim,
                                 best_config[0], best_config[1],
                                 nn.ReLU)

    train(model, train_ds, test_ds)
    score = validate(model, test_ds)

    outer_configs.append(best_config)
    outer_scores.append(score)

# 3. Training the foundation model
We will train our model with the whole dataset to create a foundation model that will have learnt all the specifics of the problem.

# 4. Prediction on real data
We will use as *real* data, data with shared multiplicative noise:
$$\varepsilon \sim \mathcal N \left( \mu = 1, \sigma = 0.0025 \right)$$

Such that $\hat x = x * \varepsilon$ has 95% chance of being within +- 0.5% of the true value. Which is the same order of magnitude observed with HBM sensors.

This noise will be applied to a set of data from which a subset will be extracted for fine-tuning.

## a. Non fine-tuned prediction
Scores using the foundation model for prediction

z## b. Fine-tuned model
We will finetune the foundation model using the subset of real data as input


### I. Experiment on the size of fine-tuning set
These experiments will help us define how many real example are needed for *sufficient* fine-tuning.

##### Without PINN

##### With PINN