# Damage Size Regression on Synthetic Data - Baseline
In this notebook, we conduct experiments to be able to compare to a baseline. Just to make sure, we repeat the experiment several times to make sure that the previously established cross-validation setup gives reliable results also with this approach.

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/berni-lehner/structural_health_monitoring/blob/main/notebooks/synthetic_regression_AA_BASE.ipynb)

In [1]:
%load_ext rich
%load_ext autoreload
%autoreload 2
%matplotlib inline

from rich import print

random_state = 42

## Prerequisites

In [2]:
import sys
import os
from pathlib import Path
import pandas as pd

# initialize environment, paths, etc...
IN_COLAB = "google.colab" in sys.modules

if IN_COLAB:
    # uncomment in case you want to start from scratch in Colab
    ! rm -rf /content/structural_health_monitoring  
    ! git clone https://github.com/berni-lehner/structural_health_monitoring.git
    % pip install -r /content/structural_health_monitoring/requirements.txt

    # this is Colab specific, and only a manual "Run all" will cause the
    # installation to be finished
    if pd.__version__ != "1.4.2":
        print("Stopping RUNTIME! Please run again to finish installation.")
        os.kill(os.getpid(), 9)

    DATA_PATH = Path(r"/content/structural_health_monitoring/data")
    sys.path.insert(0, "/content/structural_health_monitoring/src")
else:
    DATA_PATH = Path(r"../data/")
    sys.path.insert(0, "../src")


sys.path.insert(0, str(DATA_PATH))
SYN_DATA_PATH = Path(DATA_PATH, "synthetic")
REAL_DATA_PATH = Path(DATA_PATH, "real_world")
RESULT_PATH = Path(DATA_PATH, "results")
if not os.path.isdir(RESULT_PATH):
    os.makedirs(RESULT_PATH)

try:
    from zippeekiyay import namelist
except ImportError or ModuleNotFoundError:
    print("installing zippee-ki-yay...")
    % pip install git+https://github.com/berni-lehner/zippee-ki-yay.git

    from zippeekiyay import namelist

CALIBRATION_FILE = Path(REAL_DATA_PATH, "calibration.pkl")

In [3]:
# Intel(R) Extension for Scikit-learn for speed-up
from sklearnex import patch_sklearn

patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [4]:
import time
import glob
import sklearn
import numpy as np
from collections import Counter

import matplotlib.pyplot as plt
import seaborn as sns
import ipywidgets as widgets

# ...and download raw data if not already downloaded
from data_utils import init_data

init_data(syn_data_path=SYN_DATA_PATH)

from plot_utils import (
    init_plotting,
    plot_metrics,
    plot_cv_indices,
    plot_embedding_targets,
    init_plotting,
    plot_classwise_dist,
    plot_classwise_kde,
)

# configure fonts, etc...
init_plotting()

from scoring_utils import get_synth_regression_scoring, SHM_Scoring
from cv_utils import (
    AnomalyShuffleSplit,
    RepeatedAnomalyShuffleSplit,
    CreateAnomalyData,
    dump_cv,
)
from bench_utils import classification_benchmark, repeat_experiment, extract_metrics
from data_utils import load_syn_reg_data, FEATURE_LIST
from shm_experiments import conduct_aa_reg_experiment

time passed: 6.75 s
downloading synthetic data successful: True


In [5]:
result_file = "synthetic_regression_AA_hopfield.pkl"
result_file = Path(RESULT_PATH, result_file)

## Data Loading

In [6]:
X, y = load_syn_reg_data(
    data_path=SYN_DATA_PATH, target_col=["y_radius"], min_radius=1.9, cache=True
)

In [7]:
X.shape

[1m([0m[1;36m28302[0m, [1;36m31[0m[1m)[0m

In [8]:
cnt = Counter(y)
cnt


[1;35mCounter[0m[1m([0m[1m{[0m
    [1;36m0.0[0m: [1;36m990[0m,
    [1;36m2.2[0m: [1;36m979[0m,
    [1;36m2.5[0m: [1;36m979[0m,
    [1;36m2.8[0m: [1;36m978[0m,
    [1;36m3.1[0m: [1;36m986[0m,
    [1;36m3.4[0m: [1;36m972[0m,
    [1;36m3.8[0m: [1;36m973[0m,
    [1;36m4.1[0m: [1;36m976[0m,
    [1;36m4.4[0m: [1;36m979[0m,
    [1;36m4.7[0m: [1;36m976[0m,
    [1;36m5.0[0m: [1;36m977[0m,
    [1;36m6.7[0m: [1;36m981[0m,
    [1;36m8.3[0m: [1;36m972[0m,
    [1;36m10.0[0m: [1;36m973[0m,
    [1;36m12.0[0m: [1;36m969[0m,
    [1;36m14.0[0m: [1;36m978[0m,
    [1;36m16.0[0m: [1;36m980[0m,
    [1;36m18.0[0m: [1;36m976[0m,
    [1;36m20.0[0m: [1;36m974[0m,
    [1;36m22.0[0m: [1;36m975[0m,
    [1;36m24.0[0m: [1;36m980[0m,
    [1;36m26.0[0m: [1;36m973[0m,
    [1;36m28.0[0m: [1;36m983[0m,
    [1;36m30.0[0m: [1;36m977[0m,
    [1;36m32.0[0m: [1;36m967[0m,
    [1;36m34.0[0m: [1;36m971[0m,
    [1;36m36

## A-A Experiments

In [9]:
from imblearn.under_sampling import ClusterCentroids
from sklearn.cluster import KMeans
from sklearn.preprocessing import OrdinalEncoder

samples_per_class = 40
classes, counts = np.unique(y, return_counts=True)
n_classes = len(classes)
strgy = dict(zip(range(n_classes), [samples_per_class] * n_classes))

sampler = ClusterCentroids(
    estimator=KMeans(n_init="auto"),
    sampling_strategy=strgy,
    random_state=random_state,
)

enc = OrdinalEncoder(dtype=np.int32)

y_cls = enc.fit_transform(y.reshape(-1, 1))
X_sampled, y_recls = sampler.fit_resample(X, y_cls)
y_sampled = enc.inverse_transform(y_recls.reshape(-1, 1))  # .squeeze()


X_sampled.shape, y_sampled.shape

[1m([0m[1m([0m[1;36m1160[0m, [1;36m31[0m[1m)[0m, [1m([0m[1;36m1160[0m, [1;36m1[0m[1m)[0m[1m)[0m

In [12]:
import torch
from imblearn.pipeline import Pipeline
from skorch import NeuralNetRegressor
from hflayers import Hopfield, HopfieldLayer

In [30]:
beta = 1.0
hopfield = Hopfield(
    scaling=beta,
    # do not project layer input
    state_pattern_as_static=True,
    stored_pattern_as_static=True,
    pattern_projection_as_static=True,
    # do not pre-process layer input
    normalize_stored_pattern=False,
    normalize_stored_pattern_affine=False,
    normalize_state_pattern=False,
    normalize_state_pattern_affine=False,
    normalize_pattern_projection=False,
    normalize_pattern_projection_affine=False,
    # do not post-process layer output
    disable_out_projection=True,
)

estimator = NeuralNetRegressor(hopfield)

pipeline = Pipeline([
    ("to_tensor"),    
    ("estimator", estimator)
])

k = torch.from_numpy(X[np.newaxis])
q = torch.from_numpy(X_sampled[np.newaxis])
v = torch.from_numpy(y_sampled[np.newaxis]).float()

print(q.shape, k.shape, v.shape)
hopfield((q, k, v))


[1;35mtensor[0m[1m([0m[1m[[0m[1m[[0m[1m[[0m[1;36m34.0021[0m[1m][0m,
         [1m[[0m[1;36m34.0019[0m[1m][0m,
         [1m[[0m[1;36m34.0016[0m[1m][0m,
         [33m...[0m,
         [1m[[0m[1;36m40.0000[0m[1m][0m,
         [1m[[0m[1;36m40.0000[0m[1m][0m,
         [1m[[0m[1;36m40.0000[0m[1m][0m[1m][0m[1m][0m[1m)[0m

In [14]:
scoring = get_synth_regression_scoring()

results = conduct_aa_reg_experiment(
    X=X,
    y=y,
    estimator=pipeline,
    name="BASE",
    n_repeats=5,
    n_splits=32,
    test_size=0.1,
    scoring=scoring,
    random_state=random_state,
)


All the 32 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
32 fits failed with the following error:
Traceback (most recent call last):
  File "/home/huberc/sw/mambaforge/envs/quick/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/huberc/sw/mambaforge/envs/quick/lib/python3.9/site-packages/imblearn/pipeline.py", line 297, in fit
    self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  File "/home/huberc/sw/mambaforge/envs/quick/lib/python3.9/site-packages/skorch/regressor.py", line 82, in fit
    return super(NeuralNetRegressor, self).fit(X, y, **fit_params)
  File "/home/huberc/sw/mambaforge/envs/quick/lib/python3.9/site-packages/skorch/net.py", line 1319, in fit
   

## Store results for later

In [15]:
results.to_pickle(result_file)

## Plot some results for overview

In [16]:
fig = plot_metrics(
    extract_metrics(
        results, ["test_mse_0.0_40.0", "test_mse_2.8_40.0", "test_mse_5.0_40.0"]
    )
)
plt.show()

In [17]:
shm_scoring = SHM_Scoring()

fig = plot_metrics(extract_metrics(results, shm_scoring.SYNTH_MSE_RESULTS))
plt.gca().get_legend().remove()

plt.show()