In [1]:
import pandas as pd
import pytorch_lightning as pl
import seaborn as sns
import matplotlib.pyplot as plt
from fccd.data.modules import CDDDataModule

from fccd.data.datasets import TabularDataset
from fccd.util import plot_prediction_vs_truth, find_lr, plot_prediction_vs_truth_sklearn, limit_psd

from fccd.util import collect_dataloader
%load_ext autoreload
%autoreload 2

# Analyzing the minimal encoding size
Analyzing the minimal encoding size for the hidden state of a LSTM with parameters encoded in its Hidden State.

We analyze the information content of the parameters using PCA.

### Load data

In [2]:
data_125 = pd.read_csv("../data/processed/cu_125.csv", index_col=0)
data_1000 = pd.read_csv("../data/processed/cu_1000.csv", index_col=0)

data = pd.concat([data_125, data_1000])
data = data.reset_index(drop=True)

data["stress"] = data["stress"] * 1000
data["psd"] = data["psd"] * 1000

In [3]:
# Limit PSD to max(stress)
split_data = limit_psd(data)

In [4]:
stress_dm = CDDDataModule(
    split_data,
    target="stress",
    psd="psd",
    group="id",
    drop_cols=["strain", "time_ns", "dislocation"],
    time="t",
    batch_size=200,
    categoricals=["material", "euler_angles"],
    num_workers=4,
    transform=MinMaxScaler,
    split_dataset=True,
)
stress_dm.setup()

### Create Dataset from lightning datamodule

In [5]:
train_data = stress_dm.train_dataset()
val_data = stress_dm.val_dataset()
test_data = stress_dm.test_dataset()

In [6]:
psd_train_data = TabularDataset.from_cdd_dataset(train_data)
psd_val_data = TabularDataset.from_cdd_dataset(val_data)
psd_test_data = TabularDataset.from_cdd_dataset(test_data)

In [7]:
x_train, y_train = psd_train_data.dataset
x_val, y_val = psd_train_data.dataset
x_test, y_test = psd_train_data.dataset

### Conduct PCA

In [8]:
from sklearn.decomposition import PCA

pca = PCA(10)
pca.fit(x_train)

pca.explained_variance_ratio_[0:5]

array([8.12669001e-01, 1.03855052e-01, 7.08747628e-02, 1.25177180e-02,
       5.69375130e-05])

We can observe that a single component describes over 80% of all variation in the data. The second covers 10%, the third 0.7%.

The next big leap in explained variance is from the 4th to the 5th component. The 5th component explains less than 0.001%. We therefore can conclude that it might make sense to use a minimal layer size of 4 in an encoding model.
