In [1]:
import torch
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split, KFold
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import MinMaxScaler
import torch.optim as optim
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error
from scipy.stats import pearsonr
import pyro
import pyro.distributions as dist
from pyro.nn import PyroModule, PyroSample
from pyro.infer import Predictive
from pyro.infer import MCMC, NUTS

In [2]:
creep_df = pd.read_csv('Ni_superalloys_dataset.csv')
creep_df

Unnamed: 0,Ni,Al,Co,Cr,Mo,Re,Ru,Ta,W,Ti,Nb,T,log_stress,log_creep_life
0,62.80,5.6,9.0,6.5,0.6,3.0,0.0,6.5,6.0,0.0,0.0,950,2.267172,3.276554
1,59.30,5.8,5.8,2.9,3.9,4.9,6.0,5.6,5.8,0.0,0.0,1100,2.136721,3.026370
2,59.80,5.6,5.6,4.6,2.4,6.4,5.0,5.6,5.0,0.0,0.0,1000,2.389166,3.009026
3,59.30,5.8,5.8,2.9,3.9,4.9,6.0,5.6,5.8,0.0,0.0,1000,2.389166,2.969556
4,61.68,6.0,9.0,3.5,1.5,4.0,0.0,8.0,6.0,0.2,0.0,1100,2.079181,2.957607
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148,61.00,5.6,9.0,4.3,2.0,2.0,0.0,7.5,8.0,0.0,0.5,1100,2.322219,1.155336
149,61.00,5.6,9.0,4.3,2.0,2.0,0.0,7.5,8.0,0.0,0.5,1070,2.447158,1.089905
150,61.00,5.6,9.0,4.3,2.0,2.0,0.0,7.5,8.0,0.0,0.5,1100,2.352183,0.991226
151,61.00,5.6,9.0,4.3,2.0,2.0,0.0,7.5,8.0,0.0,0.5,1100,2.342423,0.968483


In [3]:
class BNN(PyroModule):
    def __init__(self, in_dim=13, out_dim=1, hid_dim=10, n_hid_layers=5, prior_scale=5.):
        super().__init__()

        self.activation = nn.Tanh()  # could also be ReLU or LeakyReLU
        assert in_dim > 0 and out_dim > 0 and hid_dim > 0 and n_hid_layers > 0  # make sure the dimensions are valid

        # Define the layer sizes and the PyroModule layer list
        self.layer_sizes = [in_dim] + n_hid_layers * [hid_dim] + [out_dim]
        layer_list = [PyroModule[nn.Linear](self.layer_sizes[idx - 1], self.layer_sizes[idx]) for idx in
                      range(1, len(self.layer_sizes))]
        self.layers = PyroModule[torch.nn.ModuleList](layer_list)

        for layer_idx, layer in enumerate(self.layers):
            layer.weight = PyroSample(dist.Normal(0., prior_scale * np.sqrt(2 / self.layer_sizes[layer_idx])).expand(
                [self.layer_sizes[layer_idx + 1], self.layer_sizes[layer_idx]]).to_event(2))
            layer.bias = PyroSample(dist.Normal(0., prior_scale).expand([self.layer_sizes[layer_idx + 1]]).to_event(1))

    def forward(self, x, y=None):
        # x = x.reshape(-1, 1)
        x = self.activation(self.layers[0](x))  # input --> hidden
        for layer in self.layers[1:-1]:
            x = self.activation(layer(x))  # hidden --> hidden
        mu = self.layers[-1](x).squeeze()  # hidden --> output
        sigma = pyro.sample("sigma", dist.Gamma(.5, 1))  # infer the response noise

        with pyro.plate("data", x.shape[0]):
            obs = pyro.sample("obs", dist.Normal(mu, sigma * sigma), obs=y)
        return mu

In [4]:
rm_state = 123
test_size = 0.2

X, X_test, y, y_test = train_test_split(np.array(creep_df.iloc[:, 0:13]), np.array(creep_df.iloc[:,13]), shuffle=True, test_size=test_size, random_state=rm_state)

scaler = MinMaxScaler()
X = scaler.fit_transform(X)
X_test = scaler.transform(X_test)

In [5]:
idx = np.arange(len(y))

train_ratio = 0.1
X_train, _, y_train, _, idx_train, idx_pool = train_test_split(X, y, idx, train_size=train_ratio, shuffle=True, random_state=rm_state)

In [6]:
pcc_variance = []
r2_variance = []
rmse_variance = []
mae_variance = []
num_training_data=[]


pcc_random = []
r2_random = []
num_iteration = []

X_train_var = X_train
X_train_ran = X_train
y_train_var = y_train
y_train_ran = y_train
idx_pool_var = idx_pool
idx_pool_ran = idx_pool
idx_train_var = idx_train
idx_train_ran = idx_train

In [7]:
n_iter = 14
test_list = []

for i in range(n_iter):
    print(f"Performing iteration : {i}")

    if i != 0:
        # find 10 data points with the highest variance
        
        q_points_var = np.argpartition(y_pred_unc_pool_var, -8)[-8:]
        # indices of those points in idx_pool
        idx_pool_train_var = idx_pool_var[q_points_var]

        idx_train_var = np.append(idx_train_var, idx_pool_train_var)
        idx_pool_var = np.delete(idx_pool_var, q_points_var)
        
        X_train_var = X[idx_train_var]
        y_train_var = y[idx_train_var]
       

    print(f"Number of training data with variance: {len(idx_train_var)}")
    print(f"Number of pooling data with variance: {len(idx_pool_var)}")
    
    num_training_data.append(len(idx_train_var))
    
    model = BNN(hid_dim=10, n_hid_layers=3, prior_scale=1)
    nuts_kernel = NUTS(model, jit_compile=False)
    mcmc = MCMC(nuts_kernel, num_samples=100)
    mcmc.run(torch.Tensor(X_train_var), torch.Tensor(y_train_var))
    predictive_var = Predictive(model=model, posterior_samples=mcmc.get_samples())
    preds_var = predictive_var(torch.Tensor(X_test))

    # mean and standard deviation of the test dataset
    y_pred_test_var = preds_var['obs'].T.detach().numpy().mean(axis=1)
    y_std_test_var = preds_var['obs'].T.detach().numpy().std(axis=1)

    preds_pool_var = predictive_var(torch.Tensor(X[idx_pool_var]))
    y_pred_unc_pool_var = preds_pool_var['obs'].T.detach().numpy().std(axis=1)

    print('PCC_test variance', pearsonr(np.squeeze(y_test), np.squeeze(y_pred_test_var))[0])
    print('R2_test variance', r2_score(np.squeeze(y_test), np.squeeze(y_pred_test_var)))
    print('RMSE', np.sqrt(mean_squared_error(y_test, y_pred_test_var)))
    print('MAE', np.mean(abs(y_test - y_pred_test_var)))

    pcc_variance.append(pearsonr(np.squeeze(y_test), np.squeeze(y_pred_test_var))[0])
    r2_variance.append(r2_score(np.squeeze(y_test), np.squeeze(y_pred_test_var)))
    rmse_variance.append(np.sqrt(mean_squared_error(y_test, y_pred_test_var)))
    mae_variance.append(np.mean(abs(y_test - y_pred_test_var)))

Performing iteration : 0
Number of training data with variance: 12
Number of pooling data with variance: 110


Sample: 100%|██████████| 200/200 [01:05,  3.04it/s, step size=2.10e-02, acc. prob=0.928]


PCC_test variance 0.15087347084500985
R2_test variance -0.6523343664829939
RMSE 0.449117333535391
MAE 0.320401441339328
Performing iteration : 1
Number of training data with variance: 20
Number of pooling data with variance: 102


Sample: 100%|██████████| 200/200 [03:38,  1.09s/it, step size=3.98e-03, acc. prob=0.871]


PCC_test variance 0.7269042843332076
R2_test variance 0.36285350545149175
RMSE 0.27888843035326594
MAE 0.23471373697908796
Performing iteration : 2
Number of training data with variance: 28
Number of pooling data with variance: 94


Sample: 100%|██████████| 200/200 [04:21,  1.31s/it, step size=4.76e-03, acc. prob=0.945]


PCC_test variance 0.8399681560106896
R2_test variance 0.6777273899614484
RMSE 0.198345589228227
MAE 0.16356079853184052
Performing iteration : 3
Number of training data with variance: 36
Number of pooling data with variance: 86


Sample: 100%|██████████| 200/200 [04:16,  1.28s/it, step size=4.05e-03, acc. prob=0.967]


PCC_test variance 0.8763395930574672
R2_test variance 0.7155948445897347
RMSE 0.18632862591502772
MAE 0.1363743350864311
Performing iteration : 4
Number of training data with variance: 44
Number of pooling data with variance: 78


Sample: 100%|██████████| 200/200 [04:21,  1.31s/it, step size=3.12e-03, acc. prob=0.957]


PCC_test variance 0.8984341760956256
R2_test variance 0.774817751958591
RMSE 0.1657974795797325
MAE 0.13213508726598644
Performing iteration : 5
Number of training data with variance: 52
Number of pooling data with variance: 70


Sample: 100%|██████████| 200/200 [04:19,  1.30s/it, step size=3.66e-03, acc. prob=0.958]


PCC_test variance 0.8969721937777837
R2_test variance 0.7885231128672816
RMSE 0.16067277833817833
MAE 0.12648269168954449
Performing iteration : 6
Number of training data with variance: 60
Number of pooling data with variance: 62


Sample: 100%|██████████| 200/200 [04:35,  1.38s/it, step size=3.26e-03, acc. prob=0.947]


PCC_test variance 0.8985690659325073
R2_test variance 0.8042111623185682
RMSE 0.15459833525959396
MAE 0.1279352098624478
Performing iteration : 7
Number of training data with variance: 68
Number of pooling data with variance: 54


Sample: 100%|██████████| 200/200 [04:38,  1.39s/it, step size=3.86e-03, acc. prob=0.885]


PCC_test variance 0.8895249412818225
R2_test variance 0.791140015014119
RMSE 0.15967556817800502
MAE 0.12752473965449965
Performing iteration : 8
Number of training data with variance: 76
Number of pooling data with variance: 46


Sample: 100%|██████████| 200/200 [04:15,  1.28s/it, step size=5.92e-03, acc. prob=0.787]


PCC_test variance 0.8906754696959529
R2_test variance 0.7811368305729007
RMSE 0.16345461650372542
MAE 0.12444410901349891
Performing iteration : 9
Number of training data with variance: 84
Number of pooling data with variance: 38


Sample: 100%|██████████| 200/200 [04:34,  1.37s/it, step size=2.98e-03, acc. prob=0.974]


PCC_test variance 0.8824515150320309
R2_test variance 0.7735695234770201
RMSE 0.16625636817966952
MAE 0.13475790099490925
Performing iteration : 10
Number of training data with variance: 92
Number of pooling data with variance: 30


Sample: 100%|██████████| 200/200 [04:38,  1.39s/it, step size=3.02e-03, acc. prob=0.970]


PCC_test variance 0.8922371794388899
R2_test variance 0.791016611093634
RMSE 0.1597227329841045
MAE 0.12601140737967637
Performing iteration : 11
Number of training data with variance: 100
Number of pooling data with variance: 22


Sample: 100%|██████████| 200/200 [04:31,  1.36s/it, step size=3.31e-03, acc. prob=0.932]


PCC_test variance 0.9065974084022204
R2_test variance 0.819433340588416
RMSE 0.1484668966345859
MAE 0.11557288339749315
Performing iteration : 12
Number of training data with variance: 108
Number of pooling data with variance: 14


Sample: 100%|██████████| 200/200 [04:51,  1.46s/it, step size=2.52e-03, acc. prob=0.953]


PCC_test variance 0.9069714021178755
R2_test variance 0.8155329268968359
RMSE 0.15006184350976418
MAE 0.11262209083108378
Performing iteration : 13
Number of training data with variance: 116
Number of pooling data with variance: 6


Sample: 100%|██████████| 200/200 [04:44,  1.42s/it, step size=2.43e-03, acc. prob=0.938]

PCC_test variance 0.9133069143382854
R2_test variance 0.8273152543551953
RMSE 0.14519037822216643
MAE 0.10009469985552619





'\n    # random sampling\n    if i != 0:\n        # select 10 random data points\n        q_points_ran = np.random.choice(np.arange(len(idx_pool_ran)), size=10)\n        # indices of those points in idx_pool\n        idx_pool_train_ran = idx_pool_ran[q_points_ran]\n\n        idx_train_ran = np.append(idx_train_ran, idx_pool_train_ran)\n        idx_pool_ran = np.delete(idx_pool_ran, q_points_ran)\n        X_train_ran = X[idx_train_ran]\n        y_train_ran = y[idx_train_ran]\n\n    print(f"Number of training data with random: {len(idx_train_ran)}")\n    print(f"Number of training data with random: {len(idx_pool_ran)}")\n\n\n    model = BNN(hid_dim=10, n_hid_layers=3, prior_scale=1)\n    nuts_kernel = NUTS(model, jit_compile=False)\n    mcmc = MCMC(nuts_kernel, num_samples=50)\n    mcmc.run(torch.Tensor(X_train_ran), torch.Tensor(y_train_ran))\n    predictive_ran = Predictive(model=model, posterior_samples=mcmc.get_samples())\n    preds_ran = predictive_ran(torch.Tensor(X_test))\n    # m

In [8]:
import pickle

with open('AL_MCMC_Ni.pkl', 'wb') as f:

    pickle.dump({'train_numbs':num_training_data, 'pcc':pcc_variance,'r2':r2_variance, 'rsme': rmse_variance, 'mae': mae_variance}, f)
    f.close()

pkl_file = open('AL_MCMC_Ni.pkl', 'rb')  
test_ALGPR = pickle.load(pkl_file)