In [1]:
from pathlib import Path
import pandas as pd
import sys
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import (RBF, ConstantKernel as C,
                                              Matern, WhiteKernel, DotProduct)
from sklearn.metrics import pairwise_distances, mean_squared_error,r2_score, mean_absolute_percentage_error
from sklearn.decomposition import PCA
from scipy.stats import pearsonr
from pathlib import Path
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

In [2]:
creep_df = pd.read_csv('SS316_PI_dataset.csv')
creep_df

Unnamed: 0,Material,C,Si,Mn,P,S,Ni,Cr,Mo,Cu,...,B,N,Nb+Ta,Elongation,Reduction of Area,Stress,Temperature,Stacking Fault Energy,Predicted Creep Life,Log Creep Life
0,0.0,0.060,0.59,1.69,0.024,0.017,13.32,16.73,2.38,0.07,...,0.0010,0.0300,0.020,55.0,61,235,650,66.761225,1.438188,1.465383
1,1.0,0.050,0.52,1.51,0.021,0.010,13.21,16.42,2.34,0.14,...,0.0005,0.0340,0.010,68.0,78,157,700,66.602514,1.428500,1.537819
2,7.0,0.060,0.52,1.60,0.025,0.007,13.30,16.70,2.25,0.24,...,0.0008,0.0318,0.010,72.0,77,235,650,65.227932,1.471625,1.588832
3,18.0,0.012,0.56,0.81,0.024,0.004,10.67,16.39,2.11,0.27,...,0.0010,0.0810,0.001,106.0,80,177,700,58.700152,1.575937,1.591065
4,8.0,0.060,0.52,1.58,0.025,0.007,13.60,16.60,2.31,0.26,...,0.0007,0.0224,0.010,75.0,83,157,700,66.624772,1.566875,1.604226
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
612,7.0,0.060,0.52,1.60,0.025,0.007,13.30,16.70,2.25,0.24,...,0.0008,0.0318,0.010,41.0,55,118,600,65.227932,5.178750,5.152857
613,6.0,0.070,0.61,1.65,0.025,0.007,13.60,16.60,2.33,0.26,...,0.0011,0.0250,0.010,26.0,39,108,600,66.647083,5.289000,5.184004
614,2.0,0.050,0.71,1.52,0.022,0.013,13.50,17.50,2.28,0.17,...,0.0013,0.0350,0.020,28.0,37,108,600,64.234884,5.247063,5.240086
615,6.0,0.070,0.61,1.65,0.025,0.007,13.60,16.60,2.33,0.26,...,0.0011,0.0250,0.010,25.0,23,41,700,66.647083,5.353813,5.250884


In [3]:
creep_df = creep_df.drop(columns=['Stacking Fault Energy', 'Predicted Creep Life'])
creep_df.head()

Unnamed: 0,Material,C,Si,Mn,P,S,Ni,Cr,Mo,Cu,Ti,Al,B,N,Nb+Ta,Elongation,Reduction of Area,Stress,Temperature,Log Creep Life
0,0.0,0.06,0.59,1.69,0.024,0.017,13.32,16.73,2.38,0.07,0.011,0.015,0.001,0.03,0.02,55.0,61,235,650,1.465383
1,1.0,0.05,0.52,1.51,0.021,0.01,13.21,16.42,2.34,0.14,0.011,0.018,0.0005,0.034,0.01,68.0,78,157,700,1.537819
2,7.0,0.06,0.52,1.6,0.025,0.007,13.3,16.7,2.25,0.24,0.06,0.02,0.0008,0.0318,0.01,72.0,77,235,650,1.588832
3,18.0,0.012,0.56,0.81,0.024,0.004,10.67,16.39,2.11,0.27,0.0006,0.011,0.001,0.081,0.001,106.0,80,177,700,1.591065
4,8.0,0.06,0.52,1.58,0.025,0.007,13.6,16.6,2.31,0.26,0.029,0.021,0.0007,0.0224,0.01,75.0,83,157,700,1.604226


In [4]:
rm_state = 123
test_size = 0.6

X, X_test, y, y_test = train_test_split(np.array(creep_df.iloc[:, 0:19]), np.array(creep_df.iloc[:,19]), shuffle=True, test_size=test_size, random_state=rm_state)

In [5]:
print(X.shape)
print(X_test.shape)
print(y.shape)
print(y_test.shape)

(246, 19)
(371, 19)
(246,)
(371,)


In [6]:
idx = np.arange(len(y))

In [7]:
len(idx)

246

In [8]:
train_ratio = 0.1

X_train, _, y_train, _, idx_train, idx_pool = train_test_split(X, y, idx, train_size=train_ratio, shuffle=True, random_state=rm_state)

In [9]:
print(X_train.shape)
print(y_train.shape)

(24, 19)
(24,)


In [10]:
print(idx_train.shape)
print(idx_pool.shape)

(24,)
(222,)


In [11]:
n_iter = 20
pcc_variance = []
r2_variance = []
rmse_variance = []
mae_variance = []

pcc_random = []
r2_random = []
num_training_data=[]

kernel = C(1.0) * Matern(length_scale=1.0) + WhiteKernel(noise_level=1.0) + C(1.0) * DotProduct(sigma_0=1.0)
model = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=8, normalize_y=True)

X_train_var = X_train
X_train_ran = X_train
y_train_var = y_train
y_train_ran = y_train
idx_pool_var = idx_pool
idx_pool_ran = idx_pool
idx_train_var = idx_train
idx_train_ran = idx_train

for i in range(n_iter):
    print(f"Performing iteration : {i}")

    if i != 0:
        # find 10 data points with the highest variance
        q_points_var = np.argpartition(y_pred_unc_pool_var, -10)[-10:]
        # indices of those points in idx_pool
        idx_pool_train_var = idx_pool_var[q_points_var]

        idx_train_var = np.append(idx_train_var, idx_pool_train_var)
        idx_pool_var = np.delete(idx_pool_var, q_points_var)
        X_train_var = X[idx_train_var]
        y_train_var = y[idx_train_var]

    print(f"Number of training data with variance: {len(idx_train_var)}")
    print(f"Number of pooling data with variance: {len(idx_pool_var)}")

    num_training_data.append(len(idx_train_var))

    model.fit(X_train_var, y_train_var)

    y_pred_test_var, y_pred_unc_test_var = model.predict(X_test, return_std=True)
    y_pred_train_var, y_pred_unc_train_var = model.predict(X_train_var, return_std=True)
    y_pred_pool_var, y_pred_unc_pool_var = model.predict(X[idx_pool_var], return_std=True)
    

    print('PCC_test', pearsonr(y_test, y_pred_test_var)[0])
    print('R2_test', r2_score(y_test, y_pred_test_var))
    print('RMSE', np.sqrt(mean_squared_error(y_test, y_pred_test_var)))
    print('MAE', np.mean(abs(y_test - y_pred_test_var)))

    
    pcc_variance.append(pearsonr(y_test, y_pred_test_var)[0])
    r2_variance.append(r2_score(y_test, y_pred_test_var))
    rmse_variance.append(np.sqrt(mean_squared_error(y_test, y_pred_test_var)))
    mae_variance.append(np.mean(abs(y_test - y_pred_test_var)))

Performing iteration : 0
Number of training data with variance: 24
Number of pooling data with variance: 222
PCC_test 0.9395604536808678
R2_test 0.8655798775522651
RMSE 0.32964144431916387
MAE 0.22859493677007148
Performing iteration : 1
Number of training data with variance: 34
Number of pooling data with variance: 212
PCC_test 0.9615041689707732
R2_test 0.9186822997465037
RMSE 0.2563906650772444
MAE 0.19583827896080452
Performing iteration : 2
Number of training data with variance: 44
Number of pooling data with variance: 202




PCC_test 0.9695595752300191
R2_test 0.9389162076935736
RMSE 0.22221463644590797
MAE 0.1739707954165439
Performing iteration : 3
Number of training data with variance: 54
Number of pooling data with variance: 192
PCC_test 0.971018861894877
R2_test 0.9374314917313973
RMSE 0.22489902124858385
MAE 0.1758269026768051
Performing iteration : 4
Number of training data with variance: 64
Number of pooling data with variance: 182
PCC_test 0.9716010434778848
R2_test 0.9421861643190511
RMSE 0.21618499986940945
MAE 0.16964274955842842
Performing iteration : 5
Number of training data with variance: 74
Number of pooling data with variance: 172
PCC_test 0.9712040308547487
R2_test 0.9423427513327903
RMSE 0.21589203611897323
MAE 0.17000558533962234
Performing iteration : 6
Number of training data with variance: 84
Number of pooling data with variance: 162
PCC_test 0.9722381544658099
R2_test 0.9444779873968254
RMSE 0.2118567302380544
MAE 0.16605242776143572
Performing iteration : 7
Number of training data



PCC_test 0.9797628115055691
R2_test 0.9596550507561598
RMSE 0.1805943885368601
MAE 0.14217124632437522
Performing iteration : 8
Number of training data with variance: 104
Number of pooling data with variance: 142
PCC_test 0.9810091569259153
R2_test 0.9623686523435646
RMSE 0.1744152885454217
MAE 0.13837187670997142
Performing iteration : 9
Number of training data with variance: 114
Number of pooling data with variance: 132




PCC_test 0.9815951961814949
R2_test 0.9634086739266976
RMSE 0.17198823486843134
MAE 0.1342575990462814
Performing iteration : 10
Number of training data with variance: 124
Number of pooling data with variance: 122
PCC_test 0.981806983173938
R2_test 0.9638779918075026
RMSE 0.1708817206475321
MAE 0.1329344208196893
Performing iteration : 11
Number of training data with variance: 134
Number of pooling data with variance: 112
PCC_test 0.981675709450143
R2_test 0.9636593321881168
RMSE 0.17139814482344556
MAE 0.1333061963592806
Performing iteration : 12
Number of training data with variance: 144
Number of pooling data with variance: 102
PCC_test 0.983606352441132
R2_test 0.9673573585670935
RMSE 0.16244349028373264
MAE 0.12479544766585303
Performing iteration : 13
Number of training data with variance: 154
Number of pooling data with variance: 92
PCC_test 0.982833322778944
R2_test 0.96556784607485
RMSE 0.1668367651361164
MAE 0.12857288203229245
Performing iteration : 14
Number of training dat



PCC_test 0.9829316561978741
R2_test 0.9656508614355709
RMSE 0.16663552340741697
MAE 0.1283743613843284
Performing iteration : 18
Number of training data with variance: 204
Number of pooling data with variance: 42




PCC_test 0.984146855382046
R2_test 0.9679227084404542
RMSE 0.1610306369541619
MAE 0.12369416964303956
Performing iteration : 19
Number of training data with variance: 214
Number of pooling data with variance: 32
PCC_test 0.9848482965156922
R2_test 0.9692528041840186
RMSE 0.1576566963607119
MAE 0.12111893426055499


ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)


'\n    # random sampling\n    if i != 0:\n        # select 10 random data points\n        q_points_ran = np.random.choice(np.arange(len(idx_pool_ran)), size=10)\n        # indices of those points in idx_pool\n        idx_pool_train_ran = idx_pool_ran[q_points_ran]\n\n        idx_train_ran = np.append(idx_train_ran, idx_pool_train_ran)\n        idx_pool_ran = np.delete(idx_pool_ran, q_points_ran)\n        X_train_ran = X[idx_train_ran]\n        y_train_ran = y[idx_train_ran]\n\n    print(f"Number of training data with random: {len(idx_train_ran)}")\n    print(f"Number of pooling data with random: {len(idx_pool_ran)}")\n\n    model.fit(X_train_ran, y_train_ran)\n\n    y_pred_test_ran, y_pred_unc_test_ran = model.predict(X_test, return_std=True)\n    y_pred_train_ran, y_pred_unc_train_ran = model.predict(X_train_ran, return_std=True)\n    y_pred_pool_ran, y_pred_unc_pool_ran = model.predict(X[idx_pool_ran], return_std=True)\n\n    print(\'PCC_test\', pearsonr(y_test, y_pred_test_ran)[0])\

In [None]:
import pickle
with open('AL_GPR.pkl', 'wb') as f:
    pickle.dump({'train_numbs':num_training_data, 'pcc':pcc_variance,'r2':r2_variance, 'rsme': rmse_variance, 'mae': mae_variance}, f)
    f.close()

pkl_file = open('AL_GPR.pkl', 'rb')  
test_ALGPR = pickle.load(pkl_file)