In [1]:
from pathlib import Path
import pandas as pd
import sys
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import (RBF, ConstantKernel as C,
                                              Matern, WhiteKernel, DotProduct)
from sklearn.metrics import pairwise_distances, mean_squared_error,r2_score, mean_absolute_percentage_error
from sklearn.decomposition import PCA
from scipy.stats import pearsonr
from pathlib import Path
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

In [2]:
creep_df = pd.read_csv('Ni_superalloys_dataset.csv')
creep_df

Unnamed: 0,Ni,Al,Co,Cr,Mo,Re,Ru,Ta,W,Ti,Nb,T,log_stress,log_creep_life
0,62.80,5.6,9.0,6.5,0.6,3.0,0.0,6.5,6.0,0.0,0.0,950,2.267172,3.276554
1,59.30,5.8,5.8,2.9,3.9,4.9,6.0,5.6,5.8,0.0,0.0,1100,2.136721,3.026370
2,59.80,5.6,5.6,4.6,2.4,6.4,5.0,5.6,5.0,0.0,0.0,1000,2.389166,3.009026
3,59.30,5.8,5.8,2.9,3.9,4.9,6.0,5.6,5.8,0.0,0.0,1000,2.389166,2.969556
4,61.68,6.0,9.0,3.5,1.5,4.0,0.0,8.0,6.0,0.2,0.0,1100,2.079181,2.957607
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148,61.00,5.6,9.0,4.3,2.0,2.0,0.0,7.5,8.0,0.0,0.5,1100,2.322219,1.155336
149,61.00,5.6,9.0,4.3,2.0,2.0,0.0,7.5,8.0,0.0,0.5,1070,2.447158,1.089905
150,61.00,5.6,9.0,4.3,2.0,2.0,0.0,7.5,8.0,0.0,0.5,1100,2.352183,0.991226
151,61.00,5.6,9.0,4.3,2.0,2.0,0.0,7.5,8.0,0.0,0.5,1100,2.342423,0.968483


In [3]:
rm_state = 123
test_size = 0.2

X, X_test, y, y_test = train_test_split(np.array(creep_df.iloc[:, 0:13]), np.array(creep_df.iloc[:,13]), shuffle=True, test_size=test_size, random_state=rm_state)

In [4]:
print(X.shape)
print(X_test.shape)
print(y.shape)
print(y_test.shape)

(122, 13)
(31, 13)
(122,)
(31,)


In [5]:
idx = np.arange(len(y))

In [6]:
len(idx)

122

In [7]:
train_ratio = 0.1

X_train, _, y_train, _, idx_train, idx_pool = train_test_split(X, y, idx, train_size=train_ratio, shuffle=True, random_state=rm_state)

In [8]:
print(X_train.shape)
print(y_train.shape)

(12, 13)
(12,)


In [9]:
print(idx_train.shape)
print(idx_pool.shape)

(12,)
(110,)


In [10]:
n_iter = 14
pcc_variance = []
r2_variance = []
rmse_variance = []
mae_variance = []

pcc_random = []
r2_random = []
num_training_data=[]

kernel = C(1.0) * Matern(length_scale=1.0) + WhiteKernel(noise_level=1.0) + C(1.0) * DotProduct(sigma_0=1.0)
model = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=8, normalize_y=True)

X_train_var = X_train
X_train_ran = X_train
y_train_var = y_train
y_train_ran = y_train
idx_pool_var = idx_pool
idx_pool_ran = idx_pool
idx_train_var = idx_train
idx_train_ran = idx_train

for i in range(n_iter):
    print(f"Performing iteration : {i}")

    if i != 0:
        # find 8 data points with the highest variance
        q_points_var = np.argpartition(y_pred_unc_pool_var, -8)[-8:]
        # indices of those points in idx_pool
        idx_pool_train_var = idx_pool_var[q_points_var]

        idx_train_var = np.append(idx_train_var, idx_pool_train_var)
        idx_pool_var = np.delete(idx_pool_var, q_points_var)
        X_train_var = X[idx_train_var]
        y_train_var = y[idx_train_var]

    print(f"Number of training data with variance: {len(idx_train_var)}")
    print(f"Number of pooling data with variance: {len(idx_pool_var)}")

    num_training_data.append(len(idx_train_var))

    model.fit(X_train_var, y_train_var)

    y_pred_test_var, y_pred_unc_test_var = model.predict(X_test, return_std=True)
    y_pred_train_var, y_pred_unc_train_var = model.predict(X_train_var, return_std=True)
    y_pred_pool_var, y_pred_unc_pool_var = model.predict(X[idx_pool_var], return_std=True)
    

    print('PCC_test', pearsonr(y_test, y_pred_test_var)[0])
    print('R2_test', r2_score(y_test, y_pred_test_var))
    print('RMSE', np.sqrt(mean_squared_error(y_test, y_pred_test_var)))
    print('MAE', np.mean(abs(y_test - y_pred_test_var)))
    
    pcc_variance.append(pearsonr(y_test, y_pred_test_var)[0])
    r2_variance.append(r2_score(y_test, y_pred_test_var))
    rmse_variance.append(np.sqrt(mean_squared_error(y_test, y_pred_test_var)))
    mae_variance.append(np.mean(abs(y_test - y_pred_test_var)))   

Performing iteration : 0
Number of training data with variance: 12
Number of pooling data with variance: 110




PCC_test 0.6012646403900639
R2_test 0.3501394088704497
RMSE 0.2816572596811668
MAE 0.23360257206986065
Performing iteration : 1
Number of training data with variance: 20
Number of pooling data with variance: 102




PCC_test 0.5903865932947947
R2_test 0.2590766371358345
RMSE 0.30074436139652044
MAE 0.24903333301806238
Performing iteration : 2
Number of training data with variance: 28
Number of pooling data with variance: 94




PCC_test 0.5900775783160672
R2_test 0.24614146155916627
RMSE 0.30335822745312224
MAE 0.25133583361974177
Performing iteration : 3
Number of training data with variance: 36
Number of pooling data with variance: 86




PCC_test 0.6060044935723822
R2_test 0.3480214238400927
RMSE 0.2821158661900375
MAE 0.23629526419515937
Performing iteration : 4
Number of training data with variance: 44
Number of pooling data with variance: 78




PCC_test 0.5960977081719245
R2_test 0.29879078417767324
RMSE 0.2925732773478854
MAE 0.24888719425872818
Performing iteration : 5
Number of training data with variance: 52
Number of pooling data with variance: 70


ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)


PCC_test 0.6341052404226921
R2_test 0.37909097546669124
RMSE 0.27531180580076486
MAE 0.2306244060687582
Performing iteration : 6
Number of training data with variance: 60
Number of pooling data with variance: 62




PCC_test 0.5865656911122495
R2_test 0.28261838945581397
RMSE 0.2959279385515636
MAE 0.2521248262213996
Performing iteration : 7
Number of training data with variance: 68
Number of pooling data with variance: 54




PCC_test 0.6347974832835757
R2_test 0.34569660569130667
RMSE 0.2826184014557662
MAE 0.24341334128920059
Performing iteration : 8
Number of training data with variance: 76
Number of pooling data with variance: 46
PCC_test 0.8856762576242541
R2_test 0.7776282142300228
RMSE 0.16475958555052872
MAE 0.1228159680375857
Performing iteration : 9
Number of training data with variance: 84
Number of pooling data with variance: 38


ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)


PCC_test 0.8870500623486225
R2_test 0.7802950436644583
RMSE 0.16376865268380458
MAE 0.12150169373444587
Performing iteration : 10
Number of training data with variance: 92
Number of pooling data with variance: 30


ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)


PCC_test 0.898131418677083
R2_test 0.7984076946522185
RMSE 0.1568728637046842
MAE 0.11818948644300474
Performing iteration : 11
Number of training data with variance: 100
Number of pooling data with variance: 22
PCC_test 0.897670474642784
R2_test 0.8007321091608068
RMSE 0.15596584804198887
MAE 0.11580165053032838
Performing iteration : 12
Number of training data with variance: 108
Number of pooling data with variance: 14


ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)


PCC_test 0.9014338757524981
R2_test 0.8076342696962814
RMSE 0.15324090275295166
MAE 0.11112980296067064
Performing iteration : 13
Number of training data with variance: 116
Number of pooling data with variance: 6


ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)


PCC_test 0.898053571698364
R2_test 0.8048225904608896
RMSE 0.15435674925283474
MAE 0.11494586097850441


'\n    # random sampling\n    if i != 0:\n        # select 10 random data points\n        q_points_ran = np.random.choice(np.arange(len(idx_pool_ran)), size=10)\n        # indices of those points in idx_pool\n        idx_pool_train_ran = idx_pool_ran[q_points_ran]\n\n        idx_train_ran = np.append(idx_train_ran, idx_pool_train_ran)\n        idx_pool_ran = np.delete(idx_pool_ran, q_points_ran)\n        X_train_ran = X[idx_train_ran]\n        y_train_ran = y[idx_train_ran]\n\n    print(f"Number of training data with random: {len(idx_train_ran)}")\n    print(f"Number of pooling data with random: {len(idx_pool_ran)}")\n\n    model.fit(X_train_ran, y_train_ran)\n\n    y_pred_test_ran, y_pred_unc_test_ran = model.predict(X_test, return_std=True)\n    y_pred_train_ran, y_pred_unc_train_ran = model.predict(X_train_ran, return_std=True)\n    y_pred_pool_ran, y_pred_unc_pool_ran = model.predict(X[idx_pool_ran], return_std=True)\n\n    print(\'PCC_test\', pearsonr(y_test, y_pred_test_ran)[0])\

In [11]:
import pickle
with open('AL_GPR_Ni.pkl', 'wb') as f:
    pickle.dump({'train_numbs':num_training_data, 'pcc':pcc_variance,'r2':r2_variance, 'rsme': rmse_variance, 'mae': mae_variance}, f)
    f.close()

pkl_file = open('AL_GPR_Ni.pkl', 'rb')  
test_ALGPR = pickle.load(pkl_file)