## Batch evaluate ML-based properties for the composition datesets used for CALPHAD

**Author:** Y.X. Wu


In [None]:
# Parameters
notebook_fname = "NN_full_RepeatedKFold_v3_BO_test"

In [None]:
import sys
sys.path.append("../CCA_representation_ML/")

In [None]:
import os
import numpy as np
import pandas as pd

print(f'cpu_count: {os.cpu_count()}')

# Data Path
data_path = '../CCA_representation_ML/01_Dataset_Cleaned/'
if os.path.isfile(data_path+'LiteratureDataset_Corrosion_YW_v3_processed.xlsx'):
    print(f"Folder '{data_path}' found.")
else:
    print(f"Warning: File '{data_path}' not found!")

# Model Path
model_path = '../CCA_representation_ML/04_Model_Saved/'
model_path_bo = f'{model_path}{notebook_fname}/'

# Use GPU or not
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
print('not using GPU')

### Check the model and scalers

- Show the model.h5 files in this directory
- Load scalers


In [None]:
import pickle
from utils.postprocessing_evalutation import display_saved_models

display_saved_models(model_path_bo)

# Load the scalers dictionary from a file using pickle
with open(data_path + 'scalers.pkl', 'rb') as f:
    scalers = pickle.load(f)
    print("")
print(scalers)

## Predict based on new data inputs


### coordinates for PVD alloy representation


In [None]:
# load the wafer-coordinates
df_PVD_x_y = pd.read_excel(data_path + 'PVD_x_y.xlsx')
coord_x = df_PVD_x_y["x"].to_numpy(dtype=float)
coord_y = df_PVD_x_y["y"].to_numpy(dtype=float)
index_PVD_x_y = df_PVD_x_y.index.values+1

### Setting up compositional input for new alloys


In [None]:
from itertools import combinations

Flag_Calc_KW = False
Flag_Calc_All = True

if Flag_Calc_KW:
    data_path_compo = './v6_A-B-C-D-E_Sputtering_ML_Exp/'

    compo_A_B_C_D_E_list = [['Ni', 'Cr', 'Co', 'V', 'Fe'],
                            ['Ni', 'Cr', 'Mo', 'Ti', 'Fe']]
    compo_A_B_C_D_E_fname_list = ['KW99', 'KW131']


elif Flag_Calc_All:
    data_path_compo = './v6_A-B-C-D-E_Sputtering_ML_All/'

    # Define the set of elements for D and E
    set_D_E = [{'Co'}, {'V'}, {'Mn'}, {'Mo'}, {'Cu'},
               {'Nb'}, {'W'}, {'Ti'}, {'Al'}, {'Si'}, {'Ta'}]

    # Generate all possible combinations of D and E
    compo_D_E_list = list(combinations(set_D_E, 2))

    compo_A_B_C_D_E_list = []
    compo_A_B_C_D_E_fname_list = []
    for compo_D_E in compo_D_E_list:
        # Convert the current combination to a list and flatten it
        compo_D_E_list = [item for sublist in compo_D_E for item in sublist]

        # Define the current combination of A, B, C, D, E
        compo_A_B_C_D_E = ['Fe', 'Cr', 'Ni'] + compo_D_E_list
        compo_A_B_C_D_E_merge = '_'.join([*compo_A_B_C_D_E])

        compo_A_B_C_D_E_list.append(compo_A_B_C_D_E)
        compo_A_B_C_D_E_fname_list.append(compo_A_B_C_D_E_merge)

    print(compo_A_B_C_D_E_list[0])
    print(compo_A_B_C_D_E_fname_list[0])

### Make predictions for new alloys

`read_new_data_feature_calc` can calcualte the engineered features


In [None]:
from utils.postprocessing_prediction import read_new_data_feature_calc, predict_bootstrap_NNH_NNC, plot_prediction_uncertainty, plot_prediction_uncertainty_AVG
from tqdm import tqdm

for compo_A_B_C_D_E, compo_A_B_C_D_E_fname in tqdm(zip(compo_A_B_C_D_E_list, compo_A_B_C_D_E_fname_list), desc='Processing', total=len(compo_A_B_C_D_E_list)):

    print(compo_A_B_C_D_E)
    print(compo_A_B_C_D_E_fname)
    vars_ele, KW_name = compo_A_B_C_D_E, compo_A_B_C_D_E_fname

    file_name_input = f'{data_path_compo}v6_{KW_name}_SSS_FCC_byCompo_wt_pct.xlsx'
    df_new_wt = pd.read_excel(file_name_input)

    compo_new, HC_specific_features, C_specific_testing = read_new_data_feature_calc(df_new_wt, vars_ele,
                                                                                     specific_features_sel_column=['delta_a', 'Tm', 'sigma_Tm',
                                                                                                                   'Hmix', 'sigma_Hmix', 'sigma_elec_nega', 'VEC', 'sigma_VEC'],
                                                                                     C_testing=np.array([25, 1, 7, 0.333]))

    NNH_model_name = 'NNH_model_RepeatedKFold_{}.h5'
    NNC_model_name = 'NNC_model_RepeatedKFold_{}.h5'
    k_folds, n_CVrepeats, mc_repeat = 6, 2, 100

    (H1_new_pred_stack, H1_new_pred_mean, H1_new_pred_std,
     C2_new_pred_stack, C2_new_pred_mean, C2_new_pred_std) = predict_bootstrap_NNH_NNC(
        model_path_bo, NNH_model_name, NNC_model_name,
        compo_new, HC_specific_features, C_specific_testing,
        scalers, k_folds, n_CVrepeats, mc_repeat)

    # Concatenate and compute mean and std + save to excel
    H1_new_pred_KFold_mean = np.mean(np.concatenate(
        H1_new_pred_stack, axis=0), axis=0).reshape(-1)
    H1_new_pred_KFold_std = np.std(np.concatenate(
        H1_new_pred_stack, axis=0), axis=0).reshape(-1)
    C2_new_pred_KFold_mean = np.mean(np.concatenate(
        C2_new_pred_stack, axis=0), axis=0).reshape(-1)
    C2_new_pred_KFold_std = np.std(np.concatenate(
        C2_new_pred_stack, axis=0), axis=0).reshape(-1)

    df_new_wt['H1_new_pred_KFold_mean'] = H1_new_pred_KFold_mean
    df_new_wt['H1_new_pred_KFold_std'] = H1_new_pred_KFold_std
    df_new_wt['C2_new_pred_KFold_mean'] = C2_new_pred_KFold_mean
    df_new_wt['C2_new_pred_KFold_std'] = C2_new_pred_KFold_std

    file_name_output = f'{data_path_compo}v6_{KW_name}_SSS_FCC_byCompo_wt_pct_ML.xlsx'
    display(df_new_wt.head(3))
    df_new_wt.to_excel(file_name_output, index=False)

    if Flag_Calc_KW:
        # NNH predictions
        plot_prediction_uncertainty(data_path_compo, coord_x, coord_y, index_PVD_x_y, H1_new_pred_mean, H1_new_pred_std,
                                    pred_label='Hardness', unc_label='Hardness uncertainty',
                                    title='NNH_RepeatedKFold_prediction_uncertainty_eachFold_' + KW_name,
                                    vmin1=100, vmax1=300,
                                    vmin2=25, vmax2=100)

        # NNC predictions
        plot_prediction_uncertainty(data_path_compo, coord_x, coord_y, index_PVD_x_y, C2_new_pred_mean, C2_new_pred_std,
                                    pred_label='Pitting potential (mV)', unc_label='Pitting potential uncertainty (mV)',
                                    title='NNC_RepeatedKFold_prediction_uncertainty_eachFold_' + KW_name,
                                    vmin1=0, vmax1=900,
                                    vmin2=50, vmax2=150)

        # NNH_NNC_AVG predictions
        plot_prediction_uncertainty_AVG(data_path_compo, coord_x, coord_y, index_PVD_x_y, H1_new_pred_stack, C2_new_pred_stack,
                                        title='NNH_NNC_RepeatedKFold_prediction_uncertainty_AVG_' + KW_name)