In [4]:
from GPR_SOAP import SOAP_GPR

In [5]:
# Define path to xyz files and path to SOAP files

XYZ_directory = '/home/alex/Pt_NMR/data/structures/'
SOAP_directory = '/home/alex/Pt_NMR/data/representations/SOAP/'

# Set up SOAP_GPR class

SOAP_GPR = SOAP_GPR(SOAP_parameters=[2.0, 2, 2], SOAP_directory=SOAP_directory,
                    XYZ_directory=XYZ_directory, XYZ_base='st_', central_atom='Pt')

In [6]:
# Define path to and name of labels

target_name = 'Experimental'
target_path = '/home/alex/Pt_NMR/data/labels/final_data_corrected'

# Use predict function for regression and obtaining CV prediction errors

errors_std = SOAP_GPR.predict(mode='write', regressor='GPR', kernel_degree=2, target_path=target_path,
             target_name=target_name, alpha=1e-3, normalization=True)

print(errors_std)

Species present in dataset: ['F', 'Ge', 'O', 'I', 'N', 'Br', 'C', 'Te', 'Sn', 'Sb', 'H', 'P', 'As', 'Cl', 'S', 'Si', 'Se', 'Pt']
(298, 1998)
--------------------------------
Cross-validated error values:
--------------------------------
RMSE (4-fold CV):
192.62068761033655 [ppm]
58.254485840789705 [ppm]  (STDEV)


MAE (4-fold CV):
112.27321217473667 [ppm]
17.429657558526074 [ppm],  (STDEV)
--------------------------------
(112.27321217473667, 17.429657558526074, 192.62068761033655, 58.254485840789705)


In [None]:
# How to create heatmap of n_max, l_max
import seaborn as sns
import numpy as np

nl_matrix = np.zeros((8, 8))

for n_max in range(1,9):
    for l_max in range(1,9):

        SOAP_ML = SOAP_GPR(SOAP_parameters=[6.0, n_max, l_max], SOAP_directory=SOAP_directory,
                           XYZ_directory=XYZ_directory, XYZ_base='st_', central_atom='Pt')

        nl_matrix[n_max-1, l_max-1] = SOAP_ML.predict(mode='read', regressor='GPR', kernel_degree=2, target_path=target_path,
             target_name=target_name, alpha=1e-3, normalization=True)[0]


sns.heatmap(nl_matrix, cmap="hot", annot=True, fmt=".1f",
            xticklabels=['1','2','3','4','5','6','7','8'], yticklabels=['1','2','3','4','5','6','7','8'])

In [None]:
# Create learning curves for r_cut dependency

import matplotlib.pyplot as plt

rcut_list = [2.0, 3.0, 4.0, 5.0, 6.0]
mean_MAE_rcut_list = []

for rcut in rcut_list:
    SOAP_ML = SOAP_GPR(SOAP_parameters=[rcut, 8, 8], SOAP_directory=SOAP_directory, XYZ_directory=XYZ_directory,
                            XYZ_base='st_', central_atom='Pt')

    errors_std = SOAP_ML.predict(mode='read', regressor='GPR', kernel_degree=1, target_path=target_path,
                target_name=target_name, alpha=10.0, normalization=False, lc=True)

    print(errors_std)

    mean_MAE_list = []

    lc_MAE = errors_std[5]
    train_sizes = errors_std[4]

    for row in lc_MAE:
        mean_MAE = np.mean(np.abs(row))
        mean_MAE_list.append(mean_MAE)

    print(mean_MAE_list)
    print(train_sizes)
    mean_MAE_rcut_list.append(mean_MAE_list)


color_list = ['orange', 'r', 'purple', 'b', 'k']
#color_list = ['k', 'b', 'purple', 'r', 'orange']

for i, rcut in enumerate(rcut_list):
    color = color_list[i] # Use colormap to define color

    # Extract the mean MAE values and train sizes for the current rcut
    mean_MAE_list_rcut = mean_MAE_rcut_list[i]
    train_sizes_rcut = train_sizes

    # Plot the curve
    plt.plot(train_sizes_rcut, mean_MAE_list_rcut, color=color)
    plt.scatter(train_sizes_rcut, mean_MAE_list_rcut, color=color, label=f'r={rcut}')

# Add labels and legend
plt.xlabel('Number of samples in training set')
plt.ylabel('Mean Absolute Error [ppm]')
plt.legend()
plt.grid()

plt.savefig(f'/home/alex/ML/SOAP_GPR_NMR/final_dataset/figures/degree_dependency/lc_rcut{rcut}_n{8}_l{8}_unnormalized_final.svg', format='svg', dpi=700, bbox_inches='tight')#
plt.show()


In [None]:
def exhaustive_search(rcut_list, nmax_list, lmax_list, alpha_list):

    SOAP_directory = '/home/alex/ML/SOAP_GPR_NMR/final_dataset/SOAPs/'

    XYZ_directory = '/home/alex/ML/SOAP_GPR_NMR/final_dataset/xyz_files_final_set/'
    XYZ_base = 'st_'

    target_name = 'Experimental'
    target_path = '/home/alex/ML/SOAP_GPR_NMR/final_dataset/Pt_II_complexes_final'


    mae_list = []
    parameter_combination_list = []

    for rcut in rcut_list:
        for nmax in nmax_list:
            for lmax in lmax_list:
                    for alpha in alpha_list:


                        try:

                            SOAP_ML = SOAP_GPR(SOAP_parameters=[rcut, nmax, lmax], SOAP_directory=SOAP_directory, XYZ_directory=XYZ_directory,
                            XYZ_base=XYZ_base, central_atom='Pt')

                            errors_std = SOAP_ML.predict(mode='read', regressor='GPR', kernel_degree=5, target_path=target_path,
                                         target_name=target_name, alpha=alpha, normalization=False)

                            mae_list.append(errors_std[0])
                            parameter_combination_list.append([rcut, nmax, lmax, alpha])

                            np.savetxt(f'/home/alex/ML/SOAP_GPR_NMR/final_dataset/prediction_errors/polynomial_kernel_unnormalized/rcut{int(rcut)}_nmax{nmax}_lmax{lmax}_alpha{alpha}_degree5_unnormalized.txt', np.array(errors_std), delimiter=',')

                        except Exception as e:
                            print(e)
                            pass

    mae_parameters_combined = [sublist + [item] for sublist, item in zip(parameter_combination_list, mae_list)]
    np.savetxt('/home/alex/ML/SOAP_GPR_NMR/final_dataset/prediction_errors/polynomial_kernel_unnormalized/mae_parameters_combined_unnormalized_degree5.txt', np.array(mae_parameters_combined), delimiter=',')

    best_params, min_error = min(zip(parameter_combination_list, mae_list), key=lambda x: x[1])

    print(best_params, min_error)

    return best_params, min_error