In [1]:
# System
import os
import sys
sys.path.append('/home/helfrech/Tools/Toolbox/utils')

# Maths
import numpy as np

# ML
from regression import SparseKRR
from kernels import build_kernel, linear_kernel, gaussian_kernel
from split import cv_split
from errors import mae, rmse

# Utilities
import h5py
import json

In [None]:
# Load train and test sets
train_idxs = np.loadtxt('../Processed_Data/DEEM_10k/train_idxs', dtype=int)
test_idxs = np.loadtxt('../Processed_Data/DEEM_10k/test_idxs', dtype=int)

In [None]:
# Load SOAP cutoffs
with open('../Processeed_Data/soap_hyperparameters.json', 'r') as f:
    soap_hyperparameters = json.load(f)
    
cutoffs = soap_hyperparameters['interaction_cutoff']

In [None]:
# Load representative SOAPs
representative_soaps = {}

for cutoff in cutoffs:
    work_dir = '../Processed_Data/DEEM_10k/Data/{cutoff}'
    n_Si = np.loadtxt('{work_dir}/n_Si.dat', dtype=int)
    split_idxs = np.cumsum(n_Si)[0:-1]
    representative_idxs = np.loadtxt('{work_dir}/FPS_representatives.idxs', dtype=int)
    soaps_file = '{work_dir}/soaps.hdf5'
    representative_soaps['{cutoff}'] = build_representatives_from_hdf5(soaps_file, representative_idxs, split_idxs)

In [None]:
# Cross validation splitting
k = 5
cv_idxs = train_idxs.copy()
np.random.shuffle(cv_idxs)
try:
    cv_idxs = np.split(cv_idxs, k)
    np.savetxt('../Processed_Data/DEEM_10k/cv.idxs', np.stack(cv_idxs, axis=1), fmt='%d')
except ValueError:
    print('Error: number of points in the training set must be divisible by the number of folds')

In [None]:
property_names = ['volumes', 'energies']

In [None]:
gamma = np.logspace(-3, 2, 11)
sigma = np.logspace(-3, 2, 11)
reg = np.logspace(-4, 1, 11)

In [None]:
dt_list = [('gamma', 'f8'), 
           ('sigma', 'f8'), 
           ('reg', 'f8'), 
           ('mae_train', 'f8', (k,)),
           ('mae_validate', 'f8', (k,)),
           ('rmse_train', 'f8', (k,)),
           ('rmse_validate', 'f8', (k,))]

with open('../Processed_Data/DEEM_10k/Models/optimization_dtype.json', 'w') as f:
    json.dump(dt_list, f)

dt = np.dtype(dt_list)
errors_list = []

# DEEM_10k

In [None]:
deem_10k_structure_properties = {}
for pn in property_names:
    deem_10k_structure_properties[pn] = np.loadtxt('../Processed_Data/DEEM_10k/structure_{pn}.dat')

In [None]:
# Build base kernels
KMM = build_kernel(representative_soaps, representative_soaps,
                                   kernel_type=kernel_type, **base_kernel_parameters)
KNM = build_kernel(soaps, representative_soaps,
                   kernel_type=kernel_type, **base_kernel_parameters)

In [None]:
datasets = [str(i) for i in train_idxs]

In [None]:
for cutoff in cutoffs:
    # Read SOAPs
    deem_10k = load_structures_from_hdf5('../Processed_Data/DEEM_10k/Data/{cutoff}/soaps.hdf5',
                                         datasets=datasets, concatenate=False)
    for pn in property_names:
        Y = deem_10k_structure_properties[pn]
        for g in gamma:  
            # Build kernel
            KMM**kernel_parameter_values
            KNM**kernel_parameter_values
            for s in sigma:
                for r in reg:
                    mae_train = np.zeros(k)
                    mae_validate = np.zeros(k)
                    rmse_train = np.zeros(k)
                    rmse_validate = np.zeros(k)
                    for kk in np.arange(0, k):
                        idxs_train = cv_idxs.copy()
                        idxs_validate = idxs_train.pop(kk)
                        idxs_train = np.concatenate(idxs_train)
                        
                        delta = np.var(Y[idxs_train]) * KMM.shape[0] / np.trace(KMM)

                        # Initialize sparse KRR
                        skrr = SparseKRR(sigma=s, reg=r, rcond=None)
                        skrr.fit(delta*KNM[idxs_train, :], delta*KMM, delta*Y[idxs_train])

                        Yp_train = skrr.transform(KNM[idxs_train, :])
                        Yp_validate = skrr.transform(KNM[idxs_validate, :])
                        
                        mae_train[kk] = MAE(Yp_train, Y[idxs_train])
                        mae_validate[kk] = MAE(Yp_validate, Y[idxs_validate])
                        
                        rmse_train[kk] = RMSE(Yp_train, Y[idxs_train])
                        rmse_validate[kk] = RMSE(Yp_validate, Y[idxs_validate])
                        
                    model = np.array([(g, s, r, mae_train, mae_validate, rmse_train, rmse_validate)],
                                     dtype=dt)    
                    errors_list.append(model)
        
        errors_list = np.concatenate(errors_list)
        
        # Stack the arrays in a writable form
        columns = []
        header = []
        for name in dt.names:
            column = errors_list[name]
            if column.ndim == 1:
                column = np.reshape(column, (-1, 1))
                header.append(name)
            else:
                n_cols = column.shape[1]
                header.append('{name}({n_cols})')
            columns.append(column)
        header = ' '.join(header)
        np.savetxt('../Processed_Data/DEEM_10k/Models/{cutoff}/{pn}_optimization.dat', 
                   np.hstack(columns), header=header)