In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# System
import os
import sys
sys.path.append('/home/helfrech/Tools/Toolbox/utils')

# Maths
import numpy as np

# ML
from kernels import gaussian_kernel
from sklearn.linear_model import Ridge
from sklearn.kernel_ridge import KernelRidge
from sklearn.preprocessing import KernelCenterer
from sklearn.model_selection import GridSearchCV
from sklearn.compose import TransformedTargetRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.metrics import make_scorer

# Utilities
import h5py
from tempfile import mkdtemp
from shutil import rmtree
from tqdm.notebook import tqdm
from tools import load_json, save_json
import project_utils as utils
from copy import deepcopy

In [3]:
# Load train and test sets
train_idxs = np.loadtxt('../Processed_Data/DEEM_330k/train.idxs', dtype=int)
cv_idxs = np.loadtxt('../Processed_Data/DEEM_330k/cv_5.idxs', dtype=int)

In [4]:
# BEGIN TMP

In [5]:
# Train and test sets (random DEEM)
from numpy.random import default_rng
idxs_delete = np.loadtxt('../Processed_Data/DEEM_330k/10kJmol_error.idxs', dtype=int)
deem_10k_idxs = np.loadtxt('../Processed_Data/DEEM_330k/deem_10k.idxs', dtype=int)

n_total = 331172
n_train = 10000
n_test = 250
rng = default_rng(seed=11011)
idxs = np.arange(0, n_total)
idxs = np.delete(idxs, idxs_delete)
rng.shuffle(idxs)
train_idxs = idxs[0:n_train]
test_idxs = idxs[n_train:n_train + n_test]

cv_idxs = np.loadtxt('../Processed_Data/DEEM_330k/cv_5.idxs', dtype=int)

In [6]:
# END TMP

In [4]:
# Load SOAP cutoffs
soap_hyperparameters = load_json('../Processed_Data/soap_hyperparameters.json')   
cutoffs = soap_hyperparameters['interaction_cutoff']

In [5]:
# Make directory in which to store models
os.makedirs('../Processed_Data/Models', exist_ok=True)

In [6]:
# Set property names for loading
property_names = ['volumes', 'energies']

# Load structure properties
structure_properties = {}
for pn in property_names:
    structure_properties[pn] = np.loadtxt(f'../Processed_Data/DEEM_330k/Data/structure_{pn}.dat')

# Linear Ridge Regression

In [9]:
# Use all defaults for the template parameters
ridge_parameters = dict()

# Regularization parameters for cross-validation
regularizations = np.logspace(-12, 0, 13)
parameter_grid = dict(ridge__regressor__alpha=regularizations)

In [11]:
# Loop over cutoffs
for cutoff in tqdm(cutoffs, desc='Cutoff', leave=True):
    
    # Set data directory
    data_dir = f'../Processed_Data/DEEM_330k/Data/{cutoff}'
    
    # Read SOAPs in training set
    sort_idxs = np.argsort(train_idxs) ### TMP
    rev_idxs = np.argsort(sort_idxs) ### TMP
    soaps = utils.load_hdf5(f'{data_dir}/soaps_power_full_avg_nonorm.hdf5', indices=train_idxs[sort_idxs]) ### REMOVE SORT IDXS
    soaps = soaps[rev_idxs] ### TMP
    
    # Loop over properties
    for pn in tqdm(property_names, desc='Property', leave=False):
        property_label = pn.capitalize()
        
        # Set working directory
        work_dir = f'../Processed_Data/Models/{cutoff}/Linear_Models/LR/{property_label}'
        os.makedirs(work_dir, exist_ok=True)
        
        # Load the property values (just from the train set)
        y = structure_properties[pn][train_idxs]
         
        # Cross validation pipeline
        cache_dir = mkdtemp(dir=work_dir) # TODO: move this and rmtree outside the property loop?
        pipeline = Pipeline(
            [
                ('norm_scaler', utils.NormScaler()), 
                ('ridge', TransformedTargetRegressor(
                    regressor=Ridge(**ridge_parameters), 
                    transformer=utils.NormScaler()
                ))
            ],
            memory=cache_dir
        )
        gscv = GridSearchCV(
            pipeline, parameter_grid, 
            scoring=[
                'neg_root_mean_squared_error', 
                'neg_mean_absolute_error'
            ],
            cv=utils.cv_generator(cv_idxs),
            refit=False, return_train_score=True, error_score='raise'
        )
        gscv.fit(soaps, y)
        rmtree(cache_dir)
        
        save_json(gscv.cv_results_, f'{work_dir}/cv_results.json', array_convert=True)
        # TODO: check the optimal doesn't give an ill conditioned warning or doesn't converge

HBox(children=(FloatProgress(value=0.0, description='Cutoff', max=2.0, style=ProgressStyle(description_width='…

HBox(children=(FloatProgress(value=0.0, description='Property', max=2.0, style=ProgressStyle(description_width…

  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T


HBox(children=(FloatProgress(value=0.0, description='Property', max=2.0, style=ProgressStyle(description_width…

  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T





In [None]:
# BEGIN TMP

In [12]:
# Extract optimal parameters as a check

# Loop over cutoffs
for cutoff in cutoffs:
    
    # Loop over properties
    for pn in property_names:
        property_label = pn.capitalize()
        
        work_dir = f'../Processed_Data/Models/{cutoff}/Linear_Models/LR/{property_label}'
        cv_results = load_json(f'{work_dir}/cv_results.json')
        
        # Loop over error types
        for error, error_name in zip(
            ['neg_mean_absolute_error', 'neg_root_mean_squared_error'],
            ['mae', 'rmse']
        ):
        
            idx = np.argmin(cv_results[f'rank_test_{error}'])
            opt_parameters = utils.get_optimal_parameters(cv_results, error, **ridge_parameters)

            # Print error and parameters
            print(f'-----Optimal Parameters for {cutoff} {pn} {error[4:]}-----')
            print(f'{error} =', cv_results[f'mean_test_{error}'][idx])
            print(opt_parameters)
            print('')
            
            # Save optimal parameters for easy access
            save_json(opt_parameters, f'{work_dir}/ridge_parameters_{error_name}.json')

-----Optimal Parameters for 3.5 volumes mean_absolute_error-----
neg_mean_absolute_error = -2.6278993636644223
{'alpha': 0.0001}

-----Optimal Parameters for 3.5 volumes root_mean_squared_error-----
neg_root_mean_squared_error = -3.763467401447244
{'alpha': 0.001}

-----Optimal Parameters for 3.5 energies mean_absolute_error-----
neg_mean_absolute_error = -0.6811454025198304
{'alpha': 0.0001}

-----Optimal Parameters for 3.5 energies root_mean_squared_error-----
neg_root_mean_squared_error = -0.9701972225669557
{'alpha': 0.0001}

-----Optimal Parameters for 6.0 volumes mean_absolute_error-----
neg_mean_absolute_error = -1.1184560899853364
{'alpha': 0.01}

-----Optimal Parameters for 6.0 volumes root_mean_squared_error-----
neg_root_mean_squared_error = -1.7742455247921032
{'alpha': 0.01}

-----Optimal Parameters for 6.0 energies mean_absolute_error-----
neg_mean_absolute_error = -0.11596155677192237
{'alpha': 1e-07}

-----Optimal Parameters for 6.0 energies root_mean_squared_error-----

In [None]:
# END TMP

In [14]:
# Extract optimal parameters as a check

# Loop over cutoffs
for cutoff in cutoffs:
    
    # Loop over properties
    for pn in property_names:
        property_label = pn.capitalize()
        
        work_dir = f'../Processed_Data/Models/{cutoff}/Linear_Models/LR/{property_label}'
        cv_results = load_json(f'{work_dir}/cv_results.json')
        
        # Loop over error types
        for error, error_name in zip(
            ['neg_mean_absolute_error', 'neg_root_mean_squared_error'],
            ['mae', 'rmse']
        ):
        
            idx = np.argmin(cv_results[f'rank_test_{error}'])
            opt_parameters = utils.get_optimal_parameters(cv_results, error, **ridge_parameters)

            # Print error and parameters
            print(f'-----Optimal Parameters for {cutoff} {pn} {error[4:]}-----')
            print(f'{error} =', cv_results[f'mean_test_{error}'][idx])
            print(opt_parameters)
            print('')
            
            # Save optimal parameters for easy access
            save_json(opt_parameters, f'{work_dir}/ridge_parameters_{error_name}.json')

-----Optimal Parameters for 3.5 volumes mean_absolute_error-----
neg_mean_absolute_error = -2.549571216699533
{'alpha': 0.0001}

-----Optimal Parameters for 3.5 volumes root_mean_squared_error-----
neg_root_mean_squared_error = -3.6876161795028453
{'alpha': 0.0001}

-----Optimal Parameters for 3.5 energies mean_absolute_error-----
neg_mean_absolute_error = -0.6690029851874398
{'alpha': 1e-05}

-----Optimal Parameters for 3.5 energies root_mean_squared_error-----
neg_root_mean_squared_error = -0.9511295920625198
{'alpha': 0.001}

-----Optimal Parameters for 6.0 volumes mean_absolute_error-----
neg_mean_absolute_error = -1.0842773569634572
{'alpha': 0.01}

-----Optimal Parameters for 6.0 volumes root_mean_squared_error-----
neg_root_mean_squared_error = -1.7692268855224387
{'alpha': 0.01}

-----Optimal Parameters for 6.0 energies mean_absolute_error-----
neg_mean_absolute_error = -0.1200024862710221
{'alpha': 1e-07}

-----Optimal Parameters for 6.0 energies root_mean_squared_error-----
n

# Kernel Ridge Regression

In [7]:
# Use all defaults for the template parameters
kernel_ridge_parameters = dict(kernel='precomputed')

# Set ranges of kernel gamma (for Gaussian kernel) and regularization
log_gammas = np.linspace(-3, 3, 7)
##log_gammas = np.array([-2.0])

regularizations = np.logspace(-12, 0, 13)
##regularizations = np.array([1.0E-3, 1.0E-2])

# Use the filenames of the kernels that we will be loading
# in the "hacked" pipeline.
# The gamma names for the kernel parsing will be set in the loop
parameter_grid_base = dict(
    ridge__regressor__alpha=regularizations, 
)

In [8]:
# Loop over cutoffs
for cutoff in tqdm(cutoffs, desc='Cutoff', leave=True):
    
    # Set data directory
    data_dir = f'../Processed_Data/DEEM_10k/Data/{cutoff}'
    model_dir = f'../Processed_Data/Models/{cutoff}/Kernel_Models/Gaussian'
    os.makedirs(model_dir, exist_ok=True)
    
    # Read SOAPs in training set
    soaps = utils.load_hdf5(f'{data_dir}/soaps_power_full_nonorm.hdf5') # Use this
    soaps = np.array([np.mean(soap, axis=0) for soap in soaps]) # Use this
    ##str_idxs = [str(i).zfill(4) for i in range(0, len(train_idxs))]
    ##soaps = utils.load_hdf5(f'{data_dir}/soaps.hdf5', datasets=str_idxs, concatenate=False)
    
    # Build a "superkernel": a concatenation of the train and test kernels
    # (which we can store easily in memory and on disk) so that
    # the pipeline doesn't compute whole new kernels for each CV set.
    # We still have to compute a kernel for each gamma, though.
    # We store the kernels named by the logarithm of the gamma parameter,
    # so the filenames don't get ridiculous
    ##XA = np.random.rand(10000, 1000)
    for log_gamma in tqdm(log_gammas, desc='Gamma', leave=False):
        gamma = 10 ** log_gamma
        K = gaussian_kernel(soaps, soaps, gamma=gamma) # Use this
        ##K = gaussian_kernel(XA, XA, gamma=gamma)
        utils.save_hdf5(
            f'{model_dir}/gaussian_kernel_{log_gamma}.hdf5', K, 
            attrs=dict(gamma=gamma, log_gamma=log_gamma), chunks=(100, 100)
        )

HBox(children=(FloatProgress(value=0.0, description='Cutoff', max=2.0, style=ProgressStyle(description_width='…

HBox(children=(FloatProgress(value=0.0, description='Gamma', max=1.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Gamma', max=1.0, style=ProgressStyle(description_width='i…




In [9]:
# Hacked CV indices so we can load and access the kernels by indexing
# instead of recomputing them
dummy_cv_idxs = np.arange(0, cv_idxs.shape[-1]).reshape((1, -1))

In [11]:
# Optimize hyperparameters

# Loop over cutoffs
for cutoff in tqdm(cutoffs, desc='Cutoff', leave=True):
    
    # Set data directory
    model_dir = f'../Processed_Data/Models/{cutoff}/Kernel_Models/Gaussian'
    
    parameter_grid = deepcopy(parameter_grid_base)
    parameter_grid['kernel_loader__filename'] = \
        [f'{model_dir}/gaussian_kernel_{log_gamma}.hdf5' for log_gamma in log_gammas]
          
    # Loop over properties
    for pn in tqdm(property_names, desc='Property', leave=False):
        property_label = pn.capitalize()
        
        # Set working directory
        work_dir = f'{model_dir}/KRR/{property_label}'
        os.makedirs(work_dir, exist_ok=True)
                
        # Load the property values (just from the train set)
        y = structure_properties[pn][train_idxs]
        
        # "Hacked" pipeline: instead of recomputing the expensive kernels
        # at each CV iteration, we will fit instead with a set of indices
        # and use the filename containing the kernel at a particular
        # gamma as a hyperparameter for KernelLoader, which will
        # load the kernel at initialization.
                
        # NOTE: can't just use the custom kernel as a callable
        # to KernelRidge, as the custom kernel necessarily operates on 2D arrays
        # of the features for all environments in a given structure,
        # whereas the callable must operate on pairs of samples.
        # The KernelLoader/KernelConstructor is used instead
        cache_dir = mkdtemp(dir=work_dir)
        pipeline = Pipeline(
            [
                ('kernel_loader', utils.KernelLoader()),
                ('kernel_norm_scaler', utils.KernelNormScaler()), 
                ('ridge', TransformedTargetRegressor(
                    regressor=KernelRidge(**kernel_ridge_parameters),
                    transformer=utils.SampleSelector(X=y, model=utils.NormScaler()),
                    check_inverse=False
                ))
            ],
            memory=cache_dir
        )
        
        # "Hacked" CV: since we want to access the kernels via indices,
        # we fit the CV with cv_indices, and the cv_generator 
        # is used to pick out the correct folds
        gscv = GridSearchCV(
            pipeline, parameter_grid, 
            scoring=dict(
                neg_mean_absolute_error=make_scorer(
                    utils.score_by_index,
                    greater_is_better=False,
                    y=y, scorer=mean_absolute_error
                ),
                neg_root_mean_squared_error=make_scorer(
                    utils.score_by_index,
                    greater_is_better=False,
                    y=y, scorer=mean_squared_error,
                    squared=False
                )
            ),
            cv=utils.cv_generator(dummy_cv_idxs),
            refit=False, return_train_score=True, error_score='raise'
        )
        gscv.fit(cv_idxs.T, cv_idxs.T)
        save_json(gscv.cv_results_, f'{work_dir}/cv_results.json', array_convert=True)
        rmtree(cache_dir)

HBox(children=(FloatProgress(value=0.0, description='Cutoff', max=2.0, style=ProgressStyle(description_width='…

HBox(children=(FloatProgress(value=0.0, description='Property', max=2.0, style=ProgressStyle(description_width…

If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])
If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])


2000 4.960045460337438
2000 6.539265800221978
8000 0.007166189849750411
8000 0.009392341815921012


If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])
If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])


2000 4.942079096059853
2000 6.43008450991217
8000 0.007181076673697766
8000 0.009410821805014848


If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])
If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])


2000 4.810728029428967
2000 6.299220004190456
8000 0.007256557225164462
8000 0.00949681243112259


If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])
If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])


2000 4.805670124528469
2000 6.356051363793797
8000 0.007248508979858779
8000 0.009465932131919501


If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])
If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])


2000 4.9633533124433455
2000 6.472562724901512
8000 0.0071902838969953465
8000 0.009417239885575432
2000 4.95826557284866
2000 6.537110212283552
8000 0.07061094346583352
8000 0.09254999552762669
2000 4.939960350179164
2000 6.4279299923430635
8000 0.07075838101886131
8000 0.09273302237765223
2000 4.808946647991857
2000 6.297193222363458
8000 0.0715009391270776
8000 0.09357841727293076
2000 4.803984392860917
2000 6.354536766719237
8000 0.07142147281407321
8000 0.09327289632744941
2000 4.961639122401193
2000 6.470757126592184
8000 0.07084730371581265
8000 0.09279416533548115


If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])
If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])


2000 4.402729097865448
2000 5.285862854565544
8000 0.006534903048649085
8000 0.007825077597155774


If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])
If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])


2000 4.393761885876767
2000 5.258442847327382
8000 0.006548715298039497
8000 0.007859924695296434


If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])
If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])


2000 4.507892878420929
2000 5.371464495784244
8000 0.006525489873679817
8000 0.007820004358709193


If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])
If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])


2000 4.4605204733422354
2000 5.346656771817301
8000 0.006541468455573067
8000 0.007822420139648787


If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])
If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])


2000 4.474940789064357
2000 5.346606667668517
8000 0.006523312979016055
8000 0.007825119702256559
2000 4.4016397013740205
2000 5.284119726077778
8000 0.06439958844936496
8000 0.07710759582966766
2000 4.392841109567889
2000 5.256945454264224
8000 0.06453424778184468
8000 0.07744878262945074
2000 4.506844367297403
2000 5.369948960334716
8000 0.06430455626680691
8000 0.07705408233542767
2000 4.45940358680704
2000 5.344893658102849
8000 0.06446240443279772
8000 0.07707976299419332
2000 4.473880183240371
2000 5.345075052320698
8000 0.06428354117143477
8000 0.07710586630137903


HBox(children=(FloatProgress(value=0.0, description='Property', max=2.0, style=ProgressStyle(description_width…

If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])
If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])


2000 5.0295373215385535
2000 6.578949014165565
8000 0.0071237610072918435
8000 0.009371106637919811


If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])
If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])


2000 4.866968397693262
2000 6.381358290763396
8000 0.007177524667246514
8000 0.009461465820280283


If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])
If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])


2000 4.720749395931691
2000 6.181133799609303
8000 0.007248295469694294
8000 0.009529677971386725


If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])
If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])


2000 4.854658380693583
2000 6.472832879565431
8000 0.007211631196156998
8000 0.009424475490826839


If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])
If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])


2000 4.960818163028237
2000 6.456212171494541
8000 0.007148915733277307
8000 0.009427737666256233
2000 5.027518496865081
2000 6.576726872018838
8000 0.07019206842427306
8000 0.09234030411041187
2000 4.865244064571327
2000 6.379471386984439
8000 0.07072064166630189
8000 0.09322789631353146
2000 4.7190857474305865
2000 6.179348431313144
8000 0.0714185761465637
8000 0.09390258293775754
2000 4.852723462190191
2000 6.47086154223445
8000 0.07105672457290399
8000 0.09286492236226893
2000 4.958976149809162
2000 6.454169674809251
8000 0.07044073261087204
8000 0.0928980121254763


If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])
If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])


2000 4.402480373070501
2000 5.264167600257628
8000 0.006593520372734929
8000 0.007901001921687845


If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])
If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])


2000 4.416118717740299
2000 5.2731037753995915
8000 0.006588975990578547
8000 0.007900375624650988


If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])
If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])


2000 4.515232739267379
2000 5.424421802057503
8000 0.006555688084134317
8000 0.00783352437125129


If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])
If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])


2000 4.496065842492748
2000 5.345421404588377
8000 0.006576645948995292
8000 0.007861375018359547


If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])
If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])


2000 4.486501334937917
2000 5.392068834042245
8000 0.006550265980625226
8000 0.007850006534127258
2000 4.401412512842515
2000 5.262705825404646
8000 0.06497304630108897
8000 0.0778512897570538
2000 4.41522173747049
2000 5.271624863803656
8000 0.06492891516984674
8000 0.07784542723723505
2000 4.514112441602189
2000 5.42264727172415
8000 0.0646010844020409
8000 0.07718730290933713
2000 4.494938557694049
2000 5.343808076466818
8000 0.06480752593875196
8000 0.07746236419478778
2000 4.485312168294686
2000 5.390281419199412
8000 0.0645489751580883
8000 0.07735134825282045



In [55]:
# Extract optimal parameters as a check

# Loop over cutoffs
for cutoff in cutoffs:
    if cutoff != 3.5:
        continue
    
    # Loop over properties
    for pn in property_names:
        property_label = pn.capitalize()
        
        work_dir = f'../Processed_Data/Models/{cutoff}/Kernel_Models/Gaussian/KRR/{property_label}'
        cv_results = load_json(f'{work_dir}/cv_results.json')
        
        # Loop over error types
        for error, error_name in zip(
            ['neg_mean_absolute_error', 'neg_root_mean_squared_error'],
            ['mae', 'rmse']
        ):
        
            idx = np.argmin(cv_results[f'rank_test_{error}'])
            opt_parameters = utils.get_optimal_parameters(cv_results, error, **kernel_ridge_parameters)

            # Print error and parameters
            print(f'-----Optimal Parameters for {cutoff} {pn} {error[4:]}-----')
            print(f'{error} =', cv_results[f'mean_test_{error}'][idx])
            print(opt_parameters)
            print('')
            
            # Save optimal parameters for easy access
            save_json(opt_parameters, f'{work_dir}/ridge_parameters_{error_name}.json')

-----Optimal Parameters for 3.5 volumes mean_absolute_error-----
neg_mean_absolute_error = -50.46811285596522
{'kernel': 'precomputed', 'filename': '../Processed_Data/Models/3.5/Kernel_Models/Gaussian/gaussian_kernel_1.0.hdf5', 'alpha': 1e-12}

-----Optimal Parameters for 3.5 volumes root_mean_squared_error-----
neg_root_mean_squared_error = -50.836986007724846
{'kernel': 'precomputed', 'filename': '../Processed_Data/Models/3.5/Kernel_Models/Gaussian/gaussian_kernel_1.0.hdf5', 'alpha': 1e-12}

-----Optimal Parameters for 3.5 energies mean_absolute_error-----
neg_mean_absolute_error = -12395.5157030061
{'kernel': 'precomputed', 'filename': '../Processed_Data/Models/3.5/Kernel_Models/Gaussian/gaussian_kernel_1.0.hdf5', 'alpha': 1e-12}

-----Optimal Parameters for 3.5 energies root_mean_squared_error-----
neg_root_mean_squared_error = -12395.516743897222
{'kernel': 'precomputed', 'filename': '../Processed_Data/Models/3.5/Kernel_Models/Gaussian/gaussian_kernel_1.0.hdf5', 'alpha': 1e-12}

-