In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# System
import os
import sys
sys.path.append('/home/helfrech/Tools/Toolbox/utils')

# Maths
import numpy as np

# ML
from kernels import gaussian_kernel
from sklearn.linear_model import Ridge
from sklearn.kernel_ridge import KernelRidge
from sklearn.preprocessing import KernelCenterer
from sklearn.model_selection import GridSearchCV
from sklearn.compose import TransformedTargetRegressor
from sklearn.pipeline import Pipeline

# Utilities
from tempfile import mkdtemp
from shutil import rmtree
from tqdm.notebook import tqdm
from tools import load_json, save_json
import project_utils as utils

In [3]:
# Load train and test sets
train_idxs = np.loadtxt('../Processed_Data/DEEM_330k/train.idxs', dtype=int)
cv_idxs = np.loadtxt('../Processed_Data/DEEM_330k/cv_5.idxs', dtype=int)

In [4]:
# Load SOAP cutoffs
soap_hyperparameters = load_json('../Processed_Data/soap_hyperparameters.json')   
cutoffs = soap_hyperparameters['interaction_cutoff']

In [5]:
# Make directory in which to store models
os.makedirs('../Processed_Data/Models', exist_ok=True)

In [6]:
# Set property names for loading
property_names = ['volumes', 'energies']

# Load structure properties
structure_properties = {}
for pn in property_names:
    structure_properties[pn] = np.loadtxt(f'../Processed_Data/DEEM_330k/Data/structure_{pn}.dat')

# Linear Ridge Regression

In [7]:
# Use all defaults for the template parameters
ridge_parameters = dict()

# Regularization parameters for cross-validation
regularizations = np.logspace(-12, 0, 13)
parameter_grid = dict(ridge__regressor__alpha=regularizations)

In [13]:
# Loop over cutoffs
for cutoff in tqdm(cutoffs, desc='Cutoff', leave=True):
    
    # Set data directory
    data_dir = f'../Processed_Data/DEEM_330k/Data/{cutoff}'
    
    # Read SOAPs in training set
    soaps = utils.load_hdf5(f'{data_dir}/soaps_power_full_avg_nonorm.hdf5', indices=train_idxs)
    
    # Loop over properties
    for pn in tqdm(property_names, desc='Property', leave=False):
        property_label = pn.capitalize()
        
        # Set working directory
        work_dir = f'../Processed_Data/Models/{cutoff}/Linear_Models/LR/{property_label}'
        os.makedirs(work_dir, exist_ok=True)
        
        # Load the property values (just from the train set)
        y = structure_properties[pn][train_idxs]
         
        # Cross validation pipeline
        cache_dir = mkdtemp()
        pipeline = Pipeline(
            [
                ('norm_scaler', utils.NormScaler()), 
                ('ridge', TransformedTargetRegressor(
                    regressor=Ridge(**ridge_parameters), 
                    transformer=utils.NormScaler()
                ))
            ],
            memory=cache_dir
        )
        gscv = GridSearchCV(
            pipeline, parameter_grid, 
            scoring=[
                'neg_root_mean_squared_error', 
                'neg_mean_absolute_error'
            ],
            cv=utils.cv_generator(cv_idxs),
            refit=False, return_train_score=True, error_score='raise'
        )
        gscv.fit(soaps, y)
        rmtree(cache_dir)
        
        save_json(gscv.cv_results_, f'{work_dir}/cv_results.json', array_convert=True)
        # TODO: check the optimal doesn't give an ill conditioned warning or doesn't converge

HBox(children=(FloatProgress(value=0.0, description='Cutoff', max=2.0, style=ProgressStyle(description_width='…

HBox(children=(FloatProgress(value=0.0, description='Property', max=2.0, style=ProgressStyle(description_width…

  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T


HBox(children=(FloatProgress(value=0.0, description='Property', max=2.0, style=ProgressStyle(description_width…

  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T





In [14]:
# Extract optimal parameters as a check

# Loop over cutoffs
for cutoff in cutoffs:
    
    # Loop over properties
    for pn in property_names:
        property_label = pn.capitalize()
        
        work_dir = f'../Processed_Data/Models/{cutoff}/Linear_Models/LR/{property_label}'
        cv_results = load_json(f'{work_dir}/cv_results.json')
        
        # Loop over error types
        for error, error_name in zip(
            ['neg_mean_absolute_error', 'neg_root_mean_squared_error'],
            ['mae', 'rmse']
        ):
        
            idx = np.argmin(cv_results[f'rank_test_{error}'])
            opt_parameters = utils.get_optimal_parameters(cv_results, error, **ridge_parameters)

            # Print error and parameters
            print(f'-----Optimal Parameters for {cutoff} {pn} {error[4:]}-----')
            print(f'{error} =', cv_results[f'mean_test_{error}'][idx])
            print(opt_parameters)
            print('')
            
            # Save optimal parameters for easy access
            save_json(opt_parameters, f'{work_dir}/ridge_parameters_{error_name}.json')

-----Optimal Parameters for 3.5 volumes mean_absolute_error-----
neg_mean_absolute_error = -2.549571216699533
{'alpha': 0.0001}

-----Optimal Parameters for 3.5 volumes root_mean_squared_error-----
neg_root_mean_squared_error = -3.6876161795028453
{'alpha': 0.0001}

-----Optimal Parameters for 3.5 energies mean_absolute_error-----
neg_mean_absolute_error = -0.6690029851874398
{'alpha': 1e-05}

-----Optimal Parameters for 3.5 energies root_mean_squared_error-----
neg_root_mean_squared_error = -0.9511295920625198
{'alpha': 0.001}

-----Optimal Parameters for 6.0 volumes mean_absolute_error-----
neg_mean_absolute_error = -1.0842773569634572
{'alpha': 0.01}

-----Optimal Parameters for 6.0 volumes root_mean_squared_error-----
neg_root_mean_squared_error = -1.7692268855224387
{'alpha': 0.01}

-----Optimal Parameters for 6.0 energies mean_absolute_error-----
neg_mean_absolute_error = -0.1200024862710221
{'alpha': 1e-07}

-----Optimal Parameters for 6.0 energies root_mean_squared_error-----
n

# Kernel Ridge Regression

In [None]:
# Use all defaults for the template parameters
kernel_ridge_parameters = dict(kernel='precomputed')

# Set ranges of kernel gamma (for Gaussian kernel) and regularization
gammas = np.logspace(-3, 3, 7)
regularizations = np.logspace(-12, 0, 13)
parameter_grid = dict(ridge__regressor__alpha=regularizations, kernel_constructor__gamma=gammas)

In [None]:
# Optimize hyperparameters

# Loop over cutoffs
for cutoff in tqdm(cutoffs, desc='Cutoff', leave=False):
    
    # Set data directory
    data_dir = f'../Processed_Data/DEEM_10k/Data/{cutoff}'
    
    # Set working directory
    work_dir = f'../Processed_Data/Models/{cutoff}/Kernel_Models/Gaussian/KRR'
    if not os.path.exists(work_dir):
        os.mkdir(work_dir)
    
    # Read SOAPs in training set
    #soaps = utils.load_hdf5(f'{data_dir}/soaps_power_full_nonorm.hdf5') # Use this
    #soaps = np.array([np.mean(soap, axis=0) for soap in soaps]) # Use this
    str_idxs = [str(i).zfill(4) for i in range(0, len(train_idxs))]
    soaps = utils.load_hdf5(f'{data_dir}/soaps_power_full_nonorm.hdf5', datasets=str_idxs, concatenate=True)
    
    # Loop over properties
    for pn in tqdm(property_names, desc='Property', leave=False):
        
        # Load the property values (just from the train set)
        y = structure_properties[pn][train_idxs]
        
        cache_dir = mkdtemp()
        
        # NOTE: can't just use the custom kernel as a callable
        # to KernelRidge, as it necessarily operates on 2D arrays
        # of the features for all environments in a given structure,
        # whereas the callable must operate on pairs of samples.
        # The KernelConstructor is used instead
        pipeline = Pipeline(
            [
                ('kernel_constructor', utils.KernelConstructor())
                ('kernel_norm_scaler', utils.KernelNormScaler()), 
                ('ridge', TransformedTargetRegressor(
                    regressor=KernelRidge(kernel='precomputed'),
                    transformer=utils.NormScaler()
                ))
            ]
        memory=)
        gscv = GridSearchCV(
            ttr, parameter_grid, 
            scoring=[
                'neg_root_mean_squared_error', 
                'neg_mean_absolute_error'
            ],
            cv=utils.cv_generator(cv_idxs),
            refit=False, return_train_score=True, error_score='raise'
        )
        gscv.fit(soaps, y)
        # TODO: save CV run in JSON
        # TODO: save kernels
        rmtree(cache_dir)

In [None]:
# Extract optimal parameters

# Loop over cutoffs
for cutoff in cutoffs:
    
    work_dir = f'../Processed_Data/Models/{cutoff}/Kernel_Models/Gaussian/KRR'
    
    # Loop over properties
    for pn in property_names:
        errors_list = np.loadtxt(f'{work_dir}/{pn}_optimization.dat', dtype=dt)
        
        # Loop over error types
        for error in ('mae', 'rmse'):
        
            # TODO: Extract set of parameters corresponding to the minimum error

            # Print error and parameters
            print(f'-----Optimal Parameters for {cutoff} {pn} {error.upper()}-----')
            print('Avg. Error =', np.mean(errors_list[f'{error}_validate'][idx]))
            print('Gamma =', errors_list[idx]['gamma'])
            print('Sigma =', errors_list[idx]['sigma'])
            print('Regularization =', errors_list[idx]['reg'])
            print('')

            # TODO: Save optimal parameters for easy access
            #with open(f'{work_dir}/{pn}_{error}_parameters.json', 'w') as f:
            #    json.dump(opt_params, f)