In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# System
import os
import sys
sys.path.append('/home/helfrech/Tools/Toolbox/utils')

# Maths
import numpy as np

# ML
from kernels import gaussian_kernel
from sklearn.linear_model import Ridge
from sklearn.kernel_ridge import KernelRidge
from sklearn.preprocessing import KernelCenterer
from sklearn.model_selection import GridSearchCV
from sklearn.compose import TransformedTargetRegressor
from sklearn.pipeline import Pipeline

# Utilities
from tempfile import mkdtemp
from shutil import rmtree
from tqdm.notebook import tqdm
from tools import load_json, save_json
import project_utils as utils

In [3]:
# Load train and test sets
train_idxs = np.loadtxt('../Processed_Data/DEEM_330k/train.idxs', dtype=int)
cv_idxs = np.loadtxt('../Processed_Data/DEEM_330k/cv_5.idxs', dtype=int)

In [4]:
# Load SOAP cutoffs
soap_hyperparameters = load_json('../Processed_Data/soap_hyperparameters.json')
    
cutoffs = soap_hyperparameters['interaction_cutoff']

In [5]:
# Make directory in which to store models
if not os.path.exists('../Processed_Data/Models'):
    os.mkdir('../Processed_Data/Models')

In [6]:
# Set property names for loading
property_names = ['volumes', 'energies']

# Load structure properties
structure_properties = {}
for pn in property_names:
    structure_properties[pn] = np.loadtxt(f'../Processed_Data/DEEM_330k/Data/structure_{pn}.dat')

# Linear Ridge Regression

In [7]:
# Set ranges of regularization
regularizations = np.logspace(-12, 0, 13)
parameter_grid = dict(ridge__regressor__alpha=regularizations)

In [8]:
# Loop over cutoffs
#for cutoff in tqdm(cutoffs, desc='Cutoff', leave=True):
for cutoff in tqdm([6.0], desc='Cutoff', leave=True):
    
    # Set data directory
    data_dir = f'../Processed_Data/DEEM_10k/Data/{cutoff}'
    
    # Set working directory
    work_dir = f'../Processed_Data/Models/{cutoff}/Linear_Models/LR'
    if not os.path.exists(work_dir):
        os.mkdir(work_dir)
    
    # Read SOAPs in training set
    #soaps = utils.load_hdf5(f'{data_dir}/soaps_power_full_nonorm.hdf5') # Use this
    #soaps = np.array([np.mean(soap, axis=0) for soap in soaps]) # Use this
    str_idxs = [str(i).zfill(4) for i in range(0, len(train_idxs))]
    soaps = utils.load_hdf5(f'{data_dir}/soaps_power_full_avg_nonorm.hdf5', datasets=str_idxs, concatenate=True)
    
    # Loop over properties
    for pn in tqdm(property_names, desc='Property', leave=False):
        
        # Load the property values (just from the train set)
        y = structure_properties[pn][train_idxs]
         
        cache_dir = mkdtemp()
        pipeline = Pipeline(
            [
                ('norm_scaler', utils.NormScaler()), 
                ('ridge', TransformedTargetRegressor(
                    regressor=Ridge(), 
                    transformer=utils.NormScaler()
                ))
            ],
            memory=cache_dir
        )
        gscv = GridSearchCV(
            pipeline, parameter_grid, 
            scoring=[
                'neg_root_mean_squared_error', 
                'neg_mean_absolute_error'
            ],
            cv=utils.cv_generator(cv_idxs),
            refit=False, return_train_score=True, error_score='raise'
        )
        gscv.fit(soaps, y)
        # TODO: save CV run in JSON
        rmtree(cache_dir)

HBox(children=(FloatProgress(value=0.0, description='Cutoff', max=1.0, style=ProgressStyle(description_width='…

HBox(children=(FloatProgress(value=0.0, description='Property', max=2.0, style=ProgressStyle(description_width…

  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T





In [12]:
# Extract optimal parameters

# Loop over cutoffs
for cutoff in cutoffs:
    
    work_dir = f'../Processed_Data/Models/{cutoff}/Kernel_Models/Gaussian/KRR'
    
    # Loop over properties
    for pn in property_names:
        errors_list = np.loadtxt(f'{work_dir}/{pn}_optimization.dat', dtype=dt)
        
        # Loop over error types
        for error in ('mae', 'rmse'):
        
            # TODO: Extract set of parameters corresponding to the minimum error

            # Print error and parameters
            print(f'-----Optimal Parameters for {cutoff} {pn} {error.upper()}-----')
            print('Avg. Error =', np.mean(errors_list[f'{error}_validate'][idx]))
            print('Gamma =', errors_list[idx]['gamma'])
            print('Sigma =', errors_list[idx]['sigma'])
            print('Regularization =', errors_list[idx]['reg'])
            print('')

            # TODO: Save optimal parameters for easy access
            #with open(f'{work_dir}/{pn}_{error}_parameters.json', 'w') as f:
            #    json.dump(opt_params, f)

-----Optimal Parameters for 3.5 volumes MAE-----
Avg. Error = 2.5304993519468213
Gamma = 10.0
Sigma = 0.1
Regularization = 0.001

-----Optimal Parameters for 3.5 volumes RMSE-----
Avg. Error = 3.654212004317217
Gamma = 10.0
Sigma = 0.1
Regularization = 0.001

-----Optimal Parameters for 3.5 energies MAE-----
Avg. Error = 0.689774447323908
Gamma = 10.0
Sigma = 0.01
Regularization = 0.0001

-----Optimal Parameters for 3.5 energies RMSE-----
Avg. Error = 0.9967709902316922
Gamma = 10.0
Sigma = 0.01
Regularization = 0.0001

-----Optimal Parameters for 6.0 volumes MAE-----
Avg. Error = 1.0099737440112349
Gamma = 10.0
Sigma = 1.0
Regularization = 0.0001

-----Optimal Parameters for 6.0 volumes RMSE-----
Avg. Error = 1.7406761608394494
Gamma = 1.0
Sigma = 0.01
Regularization = 0.01

-----Optimal Parameters for 6.0 energies MAE-----
Avg. Error = 0.47347945338816533
Gamma = 1.0
Sigma = 0.001
Regularization = 0.001

-----Optimal Parameters for 6.0 energies RMSE-----
Avg. Error = 0.73099440698692

# Kernel Ridge Regression

In [5]:
# Set ranges of kernel gamma (for Gaussian kernel) and regularization
gammas = np.logspace(-3, 3, 7)
regularizations = np.logspace(-12, 0, 13)
parameter_grid = dict(ridge__regressor__alpha=regularizations, kernel_constructor__gamma=gammas)

In [None]:
# Optimize hyperparameters

# Loop over cutoffs
for cutoff in tqdm(cutoffs, desc='Cutoff', leave=False):
    
    # Set data directory
    data_dir = f'../Processed_Data/DEEM_10k/Data/{cutoff}'
    
    # Set working directory
    work_dir = f'../Processed_Data/Models/{cutoff}/Kernel_Models/Gaussian/KRR'
    if not os.path.exists(work_dir):
        os.mkdir(work_dir)
    
    # Read SOAPs in training set
    #soaps = utils.load_hdf5(f'{data_dir}/soaps_power_full_nonorm.hdf5') # Use this
    #soaps = np.array([np.mean(soap, axis=0) for soap in soaps]) # Use this
    str_idxs = [str(i).zfill(4) for i in range(0, len(train_idxs))]
    soaps = utils.load_hdf5(f'{data_dir}/soaps_power_full_nonorm.hdf5', datasets=str_idxs, concatenate=True)
    
    # Loop over properties
    for pn in tqdm(property_names, desc='Property', leave=False):
        
        # Load the property values (just from the train set)
        y = structure_properties[pn][train_idxs]
        
        cache_dir = mkdtemp()
        
        # NOTE: can't just use the custom kernel as a callable
        # to KernelRidge, as it necessarily operates on 2D arrays
        # of the features for all environments in a given structure,
        # whereas the callable must operate on pairs of samples.
        # The KernelConstructor is used instead
        pipeline = Pipeline(
            [
                ('kernel_constructor', utils.KernelConstructor())
                ('kernel_norm_scaler', utils.KernelNormScaler()), 
                ('ridge', TransformedTargetRegressor(
                    regressor=KernelRidge(kernel='precomputed'),
                    transformer=utils.NormScaler()
                ))
            ]
        memory=)
        gscv = GridSearchCV(
            ttr, parameter_grid, 
            scoring=[
                'neg_root_mean_squared_error', 
                'neg_mean_absolute_error'
            ],
            cv=utils.cv_generator(cv_idxs),
            refit=False, return_train_score=True, error_score='raise'
        )
        gscv.fit(soaps, y)
        # TODO: save CV run in JSON
        rmtree(cache_dir)

In [None]:
# Extract optimal parameters

# Loop over cutoffs
for cutoff in cutoffs:
    
    work_dir = f'../Processed_Data/Models/{cutoff}/Kernel_Models/Gaussian/KRR'
    
    # Loop over properties
    for pn in property_names:
        errors_list = np.loadtxt(f'{work_dir}/{pn}_optimization.dat', dtype=dt)
        
        # Loop over error types
        for error in ('mae', 'rmse'):
        
            # TODO: Extract set of parameters corresponding to the minimum error

            # Print error and parameters
            print(f'-----Optimal Parameters for {cutoff} {pn} {error.upper()}-----')
            print('Avg. Error =', np.mean(errors_list[f'{error}_validate'][idx]))
            print('Gamma =', errors_list[idx]['gamma'])
            print('Sigma =', errors_list[idx]['sigma'])
            print('Regularization =', errors_list[idx]['reg'])
            print('')

            # TODO: Save optimal parameters for easy access
            #with open(f'{work_dir}/{pn}_{error}_parameters.json', 'w') as f:
            #    json.dump(opt_params, f)