In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# System
import os
import sys
sys.path.append('/home/helfrech/Tools/Toolbox/utils')

# Maths
import numpy as np

# ML
from kernels import gaussian_kernel
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import Ridge
from sklearn.kernel_ridge import KernelRidge
from sklearn.preprocessing import KernelCenterer
from sklearn.model_selection import GridSearchCV
from sklearn.compose import TransformedTargetRegressor
from sklearn.pipeline import Pipeline

# Utilities
from tqdm.notebook import tqdm
from tools import load_json, save_json
import project_utils as utils

In [3]:
# Load train and test sets
train_idxs = np.loadtxt('../Processed_Data/DEEM_330k/train.idxs', dtype=int)
cv_idxs = np.loadtxt('../Processed_Data/DEEM_330k/cv_5.idxs', dtype=int)

In [4]:
# Load SOAP cutoffs
soap_hyperparameters = load_json('../Processed_Data/soap_hyperparameters.json')
    
cutoffs = soap_hyperparameters['interaction_cutoff']

In [5]:
# Make directory in which to store models
if not os.path.exists('../Processed_Data/Models'):
    os.mkdir('../Processed_Data/Models')

In [6]:
# Set property names for loading
property_names = ['volumes', 'energies']

# Load structure properties
structure_properties = {}
for pn in property_names:
    structure_properties[pn] = np.loadtxt(f'../Processed_Data/DEEM_330k/Data/structure_{pn}.dat')

# Linear Ridge Regression

In [11]:
# Set ranges of regularization
regularizations = np.logspace(-12, 0, 13)
parameter_grid = dict(ridge__regressor__alpha=regularizations)

In [12]:
# Loop over cutoffs
#for cutoff in tqdm(cutoffs, desc='Cutoff', leave=True):
for cutoff in tqdm([6.0], desc='Cutoff', leave=True):
    
    # Set data directory
    data_dir = f'../Processed_Data/DEEM_10k/Data/{cutoff}'
    
    # Set working directory
    work_dir = f'../Processed_Data/Models/{cutoff}/Linear_Models/LR'
    if not os.path.exists(work_dir):
        os.mkdir(work_dir)
    
    # Read SOAPs in training set
    #soaps = utils.load_hdf5(f'{data_dir}/soaps_power_full_nonorm.hdf5') # Use this
    #soaps = np.array([np.mean(soap, axis=0) for soap in soaps]) # Use this
    str_idxs = [str(i).zfill(4) for i in range(0, len(train_idxs))]
    soaps = utils.load_hdf5(f'{data_dir}/soaps_power_full_avg_nonorm.hdf5', datasets=str_idxs, concatenate=True)
    
    # Loop over properties
    for pn in tqdm(property_names, desc='Property', leave=False):
        
        # Load the property values (just from the train set)
        y = structure_properties[pn][train_idxs]
         
        pipeline = Pipeline([
            ('norm_scaler', utils.NormScaler()), 
            ('ridge', TransformedTargetRegressor(
                regressor=Ridge(), transformer=utils.NormScaler()
            ))
        ])
        gscv = GridSearchCV(
            pipeline, parameter_grid, 
            scoring=[
                'neg_root_mean_squared_error', 
                'neg_mean_absolute_error'
            ],
            cv=utils.cv_generator(cv_idxs),
            refit=False, return_train_score=True, error_score='raise'
        )
        gscv.fit(soaps, y)
        # TODO: save CV run in JSON

HBox(children=(FloatProgress(value=0.0, description='Cutoff', max=1.0, style=ProgressStyle(description_width='…

HBox(children=(FloatProgress(value=0.0, description='Property', max=2.0, style=ProgressStyle(description_width…

  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T


{'mean_fit_time': array([2.00327945, 1.99795218, 2.00836487, 2.0116116 , 2.01227717,
       2.01439157, 2.01660132, 2.02133436, 2.01460571, 2.02139516,
       2.00432792, 2.00883031, 1.99679728]), 'std_fit_time': array([0.00572379, 0.00389244, 0.00872219, 0.00491316, 0.00484051,
       0.00679552, 0.00702277, 0.01124612, 0.00565531, 0.00865333,
       0.0094388 , 0.00437262, 0.00562902]), 'mean_score_time': array([0.04058428, 0.04024801, 0.04025259, 0.0403316 , 0.04025154,
       0.04030414, 0.04023361, 0.04024367, 0.04024777, 0.04028468,
       0.04020991, 0.04032917, 0.04021859]), 'std_score_time': array([4.38547484e-04, 5.94091260e-05, 4.65859410e-05, 1.01771841e-04,
       2.46688467e-05, 1.56481973e-04, 3.57577002e-05, 7.16661661e-05,
       7.54046165e-05, 2.54487236e-05, 2.61298931e-05, 1.08976373e-04,
       3.90028312e-05]), 'param_ridge__regressor__alpha': masked_array(data=[1e-12, 1e-11, 1e-10, 1e-09, 1e-08, 1e-07, 1e-06, 1e-05,
                   0.0001, 0.001, 0.01, 0.1, 1

  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T


{'mean_fit_time': array([2.00068684, 2.00220275, 1.99353952, 2.02016001, 2.02467017,
       2.03357196, 2.02679992, 2.02221699, 2.03311119, 2.02664042,
       2.02839999, 2.03681245, 2.03781376]), 'std_fit_time': array([0.01355119, 0.00516144, 0.0121887 , 0.02318428, 0.00821208,
       0.00460534, 0.00764017, 0.0048264 , 0.0091274 , 0.00519288,
       0.00636621, 0.00644318, 0.00537034]), 'mean_score_time': array([0.04025626, 0.04028239, 0.04046698, 0.04105659, 0.04112196,
       0.04107704, 0.04112334, 0.04115205, 0.04106584, 0.04114799,
       0.04108377, 0.04114232, 0.04111609]), 'std_score_time': array([2.21496429e-04, 6.27907093e-05, 1.47900820e-04, 4.13290183e-04,
       6.99091086e-05, 1.36888446e-04, 1.01706068e-04, 5.45742992e-05,
       7.65346069e-05, 1.24369278e-04, 7.40584140e-05, 7.34864077e-05,
       1.57054887e-04]), 'param_ridge__regressor__alpha': masked_array(data=[1e-12, 1e-11, 1e-10, 1e-09, 1e-08, 1e-07, 1e-06, 1e-05,
                   0.0001, 0.001, 0.01, 0.1, 1

In [12]:
# Extract optimal parameters

# Loop over cutoffs
for cutoff in cutoffs:
    
    work_dir = f'../Processed_Data/Models/{cutoff}/Kernel_Models/Gaussian/KRR'
    
    # Loop over properties
    for pn in property_names:
        errors_list = np.loadtxt(f'{work_dir}/{pn}_optimization.dat', dtype=dt)
        
        # Loop over error types
        for error in ('mae', 'rmse'):
        
            # TODO: Extract set of parameters corresponding to the minimum error

            # Print error and parameters
            print(f'-----Optimal Parameters for {cutoff} {pn} {error.upper()}-----')
            print('Avg. Error =', np.mean(errors_list[f'{error}_validate'][idx]))
            print('Gamma =', errors_list[idx]['gamma'])
            print('Sigma =', errors_list[idx]['sigma'])
            print('Regularization =', errors_list[idx]['reg'])
            print('')

            # TODO: Save optimal parameters for easy access
            #with open(f'{work_dir}/{pn}_{error}_parameters.json', 'w') as f:
            #    json.dump(opt_params, f)

-----Optimal Parameters for 3.5 volumes MAE-----
Avg. Error = 2.5304993519468213
Gamma = 10.0
Sigma = 0.1
Regularization = 0.001

-----Optimal Parameters for 3.5 volumes RMSE-----
Avg. Error = 3.654212004317217
Gamma = 10.0
Sigma = 0.1
Regularization = 0.001

-----Optimal Parameters for 3.5 energies MAE-----
Avg. Error = 0.689774447323908
Gamma = 10.0
Sigma = 0.01
Regularization = 0.0001

-----Optimal Parameters for 3.5 energies RMSE-----
Avg. Error = 0.9967709902316922
Gamma = 10.0
Sigma = 0.01
Regularization = 0.0001

-----Optimal Parameters for 6.0 volumes MAE-----
Avg. Error = 1.0099737440112349
Gamma = 10.0
Sigma = 1.0
Regularization = 0.0001

-----Optimal Parameters for 6.0 volumes RMSE-----
Avg. Error = 1.7406761608394494
Gamma = 1.0
Sigma = 0.01
Regularization = 0.01

-----Optimal Parameters for 6.0 energies MAE-----
Avg. Error = 0.47347945338816533
Gamma = 1.0
Sigma = 0.001
Regularization = 0.001

-----Optimal Parameters for 6.0 energies RMSE-----
Avg. Error = 0.73099440698692

# Kernel Ridge Regression

In [5]:
# Set ranges of kernel gamma (for Gaussian kernel) and regularization
gammas = np.logspace(-3, 3, 7)
regularizations = np.logspace(-12, 0, 13)
parameter_grid = dict(ridge__regressor__alpha=regularizations, kernel_constructor__gamma=gammas)

In [None]:
# Optimize hyperparameters

# Loop over cutoffs
for cutoff in tqdm(cutoffs, desc='Cutoff', leave=False):
    
    # Set data directory
    data_dir = f'../Processed_Data/DEEM_10k/Data/{cutoff}'
    
    # Set working directory
    work_dir = f'../Processed_Data/Models/{cutoff}/Kernel_Models/Gaussian/KRR'
    if not os.path.exists(work_dir):
        os.mkdir(work_dir)
    
    # Read SOAPs in training set
    soaps = load_hdf5(f'{data_dir}/soaps_power_full_nonorm.hdf5', datasets=None, concatenate=False)
    
    # Loop over properties
    for pn in tqdm(property_names, desc='Property', leave=False):
        
        # Load the property values (just from the train set)
        y = structure_properties[pn][train_idxs]
        
        pipeline = Pipeline([
            ('kernel_constructor', utils.KernelConstructor())
            ('kernel_norm_scaler', utils.KernelNormScaler()), 
            ('ridge', TransformedTargetRegressor(
                regressor=KernelRidge(kernel='precomputed'),
                transformer=utils.NormScaler()
            ))
        ])
        gscv = GridSearchCV(
            ttr, parameter_grid, 
            scoring=[
                'neg_root_mean_squared_error', 
                'neg_mean_absolute_error'
            ],
            cv=utils.cv_generator(cv_idxs),
            refit=False, return_train_score=True, error_score='raise'
        )
        gscv.fit(soaps, y)
        # TODO: save CV run in JSON

In [None]:
# Extract optimal parameters

# Loop over cutoffs
for cutoff in cutoffs:
    
    work_dir = f'../Processed_Data/Models/{cutoff}/Kernel_Models/Gaussian/KRR'
    
    # Loop over properties
    for pn in property_names:
        errors_list = np.loadtxt(f'{work_dir}/{pn}_optimization.dat', dtype=dt)
        
        # Loop over error types
        for error in ('mae', 'rmse'):
        
            # TODO: Extract set of parameters corresponding to the minimum error

            # Print error and parameters
            print(f'-----Optimal Parameters for {cutoff} {pn} {error.upper()}-----')
            print('Avg. Error =', np.mean(errors_list[f'{error}_validate'][idx]))
            print('Gamma =', errors_list[idx]['gamma'])
            print('Sigma =', errors_list[idx]['sigma'])
            print('Regularization =', errors_list[idx]['reg'])
            print('')

            # TODO: Save optimal parameters for easy access
            #with open(f'{work_dir}/{pn}_{error}_parameters.json', 'w') as f:
            #    json.dump(opt_params, f)