In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# System
import os
import sys
sys.path.append('/home/helfrech/Tools/Toolbox/utils')

# Maths
import numpy as np

# ML
from kernels import gaussian_kernel
from sklearn.linear_model import Ridge
from sklearn.kernel_ridge import KernelRidge
from sklearn.preprocessing import KernelCenterer
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.compose import TransformedTargetRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.metrics import make_scorer

# Utilities
import h5py
from tempfile import mkdtemp
from shutil import rmtree
from tqdm.auto import tqdm
from tools import load_json, save_json
import project_utils as utils
from copy import deepcopy

In [3]:
# Load train and test sets
train_idxs = np.loadtxt('../Processed_Data/DEEM_330k/ridge_train.idxs', dtype=int)

In [4]:
# Load SOAP cutoffs
soap_hyperparameters = load_json('../Processed_Data/soap_hyperparameters.json')   
cutoffs = soap_hyperparameters['interaction_cutoff']

In [5]:
# Make directory in which to store models
os.makedirs('../Processed_Data/Models', exist_ok=True)

In [6]:
# Set property names for loading
property_names = ['volumes', 'energies']

# Load structure properties
structure_properties = {}
for pn in property_names:
    structure_properties[pn] = np.loadtxt(f'../Processed_Data/DEEM_330k/Data/structure_{pn}.dat')

# Linear Ridge Regression

In [7]:
# CV splits
n_splits = 5

# Use all defaults for the template parameters
ridge_parameters = dict()

# Regularization parameters for cross-validation
regularizations = np.logspace(-10, 0, 11)
parameter_grid = dict(ridge__regressor__alpha=regularizations)

In [8]:
# Loop over cutoffs
for cutoff in tqdm(cutoffs, desc='Cutoff', leave=True):
    
    # Set data directory
    data_dir = f'../Processed_Data/DEEM_330k/Data/{cutoff}'
    
    # Read SOAPs in training set
    soaps = utils.load_hdf5(f'{data_dir}/soaps_power_full_avg_nonorm.hdf5', indices=train_idxs)
    
    # Loop over properties
    for pn in tqdm(property_names, desc='Property', leave=False):
        property_label = pn.capitalize()
        
        # Set working directory
        work_dir = f'../Processed_Data/Models/{cutoff}/LRR/{property_label}'
        os.makedirs(work_dir, exist_ok=True)
        
        # Load the property values (just from the train set)
        y = structure_properties[pn][train_idxs]
         
        # Cross validation pipeline
        pipeline = Pipeline(
            [
                ('norm_scaler', utils.StandardNormScaler()), 
                ('ridge', TransformedTargetRegressor(
                    regressor=Ridge(**ridge_parameters), 
                    transformer=utils.StandardNormScaler()
                ))
            ],
        )
        gscv = GridSearchCV(
            pipeline, parameter_grid, 
            scoring=[
                'neg_root_mean_squared_error', 
                'neg_mean_absolute_error'
            ],
            cv=KFold(n_splits=n_splits, shuffle=True, random_state=0),
            refit=False, return_train_score=True, error_score='raise', n_jobs=4
        )
        gscv.fit(soaps, y)
        
        save_json(gscv.cv_results_, f'{work_dir}/cv_results.json', array_convert=True)

HBox(children=(FloatProgress(value=0.0, description='Cutoff', max=2.0, style=ProgressStyle(description_width='…

HBox(children=(FloatProgress(value=0.0, description='Property', max=2.0, style=ProgressStyle(description_width…

HBox(children=(FloatProgress(value=0.0, description='Property', max=2.0, style=ProgressStyle(description_width…




In [9]:
# Extract optimal parameters as a check

# Loop over cutoffs
for cutoff in cutoffs:
    
    # Loop over properties
    for pn in property_names:
        property_label = pn.capitalize()
        
        work_dir = f'../Processed_Data/Models/{cutoff}/LRR/{property_label}'
        cv_results = load_json(f'{work_dir}/cv_results.json')
        
        # Loop over error types
        for error, error_name in zip(
            ['neg_mean_absolute_error', 'neg_root_mean_squared_error'],
            ['mae', 'rmse']
        ):
        
            idx = np.argmin(cv_results[f'rank_test_{error}'])
            opt_parameters = utils.get_optimal_parameters(cv_results, error, **ridge_parameters)

            # Print error and parameters
            print(f'-----Optimal Parameters for {cutoff} {pn} {error[4:]}-----')
            print(f'{error} =', cv_results[f'mean_test_{error}'][idx])
            print(opt_parameters)
            print('')
            
            # Save optimal parameters for easy access
            save_json(opt_parameters, f'{work_dir}/ridge_parameters_{error_name}.json')

-----Optimal Parameters for 3.5 volumes mean_absolute_error-----
neg_mean_absolute_error = -2.552061285701326
{'alpha': 0.0001}

-----Optimal Parameters for 3.5 volumes root_mean_squared_error-----
neg_root_mean_squared_error = -3.7051810263578226
{'alpha': 0.0001}

-----Optimal Parameters for 3.5 energies mean_absolute_error-----
neg_mean_absolute_error = -0.6715277769635979
{'alpha': 0.0001}

-----Optimal Parameters for 3.5 energies root_mean_squared_error-----
neg_root_mean_squared_error = -0.9367011619628013
{'alpha': 0.001}

-----Optimal Parameters for 6.0 volumes mean_absolute_error-----
neg_mean_absolute_error = -1.0821631613521194
{'alpha': 0.1}

-----Optimal Parameters for 6.0 volumes root_mean_squared_error-----
neg_root_mean_squared_error = -1.7742008005202385
{'alpha': 0.01}

-----Optimal Parameters for 6.0 energies mean_absolute_error-----
neg_mean_absolute_error = -0.2011229827081546
{'alpha': 1e-06}

-----Optimal Parameters for 6.0 energies root_mean_squared_error-----
n

# Linear Ridge Regression (random train set)

In [10]:
# Train and test sets (random DEEM)
train_idxs_random = np.loadtxt('../Processed_Data/DEEM_330k/ridge_train_random.idxs', dtype=int)
sort_idxs = np.argsort(train_idxs_random)
rev_idxs = np.argsort(sort_idxs)

In [11]:
# Loop over cutoffs
for cutoff in tqdm(cutoffs, desc='Cutoff', leave=True):
    
    # Set data directory
    data_dir = f'../Processed_Data/DEEM_330k/Data/{cutoff}'
    
    # Read SOAPs in training set
    soaps = utils.load_hdf5(f'{data_dir}/soaps_power_full_avg_nonorm.hdf5', indices=train_idxs_random[sort_idxs])
    soaps = soaps[rev_idxs]
    
    # Loop over properties
    for pn in tqdm(property_names, desc='Property', leave=False):
        property_label = pn.capitalize()
        
        # Set working directory
        work_dir = f'../Processed_Data/Models/{cutoff}/LRR/{property_label}'
        os.makedirs(work_dir, exist_ok=True)
        
        # Load the property values (just from the train set)
        y = structure_properties[pn][train_idxs_random]
         
        # Cross validation pipeline
        pipeline = Pipeline(
            [
                ('norm_scaler', utils.StandardNormScaler()), 
                ('ridge', TransformedTargetRegressor(
                    regressor=Ridge(**ridge_parameters), 
                    transformer=utils.StandardNormScaler()
                ))
            ],
        )
        gscv = GridSearchCV(
            pipeline, parameter_grid, 
            scoring=[
                'neg_root_mean_squared_error', 
                'neg_mean_absolute_error'
            ],
            cv=KFold(n_splits=n_splits, shuffle=True, random_state=0),
            refit=False, return_train_score=True, error_score='raise', n_jobs=4
        )
        gscv.fit(soaps, y)
        
        save_json(gscv.cv_results_, f'{work_dir}/cv_results_random.json', array_convert=True)

HBox(children=(FloatProgress(value=0.0, description='Cutoff', max=2.0, style=ProgressStyle(description_width='…

HBox(children=(FloatProgress(value=0.0, description='Property', max=2.0, style=ProgressStyle(description_width…

HBox(children=(FloatProgress(value=0.0, description='Property', max=2.0, style=ProgressStyle(description_width…




In [12]:
# Extract optimal parameters as a check

# Loop over cutoffs
for cutoff in cutoffs:
    
    # Loop over properties
    for pn in property_names:
        property_label = pn.capitalize()
        
        work_dir = f'../Processed_Data/Models/{cutoff}/LRR/{property_label}'
        cv_results = load_json(f'{work_dir}/cv_results_random.json')
        
        # Loop over error types
        for error, error_name in zip(
            ['neg_mean_absolute_error', 'neg_root_mean_squared_error'],
            ['mae', 'rmse']
        ):
        
            idx = np.argmin(cv_results[f'rank_test_{error}'])
            opt_parameters = utils.get_optimal_parameters(cv_results, error, **ridge_parameters)

            # Print error and parameters
            print(f'-----Optimal Parameters for {cutoff} {pn} {error[4:]}-----')
            print(f'{error} =', cv_results[f'mean_test_{error}'][idx])
            print(opt_parameters)
            print('')
            
            # Save optimal parameters for easy access
            save_json(opt_parameters, f'{work_dir}/ridge_parameters_{error_name}_random.json')

-----Optimal Parameters for 3.5 volumes mean_absolute_error-----
neg_mean_absolute_error = -2.6405177225997285
{'alpha': 0.0001}

-----Optimal Parameters for 3.5 volumes root_mean_squared_error-----
neg_root_mean_squared_error = -3.7671040988835145
{'alpha': 0.0001}

-----Optimal Parameters for 3.5 energies mean_absolute_error-----
neg_mean_absolute_error = -0.6800422758359846
{'alpha': 0.0001}

-----Optimal Parameters for 3.5 energies root_mean_squared_error-----
neg_root_mean_squared_error = -0.9654442919931505
{'alpha': 0.001}

-----Optimal Parameters for 6.0 volumes mean_absolute_error-----
neg_mean_absolute_error = -1.118881417444896
{'alpha': 0.1}

-----Optimal Parameters for 6.0 volumes root_mean_squared_error-----
neg_root_mean_squared_error = -1.7833032039956365
{'alpha': 0.01}

-----Optimal Parameters for 6.0 energies mean_absolute_error-----
neg_mean_absolute_error = -0.199422690774243
{'alpha': 1e-07}

-----Optimal Parameters for 6.0 energies root_mean_squared_error-----
ne

# Linear Ridge Regression IZA compositions

In [13]:
# Train and test sets (random DEEM)
train_idxs_iza = np.loadtxt('../Processed_Data/IZA_230/svm_train.idxs', dtype=int)
sort_idxs_iza = np.argsort(train_idxs_iza)
rev_idxs_iza = np.argsort(sort_idxs_iza)

In [14]:
# Load compositions
iza_compositions = np.loadtxt('../Raw_Data/IZA_230/cantons_compositions.dat', usecols=2)[train_idxs_iza]
property_name = 'composition'
property_label = property_name.capitalize()

In [15]:
# CV splits
n_splits_iza = 2

# Use all defaults for the template parameters
ridge_parameters_iza = dict()

# Regularization parameters for cross-validation
regularizations_iza = np.logspace(-10, 5, 16)
parameter_grid_iza = dict(ridge__regressor__alpha=regularizations_iza)

In [16]:
# Loop over cutoffs
for cutoff in tqdm(cutoffs, desc='Cutoff', leave=True):
    
    # Set data directory
    data_dir = f'../Processed_Data/IZA_230/Data/{cutoff}'
    
    # Read SOAPs in training set
    soaps = utils.load_hdf5(f'{data_dir}/soaps_power_full_avg_nonorm.hdf5', indices=train_idxs_iza[sort_idxs_iza])
    soaps = soaps[rev_idxs_iza]
        
    # Set working directory
    work_dir = f'../Processed_Data/Models/{cutoff}/LRR/{property_label}'
    os.makedirs(work_dir, exist_ok=True)

    # Cross validation pipeline
    pipeline = Pipeline(
        [
            ('norm_scaler', utils.StandardNormScaler()), 
            ('ridge', TransformedTargetRegressor(
                regressor=Ridge(**ridge_parameters_iza), 
                transformer=utils.StandardNormScaler()
            ))
        ],
    )
    gscv = GridSearchCV(
        pipeline, parameter_grid_iza, 
        scoring=[
            'neg_root_mean_squared_error', 
            'neg_mean_absolute_error'
        ],
        cv=KFold(n_splits=n_splits_iza, shuffle=True, random_state=0),
        refit=False, return_train_score=True, error_score='raise', n_jobs=4
    )
    gscv.fit(soaps, iza_compositions)

    save_json(gscv.cv_results_, f'{work_dir}/cv_results.json', array_convert=True)

HBox(children=(FloatProgress(value=0.0, description='Cutoff', max=2.0, style=ProgressStyle(description_width='…




In [17]:
# Extract optimal parameters as a check

# Loop over cutoffs
for cutoff in cutoffs:        
    work_dir = f'../Processed_Data/Models/{cutoff}/LRR/{property_label}'
    cv_results = load_json(f'{work_dir}/cv_results.json')

    # Loop over error types
    for error, error_name in zip(
        ['neg_mean_absolute_error', 'neg_root_mean_squared_error'],
        ['mae', 'rmse']
    ):

        idx = np.argmin(cv_results[f'rank_test_{error}'])
        opt_parameters = utils.get_optimal_parameters(cv_results, error, **ridge_parameters_iza)

        # Print error and parameters
        print(f'-----Optimal Parameters for {cutoff} {property_name} {error[4:]}-----')
        print(f'{error} =', cv_results[f'mean_test_{error}'][idx])
        print(opt_parameters)
        print('')

        # Save optimal parameters for easy access
        save_json(opt_parameters, f'{work_dir}/ridge_parameters_{error_name}.json')

-----Optimal Parameters for 3.5 composition mean_absolute_error-----
neg_mean_absolute_error = -0.09230897966963178
{'alpha': 0.1}

-----Optimal Parameters for 3.5 composition root_mean_squared_error-----
neg_root_mean_squared_error = -0.1127996556075398
{'alpha': 0.1}

-----Optimal Parameters for 6.0 composition mean_absolute_error-----
neg_mean_absolute_error = -0.0876635821591121
{'alpha': 1.0}

-----Optimal Parameters for 6.0 composition root_mean_squared_error-----
neg_root_mean_squared_error = -0.10591209191457866
{'alpha': 1.0}



# Kernel Ridge Regression

In [None]:
# Use all defaults for the template parameters
kernel_ridge_parameters = dict(kernel='precomputed')

# Set ranges of kernel gamma (for Gaussian kernel) and regularization
log_gammas = np.linspace(-3, 3, 7)
##log_gammas = np.array([-2.0])

regularizations = np.logspace(-10, 0, 11)
##regularizations = np.array([1.0E-3, 1.0E-2])

# Use the filenames of the kernels that we will be loading
# in the "hacked" pipeline.
# The gamma names for the kernel parsing will be set in the loop
parameter_grid_base = dict(
    ridge__regressor__alpha=regularizations, 
)

cv_idxs = np.arange(0, len(train_idxs), dtype=int)

In [None]:
# Loop over cutoffs
for cutoff in tqdm(cutoffs, desc='Cutoff', leave=True):
    
    # Set data directory
    data_dir = f'../Processed_Data/DEEM_10k/Data/{cutoff}'
    model_dir = f'../Processed_Data/Models/{cutoff}/KRR'
    os.makedirs(model_dir, exist_ok=True)
    
    # Read SOAPs in training set
    soaps = utils.load_hdf5(f'{data_dir}/soaps_power_full_nonorm.hdf5')
    soaps = np.array([np.mean(soaps[i], axis=0) for i in train_idxs])
    
    # Build a "superkernel": a concatenation of the train and test kernels
    # (which we can store easily in memory and on disk) so that
    # the pipeline doesn't compute whole new kernels for each CV set.
    # We still have to compute a kernel for each gamma, though.
    # We store the kernels named by the logarithm of the gamma parameter,
    # so the filenames don't get ridiculous
    for log_gamma in tqdm(log_gammas, desc='Gamma', leave=False):
        gamma = 10 ** log_gamma
        K = gaussian_kernel(soaps, soaps, gamma=gamma) # Use this
        utils.save_hdf5(
            f'{model_dir}/gaussian_kernel_{log_gamma}.hdf5', K, 
            attrs=dict(gamma=gamma, log_gamma=log_gamma), chunks=(100, 100)
        )

In [None]:
# Optimize hyperparameters

# Loop over cutoffs
for cutoff in tqdm(cutoffs, desc='Cutoff', leave=True):
    
    # Set data directory
    model_dir = f'../Processed_Data/Models/{cutoff}/KRR'
    
    parameter_grid = deepcopy(parameter_grid_base)
    parameter_grid['kernel_loader__filename'] = \
        [f'{model_dir}/gaussian_kernel_{log_gamma}.hdf5' for log_gamma in log_gammas]
          
    # Loop over properties
    for pn in tqdm(property_names, desc='Property', leave=False):
        property_label = pn.capitalize()
        
        # Set working directory
        work_dir = f'{model_dir}/{property_label}'
        os.makedirs(work_dir, exist_ok=True)
                
        # Load the property values (just from the train set)
        y = structure_properties[pn][train_idxs]
        
        # "Hacked" pipeline: instead of recomputing the expensive kernels
        # at each CV iteration, we will fit instead with a set of indices
        # and use the filename containing the kernel at a particular
        # gamma as a hyperparameter for KernelLoader, which will
        # load the kernel at initialization.
                
        # NOTE: can't just use the custom kernel as a callable
        # to KernelRidge, as the custom kernel necessarily operates on 2D arrays
        # of the features for all environments in a given structure,
        # whereas the callable must operate on pairs of samples.
        # The KernelLoader/KernelConstructor is used instead
        cache_dir = mkdtemp()
        pipeline = Pipeline(
            [
                ('kernel_loader', utils.KernelLoader()),
                ('kernel_norm_scaler', utils.KernelNormScaler()), # TODO: make sure the kernel scaling is the same as linear in the case of linear kernel
                ('ridge', TransformedTargetRegressor(
                    regressor=KernelRidge(**kernel_ridge_parameters),
                    transformer=utils.SampleSelector(X=y, model=utils.StandardNormScaler()),
                    check_inverse=False
                ))
            ],
            memory=cache_dir
        )
        
        # "Hacked" CV: since we want to access the kernels via indices,
        # we fit the CV with cv_indices, and the cv_generator 
        # is used to pick out the correct folds
        gscv = GridSearchCV(
            pipeline, parameter_grid, 
            scoring=dict(
                neg_mean_absolute_error=make_scorer(
                    utils.score_by_index,
                    greater_is_better=False,
                    y=y, scorer=mean_absolute_error
                ),
                neg_root_mean_squared_error=make_scorer(
                    utils.score_by_index,
                    greater_is_better=False,
                    y=y, scorer=mean_squared_error,
                    squared=False
                )
            ),
            cv=KFold(n_splits=5, shuffle=True, random_state=0),
            refit=False, return_train_score=True, error_score='raise'
        )
        gscv.fit(cv_idxs, cv_idxs)
        save_json(gscv.cv_results_, f'{work_dir}/cv_results.json', array_convert=True)
        rmtree(cache_dir)

In [None]:
# Extract optimal parameters as a check

# Loop over cutoffs
for cutoff in cutoffs:
    
    # Loop over properties
    for pn in property_names:
        property_label = pn.capitalize()
        
        work_dir = f'../Processed_Data/Models/{cutoff}/Kernel_Models/Gaussian/KRR/{property_label}'
        cv_results = load_json(f'{work_dir}/cv_results.json')
        
        # Loop over error types
        for error, error_name in zip(
            ['neg_mean_absolute_error', 'neg_root_mean_squared_error'],
            ['mae', 'rmse']
        ):
        
            idx = np.argmin(cv_results[f'rank_test_{error}'])
            opt_parameters = utils.get_optimal_parameters(cv_results, error, **kernel_ridge_parameters)

            # Print error and parameters
            print(f'-----Optimal Parameters for {cutoff} {pn} {error[4:]}-----')
            print(f'{error} =', cv_results[f'mean_test_{error}'][idx])
            print(opt_parameters)
            print('')
            
            # Save optimal parameters for easy access
            save_json(opt_parameters, f'{work_dir}/ridge_parameters_{error_name}.json')