In [1]:
# System
import os
import sys
sys.path.append('/home/helfrech/Tools/Toolbox/utils')

# Maths
import numpy as np

# ML
from kernels import gaussian_kernel
from sklearn.linear_model import Ridge
from sklearn.kernel_ridge import KernelRidge
from sklearn.preprocessing import KernelCenterer
from sklearn.compose import TransformedTargetRegressor
from sklearn.pipeline import Pipeline

# Utilities
import h5py
from tempfile import mkdtemp
from shutil import rmtree
from tqdm.notebook import tqdm
from tools import load_json, save_json
import project_utils as utils

In [2]:
# TODO: table generator

# Initial setup

In [3]:
# Load SOAP cutoffs
soap_hyperparameters = load_json('../Processed_Data/soap_hyperparameters.json')   
cutoffs = soap_hyperparameters['interaction_cutoff']

In [4]:
# BEGIN TMP

In [12]:
# Load train and test set indices for Deem
# Train and test sets (random DEEM)
from numpy.random import default_rng
idxs_delete = np.loadtxt('../Processed_Data/DEEM_330k/10kJmol_error.idxs', dtype=int)
deem_10k_idxs = np.loadtxt('../Processed_Data/DEEM_330k/deem_10k.idxs', dtype=int)

n_total = 331172
n_train = 10000
n_test = 250
rng = default_rng(seed=11011)
idxs = np.arange(0, n_total)
idxs = np.delete(idxs, idxs_delete)
rng.shuffle(idxs)
deem_train_idxs = idxs[0:n_train]
deem_test_idxs = idxs[n_train:n_train + n_test]

idxs_sort = np.argsort(deem_train_idxs)

In [6]:
# END TMP

In [7]:
# Load train and test set indices for IZA
iza_train_idxs = np.loadtxt('../Processed_Data/IZA_226/train.idxs', dtype=int)
iza_test_idxs = np.loadtxt('../Processed_Data/IZA_226/test.idxs', dtype=int)

In [8]:
# Set property names for loading
property_names = ['volumes', 'energies']

# Load structure properties
structure_properties = {}
for pn in property_names:
    structure_properties[pn] = np.loadtxt(f'../Processed_Data/DEEM_330k/Data/structure_{pn}.dat')

# Model setup

In [9]:
model_dir = '../Processed_Data/Models'

deem_name = 'DEEM_330k'
iza_name = 'IZA_226'
deem_dir = f'../Processed_Data/{deem_name}/Data'
iza_dir = f'../Processed_Data/{iza_name}/Data'

# Linear Ridge Regression

In [10]:
batch_size = 100000

In [14]:
# Loop over cutoffs
for cutoff in tqdm(cutoffs, desc='Cutoff', leave=True):
    
    # Set data directory
    data_dir = f'../Processed_Data/DEEM_330k/Data/{cutoff}'
    
    # Read SOAPs in training set
    deem_file = f'{deem_dir}/{cutoff}/soaps_power_full_avg_nonorm.hdf5'
    f = h5py.File(deem_file, 'r')
    deem_330k_dataset = f['0']
    
    #iza_file = f'{iza_dir}/{cutoff}/soaps_{spectrum_type}_full_avg_nonorm.hdf5'
    #iza_soaps = utils.load_hdf5(iza_file)
    
    iza_soaps = utils.load_hdf5(f'{iza_dir}/{cutoff}/soaps_power_full_avg_nonorm.hdf5', datasets=None, concatenate=True)
    
    # Prepare batches for LR
    n_samples_330k = deem_330k_dataset.len()
    n_batches = n_samples_330k // batch_size
    if n_samples_330k % batch_size > 0:
        n_batches += 1
    
    # Loop over properties
    for pn in tqdm(property_names, desc='Property', leave=False):
        property_label = pn.capitalize()
        
        # Load the property values (just from the train set)
        y = structure_properties[pn][deem_train_idxs[idxs_sort]] ###
        
        output_dir = f'Linear_Models/LR/{property_label}'
        parameter_dir = f'{model_dir}/{cutoff}/{output_dir}'
        os.makedirs(f'{deem_dir}/{cutoff}/{output_dir}', exist_ok=True)
        os.makedirs(f'{iza_dir}/{cutoff}/{output_dir}', exist_ok=True)
        
        ridge_parameters = load_json(f'{parameter_dir}/ridge_parameters_mae.json')
         
        # Regression pipeline
        cache_dir = mkdtemp()
        pipeline = Pipeline(
            [
                ('norm_scaler', utils.NormScaler()), 
                ('ridge', TransformedTargetRegressor(
                    regressor=Ridge(**ridge_parameters), 
                    transformer=utils.NormScaler()
                ))
            ],
            memory=cache_dir
        )
        pipeline.fit(deem_330k_dataset[deem_train_idxs[idxs_sort]], y) ###
        
        iza_properties = pipeline.predict(iza_soaps)
        
        # Do DEEM 330k predictions in batches
        deem_properties = np.zeros(n_samples_330k)
        
        for i in tqdm(range(0, n_batches), desc='Batch', leave=False):
            batch_slice = slice(i * batch_size, (i + 1) * batch_size)
            
            deem_330k_batch = deem_330k_dataset[batch_slice]
            deem_properties[batch_slice] = pipeline.predict(deem_330k_batch)
        
        np.savetxt(f'{deem_dir}/{cutoff}/{output_dir}/lr_structure_properties.dat', deem_properties)
        np.savetxt(f'{iza_dir}/{cutoff}/{output_dir}/lr_structure_properties.dat', iza_properties)
        
        # Save the LR model and the scaler
        save_json(pipeline.named_steps['norm_scaler'].__dict__, f'{parameter_dir}/norm_scaler.json', array_convert=True)
        save_json(pipeline.named_steps['ridge'].regressor.__dict__, f'{parameter_dir}/ridge_regressor.json', array_convert=True)
        save_json(pipeline.named_steps['ridge'].transformer.__dict__, f'{parameter_dir}/ridge_transformer.json', array_convert=True)
        
        rmtree(cache_dir)

HBox(children=(FloatProgress(value=0.0, description='Cutoff', max=2.0, style=ProgressStyle(description_width='…

HBox(children=(FloatProgress(value=0.0, description='Property', max=2.0, style=ProgressStyle(description_width…

HBox(children=(FloatProgress(value=0.0, description='Batch', max=4.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Batch', max=4.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Property', max=2.0, style=ProgressStyle(description_width…

HBox(children=(FloatProgress(value=0.0, description='Batch', max=4.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Batch', max=4.0, style=ProgressStyle(description_width='i…




# Kernel Ridge Regression