In [1]:
# System
import os
import sys

# Maths
import numpy as np

# ML
from kernels import gaussian_kernel
from sklearn.linear_model import Ridge
from sklearn.kernel_ridge import KernelRidge
from sklearn.preprocessing import KernelCenterer
from sklearn.compose import TransformedTargetRegressor
from sklearn.pipeline import Pipeline

# Utilities
import h5py
from tempfile import mkdtemp
from shutil import rmtree
from tqdm.auto import tqdm
from tools import load_json, save_json
import project_utils as utils

# Initial setup

In [2]:
# Load SOAP cutoffs
soap_hyperparameters = load_json('../Processed_Data/soap_hyperparameters.json')   
cutoffs = soap_hyperparameters['interaction_cutoff']

In [3]:
batch_size = 100000

# Model setup

In [4]:
model_dir = '../Processed_Data/Models'

deem_name = 'DEEM_330k'
iza_name = 'IZA_230'
deem_dir = f'../Processed_Data/{deem_name}/Data'
iza_dir = f'../Processed_Data/{iza_name}/Data'

# Linear ridge regression of molar volumes and energies

In [3]:
# Load train and test set indices for Deem
deem_train_idxs = np.loadtxt('../Processed_Data/DEEM_330k/ridge_train.idxs', dtype=int)

In [4]:
# Set property names for loading
property_names = ['volumes', 'energies']

# Load structure properties
structure_properties = {}
for pn in property_names:
    structure_properties[pn] = np.loadtxt(f'../Processed_Data/DEEM_330k/Data/structure_{pn}.dat')

## Deem 10k train set (from ZAP1)

In [7]:
# Loop over cutoffs
for cutoff in tqdm(cutoffs, desc='Cutoff', leave=True):
    
    # Set data directory
    data_dir = f'../Processed_Data/DEEM_330k/Data/{cutoff}'
    
    # Read SOAPs in training set
    deem_file = f'{deem_dir}/{cutoff}/soaps_power_full_avg_nonorm.hdf5'
    f = h5py.File(deem_file, 'r')
    deem_330k_dataset = f['0']
    
    iza_file = f'{iza_dir}/{cutoff}/soaps_power_full_avg_nonorm.hdf5'
    iza_soaps = utils.load_hdf5(iza_file)
        
    # Prepare batches for LR
    n_samples_330k = deem_330k_dataset.len()
    n_batches = n_samples_330k // batch_size
    if n_samples_330k % batch_size > 0:
        n_batches += 1
    
    # Loop over properties
    for pn in tqdm(property_names, desc='Property', leave=False):
        property_label = pn.capitalize()
        
        # Load the property values (just from the train set)
        y = structure_properties[pn][deem_train_idxs]
        
        output_dir = f'LRR/{property_label}'
        parameter_dir = f'{model_dir}/{cutoff}/{output_dir}'
        os.makedirs(f'{deem_dir}/{cutoff}/{output_dir}', exist_ok=True)
        os.makedirs(f'{iza_dir}/{cutoff}/{output_dir}', exist_ok=True)
        
        ridge_parameters = load_json(f'{parameter_dir}/ridge_parameters_mae.json')
         
        # Regression pipeline
        pipeline = Pipeline(
            [
                ('norm_scaler', utils.StandardNormScaler()), 
                ('ridge', TransformedTargetRegressor(
                    regressor=Ridge(**ridge_parameters), 
                    transformer=utils.StandardNormScaler()
                ))
            ],
        )
        pipeline.fit(deem_330k_dataset[deem_train_idxs], y)
        
        iza_properties = pipeline.predict(iza_soaps)
        
        # Do DEEM 330k predictions in batches
        deem_properties = np.zeros(n_samples_330k)
        
        for i in tqdm(range(0, n_batches), desc='Batch', leave=False):
            batch_slice = slice(i * batch_size, (i + 1) * batch_size)
            
            deem_330k_batch = deem_330k_dataset[batch_slice]
            deem_properties[batch_slice] = pipeline.predict(deem_330k_batch)
                
        np.savetxt(f'{deem_dir}/{cutoff}/{output_dir}/lr_structure_properties.dat', deem_properties)
        np.savetxt(f'{iza_dir}/{cutoff}/{output_dir}/lr_structure_properties.dat', iza_properties)
        
        # Save the LR model and the scaler
        save_json(pipeline.named_steps['norm_scaler'].__dict__, f'{parameter_dir}/norm_scaler.json', array_convert=True)
        save_json(pipeline.named_steps['ridge'].regressor_.__dict__, f'{parameter_dir}/ridge_regressor.json', array_convert=True)
        save_json(pipeline.named_steps['ridge'].transformer_.__dict__, f'{parameter_dir}/ridge_transformer.json', array_convert=True)
                
    f.close()

HBox(children=(FloatProgress(value=0.0, description='Cutoff', max=2.0, style=ProgressStyle(description_width='…

HBox(children=(FloatProgress(value=0.0, description='Property', max=2.0, style=ProgressStyle(description_width…

HBox(children=(FloatProgress(value=0.0, description='Batch', max=4.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Batch', max=4.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Property', max=2.0, style=ProgressStyle(description_width…

HBox(children=(FloatProgress(value=0.0, description='Batch', max=4.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Batch', max=4.0, style=ProgressStyle(description_width='i…




## Random train set

In [8]:
deem_train_idxs_random = np.loadtxt('../Processed_Data/DEEM_330k/ridge_train_random.idxs', dtype=int)
sort_idxs = np.argsort(deem_train_idxs_random)
rev_idxs = np.argsort(sort_idxs)
deem_train_idxs_random = deem_train_idxs_random[sort_idxs]

In [9]:
# Loop over cutoffs
for cutoff in tqdm(cutoffs, desc='Cutoff', leave=True):
    
    # Set data directory
    data_dir = f'../Processed_Data/DEEM_330k/Data/{cutoff}'
    
    # Read SOAPs in training set
    deem_file = f'{deem_dir}/{cutoff}/soaps_power_full_avg_nonorm.hdf5'
    f = h5py.File(deem_file, 'r')
    deem_330k_dataset = f['0']
    
    iza_file = f'{iza_dir}/{cutoff}/soaps_power_full_avg_nonorm.hdf5'
    iza_soaps = utils.load_hdf5(iza_file)
        
    # Prepare batches for LR
    n_samples_330k = deem_330k_dataset.len()
    n_batches = n_samples_330k // batch_size
    if n_samples_330k % batch_size > 0:
        n_batches += 1
    
    # Loop over properties
    for pn in tqdm(property_names, desc='Property', leave=False):
        property_label = pn.capitalize()
        
        # Load the property values (just from the train set)
        y = structure_properties[pn][deem_train_idxs_random]
        
        output_dir = f'LRR/{property_label}'
        parameter_dir = f'{model_dir}/{cutoff}/{output_dir}'
        os.makedirs(f'{deem_dir}/{cutoff}/{output_dir}', exist_ok=True)
        os.makedirs(f'{iza_dir}/{cutoff}/{output_dir}', exist_ok=True)
        
        ridge_parameters = load_json(f'{parameter_dir}/ridge_parameters_mae_random.json')
         
        # Regression pipeline
        pipeline = Pipeline(
            [
                ('norm_scaler', utils.StandardNormScaler()), 
                ('ridge', TransformedTargetRegressor(
                    regressor=Ridge(**ridge_parameters), 
                    transformer=utils.StandardNormScaler()
                ))
            ],
        )
        pipeline.fit(deem_330k_dataset[deem_train_idxs_random], y)
        
        iza_properties = pipeline.predict(iza_soaps)
        
        # Do DEEM 330k predictions in batches
        deem_properties = np.zeros(n_samples_330k)
        
        for i in tqdm(range(0, n_batches), desc='Batch', leave=False):
            batch_slice = slice(i * batch_size, (i + 1) * batch_size)
            
            deem_330k_batch = deem_330k_dataset[batch_slice]
            deem_properties[batch_slice] = pipeline.predict(deem_330k_batch)
                
        np.savetxt(f'{deem_dir}/{cutoff}/{output_dir}/lr_structure_properties_random.dat', deem_properties)
        np.savetxt(f'{iza_dir}/{cutoff}/{output_dir}/lr_structure_properties_random.dat', iza_properties)
        
        # Save the LR model and the scaler
        save_json(pipeline.named_steps['norm_scaler'].__dict__, f'{parameter_dir}/norm_scaler_random.json', array_convert=True)
        save_json(pipeline.named_steps['ridge'].regressor_.__dict__, f'{parameter_dir}/ridge_regressor_random.json', array_convert=True)
        save_json(pipeline.named_steps['ridge'].transformer_.__dict__, f'{parameter_dir}/ridge_transformer_random.json', array_convert=True)
                
    f.close()

HBox(children=(FloatProgress(value=0.0, description='Cutoff', max=2.0, style=ProgressStyle(description_width='…

HBox(children=(FloatProgress(value=0.0, description='Property', max=2.0, style=ProgressStyle(description_width…

HBox(children=(FloatProgress(value=0.0, description='Batch', max=4.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Batch', max=4.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Property', max=2.0, style=ProgressStyle(description_width…

HBox(children=(FloatProgress(value=0.0, description='Batch', max=4.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Batch', max=4.0, style=ProgressStyle(description_width='i…




# Linear ridge regression of IZA compositions

In [5]:
# Load cantons
iza_cantons = np.loadtxt('../Raw_Data/IZA_230/cantons_compositions.dat', usecols=1, dtype=int)

In [6]:
iza_train_idxs_composition = np.loadtxt('../Processed_Data/IZA_230/svm_train.idxs', dtype=int)
# iza_train_idxs_composition = iza_train_idxs_composition[iza_cantons[iza_train_idxs_composition] != 3]

iza_sort_idxs = np.argsort(iza_train_idxs_composition)
iza_rev_idxs = np.argsort(iza_sort_idxs)
iza_train_idxs_composition = iza_train_idxs_composition[iza_sort_idxs]

In [7]:
# Load compositions
iza_compositions = np.loadtxt('../Raw_Data/IZA_230/cantons_compositions.dat', usecols=2)
property_name = 'composition'
property_label = property_name.capitalize()

In [8]:
# Loop over cutoffs
for cutoff in tqdm(cutoffs, desc='Cutoff', leave=True):
    
    # Set data directory
    data_dir = f'../Processed_Data/IZA_230/Data/{cutoff}'
    
    # Read SOAPs in training set
    iza_file = f'{iza_dir}/{cutoff}/soaps_power_full_avg_nonorm.hdf5'
    iza_soaps = utils.load_hdf5(iza_file)
    
    deem_file = f'{deem_dir}/{cutoff}/soaps_power_full_avg_nonorm.hdf5'
    f = h5py.File(deem_file, 'r')
    deem_330k_dataset = f['0']
    
    # Prepare batches for LR
    n_samples_330k = deem_330k_dataset.len()
    n_batches = n_samples_330k // batch_size
    if n_samples_330k % batch_size > 0:
        n_batches += 1
    
    output_dir = f'LRR/{property_label}'
    parameter_dir = f'{model_dir}/{cutoff}/{output_dir}'
    os.makedirs(f'{iza_dir}/{cutoff}/{output_dir}', exist_ok=True)
    os.makedirs(f'{deem_dir}/{cutoff}/{output_dir}', exist_ok=True)

    ridge_parameters = load_json(f'{parameter_dir}/ridge_parameters_mae.json')

    # Regression pipeline
    pipeline = Pipeline(
        [
            ('norm_scaler', utils.StandardNormScaler()), 
            ('ridge', TransformedTargetRegressor(
                regressor=Ridge(**ridge_parameters), 
                transformer=utils.StandardNormScaler()
            ))
        ],
    )
    pipeline.fit(
        iza_soaps[iza_train_idxs_composition],
        iza_compositions[iza_train_idxs_composition]
    )

    predicted_iza_compositions = pipeline.predict(iza_soaps)
    
    # Do DEEM 330k predictions in batches
    predicted_deem_compositions = np.zeros(n_samples_330k)

    for i in tqdm(range(0, n_batches), desc='Batch', leave=False):
        batch_slice = slice(i * batch_size, (i + 1) * batch_size)

        deem_330k_batch = deem_330k_dataset[batch_slice]
        predicted_deem_compositions[batch_slice] = pipeline.predict(deem_330k_batch)
    
    np.savetxt(f'{iza_dir}/{cutoff}/{output_dir}/lr_structure_properties.dat', predicted_iza_compositions)
    np.savetxt(f'{deem_dir}/{cutoff}/{output_dir}/lr_structure_properties.dat', predicted_deem_compositions)
    
    # Save the LR model and the scaler
    save_json(pipeline.named_steps['norm_scaler'].__dict__, f'{parameter_dir}/norm_scaler.json', array_convert=True)
    save_json(pipeline.named_steps['ridge'].regressor_.__dict__, f'{parameter_dir}/ridge_regressor.json', array_convert=True)
    save_json(pipeline.named_steps['ridge'].transformer_.__dict__, f'{parameter_dir}/ridge_transformer.json', array_convert=True)

HBox(children=(FloatProgress(value=0.0, description='Cutoff', max=2.0, style=ProgressStyle(description_width='…

HBox(children=(FloatProgress(value=0.0, description='Batch', max=4.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Batch', max=4.0, style=ProgressStyle(description_width='i…




# Kernel Ridge Regression