In [1]:
# System
import os
import sys
sys.path.append('/home/helfrech/Tools/Toolbox/utils')

# Maths
import numpy as np

# ML
from regression import SparseKRR
from kernels import build_kernel
from errors import MAE, RMSE

# Utilities
import h5py
import json
from tqdm.notebook import tqdm
from project_utils import load_structures_from_hdf5

In [None]:
# TODO: table generator

# Initial setup

In [2]:
# Load train and test sets
train_idxs = np.loadtxt('../Processed_Data/DEEM_10k/train.idxs', dtype=int)
test_idxs = np.loadtxt('../Processed_Data/DEEM_10k/test.idxs', dtype=int)

# Total number of structures
n_structures = train_idxs.size + test_idxs.size

In [3]:
# Set structure labels for loading from the HDF5 file
n_digits = len(str(n_structures - 1))
datasets = [str(i).zfill(n_digits) for i in train_idxs]

In [4]:
# Load SOAP cutoffs
with open('../Processed_Data/soap_hyperparameters.json', 'r') as f:
    soap_hyperparameters = json.load(f)
    
cutoffs = soap_hyperparameters['interaction_cutoff']

# Functions

In [5]:
def transform_skrr_oos(cutoff, datasets, property_name,
                   soaps_file, ref_soaps_file,
                   rep_idxs_file, model_file,
                   skrr_file, work_dir='.'):
    
    # Read SOAPs
    soaps = load_structures_from_hdf5(soaps_file, datasets=None, concatenate=False)
    
    # Load reference SOAPs
    representative_soaps = load_structures_from_hdf5(ref_soaps_file, datasets=datasets, concatenate=True)
    representative_idxs = np.loadtxt(rep_idxs_file, usecols=0, dtype=int)
    representative_soaps = representative_soaps[representative_idxs, :]
    
    # Unpickle the kernel parameters
    with open(model_file, 'r') as f:
        model_dict = json.load(f)
        
    kernel_type = model_dict['kernel_type']
    gamma = model_dict['gamma']
    
    # Unpickle the reference models
    with open(skrr_file, 'r') as f:
        skrr_dict = json.load(f)
                
    # Turn lists into arrays
    for k, v in skrr_dict.items():
        if isinstance(v, list):
            skrr_dict[k] = np.array(v)
                        
    # Build structure kernel
    KNM = build_kernel(soaps, representative_soaps,
                       kernel=kernel_type, gamma=gamma)
        
    # Initialize SKRR model
    skrr = SparseKRR()
    skrr.__dict__ = skrr_dict
    
    # Predict based on the loaded SparseKRR object
    Yp_structures = skrr.transform(KNM)
    
    # Iteratively operate on environments to save memory
    Yp_environments = []
    for soap in tqdm(soaps):
        KNM_environments = build_kernel(soap, representative_soaps,
                                    kernel=kernel_type, gamma=gamma)
        Yp_environments.append(skrr.transform(KNM_environments))
        
    Yp_environments = np.concatenate(Yp_environments)

    np.savetxt(f'{work_dir}/predicted_structure_{property_name}.dat', Yp_structures)
    np.savetxt(f'{work_dir}/predicted_environment_{property_name}.dat', Yp_environments)

# DEEM_10k

In [6]:
# Set property names for loading
property_names = ['volumes', 'energies']

# Load structure properties
structure_properties = {}
for pn in property_names:
    structure_properties[pn] = np.loadtxt(f'../Processed_Data/DEEM_10k/Data/structure_{pn}.dat')

In [7]:
for cutoff in cutoffs:
    
    # Set data directory
    data_dir = f'../Processed_Data/Models/{cutoff}'
    
    # Set working directory
    work_dir = f'../Processed_Data/DEEM_10k/Data/{cutoff}'
    if not os.path.exists(work_dir):
        os.mkdir(work_dir)
    
    # Set SOAP files
    soaps_file = f'{work_dir}/soaps.hdf5'
    rep_idxs_file = f'{work_dir}/FPS_representatives.idxs'
            
    # Read SOAPs
    soaps = load_structures_from_hdf5(soaps_file, datasets=None, concatenate=False)
    
    # Build representative SOAPs
    representative_idxs = np.loadtxt(rep_idxs_file, usecols=0, dtype=int)
    representative_soaps = np.vstack([soaps[i] for i in train_idxs])
    representative_soaps = representative_soaps[representative_idxs, :]
    
    # Loop over structure properties
    for pn, Y in structure_properties.items():
        
        # Load model parameters
        model_file = f'{data_dir}/{pn}_mae_parameters.json'

        # Load kernel parameters
        with open(model_file, 'r') as f:
            model_dict = json.load(f)

        kernel_type = model_dict['kernel_type']
        gamma = model_dict['gamma']

        # Build kernels
        KMM = build_kernel(representative_soaps, representative_soaps,
                           kernel=kernel_type, gamma=gamma)
        KNM = build_kernel(soaps, representative_soaps,
                           kernel=kernel_type, gamma=gamma)
    
        # Prepare properties and scaling
        Yc = Y - np.mean(Y[train_idxs])
        delta = np.var(Yc[train_idxs]) * KMM.shape[0] / np.trace(KMM)
        
        # Load regression parameters
        sigma = model_dict['sigma']
        reg = model_dict['reg']
    
        # Initialize sparse KRR
        skrr = SparseKRR(sigma=sigma, reg=reg, rcond=None)
        skrr.fit(delta*KNM[train_idxs, :], delta*KMM, delta*Yc[train_idxs])
        
        # Pickle the models
        # Copy the dict so we can make the numpy arrays lists
        skrr_dict = skrr.__dict__.copy()
        
        # Convert arrays to lists
        for k, v in skrr_dict.items():
            if isinstance(v, np.ndarray):
                skrr_dict[k] = v.tolist()
        
        # Save
        with open(f'{data_dir}/skrr_{pn}.json', 'w') as f:
            json.dump(skrr_dict, f)
        
        # Predict properties
        Yp_structures = np.zeros(len(soaps))
        Yp_structures[train_idxs] = skrr.transform(KNM[train_idxs, :])
        Yp_structures[test_idxs] = skrr.transform(KNM[test_idxs, :])
        
        # Iteratively predict environment properties to save memory
        Yp_environments = []
        for soap in tqdm(soaps):
            KNM_environments = build_kernel(soap, representative_soaps,
                                        kernel=kernel_type, gamma=gamma)
            Yp_environments.append(skrr.transform(KNM_environments))

        Yp_environments = np.concatenate(Yp_environments)
        
        # Save predicted properties
        np.savetxt(f'{work_dir}/predicted_structure_{pn}.dat', Yp_structures)
        np.savetxt(f'{work_dir}/predicted_environment_{pn}.dat', Yp_environments)

HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))




# IZA_226 on DEEM_10k

In [8]:
for cutoff in cutoffs:
    
    # Set the working directory
    work_dir = f'../Processed_Data/IZA_226onDEEM_10k/Data/{cutoff}'
    if not os.path.exists(work_dir):
        os.mkdir(work_dir)
      
    # Set the SOAP files
    soaps_file = f'../Processed_Data/IZA_226onDEEM_10k/Data/{cutoff}/soaps.hdf5'
    ref_soaps_file = f'../Processed_Data/DEEM_10k/Data/{cutoff}/soaps.hdf5'
    rep_idxs_file = f'../Processed_Data/DEEM_10k/Data/{cutoff}/FPS_representatives.idxs'

    # Compute SKRR predictions for each property
    for pn in structure_properties.keys():
        model_file = f'../Processed_Data/Models/{cutoff}/{pn}_mae_parameters.json'
        skrr_file = f'../Processed_Data/Models/{cutoff}/skrr_{pn}.json'
        transform_skrr_oos(cutoff, datasets, pn,
                           soaps_file, ref_soaps_file,
                           rep_idxs_file, model_file,
                           skrr_file, work_dir=work_dir)

HBox(children=(FloatProgress(value=0.0, max=226.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=226.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=226.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=226.0), HTML(value='')))




# COD_196 on DEEM_10k

In [9]:
for cutoff in cutoffs:
    
    # Set the working directory
    work_dir = f'../Processed_Data/COD_196onDEEM_10k/Data/{cutoff}'
    if not os.path.exists(work_dir):
        os.mkdir(work_dir)
      
    # Set the SOAP files
    soaps_file = f'../Processed_Data/COD_196onDEEM_10k/Data/{cutoff}/soaps.hdf5'
    ref_soaps_file = f'../Processed_Data/DEEM_10k/Data/{cutoff}/soaps.hdf5'
    rep_idxs_file = f'../Processed_Data/DEEM_10k/Data/{cutoff}/FPS_representatives.idxs'

    # Compute SKRR predictions for each property
    for pn in structure_properties.keys():
        model_file = f'../Processed_Data/Models/{cutoff}/{pn}_mae_parameters.json'
        skrr_file = f'../Processed_Data/Models/{cutoff}/skrr_{pn}.json'
        transform_skrr_oos(cutoff, datasets, pn,
                           soaps_file, ref_soaps_file,
                           rep_idxs_file, model_file,
                           skrr_file, work_dir=work_dir)

HBox(children=(FloatProgress(value=0.0, max=196.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=196.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=196.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=196.0), HTML(value='')))




# TESTING

In [10]:
Y_mean = {}
for pn in ['volumes', 'energies']:
    Y_mean[pn] = np.loadtxt(f'../Processed_Data/DEEM_10k/structure_{pn}.dat')
    Y_mean[pn] = np.mean(Y_mean[pn][train_idxs])

## DEEM_10k

In [11]:
for cutoff in cutoffs:
    for pn in ['volumes', 'energies']:
        Y = np.loadtxt(f'../Processed_Data/DEEM_10k/structure_{pn}.dat')
        Yp = np.loadtxt(f'../Processed_Data/DEEM_10k/Data/{cutoff}/predicted_structure_{pn}.dat')
        Yp += Y_mean[pn]
        mae = MAE(Y[test_idxs], Yp[test_idxs])
        rmse = RMSE(Y[test_idxs], Yp[test_idxs])
        
        print(f'{cutoff} {pn} MAE: {mae}')
        print(f'{cutoff} {pn} RMSE: {rmse}')

3.5 volumes MAE: 2.5997947727897315
3.5 volumes RMSE: 3.7752991352703043
3.5 energies MAE: 0.6679147247205498
3.5 energies RMSE: 0.9387783017460885
6.0 volumes MAE: 1.0495045073138882
6.0 volumes RMSE: 1.8395137747385406
6.0 energies MAE: 0.4620307674679215
6.0 energies RMSE: 0.6693558441871916


## IZA_226 on DEEM_10k

In [12]:
cantons = np.loadtxt('../Raw_Data/GULP/IZA_226/cantons.txt', usecols=1, dtype=int)
canton_labels = np.unique(cantons)
print(canton_labels)

[1 2 3 4]


In [13]:
for cutoff in cutoffs:
    for pn in ['volumes', 'energies']:
        Y = np.loadtxt(f'../Processed_Data/IZA_226/structure_{pn}.dat')
        Yp = np.loadtxt(f'../Processed_Data/IZA_226onDEEM_10k/Data/{cutoff}/predicted_structure_{pn}.dat')
        Yp += Y_mean[pn]
        mae = MAE(Y, Yp)
        rmse = RMSE(Y, Yp)
        
        mae_cantons = []
        rmse_cantons = []
        for canton in canton_labels:
            canton_idxs = np.nonzero(cantons == canton)[0]
            mae_cantons.append(MAE(Y[canton_idxs], Yp[canton_idxs]))
            rmse_cantons.append(RMSE(Y[canton_idxs], Yp[canton_idxs]))
        
        print(f'{cutoff} {pn} MAE: {mae} | {mae_cantons}')
        print(f'{cutoff} {pn} RMSE: {rmse} | {rmse_cantons}')

3.5 volumes MAE: 5.597071553718962 | [5.020883636526268, 5.133356486889813, 5.723347282333311, 73.75013726284374]
3.5 volumes RMSE: 8.605924587042685 | [6.134566648951043, 6.918039863286284, 7.805476815109476, 73.75013726284374]
3.5 energies MAE: 1.2295647615576781 | [1.373044032164554, 1.0360946547118863, 1.2324221456353692, 19.61684120822065]
3.5 energies RMSE: 2.171330635572147 | [1.9481079899780755, 1.5760536561678975, 1.897466617273481, 19.61684120822065]
6.0 volumes MAE: 2.1119540047946535 | [1.0388339319867168, 1.880866454355247, 2.306643039039767, 54.62498337801229]
6.0 volumes RMSE: 4.900791784046571 | [1.2504230461314574, 3.339528549606324, 3.889657173836812, 54.62498337801229]
6.0 energies MAE: 0.612571796556233 | [0.7003589840531017, 0.5632240906939902, 0.5117597908439594, 10.415656737794052]
6.0 energies RMSE: 1.057806574792901 | [1.1468966982608606, 0.7223079787003455, 0.7144406626110872, 10.415656737794052]


## COD_196 on DEEM_10k

In [14]:
for cutoff in cutoffs:
    for pn in ['volumes', 'energies']:
        Y = np.loadtxt(f'../Processed_Data/COD_196/structure_{pn}.dat')
        Yp = np.loadtxt(f'../Processed_Data/COD_196onDEEM_10k/Data/{cutoff}/predicted_structure_{pn}.dat')
        Yp += Y_mean[pn]
        mae = MAE(Y, Yp)
        rmse = RMSE(Y, Yp)
        
        print(f'{cutoff} {pn} MAE: {mae}')
        print(f'{cutoff} {pn} RMSE: {rmse}')

3.5 volumes MAE: 8.806058367711302
3.5 volumes RMSE: 14.843398697350736
3.5 energies MAE: 36.22190421008153
3.5 energies RMSE: 108.65834586518969
6.0 volumes MAE: 3.223133406232656
6.0 volumes RMSE: 10.968619050371599
6.0 energies MAE: 29.63415327950305
6.0 energies RMSE: 95.48433053527935
