In [5]:
# System
import os
import sys
sys.path.append('/home/helfrech/Tools/Toolbox/utils')

# Maths
import numpy as np

# ML
from regression import SparseKRR
from kernels import build_kernel

# Utilities
import h5py
import json

# Initial setup

In [None]:
# Load train and test sets
train_idxs = np.loadtxt('../Processed_Data/DEEM_10k/train.idxs', dtype=int)
test_idxs = np.loadtxt('../Processed_Data/DEEM_10k/test.idxs', dtype=int)

# Total number of structures
n_structures = train_idxs.size + test_idxs.size

In [None]:
# Set structure labels for loading from the HDF5 file
n_digits = len(str(n_structures - 1))
datasets = [str(i).zfill(n_digits) for i in train_idxs]

In [None]:
# Load SOAP cutoffs
with open('../Processeed_Data/soap_hyperparameters.json', 'r') as f:
    soap_hyperparameters = json.load(f)
    
cutoffs = soap_hyperparameters['interaction_cutoff']

# Functions

In [None]:
def transform_skrr_oos(cutoff, datasets, property_name,
                   soaps_file, ref_soaps_file,
                   rep_idxs_file, model_file,
                   skrr_file, work_dir='.'):
    
    # Read SOAPs
    soaps = load_structures_from_hdf5(soaps_file, datasets=None, concatenate=False)
    
    # Load reference SOAPs
    ref_soaps = load_structures_from_hdf5(ref_soaps_file, datasets=datasets, concatenate=True)
    representative_idxs = np.loadtxt(rep_idxs_file, usecols=0, dtype=int)
    representative_soaps = ref_soaps[representative_idxs, :]
    
    # Unpickle the kernel parameters
    with open(model_file, 'r') as f:
        model_dict = json.load(f)
        
    kernel_type = model_dict['kernel_type']
    gamma = model_dict['gamma']
    
    # Unpickle the reference models
    with open(model_file, 'r') as f:
        skrr_dict = json.load(f)
        
    # Turn lists into arrays
    for k, v in skrr_dict.items():
        if isinstance(v, list):
            skrr_dict[k] = np.array(v)
            
    # Build kernels
    KNM = build_kernel(soaps, representative_soaps,
                       kernel=kernel_type, gamma=gamma)
    KNM_environments = build_kernel(np.vstack(soaps, axis=0), representative_soaps,
                                    kernel=kernel_type, gamma=gama)
        
    # Initialize SKRR model
    skrr = SparseKRR()
    skrr.__dict__ = skrr_dict

    # Predict based on the loaded SparseKRR object
    Yp_structures = skrr.transform(KNM)
    Yp_environments = skrr.transform(KNM_environments)

    np.savetxt(f'{work_dir}/predicted_structure_{property_name}.dat', Yp_structures)
    np.savetxt(f'{work_dir}/predicted_environment_{property_name}.dat', Yp_environments)

# DEEM_10k

In [None]:
# Set property names for loading
property_names = ['volumes', 'energies']

# Load structure properties
structure_properties = {}
for pn in property_names:
    structure_properties[pn] = np.loadtxt(f'../Processed_Data/DEEM_10k/Data/structure_{pn}.dat')

In [None]:
for cutoff in cutoffs:
    
    # Set data directory
    data_dir = f'../Processed_Data/DEEM_10k/Models/{cutoff}'
    
    # Set working directory
    work_dir = f'../Processed_Data/DEEM_10k/Data/{cutoff}'
    if not os.path.exists(work_dir):
        os.mkdir(work_dir)
    
    # Set SOAP files
    soaps_file = f'{work_dir}/soaps.hdf5'
    rep_idxs_file = f'{work_dir}/FPS_representatives.idxs'
            
    # Read SOAPs
    soaps = load_structures_from_hdf5(soaps_file, datasets=None, concatenate=False)
    
    # Build representative SOAPs
    representative_idxs = np.loadtxt(rep_idxs_file, usecols=0, dtype=int)
    representative_soaps = np.vstack([soaps[i] for i in train_idxs])
    representative_soaps = representative_soaps[representative_idxs, :]
    
    # Loop over structure properties
    for pn, Y in structure_properties.items():
        
        # Load model parameters
        model_file = f'{data_dir}/{pn}_mae_parameters.json'

        # Load kernel parameters
        with open(model_file, 'r') as f:
            model_dict = json.load(f)

        kernel_type = model_dict['kernel_type']
        kernel_parameters = model_dict['kernel_parameters']

        # Build kernels
        KMM = build_kernel(representative_soaps, representative_soaps,
                           kernel=kernel_type, gamma=gamma)
        KNM = build_kernel(soaps, representative_soaps,
                           kernel=kernel_type, gamma=gamma)
        KNM_environments = build_kernel(np.vstack(soaps), representative_soaps,
                                        kernel=kernel_type, gamma=gamma)
    
        # Prepare properties and scaling
        Yc = Y - np.mean(Y[train_idxs])
        delta = np.var(Yc[train_idxs]) * KMM.shape[0] / np.trace(KMM)
        
        # Load regression parameters
        sigma = model_dict['sigma']
        reg = model_dict['reg']
    
        # Initialize sparse KRR
        skrr = SparseKRR(sigma=sigma, reg=reg, rcond=None)
        skrr.fit(delta*KNM[train_idxs, :], delta*KMM, delta*Yc[train_idxs])
        
        # Pickle the models
        # Copy the dict so we can make the numpy arrays lists
        skrr_dict = skrr.__dict__.copy()
        
        # Convert arrays to lists
        for k, v in skrr_dict.items():
            if isinstance(v, np.ndarray):
                skrr_dict[k] = v.tolist()
        
        # Save
        with open(f'{data_dir}/skrr_{pn}.json', 'w') as f:
            json.dump(skrr_dict, f)
        
        # Predict properties
        Yp_structures = np.zeros(len(soaps))
        Yp_structures[train_idxs] = skrr.transform(KNM[train_idxs, :])
        Yp_structures[test_idxs] = skrr.transform(KNM[test_idxs, :])
        Yp_environments = skrr.transform(KNM_environments)
        
        # Save predicted properties
        np.savetxt(f'{work_dir}/predicted_structure_{pn}.dat', Yp_structures)
        np.savetxt(f'{work_dir}/predicted_environment_{pn}.dat', Yp_environments)

# IZA_226 on DEEM_10k

In [None]:
for cutoff in cutoffs:
    
    # Set the working directory
    work_dir = f'../Processed_Data/IZA_226onDEEM_10k/Data/{cutoff}'
    if not os.path.exists(work_dir):
        os.mkdir(work_dir)
      
    # Set the SOAP files
    soaps_file = f'../Processed_Data/IZA_226onDEEM_10k/Data/{cutoff}/soaps.hdf5'
    ref_soaps_file = f'../Processed_Data/DEEM_10k/Data/{cutoff}/soaps.hdf5'
    rep_idxs_file = f'../Processed_Data/DEEM_10k/Data/{cutoff}/FPS_representatives.idxs'

    # Compute SKRR predictions for each property
    for pn in structure_properties.keys():
        model_file = f'../Processed_Data/DEEM_10/Models/{cutoff}/{pn}_mae_parameters.json'
        skrr_file = f'../Processed_Data/DEEM_10k/Models/{cutoff}/skrr_{pn}.json'
        transform_skrr_oos(cutoff, datasets, pn,
                           soaps_file, ref_soaps_file,
                           rep_idxs_file, model_file,
                           skrr_file, work_dir=work_dir)

# COD_196 on DEEM_10k