In [4]:
# System
import os
import sys
sys.path.append('/home/helfrech/Tools/Toolbox/utils')

# Maths
import numpy as np

# ML
from decomposition import IterativeSparseKPCA, KPCA
from kernels import build_kernel, linear_kernel, gaussian_kernel

# Utilities
import h5py
import json
from project_utils import load_structures_from_hdf5

# Initial setup

In [None]:
# Load SOAP cutoffs
with open('../Processeed_Data/soap_hyperparameters.json', 'r') as f:
    soap_hyperparameters = json.load(f)
    
cutoffs = soap_hyperparameters['interaction_cutoff']

In [None]:
representative_soaps = {}

for cutoff in cutoffs:
    work_dir = '../Processed_Data/DEEM_10k/Data/{cutoff}'
    n_Si = np.loadtxt('{work_dir}/n_Si.dat', dtype=int)
    split_idxs = np.cumsum(n_Si)[0:-1]
    representative_idxs = np.loadtxt('{work_dir}/FPS_representatives.idxs', dtype=int)
    soaps_file = '{work_dir}/soaps.hdf5'
    representative_soaps['{cutoff}'] = build_representatives_from_hdf5(soaps_file, representative_idxs, split_idxs)

In [None]:
n_kpca = 100

# Functions

In [None]:
def transform_iskpca(iskpca_environments_file, iskpca_structures_file, kernel_file,
                     representative_soaps, work_dir='.')
    
    # Read SOAPs
    soaps = load_structures_from_hdf5('{work_dir}/soaps.hdf5')
    
   # Unpickle the kernel parameters
    with open(kernel_file, 'r') as f:
        kernel_dict = json.load(f)
    
    kernel_type = kernel_dict['kernel_type']
    kernel_parameters = kernel_dict['kernel_parameters']
    
    # Unpickle the reference DEEM_10k models
    # TODO: build objects and set __dict__
    with open(iskpca_environments_file, 'r') as f:
        iskpca_environments = json.load(f)
        
    with open(iskpca_structures_file, 'r') as f:
        iskpca_structures = json.load(f)

    # Initialize the KPCA output
    g = h5py.File('{work_dir}/kpca_environments.hdf5', 'w')
    h = h5py.File('{work_dir}/kpca_structures.hdf5', 'w')
    
    # Save kernel parameters to the HDF5 files
        for file_obj in (g, h):
            file_obj.attrs['kernel_type'] = kernel_type
            file_obj.attrs['n_kpca'] = n_kpca
            for key, value in kernel_parameters.items():
                file_obj.attrs[key] = value        
    
    # Transform the data and save
    for sdx, soap in enumerate(soaps):
        KNMi = build_kernel(soap, representative_soaps,
                          kernel=kernel_type, **kernel_parameters)
        kpcai_environments = iskpca_environments.transform(KNMi)
        kpcai_structures = iskpca_structures.transform(np.mean(KNMi, axis=0))
        g.create_dataset('{sdx}'.format(sdx), data=kpcai_environments, track_order=True)
        h.create_dataset('{sdx}'.format(sdx), data=kpcai_structures, track_order=True)
        
    g.close()
    h.close()

# DEEM_10k

In [None]:
for cutoff in cutoffs:
        
    # Initialize SOAPs and KPCAs
    deem_10k = load_structures_from_hdf5('../Processed_Data/DEEM_10k/Data/{cutoff}/soaps.hdf5')
    
    # Load kernel parameters
    with open('../Processed_Data/DEEM_10k/Data/{cutoff}/kernel_parameters.json', 'r') as f:
        kernel_dict = json.load(f)
    
    kernel_type = kernel_dict['kernel_type']
    kernel_parameters = kernel_dict['kernel_parameters']
    
    # Build representative kernel
    KMM = build_kernel(representative_soaps['{cutoff}'], representative_soaps['{cutoff}'],
                       kernel=kernel_type, **kernel_parameters)
    
    # Initialize sparse KPCA for environments
    iskpca_environments = IterativeSparseKPCA(n_kpca=n_kpca)
    iskpca_environments.initialize_fit(KMM)
    
    # Initialize sparse KPCA for structures
    iskpca_structures.IterativeSparseKPCA(n_kpca=n_kpca)
    iskpca_structures.initialize_fit(KMM)
    
    # Fit the sparse KPCA
    for soap in deem_10k:
        KNMi = build_kernel(soap, representative_soaps['{cutoff}'],
                          kernel=kernel_type, **kernel_parameters)
        iskpca_environments.fit_batch(KNMi)
        iskpca_structures.fit_batch(np.mean(KNMi, axis=0))
        
    
    # Finalize the KPCA fitting
    iskpca_environments.finalize_fit()
    iskpca_structures.finalize_fit()
    
    # Pickle the models
    # TODO: account for np arrays
    with open('../Processed_Data/DEEM_10k/Models/{cutoff}/iskpca_environments.json', 'w') as f:
        json.dump(iskpca_environments.__dict__, f)
        
    with open('../Processed_Data/DEEM_10k/Models/{cutoff}/iskpca_structures.json', 'w') as f:
        json.dump(iskpca_structures.__dict__, f)
    
    # Initialize the KPCA output
    g = h5py.File('../Processed_Data/DEEM_10k/Data/{cutoff}/kpca_environments.hdf5', 'w')
    h = h5py.File('../Processed_Data/DEEM_10k/Data/{cutoff}/kpca_structures.hdf5', 'w')
    
    # Save kernel parameters to HDF5 files
    for file_obj in (g, h):
        file_obj.attrs['kernel_type'] = kernel_type
        file_obj.attrs['n_components'] = n_kpca
        for key, value in kernel_parameters.items():
            file_obj.attrs[key] = value
    
    # Transform the data and save
    for sdx, soap in enumerate(deem_10k):
        KNMi = build_kernel(soap, representative_soaps['{cutoff}'],
                          kernel=kernel_type, **kernel_parameters)
        kpcai_environments = iskpca_environments.transform(KNMi)
        kpcai_structures = iskpca_structures.transform(np.mean(KNMi, axis=0))
        g.create_dataset('{:d}'.format(sdx), data=kpcai_environments, track_order=True)
        h.create_dataset('{:d}'.format(sdx), data=kpcai_structures, track_order=True)
        
    g.close()
    h.close()

# IZA_226 on DEEM_10k

In [None]:
for cutoff in cutoffs:
    work_dir = '../Processed_Data/IZA_226onDEEM_10k/Data/{cutoff}'
    
    transform_iskpca(iskpca_environments_file, iskpca_structures_file, kernel_file, 
                     representative_soaps['{cutoff}'], work_dir=work_dir)

# COD_196 on DEEM_10k