In [4]:
# System
import os
import sys
sys.path.append('/home/helfrech/Tools/Toolbox/utils')

# Maths
import numpy as np

# ML
from decomposition import IterativeSparseKPCA, KPCA
from kernels import build_kernel, linear_kernel, gaussian_kernel

# Utilities
import h5py
import json
from project_utils import load_structures_from_hdf5

# Initial setup

In [None]:
# TODO: load cutoffs, kernel type, gaussian widths, regularizations, and sigmas

In [None]:
cutoffs = (3.5, 6.0)
representative_soaps = {}

for cutoff in cutoffs:
    work_dir = '../Processed_Data/DEEM_10k/Data/{cutoff}'
    n_Si = np.loadtxt('{work_dir}/n_Si.dat', dtype=int)
    split_idxs = np.cumsum(n_Si)[0:-1]
    representative_idxs = np.loadtxt('{work_dir}/FPS_representatives.idxs', dtype=int)
    soaps_file = '{work_dir}/soaps.hdf5'
    representative_soaps['{cutoff}'] = build_representatives_from_hdf5(soaps_file, representative_idxs, split_idxs)

In [None]:
# TODO: put reusable code into functions in within this notebook

# Functions

In [None]:
def transform_iskpca(iskpca_environments, iskpca_structures, representative_soaps, 
                     kernel_type, kernel_params, work_dir='.')
    
    # Read SOAPs
    soaps = load_structures_from_hdf5('{work_dir}/soaps.hdf5')

    # Initialize the KPCA output
    g = h5py.File('{work_dir}/kpca_environments.hdf5', 'w')
    h = h5py.File('{work_dir}/kpca_structures.hdf5', 'w')
    
    # TODO: save all the metadata to the file (number of components, kernel params, etc.)
        
    # Transform the data and save
    for sdx, soap in enumerate(soaps):
        KNMi = build_kernel(soap, representative_soaps,
                          kernel=kernel_type, **kernel_parameters)
        kpcai_environments = iskpca_environments.transform(KNMi)
        kpcai_structures = iskpca_structures.transform(np.mean(KNMi, axis=0))
        g.create_dataset('{sdx}'.format(sdx), data=kpcai_environments, track_order=True)
        h.create_dataset('{sdx}'.format(sdx), data=kpcai_structures, track_order=True)
        
    g.close()
    h.close()

# DEEM_10k

In [None]:
for cutoff in cutoffs:
        
    # Initialize SOAPs and KPCAs
    deem_10k = load_structures_from_hdf5('../Processed_Data/DEEM_10k/Data/{cutoff}/soaps.hdf5')

    # Build representative kernel
    KMM = build_kernel(representative_soaps['{cutoff}'], representative_soaps['{cutoff}'],
                       kernel=kernel_type, **kernel_parameters)
    
    # Initialize sparse KPCA for environments
    iskpca_environments = IterativeSparseKPCA(n_kpca=n_kpca)
    iskpca_environments.initialize_fit(KMM)
    
    # Initialize sparse KPCA for structures
    iskpca_structures.IterativeSparseKPCA(n_kpca=n_kpca)
    iskpca_structures.initialize_fit(KMM)
    
    # Fit the sparse KPCA
    for soap in deem_10k:
        KNMi = build_kernel(soap, representative_soaps['{cutoff}'],
                          kernel=kernel_type, **kernel_parameters)
        iskpca_environments.fit_batch(KNMi)
        iskpca_structures.fit_batch(np.mean(KNMi, axis=0))
        
    
    # Finalize the KPCA fitting
    iskpca_environments.finalize_fit()
    iskpca_structures.finalize_fit()
    
    # TODO: pickle the models
    
    # Initialize the KPCA output
    g = h5py.File('../Processed_Data/DEEM_10k/Data/{cutoff}/kpca_environments.hdf5', 'w')
    h = h5py.File('../Processed_Data/DEEM_10k/Data/{cutoff}/kpca_structures.hdf5', 'w')
    
    # TODO: save all the metadata to the files (number of components, kernel params, etc.)
    
    # Transform the data and save
    for sdx, soap in enumerate(deem_10k):
        KNMi = build_kernel(soap, representative_soaps['{cutoff}'],
                          kernel=kernel_type, **kernel_parameters)
        kpcai_environments = iskpca_environments.transform(KNMi)
        kpcai_structures = iskpca_structures.transform(np.mean(KNMi, axis=0))
        g.create_dataset('{:d}'.format(sdx), data=kpcai_environments, track_order=True)
        h.create_dataset('{:d}'.format(sdx), data=kpcai_structures, track_order=True)
        
    g.close()
    h.close()

# IZA_226 on DEEM_10k

In [None]:
for cutoff in cutoffs:
    work_dir = '../Processed_Data/IZA_226onDEEM_10k/Data/{cutoff}'
    # TODO: unpickle the models
    iskpca_environments = None
    iskpca_structures = None
    transform_iskpca(iskpca_environments, iskpca_structures, representative_soaps['{cutoff}'], 
                     kernel_type, kernel_parameters, work_dir=work_dir)

# COD_196 on DEEM_10k