In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# System
import os
import sys
sys.path.append('/home/helfrech/Tools/Toolbox/utils')

# Maths
import numpy as np

# Atoms
from ase.io import read

# Utilities
import project_utils as utils
from tools import save_json
import h5py
from tqdm.auto import tqdm

# SOAP
from soap import librascal_soap
from soap import reshape_expansion, legendre_dvr

# Initial setup

In [3]:
# Create root directories
os.makedirs('../Processed_Data/DEEM_330k', exist_ok=True)
os.makedirs('../Processed_Data/IZA_230', exist_ok=True)
os.makedirs('../Results/3.5', exist_ok=True)
os.makedirs('../Results/6.0', exist_ok=True)

In [4]:
# SOAP cutoffs (angstrom)
cutoffs = (3.5, 6.0)

In [5]:
# Load DEEM 330k
deem_330k = read('../Raw_Data/DEEM_330k/XYZ/DEEM_331172.xyz', index=':')

In [6]:
# Get unique species from Deem 330k
species_list = np.unique(
    [np.unique(frame.get_atomic_numbers()) for frame in deem_330k]
)

# Have to convert this to a list b/c the numbers stored
# inside are actually numpy int (and not python int)
# which mucks up the librascal JSON I/O
species_list = species_list.tolist()
n_species = len(species_list)
print(species_list)

[8, 14]


In [7]:
# Load DEEM 10k
deem_10k = read('../Raw_Data/DEEM_10k/DEEM_10000.xyz', index=':')

# Stride construction from the 330k set to get the 10k set
deem_10k_idxs = np.arange(0, len(deem_330k), 32)[0:10000]

# Get 10k set from 330k set
deem_10k_from_330k = [deem_330k[i] for i in deem_10k_idxs]

# Check to make sure we have the correct structures
print(deem_10k == deem_10k_from_330k)

True


In [8]:
# Save the 10k-in-330k indices
np.savetxt('../Processed_Data/DEEM_330k/deem_10k.idxs', deem_10k_idxs, fmt='%d')

In [9]:
iza_230 = read('../Raw_Data/GULP/IZA_230/IZA_230.xyz', index=':')

# Determining optimal basis set

## Hyperparameters for optimal basis set determination

In [10]:
# Setup SOAP parameters
basis_hyperparameters = dict(
    max_radial=32,
    max_angular=9,
    cutoff_smooth_width=0.3,
    gaussian_sigma_constant=0.3,
    gaussian_sigma_type='Constant',
    radial_basis='DVR'
)

basis_args = dict(
    center_species=[14],
    representation='SphericalExpansion'
)

In [11]:
# Save hyperparameters for quick reference
basis_hyperparameters_copy = basis_hyperparameters.copy()
basis_hyperparameters_copy['interaction_cutoff'] = cutoffs
save_json(basis_hyperparameters_copy, '../Processed_Data/basis_hyperparameters.json')

## Compute optimal basis via PCA
We compute the optimal basis based on the full set of Deem frameworks in an iterative fashion

In [27]:
# Batches for computing the density coefficient covariances
batch_size = 100
n_deem = len(deem_330k)
n_batches = n_deem // batch_size
if n_deem % batch_size > 0:
    n_batches += 1

In [28]:
# Shorthand aliases
n_max = basis_hyperparameters['max_radial']
l_max = basis_hyperparameters['max_angular']

In [None]:
for cutoff in cutoffs:
    work_dir = f'../Processed_Data/DEEM_330k/Data/{cutoff}'

    # Make required directories
    os.makedirs(work_dir, exist_ok=True)

    # Covariance for each species and angular channel
    # of shape (n_max, n_max)
    C = np.zeros((n_species, l_max + 1, n_max, n_max))

    # n_max eigenvalues for each covariance
    v = np.zeros(C.shape[0:3])

    # n_max eigenvectors of length n_max for each covariance
    U = np.zeros(C.shape)

    n_centers = 0
    for i in tqdm(range(0, n_batches)):
        batch_frames = deem_330k[i * batch_size:(i + 1) * batch_size]

        # Compute expansion coefficients
        batch_soaps = librascal_soap(
            batch_frames,
            interaction_cutoff=cutoff,
            **basis_hyperparameters,
            **basis_args,
            average=False,
            concatenate=True,
            progress_bar=False
        )

        n_centers += len(batch_soaps)

        # Expand real-space density
        batch_soaps = reshape_expansion(
            batch_soaps, n_species, n_max, l_max, split_l=True
        )

        # Compute covariance of density coefficients for
        # each species and angular channel,
        # i: center atom index
        # a: species index
        # x: radial index 1
        # y: radial index 2
        # l: angular degree index
        # m: angular order index
        C += np.einsum('iaxlm,iaylm->alxy', batch_soaps, batch_soaps, optimize=True)

    C /= n_centers

    for a in range(0, C.shape[0]):
        for l in range(0, C.shape[1]):
            v_al, U_al = np.linalg.eigh(C[a, l])

            v_al = np.flip(v_al)
            U_al = np.flip(U_al, axis=1)

            v[a, l] = v_al
            U[a, l] = U_al

    # Save eigenvalues and eigenvectors
    f = h5py.File(f'{work_dir}/basis_projectors.hdf5', 'w')
    f.create_dataset('v', data=v)
    f.create_dataset('U', data=U)

    # Save basis hyperparameters
    for hyperparameter, value in basis_hyperparameters.items():
        f.attrs[hyperparameter] = value

    f.close()

HBox(children=(FloatProgress(value=0.0, max=3312.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3312.0), HTML(value='')))

# Compute spline-based SOAPs

## Hyperparameters for spline-based basis set

In [45]:
# Setup SOAP parameters
soap_hyperparameters = basis_hyperparameters.copy()
soap_hyperparameters.update(
    max_radial=12,
    normalize=False
)

soap_args = basis_args.copy()
soap_args.update(representation='SphericalInvariants')

In [46]:
# Save SOAP hyperparameters for quick reference
soap_hyperparameters_copy = soap_hyperparameters.copy()
soap_hyperparameters_copy['interaction_cutoff'] = cutoffs
save_json(soap_hyperparameters_copy, '../Processed_Data/soap_hyperparameters.json')

In [47]:
# Build the spline arguments
spline_args = {}
for cutoff in cutoffs:
    work_dir = f'../Processed_Data/DEEM_330k/Data/{cutoff}'
    
    v, U = utils.load_hdf5(
            f'{work_dir}/basis_projectors.hdf5',
            datasets=['v', 'U']
        )
    
    spline_args[cutoff] = {}
    
    for spectrum, spectrum_label in zip(('PowerSpectrum', 'RadialSpectrum'), ('power', 'radial')):
        if spectrum == 'PowerSpectrum':
            spectrum_slice = slice(None)
        else:
            spectrum_slice = slice(0, 1)
        
        projection_matrix = {
            species: np.moveaxis(
                U[s, spectrum_slice, :, 0:soap_hyperparameters['max_radial']],
                1, 2
            ).tolist() for s, species in enumerate(species_list)
        }
        
        spline_args[cutoff][spectrum] = dict(
            optimization=dict(
                Spline=dict(accuracy=1.0E-8),
                RadialDimReduction=dict(
                    projection_matrices=projection_matrix
                )
            )
        )

In [48]:
save_json(spline_args, '../Processed_Data/soap_spline.json')

## DEEM 330k

In [49]:
# HDF5 chunks for I/O
chunk_shape = {
    'power': (100, soap_hyperparameters['max_radial']**2 * (soap_hyperparameters['max_angular'] + 1)),
    'radial': (10000, soap_hyperparameters['max_radial'])
}

In [52]:
# Compute unnormalized SOAPs for all structures retaining ALL components, but average over structures
for cutoff in cutoffs:
    for spectrum, spectrum_label in zip(('PowerSpectrum', 'RadialSpectrum'), ('power', 'radial')):
        work_dir = f'../Processed_Data/DEEM_330k/Data/{cutoff}'

        output_file = librascal_soap(
            deem_330k,
            interaction_cutoff=cutoff,
            soap_type=spectrum,
            **soap_hyperparameters,
            **soap_args,
            **spline_args[cutoff][spectrum],
            average=True,
            concatenate=True, # for faster access in processing
            chunks=chunk_shape[spectrum_label],
            output=f'{work_dir}/soaps_{spectrum_label}_full_avg_nonorm.hdf5'
        )

HBox(children=(FloatProgress(value=0.0, max=331172.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=331172.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=331172.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=331172.0), HTML(value='')))




## Deem 10k

In [53]:
# Compute unnormalized SOAPs for all structures retaining ALL components
for cutoff in cutoffs:
    for spectrum, spectrum_label in zip(('PowerSpectrum', 'RadialSpectrum'), ('power', 'radial')):
        work_dir = f'../Processed_Data/DEEM_10k/Data/{cutoff}'

        # Make required directories
        os.makedirs(work_dir, exist_ok=True)
        
        output_file = librascal_soap(
            deem_10k,
            interaction_cutoff=cutoff,
            soap_type=spectrum,
            **soap_hyperparameters,
            **soap_args,
            **spline_args[cutoff][spectrum],
            average=False,
            concatenate=False, # Need to be able to access the environments
            chunks=None,
            output=f'{work_dir}/soaps_{spectrum_label}_full_nonorm.hdf5'
        )

HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))




## IZA

In [54]:
# Compute unnormalized SOAPs for all structures retaining ALL components, but average over structures
for cutoff in cutoffs:
    for spectrum, spectrum_label in zip(('PowerSpectrum', 'RadialSpectrum'), ('power', 'radial')):
        work_dir = f'../Processed_Data/IZA_230/Data/{cutoff}'

        # Make required directories
        os.makedirs(work_dir, exist_ok=True)

        output_file = librascal_soap(
            iza_230,
            interaction_cutoff=cutoff,
            soap_type=spectrum,
            **soap_hyperparameters,
            **soap_args,
            **spline_args[cutoff][spectrum],
            average=True,
            concatenate=True, # For easy access
            chunks=None,
            output=f'{work_dir}/soaps_{spectrum_label}_full_avg_nonorm.hdf5'
        ) 

HBox(children=(FloatProgress(value=0.0, max=230.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=230.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=230.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=230.0), HTML(value='')))




In [55]:
# Compute unnormalized SOAPs for all structures retaining ALL components
for cutoff in cutoffs:
    for spectrum, spectrum_label in zip(('PowerSpectrum', 'RadialSpectrum'), ('power', 'radial')):
        work_dir = f'../Processed_Data/IZA_230/Data/{cutoff}'

        # Make required directories
        os.makedirs(work_dir, exist_ok=True)

        output_file = librascal_soap(
            iza_230,
            interaction_cutoff=cutoff,
            soap_type=spectrum,
            **soap_hyperparameters,
            **soap_args,
            **spline_args[cutoff][spectrum],
            average=False,
            concatenate=False, # Need to be able to access the environments
            chunks=None,
            output=f'{work_dir}/soaps_{spectrum_label}_full_nonorm.hdf5'
        )

HBox(children=(FloatProgress(value=0.0, max=230.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=230.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=230.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=230.0), HTML(value='')))


