In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# System
import os
import sys
sys.path.append('/home/helfrech/Tools/Toolbox/utils')

# Maths
import numpy as np

# Atoms
from ase.io import read

# Utilities
import project_utils as utils
from tools import save_json
import h5py
from tqdm.auto import tqdm

# SOAP
from soap import librascal_soap

# Initial setup

In [3]:
# SOAP cutoffs (angstrom)
cutoffs = (3.5, 6.0)

# Setup SOAP parameters
soap_hyperparameters = dict(
    max_radial=12,
    max_angular=9,
    cutoff_smooth_width=0.3,
    gaussian_sigma_constant=0.3,
    cutoff_function_type='ShiftedCosine',
    gaussian_sigma_type='Constant',
    radial_basis='GTO',
    expansion_by_species_method='environment wise',
    global_species=None,
    compute_gradients=False,
    inversion_symmetry=True,
    optimization_args={},
    cutoff_function_parameters={},
    coefficient_subselection=None
)

In [4]:
# Save SOAP hyperparameters for quick reference
soap_hyperparameters_copy = soap_hyperparameters.copy()
soap_hyperparameters_copy['interaction_cutoff'] = cutoffs
save_json(soap_hyperparameters_copy, '../Processed_Data/soap_hyperparameters.json')

# DEEM 330k

In [8]:
chunk_shape = dict(
    power=(100, soap_hyperparameters['max_radial']**2 * (soap_hyperparameters['max_angular'] + 1)),
    radial=(10000, soap_hyperparameters['max_radial'])
)

In [3]:
# Load DEEM 330k
deem_330k = read('../Raw_Data/DEEM_330k/XYZ/DEEM_331172.xyz', index=':')

In [25]:
# Compute unnormalized SOAPs for all structures retaining ALL components, but average over structures
for cutoff in cutoffs:
    for spectrum, spectrum_label in zip(('PowerSpectrum', 'RadialSpectrum'), ('power', 'radial')):
        work_dir = f'../Processed_Data/DEEM_330k/Data/{cutoff}'

        # Make required directories
        os.makedirs(work_dir, exist_ok=True)

        output_file = librascal_soap(
            deem_330k, [14],
            interaction_cutoff=cutoff,
            soap_type=spectrum,
            **soap_hyperparameters,
            normalize=False,
            component_idxs=None,
            average=True,
            concatenate=True, # for faster access in processing
            chunks=chunk_shape[spectrum_label],
            output=f'{work_dir}/soaps_{spectrum_label}_full_avg_nonorm.hdf5'
        )

100%|██████████| 331172/331172 [33:05<00:00, 166.76it/s]
100%|██████████| 331172/331172 [10:34<00:00, 521.58it/s]
100%|██████████| 331172/331172 [1:36:19<00:00, 57.30it/s] 
100%|██████████| 331172/331172 [24:45<00:00, 222.98it/s]


# Deem 10k

In [4]:
# Load DEEM 10k
deem_10k = read('../Raw_Data/DEEM_10k/DEEM_10000.xyz', index=':')

In [5]:
# Stride construction from the 330k set to get the 10k set
deem_10k_idxs = np.arange(0, len(deem_330k), 32)[0:10000]

In [6]:
# Get 10k set from 330k set
deem_10k_from_330k = [deem_330k[i] for i in deem_10k_idxs]

In [7]:
# Check to make sure we have the correct structures
print(deem_10k == deem_10k_from_330k)

True


In [12]:
# Save the indices
np.savetxt('../Processed_Data/DEEM_330k/deem_10k.idxs', deem_10k_idxs, fmt='%d')

In [22]:
# Compute unnormalized SOAPs for all structures retaining ALL components
for cutoff in cutoffs:
    for spectrum, spectrum_label in zip(('PowerSpectrum', 'RadialSpectrum'), ('power', 'radial')):
        work_dir = f'../Processed_Data/DEEM_10k/Data/{cutoff}'

        # Make required directories
        os.makedirs(work_dir, exist_ok=True)

        output_file = librascal_soap(
            deem_10k, [14],
            interaction_cutoff=cutoff,
            soap_type=spectrum,
            **soap_hyperparameters,
            normalize=False,
            component_idxs=None,
            average=False,
            concatenate=False, # Need to be able to access the environments
            chunks=None,
            output=f'{work_dir}/soaps_{spectrum_label}_full_nonorm.hdf5'
        )

100%|██████████| 10000/10000 [02:04<00:00, 80.27it/s] 
100%|██████████| 10000/10000 [00:15<00:00, 628.48it/s]
100%|██████████| 10000/10000 [03:06<00:00, 53.73it/s]
100%|██████████| 10000/10000 [00:44<00:00, 226.20it/s]


# IZA

In [6]:
iza_230 = read('../Raw_Data/GULP/IZA_230/IZA_230.xyz', index=':')

In [12]:
# Compute unnormalized SOAPs for all structures retaining ALL components, but average over structures
for cutoff in cutoffs:
    for spectrum, spectrum_label in zip(('PowerSpectrum', 'RadialSpectrum'), ('power', 'radial')):
        work_dir = f'../Processed_Data/IZA_230/Data/{cutoff}'

        # Make required directories
        os.makedirs(work_dir, exist_ok=True)

        output_file = librascal_soap(
            iza_230, [14],
            interaction_cutoff=cutoff,
            soap_type=spectrum,
            **soap_hyperparameters,
            normalize=False,
            component_idxs=None,
            average=True,
            concatenate=True, # For easy access
            chunks=None,
            output=f'{work_dir}/soaps_{spectrum_label}_full_avg_nonorm.hdf5'
        ) 

100%|██████████| 230/230 [00:01<00:00, 132.59it/s]
100%|██████████| 230/230 [00:00<00:00, 575.77it/s]
100%|██████████| 230/230 [00:04<00:00, 49.52it/s]
100%|██████████| 230/230 [00:01<00:00, 220.59it/s]


In [13]:
# Compute unnormalized SOAPs for all structures retaining ALL components
for cutoff in cutoffs:
    for spectrum, spectrum_label in zip(('PowerSpectrum', 'RadialSpectrum'), ('power', 'radial')):
        work_dir = f'../Processed_Data/IZA_230/Data/{cutoff}'

        # Make required directories
        os.makedirs(work_dir, exist_ok=True)

        output_file = librascal_soap(
            iza_230, [14],
            interaction_cutoff=cutoff,
            soap_type=spectrum,
            **soap_hyperparameters,
            normalize=False,
            component_idxs=None,
            average=False,
            concatenate=False, # Need to be able to access the environments
            chunks=None,
            output=f'{work_dir}/soaps_{spectrum_label}_full_nonorm.hdf5'
        )

100%|██████████| 230/230 [00:02<00:00, 99.59it/s] 
100%|██████████| 230/230 [00:00<00:00, 561.02it/s]
100%|██████████| 230/230 [00:04<00:00, 47.19it/s]
100%|██████████| 230/230 [00:01<00:00, 218.75it/s]
