In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# System
import os
import sys
sys.path.append('/home/helfrech/Tools/Toolbox/utils')

# Maths
import numpy as np

# Atoms
from ase.io import read

# Utilities
import project_utils as utils

# SOAP
from soap import librascal_soap

# Initial setup

In [5]:
# Create root directories
if not os.path.exists('../Processed_Data'):
    os.mkdir('../Processed_Data')

if not os.path.exists('../Results'):
    os.mkdir('../Results')

In [3]:
# SOAP cutoffs (angstrom)
cutoffs = (3.5, 6.0)

In [4]:
# Setup SOAP parameters
# TODO: define and save the whole set of parameters
soap_hyperparameters = dict(max_radial=12,
                            max_angular=9,
                            cutoff_smooth_width=0.3,
                            gaussian_sigma_constant=0.3)

In [4]:
# Save SOAP hyperparameters for quick reference
with open('../Processed_Data/soap_hyperparameters.json', 'w') as f:
    soap_hyperparameters_copy = soap_hyperparameters.copy()
    soap_hyperparameters_copy['interaction_cutoff'] = cutoffs
    json.dump(soap_hyperparameters_copy, f)

# Functions

In [9]:
def extract_volumes_and_numbers(structures, work_dir='.'):
    """
        Extracts and saves number of Si and volumes per Si
    """
    
    # Make required directories
    if not os.path.exists(work_dir):
        os.makedirs(work_dir)
        
    n_Si = np.zeros(len(structures), dtype=int)
    volumes = np.zeros(len(strucutres))
    
    # Iterate over structures and fill the property dictionary
    for sdx, structure in enumerate(structures):
        Z = structure.get_atomic_numbers()
        n_Si[sdx] = np.count_nonzero(Z == 14)
        volumes[sdx] = structure.cell.volume / n_Si
        
    np.savetxt(f'{work_dir}/n_Si.dat', fmt='%d')
    np.savetxt(f'{work_dir}/structure_volumes.dat', fmt='%.18e')

# DEEM 330k

In [16]:
chunk_shape = dict(
    power=(100, soap_hyperparameters['max_radial']**2 * (soap_hyperparameters['max_angular'] + 1)),
    radial=(10000, soap_hyperparameters['max_radial'])
)

In [11]:
# Load DEEM 10k
# TODO: rebuild from the CIFs?
deem_330k = read('../Raw_Data/DEEM_330k/XYZ/DEEM_331172.xyz', index=':')

In [29]:
# Extract volumes and number of Si
work_dir = '../Processed_Data/DEEM_330k/Data'
extract_volumes_and_numbers(deem_330k, work_dir=work_dir)

# Copy over the energies for easy access
energies = np.loadtxt('../Raw_Data/GULP/DEEM_330k/optimization_summary_fix.dat', usecols=2)
np.savetxt(f'{work_dir}/structure_energies.dat', energies)

In [None]:
# Compute unnormalized SOAPs for all structures retaining ALL components, but average over structures
for cutoff in cutoffs:
    for spectrum, spectrum_label in zip(('PowerSpectrum', 'RadialSpectrum'), ('power', 'radial')):
        work_dir = f'../Processed_Data/DEEM_330k/Data/{cutoff}'

        # Make required directories
        if not os.path.exists(work_dir):
            os.makedirs(work_dir)

        output_file = librascal_soap(
            deem_330k, [14],
            interaction_cutoff=cutoff,
            soap_type=spectrum,
            **soap_hyperparameters,
            normalize=False,
            component_idxs=None,
            average=True,
            concatenate=True, # for faster access in processing
            chunks=chunk_shape[spectrum_label],
            output=f'{work_dir}/soaps_{spectrum_label}_full_avg_nonorm.hdf5'
        )

100%|██████████| 331172/331172 [33:33<00:00, 164.44it/s]
 96%|█████████▋| 319537/331172 [1:34:30<09:36, 20.20it/s] 

# Extract DEEM 10k SOAPs from DEEM 330k SOAPs

In [57]:
# Load DEEM 10k
deem_10k = read('../Raw_Data/DEEM_10k/DEEM_10000.xyz', index=':')

In [59]:
# Stride construction from the 330k set to get the 10k set
deem_10k_idxs = np.arange(0, len(deem_330k), 32)[0:10000]

In [60]:
# Check to make sure we have the correct structures
print(deem_10k == [deem_330k[i] for i in deem_10k_idxs])

True


In [77]:
# Save the indices
np.savetxt('../Processed_Data/DEEM_330k/deem_10k.idxs', deem_10k_idxs, fmt='%d')

In [None]:
# Compute unnormalized SOAPs for all structures retaining ALL components, but average over structures
for cutoff in cutoffs:
    for spectrum, spectrum_label in zip(('PowerSpectrum', 'RadialSpectrum'), ('power', 'radial')):
        work_dir = f'../Processed_Data/DEEM_10k/Data/{cutoff}'

        # Make required directories
        if not os.path.exists(work_dir):
            os.makedirs(work_dir)

        output_file = librascal_soap(
            deem_10k, [14],
            interaction_cutoff=cutoff,
            soap_type=spectrum,
            **soap_hyperparameters,
            normalize=False,
            component_idxs=None,
            average=True,
            concatenate=False, # Need to be able to access the environments
            chunks=chunk_shape[spectrum_label],
            output=f'{work_dir}/soaps_{spectrum_label}_full_nonorm.hdf5'
        )

In [78]:
# Check that we can pull the correct SOAPs with the indices
for cutoff in cutoffs:
    for spectrum_label in ('power', 'radial'):
        soaps_10k = load_hdf5(
            f'../Processed_Data/DEEM_10k/Data/{cutoff}/soaps_{spectrum_label}_full_nonorm.hdf5',
            datasets=None, concatenate=False
        )
        soaps_10k = np.vstack([np.mean(soaps, axis=0) for soaps in soaps_10k])

        soaps_330k = load_hdf5(
            f'../Processed_Data/DEEM_330k/Data/{cutoff}/soaps_{spectrum_label}_full_avg_nonorm.hdf5',
            indices=deem_10k_idxs
        )

        print(np.allclose(soaps_10k, soaps_330k, rtol=1.0E-12, atol=1.0E-12))

True


# IZA

In [None]:
iza_226 = read('../Raw_Data/GULP/IZA_226/IZA_226.xyz', index=':')

In [7]:
# Extract volumes and number of Si
work_dir = '../Processed_Data/IZA_226/Data'
extract_volumes_and_numbers(iza_226, work_dir=work_dir)

# Copy over the energies for easy access
energies = np.loadtxt('../Raw_Data/GULP/IZA_226/optimization_summary_fix.dat', usecols=2)
np.savetxt(f'{work_dir}/structure_energies.dat', energies)

In [7]:
# Compute unnormalized SOAPs for all structures retaining ALL components, but average over structures
for cutoff in cutoffs:
    for spectrum, spectrum_label in zip(('PowerSpectrum', 'RadialSpectrum'), ('power', 'radial')):
        work_dir = f'../Processed_Data/IZA_226/Data/{cutoff}'

        # Make required directories
        if not os.path.exists(work_dir):
            os.makedirs(work_dir)

        output_file = librascal_soap(
            iza_226, [14],
            interaction_cutoff=cutoff,
            soap_type=spectrum,
            **soap_hyperparameters,
            normalize=False,
            component_idxs=None,
            average=True,
            concatenate=True, # For easy access
            chunks=chunk_shape[spectrum_label],
            output=f'{work_dir}/soaps_{spectrum_label}_full_nonorm.hdf5'
        ) 

100%|██████████| 226/226 [00:01<00:00, 122.86it/s]
100%|██████████| 226/226 [00:04<00:00, 45.47it/s]
