In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# System
import os
import sys
sys.path.append('/home/helfrech/Tools/Toolbox/utils')

# Maths
import numpy as np

# Atoms
from ase.io import read

# Utilities
import h5py
import json
from selection import FPS, random_selection
from project_utils import load_structures_from_hdf5

# SOAP
from soap import quippy_soap, librascal_soap

# Initial setup

In [5]:
# Create root directories
if not os.path.exists('../Processed_Data'):
    os.mkdir('../Processed_Data')

if not os.path.exists('../Results'):
    os.mkdir('../Results')

In [3]:
# SOAP cutoffs (angstrom)
cutoffs = (3.5, 6.0)

In [4]:
# Setup SOAP parameters
# TODO: define and save the whole set of parameters
soap_hyperparameters = dict(max_radial=12,
                            max_angular=9,
                            cutoff_smooth_width=0.3,
                            gaussian_sigma_constant=0.3)

In [4]:
# Save SOAP hyperparameters for quick reference
with open('../Processed_Data/soap_hyperparameters.json', 'w') as f:
    soap_hyperparameters_copy = soap_hyperparameters.copy()
    soap_hyperparameters_copy['interaction_cutoff'] = cutoffs
    json.dump(soap_hyperparameters_copy, f)

In [5]:
# Number of FPS SOAP components to retain 
n_components = 500

In [6]:
# Number of randomly selected structure to use to select the SOAP components
n_random = 2000

In [7]:
# Number of representative environments
n_representatives = 2000

In [8]:
# Fraction of training structures
f_train = 0.7750

# Functions

In [9]:
def extract_structure_properties(structures, 
                                 properties=['Energy_per_Si'], 
                                 property_names=['structure_energies'],
                                 work_dir='.'):
    """
        Extracts and saves number of Si, volumes per Si, and other structure properties
    """
    
    # Make required directories
    if not os.path.exists(work_dir):
        os.makedirs(work_dir)
    
    # Dictonary of standard properties
    property_dict = dict(n_Si=[], structure_volumes=[])
    
    # Append extra properties to dictionary
    for pn in property_names:
        property_dict[pn] = []
    
    # Iterate over structures and fill the property dictionary
    for structure in structures:
        Z = structure.get_atomic_numbers()
        n_Si = np.count_nonzero(Z == 14)
        property_dict['n_Si'].append(n_Si)
        property_dict['structure_volumes'].append(structure.cell.volume / n_Si)
        
        for p, pn in zip(properties, property_names):
            property_dict[pn].append(structure.info[p])

    # Save properties
    for key, value in property_dict.items():
        if key == 'n_Si':
            fmt = '%d'
        else:
            fmt = '%.18e'
        np.savetxt(f'{work_dir}/{key}.dat', np.asarray(value), fmt=fmt)

# DEEM_330k

In [16]:
chunk_shape_power = (100, soap_hyperparameters['max_radial']**2 * (soap_hyperparameters['max_angular'] + 1))
chunk_shape_radial = (10000, soap_hyperparameters['max_radial'])

In [11]:
# Load DEEM 10k
deem_330k = read('../Raw_Data/DEEM_330k/XYZ/DEEM_331172.xyz', index=':')

In [29]:
# TODO: change this to extract properties like the IZA calculations
# and get Deem energies from the GULP calculations
extract_structure_properties(deem_330k, 
                             properties=['Energy_per_Si'], 
                             property_names=['structure_energies'],
                             work_dir='../Processed_Data/DEEM_330k')

In [30]:
for cutoff in cutoffs:
    work_dir = f'../Processed_Data/DEEM_330konDEEM_10k/Data/{cutoff}'
    idxs_dir = f'../Processed_Data/DEEM_10k/Data/{cutoff}'
    
    # Make required directories
    if not os.path.exists(work_dir):
        os.makedirs(work_dir)
    
    # Load DEEM_10k component indices
    component_idxs = np.loadtxt(f'{idxs_dir}/FPS_components.idxs', usecols=0, dtype=int)
    
    output_file = librascal_soap(deem_330k, [14],
                                 interaction_cutoff=cutoff,
                                 **soap_hyperparameters,
                                 component_idxs=component_idxs,
                                 average=True,
                                 output=f'{work_dir}/soaps_avg.hdf5')

331172it [38:11, 144.51it/s]
331172it [1:49:48, 50.26it/s] 


In [None]:
# Compute unnormalized SOAPs for all structures retaining ALL components, but average over structures
for cutoff in cutoffs:
    work_dir = f'../Processed_Data/DEEM_330k/Data/{cutoff}'
    
    # Make required directories
    if not os.path.exists(work_dir):
        os.makedirs(work_dir)
    
    output_file = librascal_soap(deem_330k, [14],
                                 interaction_cutoff=cutoff,
                                 **soap_hyperparameters,
                                 normalize=False,
                                 component_idxs=None,
                                 average=True,
                                 concatenate=True, # for faster access in processing
                                 chunks=chunk_shape_power,
                                 output=f'{work_dir}/soaps_power_full_avg_nonorm.hdf5')

100%|██████████| 331172/331172 [33:33<00:00, 164.44it/s]
 96%|█████████▋| 319537/331172 [1:34:30<09:36, 20.20it/s] 

In [18]:
# Compute unnormalized SOAP radial spectra (2-body correlations) for all structures retaining ALL components, 
# but average over structures
for cutoff in cutoffs:
    work_dir = f'../Processed_Data/DEEM_330k/Data/{cutoff}'
    
    # Make required directories
    if not os.path.exists(work_dir):
        os.makedirs(work_dir)
    
    output_file = librascal_soap(deem_330k, [14],
                                 interaction_cutoff=cutoff,
                                 soap_type='RadialSpectrum',
                                 **soap_hyperparameters, # `max_angular` should automatically be set to 0
                                 normalize=False,
                                 component_idxs=None,
                                 average=True,
                                 concatenate=True, # for faster access in processing
                                 chunks=chunk_shape_radial,
                                 output=f'{work_dir}/soaps_radial_full_avg_nonorm.hdf5')

100%|██████████| 331172/331172 [10:44<00:00, 513.84it/s]
100%|██████████| 331172/331172 [25:44<00:00, 214.35it/s]


# Extract DEEM 10k SOAPs from DEEM 330k SOAPs

In [57]:
# Load DEEM 10k
deem_10k = read('../Raw_Data/DEEM_10k/DEEM_10000.xyz', index=':')

In [58]:
# Load DEEM 10k
deem_330k = read('../Raw_Data/DEEM_330k/XYZ/DEEM_331172.xyz', index=':')

In [59]:
# Stride construction from the 330k set to get the 10k set
deem_10k_idxs = np.arange(0, len(deem_330k), 32)[0:10000]
n_digits_deem = len(str(len(deem_330k)))

In [60]:
# Check to make sure we have the correct structures
print(deem_10k == [deem_330k[i] for i in deem_10k_idxs])

True


In [61]:
# Change to zero-padded string for dataset access in HDF5 file
deem_10k_idxs = np.array([str(i).zfill(n_digits_deem) for i in deem_10k_idxs])

In [78]:
# Check that we can pull the correct SOAPs with the indices
soaps_10k = load_structures_from_hdf5('../Processed_Data/DEEM_10k/Data/6.0/soaps.hdf5',
                                      datasets=None, concatenate=False)
soaps_10k = np.vstack([np.mean(soaps, axis=0) for soaps in soaps_10k])

soaps_330k = load_structures_from_hdf5('../Processed_Data/DEEM_330konDEEM_10k/Data/6.0/soaps.hdf5',
                                      datasets=deem_10k_idxs, concatenate=True)

print(np.allclose(soaps_10k, soaps_330k, rtol=1.0E-12, atol=1.0E-12))

True


In [77]:
# Save the indices
np.savetxt('../Processed_Data/DEEM_330konDEEM_10k/deem_10k.idxs', deem_10k_idxs, fmt='%s')

# IZA

In [7]:
# TODO: recompute IZA energies just like DEEM 330k and load energies accordingly
# Extract structure volumes
extract_structure_properties(iza, 
                             properties=[], 
                             property_names=[],
                             work_dir='../Processed_Data/IZA_226')

# Extract energies separately
iza_energies = np.loadtxt('../Raw_Data/GULP/IZA_226/Energies_IZA.dat', usecols=8)
np.savetxt('../Processed_Data/IZA_226/structure_energies.dat', iza_energies)

In [7]:
# Compute unnormalized SOAPs for all structures retaining ALL components, but average over structures
for cutoff in cutoffs:
    work_dir = f'../Processed_Data/IZA_226/Data/{cutoff}'
    
    # Make required directories
    if not os.path.exists(work_dir):
        os.makedirs(work_dir)
    
    output_file = librascal_soap(iza, [14],
                                 interaction_cutoff=cutoff,
                                 **soap_hyperparameters,
                                 normalize=False,
                                 component_idxs=None,
                                 average=True,
                                 output=f'{work_dir}/soaps_power_full_avg_nonorm.hdf5')    

100%|██████████| 226/226 [00:01<00:00, 122.86it/s]
100%|██████████| 226/226 [00:04<00:00, 45.47it/s]


In [8]:
# Compute unnormalized SOAP radial spectra (2-body correlations) for all structures retaining ALL components, 
# but average over structures
for cutoff in cutoffs:
    work_dir = f'../Processed_Data/IZA_226/Data/{cutoff}'
    
    # Make required directories
    if not os.path.exists(work_dir):
        os.makedirs(work_dir)
    
    output_file = librascal_soap(iza, [14],
                                 interaction_cutoff=cutoff,
                                 soap_type='RadialSpectrum',
                                 **soap_hyperparameters, # `max_angular` should automatically be set to 0
                                 normalize=False,
                                 component_idxs=None,
                                 average=True,
                                 output=f'{work_dir}/soaps_radial_full_avg_nonorm.hdf5')    

100%|██████████| 226/226 [00:00<00:00, 513.35it/s]
100%|██████████| 226/226 [00:01<00:00, 177.50it/s]
