In [1]:
# System
import os
import sys
sys.path.append('/home/helfrech/Tools/Toolbox/utils')

# Maths
import numpy as np

# Atoms
from ase.io import read

# Utilities
import h5py
import json
from selection import FPS, random_selection
from project_utils import load_structures_from_hdf5

# SOAP
from soap import quippy_soap, librascal_soap

# Initial setup

In [3]:
# Create root directories
if not os.path.exists('../Processed_Data'):
    os.mkdir('../Processed_Data')

if not os.path.exists('../Results'):
    os.mkdir('../Results')

In [2]:
# SOAP cutoffs (angstrom)
cutoffs = (3.5, 6.0)

In [3]:
# Setup SOAP parameters
# TODO: define and save the whole set of parameters
soap_hyperparameters = dict(max_radial=12,
                            max_angular=9,
                            cutoff_smooth_width=0.3,
                            gaussian_sigma_constant=0.3)

In [4]:
# Save SOAP hyperparameters for quick reference
with open('../Processed_Data/soap_hyperparameters.json', 'w') as f:
    soap_hyperparameters_copy = soap_hyperparameters.copy()
    soap_hyperparameters_copy['interaction_cutoff'] = cutoffs
    json.dump(soap_hyperparameters_copy, f)

In [5]:
# Number of FPS SOAP components to retain 
n_components = 500

In [6]:
# Number of randomly selected structure to use to select the SOAP components
n_random = 2000

In [7]:
# Number of representative environments
n_representatives = 2000

In [8]:
# Fraction of training structures
f_train = 0.7750

# Functions

In [2]:
def extract_structure_properties(structures, 
                                 properties=['Energy_per_Si'], 
                                 property_names=['structure_energies'],
                                 work_dir='.'):
    """
        Extracts and saves number of Si, volumes per Si, and other structure properties
    """
    
    # Make required directories
    if not os.path.exists(work_dir):
        os.makedirs(work_dir)
    
    # Dictonary of standard properties
    property_dict = dict(n_Si=[], structure_volumes=[])
    
    # Append extra properties to dictionary
    for pn in property_names:
        property_dict[pn] = []
    
    # Iterate over structures and fill the property dictionary
    for structure in structures:
        Z = structure.get_atomic_numbers()
        n_Si = np.count_nonzero(Z == 14)
        property_dict['n_Si'].append(n_Si)
        property_dict['structure_volumes'].append(structure.cell.volume / n_Si)
        
        for p, pn in zip(properties, property_names):
            property_dict[pn].append(structure.info[p])

    # Save properties
    for key, value in property_dict.items():
        if key == 'n_Si':
            fmt = '%d'
        else:
            fmt = '%.18e'
        np.savetxt(f'{work_dir}/{key}.dat', np.asarray(value), fmt=fmt)

# DEEM_10k

In [4]:
# Load DEEM 10k
deem_10k = read('../Raw_Data/DEEM_10k/DEEM_10000.xyz', index=':')

In [5]:
# Extract structure properties
extract_structure_properties(deem_10k, 
                             properties=['Energy_per_Si'], 
                             property_names=['structure_energies'],
                             work_dir='../Processed_Data/DEEM_10k')

In [12]:
# Select random structures from which to select SOAP components
random_idxs = random_selection(len(deem_10k), n=n_random)
random_structures = [deem_10k[i] for i in random_idxs]

In [13]:
# Select random structures, compute SOAPs, and select components with FPS
for cutoff in cutoffs:
    work_dir = f'../Processed_Data/DEEM_10k/Data/{cutoff}'
    
    # Make required directories
    if not os.path.exists(work_dir):
        os.makedirs(work_dir)
    
    # Compute SOAPs on the sample structures
    soaps = librascal_soap(random_structures, [14],
                           interaction_cutoff=cutoff,
                           **soap_hyperparameters)

    # Concatenate SOAPs
    soaps = np.vstack(soaps)

    # Compute FPS components and save
    component_idxs, distances = FPS(soaps.T, n=n_components)
    np.savetxt(f'{work_dir}/FPS_components.idxs', 
               np.stack((component_idxs, distances), axis=1), fmt='%6d\t%.18e')
    
    # Delete the SOAPs so we aren't carrying them around
    del soaps

100%|██████████| 2000/2000 [00:13<00:00, 150.23it/s]
100%|██████████| 499/499 [01:45<00:00,  4.73it/s]
100%|██████████| 2000/2000 [00:36<00:00, 54.49it/s]
100%|██████████| 499/499 [01:45<00:00,  4.73it/s]


In [14]:
# Compute SOAPs for all structures retaining only the FPS components
for cutoff in cutoffs:
    work_dir = f'../Processed_Data/DEEM_10k/Data/{cutoff}'
    component_idxs = np.loadtxt(f'{work_dir}/FPS_components.idxs', usecols=0, dtype=int)
    
    output_file = librascal_soap(deem_10k, [14],
                                 interaction_cutoff=cutoff,
                                 **soap_hyperparameters,
                                 component_idxs=component_idxs,
                                 output=f'{work_dir}/soaps.hdf5')

10000it [01:07, 149.19it/s]
10000it [03:12, 52.08it/s]


In [6]:
# Compute SOAPs for all structures retaining ALL components, but average over structures
# This computes normalized SOAPs, which seem to work, but using unnormalized SOAPs is probably more robust
for cutoff in cutoffs:
    work_dir = f'../Processed_Data/DEEM_10k/Data/{cutoff}'
    
    output_file = librascal_soap(deem_10k, [14],
                                 interaction_cutoff=cutoff,
                                 **soap_hyperparameters,
                                 component_idxs=None,
                                 average=True,
                                 output=f'{work_dir}/soaps_full_avg.hdf5')

100%|██████████| 10000/10000 [01:04<00:00, 156.02it/s]
100%|██████████| 10000/10000 [03:07<00:00, 53.32it/s]


In [15]:
# Split into train and test sets
n_train = int(f_train*len(deem_10k))
idxs = np.arange(0, len(deem_10k))
np.random.shuffle(idxs)
train_idxs = idxs[0:n_train]
test_idxs = idxs[n_train:]

np.savetxt('../Processed_Data/DEEM_10k/train.idxs', train_idxs, fmt='%d')
np.savetxt('../Processed_Data/DEEM_10k/test.idxs', test_idxs, fmt='%d')

In [18]:
# Build representatives from train set
train_idxs = np.loadtxt('../Processed_Data/DEEM_10k/train.idxs', dtype=int)
n_digits = len(str(len(deem_10k) - 1))
datasets = [str(i).zfill(n_digits) for i in train_idxs]

for cutoff in cutoffs:
    work_dir = f'../Processed_Data/DEEM_10k/Data/{cutoff}'
    
    # Load SOAPs
    soaps = load_structures_from_hdf5(f'{work_dir}/soaps.hdf5', datasets=datasets, concatenate=True)
    
    # Select representatives from just the train set (indices are relative to the train set)
    representatives, distances = FPS(soaps, n=n_representatives)   
    
    # Save representatives
    np.savetxt(f'{work_dir}/FPS_representatives.idxs', 
               np.stack((representatives, distances), axis=1), fmt='%6d\t%.18e')

    # Delete SOAPs so we aren't carrying them around
    del soaps

100%|██████████| 1999/1999 [02:49<00:00, 11.82it/s]
100%|██████████| 1999/1999 [02:48<00:00, 11.89it/s]


In [None]:
# Build unique environments
max_unique = 75000 # "Safety" measure
for cutoff in cutoffs:
    work_dir = f'../Processed_Data/DEEM_10k/Data/{cutoff}'
    
    # Load SOAPs
    soaps = load_structures_from_hdf5(f'{work_dir}/soaps.hdf5', datasets=None, concatenate=True)    
    
    # Get unique structures from FPS (indices are relative to the whole dataset)
    unique, distances = FPS(soaps, n=max_unique)
    np.savetxt(f'{work_dir}/FPS_unique.idxs',
               np.stack((unique, distances), axis=1), fmt='%6d\t%.18e')
    
    # Delete the SOAPs so we aren't carrying them around
    del soaps

  1%|▏         | 944/74999 [01:43<2:14:41,  9.16it/s]

# IZA on DEEM_10k

In [4]:
# Load IZA structures
iza = read('../Raw_Data/GULP/IZA_226/IZA.xyz', index=':')

In [7]:
# Extract structure volumes
extract_structure_properties(iza, 
                             properties=[], 
                             property_names=[],
                             work_dir='../Processed_Data/IZA_226')

# Extract energies separately
iza_energies = np.loadtxt('../Raw_Data/GULP/IZA_226/Energies_IZA.dat', usecols=8)
np.savetxt('../Processed_Data/IZA_226/structure_energies.dat', iza_energies)

In [22]:
# Compute IZA SOAPs using DEEM components
for cutoff in cutoffs:
    work_dir = f'../Processed_Data/IZA_226onDEEM_10k/Data/{cutoff}'
    idxs_dir = f'../Processed_Data/DEEM_10k/Data/{cutoff}'
    
    # Make required directories
    if not os.path.exists(work_dir):
        os.makedirs(work_dir)
    
    # Load DEEM_10k component indices
    component_idxs = np.loadtxt(f'{idxs_dir}/FPS_components.idxs', usecols=0, dtype=int)
    
    output_file = librascal_soap(iza, [14],
                                 interaction_cutoff=cutoff,
                                 **soap_hyperparameters,
                                 component_idxs=component_idxs,
                                 output=f'{work_dir}/soaps.hdf5')

226it [00:02, 109.86it/s]
226it [00:05, 43.49it/s]


In [12]:
# Build unique environments
for cutoff in cutoffs:
    work_dir = f'../Processed_Data/IZA_226onDEEM_10k/Data/{cutoff}'
    
    # Load SOAPs
    soaps = load_structures_from_hdf5(f'{work_dir}/soaps.hdf5', datasets=None, concatenate=True)    
    
    # Get unique structures from FPS
    unique, distances = FPS(soaps, n=-1)
    np.savetxt(f'{work_dir}/FPS_unique.idxs',
               np.stack((unique, distances), axis=1), fmt='%6d\t%.18e')
    
    # Delete SOAPs so we aren't carrying them around
    del soaps

 68%|██████▊   | 10345/15278 [00:36<00:17, 287.23it/s]
 84%|████████▍ | 12841/15278 [00:40<00:07, 319.45it/s]


# IZA

In [5]:
# Compute SOAPs for all structures retaining ALL components, but average over structures
# This computes normalized SOAPs, which seem to work, but using unnormalized SOAPs is probably more robust
for cutoff in cutoffs:
    work_dir = f'../Processed_Data/IZA_226/Data/{cutoff}'
    
    # Make required directories
    if not os.path.exists(work_dir):
        os.makedirs(work_dir)
    
    output_file = librascal_soap(iza, [14],
                                 interaction_cutoff=cutoff,
                                 **soap_hyperparameters,
                                 component_idxs=None,
                                 average=True,
                                 output=f'{work_dir}/soaps_full_avg.hdf5')

100%|██████████| 226/226 [00:01<00:00, 124.56it/s]
100%|██████████| 226/226 [00:04<00:00, 46.95it/s]


# COD on DEEM_10k

In [8]:
# Load IZA structures
cod = read('../Raw_Data/GULP/COD_196/COD.xyz', index=':')

In [9]:
# Extract structure volumes
extract_structure_properties(cod, 
                             properties=[], 
                             property_names=[],
                             work_dir='../Processed_Data/COD_196')

# Extract structure energies separately
cod_energies = np.loadtxt('../Raw_Data/GULP/COD_196/Energies_COD.dat', usecols=8)
np.savetxt('../Processed_Data/COD_196/structure_energies.dat', cod_energies)

In [26]:
for cutoff in cutoffs:
    work_dir = f'../Processed_Data/COD_196onDEEM_10k/Data/{cutoff}'
    idxs_dir = f'../Processed_Data/DEEM_10k/Data/{cutoff}'
    
    # Make required directories
    if not os.path.exists(work_dir):
        os.makedirs(work_dir)
    
    # Load DEEM_10k component indices
    component_idxs = np.loadtxt(f'{idxs_dir}/FPS_components.idxs', usecols=0, dtype=int)
    
    output_file = librascal_soap(cod, [14],
                                 interaction_cutoff=cutoff,
                                 **soap_hyperparameters,
                                 component_idxs=component_idxs,
                                 output=f'{work_dir}/soaps.hdf5')

196it [00:00, 341.95it/s]
196it [00:01, 130.87it/s]


In [13]:
# Build unique environments
for cutoff in cutoffs:
    work_dir = f'../Processed_Data/COD_196onDEEM_10k/Data/{cutoff}'
    
    # Load SOAPs
    soaps = load_structures_from_hdf5(f'{work_dir}/soaps.hdf5', datasets=None, concatenate=True)    
    
    # Get unique structures from FPS
    unique, distances = FPS(soaps, n=-1)
    np.savetxt(f'{work_dir}/FPS_unique.idxs',
               np.stack((unique, distances), axis=1), fmt='%6d\t%.18e')
    
    # Delete SOAPs so we aren't carrying them around
    del soaps

 73%|███████▎  | 2611/3553 [00:01<00:00, 2307.60it/s]
 88%|████████▊ | 3127/3553 [00:01<00:00, 2290.17it/s]


# DEEM_330k

In [28]:
# Load DEEM 10k
deem_330k = read('../Raw_Data/DEEM_330k/XYZ/DEEM_331172.xyz', index=':')

In [29]:
extract_structure_properties(deem_330k, 
                             properties=['Energy_per_Si'], 
                             property_names=['structure_energies'],
                             work_dir='../Processed_Data/DEEM_330k')

In [30]:
for cutoff in cutoffs:
    work_dir = f'../Processed_Data/DEEM_330konDEEM_10k/Data/{cutoff}'
    idxs_dir = f'../Processed_Data/DEEM_10k/Data/{cutoff}'
    
    # Make required directories
    if not os.path.exists(work_dir):
        os.makedirs(work_dir)
    
    # Load DEEM_10k component indices
    component_idxs = np.loadtxt(f'{idxs_dir}/FPS_components.idxs', usecols=0, dtype=int)
    
    output_file = librascal_soap(deem_330k, [14],
                                 interaction_cutoff=cutoff,
                                 **soap_hyperparameters,
                                 component_idxs=component_idxs,
                                 average=True,
                                 output=f'{work_dir}/soaps_avg.hdf5')

331172it [38:11, 144.51it/s]
331172it [1:49:48, 50.26it/s] 


# Extract DEEM 10k SOAPs from DEEM 330k SOAPs

In [57]:
# Load DEEM 10k
deem_10k = read('../Raw_Data/DEEM_10k/DEEM_10000.xyz', index=':')

In [58]:
# Load DEEM 10k
deem_330k = read('../Raw_Data/DEEM_330k/XYZ/DEEM_331172.xyz', index=':')

In [59]:
# Stride construction from the 330k set to get the 10k set
deem_10k_idxs = np.arange(0, len(deem_330k), 32)[0:10000]
n_digits_deem = len(str(len(deem_330k)))

In [60]:
# Check to make sure we have the correct structures
print(deem_10k == [deem_330k[i] for i in deem_10k_idxs])

True


In [61]:
# Change to zero-padded string for dataset access in HDF5 file
deem_10k_idxs = np.array([str(i).zfill(n_digits_deem) for i in deem_10k_idxs])

In [78]:
# Check that we can pull the correct SOAPs with the indices
soaps_10k = load_structures_from_hdf5('../Processed_Data/DEEM_10k/Data/6.0/soaps.hdf5',
                                      datasets=None, concatenate=False)
soaps_10k = np.vstack([np.mean(soaps, axis=0) for soaps in soaps_10k])

soaps_330k = load_structures_from_hdf5('../Processed_Data/DEEM_330konDEEM_10k/Data/6.0/soaps.hdf5',
                                      datasets=deem_10k_idxs, concatenate=True)

print(np.allclose(soaps_10k, soaps_330k, rtol=1.0E-12, atol=1.0E-12))

True


In [77]:
# Save the indices
np.savetxt('../Processed_Data/DEEM_330konDEEM_10k/deem_10k.idxs', deem_10k_idxs, fmt='%s')