In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# System
import os
import sys
sys.path.append('/home/helfrech/Tools/Toolbox/utils')

# Maths
import numpy as np

# Atoms
from ase.io import read

# Utilities
import h5py
import json
from selection import FPS, random_selection
from project_utils import load_structures_from_hdf5, build_representatives_from_hdf5

# SOAP
from soap import quippy_soap, librascal_soap

# Initial setup

In [3]:
# Create root directories
if not os.path.exists('../Processed_Data'):
    os.mkdir('../Processed_Data')

if not os.path.exists('../Results'):
    os.mkdir('../Results')

In [4]:
# Setup SOAP parameters
soap_hyperparameters = dict(max_radial=12,
                            max_angular=9,
                            cutoff_smooth_width=0.3,
                            gaussian_sigma_constant=0.3)
# SOAP cutoffs (angstrom)
cutoffs = (3.5, 6.0)

# Save SOAP hyperparameters for quick reference
with open('../Processed_Data/soap_hyperparameters.json', 'w') as f:
    soap_hyperparameters_copy = soap_hyperparameters.copy()
    soap_hyperparameters_copy['interaction_cutoff'] = cutoffs
    json.dump(soap_hyperparameters_copy, f)

In [None]:
# Number of FPS SOAP components to retain 
n_components = 500

In [6]:
# Number of randomly selected structure to use to select the SOAP components
n_random = 2000

In [None]:
# Number of representative environments
n_representative = 2000

In [None]:
# Fraction of training structures
f_train = 0.7750

# Functions

In [None]:
def extract_structure_properties(structures, 
                                 properties=['Energy_per_Si'], 
                                 property_names=['structure_energies'],
                                 work_dir='.'):
    """
        Extracts and saves number of Si, volumes per Si, and other structure properties
    """
    
    if not os.path.exists(work_dir):
        os.mkdir(work_dir)
    
    property_dict = dict(n_Si=[], structure_volumes=[])
    
    for pn in property_names:
        property_dict[pn] = []
    
    for structure in structures:
        Z = structure.get_atomic_numbers()
        n_Si = np.count_nonzero(Z == 14)
        property_dict['n_Si'].append(n_Si)
        property_dict['structure_volumes'].append(structure.cell.volume / n_Si)
        
        for p, pn in zip(properties, property_names):
            property_dict[pn].append(structure.info[p])

    for key, value in property_dict:
        np.savetxt('{work_dir}/{key}.dat', np.asarray(value))

In [None]:
def compute_soaps_and_select(structures, cutoff, soap_hyperparameters, n_components=0, work_dir='.'):
    """
        Computes SOAP and selects a set of representative components, which are then saved
    """
    
    if not os.path.exists(work_dir):
        os.mkdir(work_dir)
    
    soaps = librascal_soap(sample_structures, [14],
                                  interaction_cutoff=cutoff,
                                  **soap_hyperparameters)

    soaps = np.vstack(sample_soaps)

    component_idxs, distances = FPS(soaps.T, n=n_components)
    np.savetxt('{work_dir}/FPS_components.idxs', 
               np.stack((component_idxs, distances), axis=1), fmt='%6d\t%.18e')
    
    return component_idxs

In [None]:
def select_representatives_from_hdf5(work_dir, idxs=None, n_representatives=0):
        
    soaps = load_structures_from_hdf5('{work_dir}/soaps.hdf5', datasets=None, concatenate=True)
    
    if idxs is not None:
        representatives, distances = FPS(soaps[idxs, :], n=n_representatives)   
        np.savetxt('{work_dir}/FPS_representatives.idxs', 
                   np.stack((representatives, distances), axis=1), fmt='%6d\t%.18e')
    
    unique, distances = FPS(soaps, n=-1)
    np.savetxt('{work_dir}/FPS_unique.idxs',
               np.stack((unique, distances), axis=1), fmt='%6d\t%.18e')

# DEEM_10k

In [8]:
# Load DEEM 10k
deem_10k = read('../Raw_Data/DEEM_10k/DEEM_10000.xyz', index=':')

In [None]:
extract_structure_properties(deem_10k, 
                             properties=['Energy_per_Si'], 
                             property_names=['structure_energies'],
                             work_dir='../Processed_Data/DEEM_10k')

In [11]:
# Select random structures from which to select SOAP components
random_idxs = random_selection(len(deem_10k), n=n_random)
random_structures = [deem_10k[i] for i in random_idxs]

In [None]:
for cutoff in cutoffs:
    work_dir = '../Processed_Data/DEEM_10k/Data/{cutoff}'
    component_idxs = compute_soap_and_select(deem_10k, 
                                             cutoff, 
                                             soap_hyperparameters, 
                                             n_components=n_components, 
                                             work_dir=work_dir)
    
    output_file = librascal_soap(deem_10k, [14],
                                 interaction_cutoff=cutoff,
                                 **soap_hyperparameters,
                                 component_idxs=component_idxs,
                                 output='{work_dir}/soaps.hdf5')

In [None]:
# Split into train and test sets
n_train = int(f_train*len(deem_10k))
idxs = np.arange(0, len(deem_10k))
np.random.shuffle(idxs)
train_idxs = idxs[0:n_train]
test_idxs = idxs[n_train:]

np.savetxt('../Processed_Data/DEEM_10k/train.idxs', train_idxs, fmt='%d')
np.savetxt('../Processed_Data/DEEM_10k/test.idxs', test_idxs, fmt='%d')

In [None]:
# Build representative and unique environments from train set
train_idxs = np.loadtxt('../Processed_Data/DEEM_10k/train.idxs', dtype=int)

for cutoff in cutoffs:
    work_dir = '../Processed_Data/DEEM_10k/Data/{cutoff}'
    select_representatives_from_hdf5(work_dir, train_idxs, n_representatives)

# IZA on DEEM_10k

In [None]:
# Load IZA structures
iza = read('../Raw_Data/GULP/IZA_226/IZA.xyz', index=':')

In [None]:
extract_structure_properties(iza, 
                             properties=[], 
                             property_names=[],
                             work_dir='../Processed_Data/IZA_226')

iza_energies = np.loadtxt('../Raw_Data/GULP/IZA_226/Energies_IZA.dat', usecols=8)
np.savetxt('../Processed_Data/IZA_226/structure_energies.dat', iza_energies)

In [None]:
for cutoff in cutoffs:
    work_dir = '../Processed_Data/IZA_226onDEEM_10k/Data/{cutoff}'
    idxs_dir = '../Processed_Data/DEEM_10k/Data/{cutoff}'
    
    # Load DEEM_10k component indices
    component_idxs = np.loadtxt('{idxs_dir}/FPS_components.idxs', usecols=0, dtype=int)
    
    compute_soaps(iza, 
                  cutoff, 
                  soap_hyperparameters, 
                  component_idxs=component_idxs, 
                  work_dir=work_dir)

In [None]:
for cutoff in cutoffs:
    work_dir = '../Processed_Data/IZA_226onDEEM_10k/Data/{cutoff}'
    select_representatives_from_hdf5(work_dir, idxs=None, n_representatives=None)

# COD on DEEM_10k

In [None]:
# Load IZA structures
cod = read('../Raw_Data/GULP/COD_196/COD.xyz', index=':')

In [None]:
extract_structure_properties(cod, 
                             properties=[], 
                             property_names=[],
                             work_dir='../Processed_Data/COD_196')

cod_energies = np.loadtxt('../Raw_Data/GULP/COD_196/Energies_COD.dat', usecols=8)
np.savetxt('../Processed_Data/COD_196/structure_energies.dat', iza_energies)

In [None]:
for cutoff in cutoffs:
    work_dir = '../Processed_Data/COD_196onDEEM_10k/Data/{cutoff}'
    idxs_dir = '../Processed_Data/DEEM_10k/Data/{cutoff}'
    
    # Load DEEM_10k component indices
    component_idxs = np.loadtxt('{idxs_dir}/FPS_components.idxs', usecols=0, dtype=int)
    
    output_file = librascal_soap(cod, [14],
                                 interaction_cutoff=cutoff,
                                 **soap_hyperparameters,
                                 component_idxs=component_idxs,
                                 output='{work_dir}/soaps.hdf5')

In [None]:
for cutoff in cutoffs:
    work_dir = '../Processed_Data/COD_196onDEEM_10k/Data/{cutoff}'
    select_representatives_from_hdf5(work_dir, idxs=None, n_representatives=None)

# DEEM_330k

In [8]:
# Load DEEM 10k
deem_330k = read('../Raw_Data/DEEM_330k/XYZ/DEEM_331172.xyz', index=':')

In [None]:
extract_structure_properties(deem_330k, 
                             properties=['Energy_per_Si'], 
                             property_names=['structure_energies'],
                             work_dir='../Processed_Data/DEEM_330k')

In [None]:
for cutoff in cutoffs:
    work_dir = '../Processed_Data/DEEM_330konDEEM_10k/Data/{cutoff}'
    idxs_dir = '../Processed_Data/DEEM_10k/Data/{cutoff}'
    
    # Load DEEM_10k component indices
    component_idxs = np.loadtxt('{idxs_dir}/FPS_components.idxs', usecols=0, dtype=int)
    
    output_file = librascal_soap(deem_330k, [14],
                                 interaction_cutoff=cutoff,
                                 **soap_hyperparameters,
                                 component_idxs=component_idxs,
                                 average=True,
                                 output='{work_dir}/soaps.hdf5')