In [1]:
%load_ext autoreload
%autoreload 2

In [30]:
# System
import os
import sys
sys.path.append('/home/helfrech/Tools/Toolbox/utils')

# Maths
import numpy as np

# Atoms
from ase.io import read
from rascal.representations import SphericalInvariants
from rascal.neighbourlist.structure_manager import AtomsList
from rascal.neighbourlist.structure_manager import mask_center_atoms_by_species

# ML
from soap import librascal_soap
from soap import rrw_neighbors, make_tuples
from soap import reshape_soaps, compute_soap_density
from soap import extract_species_pair_groups

# Utilities
import h5py
import json
from tqdm.notebook import tqdm
import project_utils as utils

# Functions

In [3]:
def save_rrw(frames, cutoff, center_species, env_species, output):
    n_digits = len(str(len(frames) - 1))
    f = h5py.File(output, 'w')
    rrw_group = f.create_group('rrw')
    idxs_group = f.create_group('idxs')
    for fdx, frame in enumerate(frames):
        dataset_name = str(fdx).zfill(n_digits)
        rrw, idxs = rrw_neighbors(frame, center_species, env_species, cutoff, self_interaction=True)
        rrw_group.create_dataset(dataset_name, data=rrw)
        
        idxs_group.create_dataset(dataset_name, data=idxs)
        
    f.close()

# Compute atom-resolved density

In [4]:
# Load SOAP hyperparameters
with open('../Processed_Data/soap_hyperparameters.json', 'r') as f:
    soap_hyperparameters = json.load(f)

In [5]:
# Manually set a single cutoff for now
cutoffs = soap_hyperparameters['interaction_cutoff']

In [6]:
# Set grids
# TODO: save grids for loading in the analysis notebook?
n_r_grid = 50
chunk_size_r = 10
r_grid = {}
for cutoff in cutoffs:
    r_grid[cutoff] = np.linspace(0.0, cutoff, n_r_grid)

n_p_grid = 50
chunk_size_p = 10
p_grid = np.linspace(-1.0, 1.0, n_p_grid)

## DEEM 10k

## TODO: also load IZA, and average over both IZA and DEEM soaps in the train set used to build the full KSVC-KPCovR models

## TODO: compute soap densities, weight densities, and triplet indices

In [7]:
# Linear model setup
n_species = 2
#group_names = {'power': ['OO', 'OSi', 'SiSi', 
#                         'OO+OSi', 'OO+SiSi', 'OSi+SiSi',
#                         'OO+OSi+SiSi'], 
#               'radial': ['O', 'Si', 'O+Si']}
group_names = {'power': ['OO', 'OSi', 'SiSi'],
               'radial': ['O', 'Si']}

In [9]:
model_dir = '../Processed_Data/Models'

deem_name = 'DEEM_10k'
iza_name = 'IZA_226'
deem_dir = f'../Processed_Data/{deem_name}/Data'
iza_dir = f'../Processed_Data/{iza_name}/Data'

In [10]:
# Load structures
deem_10k = read('../Raw_Data/DEEM_10k/DEEM_10000.xyz', index=':')
iza_226 = read('../Raw_Data/GULP/IZA_226/IZA.xyz', index=':')

In [11]:
# Load IZA cantons
cantons_iza = np.loadtxt('../Raw_Data/GULP/IZA_226/cantons.txt', usecols=1, dtype=int)
RWY = np.nonzero(cantons_iza == 4)[0][0]

In [12]:
cantons_iza = np.delete(cantons_iza, RWY)
n_iza = len(cantons_iza)

In [13]:
iza_226.pop(RWY)

Atoms(symbols='O96Si48', pbc=True, cell=[[17.762, 0.0, 0.0], [2.4e-05, 17.762, 0.0], [2.4e-05, 2.4e-05, 17.762]])

In [14]:
# Load number of Si atoms in each structure
n_Si_deem = np.loadtxt('../Processed_Data/DEEM_10k/n_Si.dat', dtype=int)
n_Si_iza = np.loadtxt('../Processed_Data/IZA_226/n_Si.dat', dtype=int)
n_Si_iza = np.delete(n_Si_iza, RWY)

In [None]:
# TODO: finish this later
# Compute the rr'w "stencils" in the IZA and DEEM structures.
# We do the computation iteratively otherwise the matrices might get huge
# for cutoff in tqdm(cutoffs, desc='Cutoff', leave=True):
    
#     iza_rrw_file = f'{iza_dir}/{cutoff}/stencils.hdf5'
#     for frame in iza_226[0:1]:
#         rrw, idxs = rrw_neighbors(frame, [14], [8, 14], cutoff, self_interaction=True)
#         print(len(rrw), len(rrw[0]), np.shape(rrw[0][2]))

    #save_rrw(iza_226, cutoff, [14], [8, 14], iza_rrw_file)
    
    #deem_rrw_file = f'{deem_dir}/{cutoff}/stencils.hdf5'
    #save_rrw(deem_10k, cutoff, deem_rrw_file)

In [41]:
for cutoff in tqdm(cutoffs, desc='Cutoff', leave=True):
    linear_dir = f'{model_dir}/{cutoff}/Linear_Models/LSVC-LPCovR'
    
    # TODO: make reshape_soaps and compute_density also work for the radial spectrum
    #for spectrum_type in tqdm(('power', 'radial'), desc='Spectrum', leave=False):
    for spectrum_type in tqdm(['power'], desc='Spectrum', leave=False):
        spectrum_name = spectrum_type.capitalize()
        
        # Load SOAPs
        deem_file = f'{deem_dir}/{cutoff}/soaps_{spectrum_type}_full_avg_nonorm.hdf5'
        iza_file = f'{iza_dir}/{cutoff}/soaps_{spectrum_type}_full_avg_nonorm.hdf5'
        
        # TODO: what do we want to use to compute the SOAP density?
        # Just the train set? All IZA+DEEM structures?
        #soaps_train, soaps_test = utils.load_soaps(deem_file, iza_file,
        #                                           idxs_deem_train, idxs_deem_test,
        #                                           idxs_iza_train, idxs_iza_test,
        #                                           idxs_iza_delete=[RWY],
        #                                           train_test_concatenate=True)
        
        # Build average IZA and DEEM SOAP over the whole dataset instead of per-structure
        soaps_deem = utils.load_structures_from_hdf5(deem_file, datasets=None, concatenate=True)
        soaps_deem = np.sum(soaps_deem * n_Si_deem[:, np.newaxis], axis=0) / np.sum(n_Si_deem)
        
        soaps_iza = utils.load_structures_from_hdf5(iza_file, datasets=None, concatenate=True)
        soaps_iza = np.delete(soaps_iza, RWY, axis=0)
        soaps_iza = np.sum(soaps_iza * n_Si_iza[:, np.newaxis], axis=0) / np.sum(n_Si_iza)
        
        # Extract features corresponding to individual species pairs
        # IZA and DEEM should be 1D and have the same number of features
        if len(soaps_deem) != len(soaps_iza):
            print('WARNING: number of IZA and DEEM features do not match')
            n_features = None
        else:
            n_features = len(soaps_deem)
        
        feature_groups = extract_species_pair_groups(n_features, n_species, 
                                                     spectrum_type=spectrum_type,
                                                     combinations=False)

        for species_pairing, feature_idxs in zip(tqdm(group_names[spectrum_type], 
                                                      desc='Species', leave=False),
                                                 feature_groups):
            
            # Reshape SOAPs for the density computation
            soaps_deem_species = reshape_soaps(soaps_deem[feature_idxs], 1, 
                                               soap_hyperparameters['max_radial'], 
                                               soap_hyperparameters['max_angular'])
            
            soaps_iza_species = reshape_soaps(soaps_iza[feature_idxs], 1,
                                              soap_hyperparameters['max_radial'],
                                              soap_hyperparameters['max_angular'])
            
            # Compute density and save
            soap_density_deem = compute_soap_density(soap_hyperparameters['max_radial'],
                                                     soap_hyperparameters['max_angular'],
                                                     cutoff,
                                                     soaps_deem_species, r_grid[cutoff], p_grid,
                                                     chunk_size_r=chunk_size_r, chunk_size_p=chunk_size_p)
            
            soap_density_iza = compute_soap_density(soap_hyperparameters['max_radial'],
                                                     soap_hyperparameters['max_angular'],
                                                     cutoff,
                                                     soaps_iza_species, r_grid[cutoff], p_grid,
                                                     chunk_size_r=chunk_size_r, chunk_size_p=chunk_size_p)
                        
            # NOTE: to get average SOAP of IZA+DEEM combined, load these SOAPs, 
            # multiply individually by the total number
            # of IZA or DEEM Si atoms, concatenate, and then sum and divide by the 
            # combined total number of Si atoms in IZA and DEEM
            output_file_deem = f'{deem_dir}/{cutoff}/real_space_soaps_{spectrum_type}_full_avg_nonorm_{species_pairing}.hdf5'
            output_file_iza = f'{iza_dir}/{cutoff}/real_space_soaps_{spectrum_type}_full_avg_nonorm_{species_pairing}.hdf5'

            utils.save_hdf5(output_file_deem, soap_density_deem)
            utils.save_hdf5(output_file_iza, soap_density_iza)
            
            for n_cantons in tqdm((2, 4), desc='Classes', leave=False):
                
                # Load SVC weights
                weights_dir = f'{linear_dir}/{n_cantons}-Class/{spectrum_name}/{species_pairing}'
                weights_file = f'{weights_dir}/svc_weights.dat'
                output_file_weights = f'{weights_dir}/real_space_svc_weights.hdf5'
                
                weights = np.loadtxt(weights_file)
                
                # Reshape weights for the density computation
                weights = reshape_soaps(weights, 1,
                                        soap_hyperparameters['max_radial'],
                                        soap_hyperparameters['max_angular'])
                
                # Compute density and save
                # (save in HDF5 instead of text b/c array is more than 2D)
                # TODO: convert weight density to 1D array?
                weight_density = compute_soap_density(soap_hyperparameters['max_radial'],
                                                      soap_hyperparameters['max_angular'],
                                                      cutoff,
                                                      weights, r_grid[cutoff], p_grid,
                                                      chunk_size_r=chunk_size_r, chunk_size_p=chunk_size_p)
                
                utils.save_hdf5(output_file_weights, weight_density)

HBox(children=(FloatProgress(value=0.0, description='Cutoff', max=2.0, style=ProgressStyle(description_width='…

HBox(children=(FloatProgress(value=0.0, description='Spectrum', max=1.0, style=ProgressStyle(description_width…

HBox(children=(FloatProgress(value=0.0, description='Species', max=3.0, style=ProgressStyle(description_width=…

HBox(children=(FloatProgress(value=0.0, description='Classes', max=2.0, style=ProgressStyle(description_width=…

HBox(children=(FloatProgress(value=0.0, description='Classes', max=2.0, style=ProgressStyle(description_width=…

HBox(children=(FloatProgress(value=0.0, description='Classes', max=2.0, style=ProgressStyle(description_width=…

HBox(children=(FloatProgress(value=0.0, description='Spectrum', max=1.0, style=ProgressStyle(description_width…

HBox(children=(FloatProgress(value=0.0, description='Species', max=3.0, style=ProgressStyle(description_width=…

HBox(children=(FloatProgress(value=0.0, description='Classes', max=2.0, style=ProgressStyle(description_width=…

HBox(children=(FloatProgress(value=0.0, description='Classes', max=2.0, style=ProgressStyle(description_width=…

HBox(children=(FloatProgress(value=0.0, description='Classes', max=2.0, style=ProgressStyle(description_width=…




## Sodalite

In [None]:
# Load structure
sod = read('../Raw_Data/SOD/sodalite.xyz', index=':')

In [None]:
soap_hyperparameters_sod = soap_hyperparameters.copy()
soap_hyperparameters_sod.update(interaction_cutoff=6.0)

In [None]:
# Make a SphericalInvariants representation (for sodalite)
representation = SphericalInvariants(gaussian_sigma_type='Constant',
                                     **soap_hyperparameters_sod)

In [None]:
# Compute SOAPs
soaps_sod = librascal_soap(sod, [14],
                           **soap_hyperparameters_sod,
                           normalize=False,
                           average=True)

# TODO: do an average just like DEEM+IZA -- should be the same as a single environment
# as they are all equivalent, but do this for consistency
soaps_sod = soaps_sod[0]

In [None]:
rrw, idxs = rrw_neighbors(sod[0], [14], [8, 14], 6.0, self_interaction=True)

In [None]:
# TODO: set n_pairs
soaps_sod = reshape_soaps(soaps_sod, n_pairs, 
                          soap_hyperparameters['max_radial'], 
                          soap_hyperparameters['max_angular'])

# Compute density
density_sod = compute_soap_density(soap_hyperparameters['max_radial'],
                                   soap_hyperparameters['max_angular'],
                                   soap_hyperparameters['interaction_cutoff'],
                                   soaps_sod, r_grid, p_grid,
                                   chunk_size_r=10, chunk_size_p=10)

# TODO: save density, rrw, and indices