In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# System
import os
import sys

# Maths
import numpy as np
from scipy.spatial import ConvexHull, convex_hull_plot_2d

# Plotting
import matplotlib.pyplot as plt

# Atoms
from ase.io import read, write

# ML
from skcosmo.decomposition import PCovR
from sklearn.linear_model import Ridge
from gch_init import gch_init
from gch_run import gch_run

# Utilities
import h5py
import json
import subprocess
import glob
from copy import deepcopy
import project_utils as utils
from tools import save_json, load_json, recursive_array_convert

# SOAP
from soap import librascal_soap

# Model setup

In [3]:
cutoff = 6.0
spectrum = 'power'
spectrum_name = spectrum.capitalize()
n_cantons = 4
group_name = 'OO+OSi+SiSi'
mixing_suffixes = ['', '_0.0', '_1.0']
#mixing_suffixes = ['']
df_types = ['OvR', 'OvO']
df_type = 'OvR'

In [4]:
model_dir = '../Processed_Data/Models'

deem_name = 'DEEM_330k'
iza_name = 'IZA_230'
deem_dir = f'../Processed_Data/{deem_name}/Data'
iza_dir = f'../Processed_Data/{iza_name}/Data'
model_data_dir = f'LPCovR/{df_type}/{n_cantons}-Class/{spectrum_name}/{group_name}'

In [5]:
gch_dir = f'{model_dir}/{cutoff}/GCH/{n_cantons}-Class/{spectrum_name}/{group_name}'
# gch_dir = f'{model_dir}/{cutoff}/GCHTMP/{n_cantons}-Class/{spectrum_name}/{group_name}'
os.makedirs(gch_dir, exist_ok=True)

In [6]:
# Load SOAP hyperparameters and spline arguments
soap_hyperparameters = load_json('../Processed_Data/soap_hyperparameters.json')
soap_hyperparameters.update(interaction_cutoff=cutoff)

soap_spline = load_json('../Processed_Data/soap_spline.json')
spline_args = soap_spline[f'{cutoff}'][f'{spectrum_name}Spectrum']
soap_args = load_json('../Processed_Data/soap_args.json')

In [7]:
# Load train sets for IZA and Deem
iza_train_idxs = np.loadtxt(f'../Processed_Data/IZA_230/svm_train.idxs', dtype=int)
iza_sort_train_idxs = np.argsort(iza_train_idxs)
iza_unsort_train_idxs = np.argsort(iza_sort_train_idxs)

deem_train_idxs = np.loadtxt('../Processed_Data/DEEM_330k/svm_train.idxs', dtype=int)

# Load test sets for IZA and Deem
iza_test_idxs = np.loadtxt('../Processed_Data/IZA_230/svm_test.idxs', dtype=int)
iza_sort_test_idxs = np.argsort(iza_test_idxs)
iza_unsort_test_idxs = np.argsort(iza_sort_test_idxs)

deem_test_idxs = np.loadtxt('../Processed_Data/DEEM_330k/svm_test.idxs', dtype=int)

In [8]:
iza_frames = read('../Raw_Data/GULP/IZA_230/IZA_230.xyz', index=':')
deem_frames = read('../Raw_Data/DEEM_330k/XYZ/DEEM_331172.xyz', index=':')

In [9]:
# Save structures for the test set
write(
    f'{gch_dir}/iza+deem.xyz', 
    (
        [iza_frames[i] for i in iza_test_idxs]
        + [deem_frames[i] for i in deem_test_idxs]
    ),
    format='extxyz'
)

In [10]:
deem_energies = np.loadtxt('../Raw_Data/GULP/DEEM_330k/optimization_summary.dat', usecols=(1, 2)) # 1=Ref, 2=GULP
iza_energies = np.loadtxt('../Raw_Data/GULP/IZA_230/optimization_summary_fix.dat', usecols=1) # 1=GULP

# Center energies relative to the train set
energy_scaler = utils.StandardNormScaler(with_mean=True, with_scale=False)
energy_scaler.fit(
    np.concatenate((iza_energies[iza_train_idxs], deem_energies[deem_train_idxs, 1]))
)

# Save energies for the test set
np.savetxt(
    f'{gch_dir}/energies_per_si.dat',
    energy_scaler.transform(
        np.concatenate((iza_energies[iza_test_idxs], deem_energies[deem_test_idxs, 1]))
    )
)

In [11]:
# Save projections for the test set
for mixing in mixing_suffixes:
    np.savetxt(
        f'{gch_dir}/T{mixing}.dat',
        np.concatenate((
            utils.load_hdf5(
                f'{iza_dir}/{cutoff}/{model_data_dir}/pcovr_structure_projections{mixing}.hdf5',
                indices=iza_test_idxs[iza_sort_test_idxs]
            )[iza_unsort_test_idxs],
            utils.load_hdf5(
                f'{deem_dir}/{cutoff}/{model_data_dir}/pcovr_structure_projections{mixing}.hdf5',
                indices=deem_test_idxs
            )
        ))
    )

In [None]:
# Compute energy errors from GULP calculations on Deem frameworks
energy_rmse = np.sqrt(np.mean(
    (deem_energies[deem_train_idxs, 0] - deem_energies[deem_train_idxs, 1]) ** 2
))
print(energy_rmse)

In [None]:
# Evaluate cell uncertainty on IZA frameworks in the train set
ref_iza_xyz = [read(f) for f in sorted(glob.glob('../Raw_Data/IZA_230/XYZ/*.xyz'))] # Sort by ID
ref_iza_xyz = [ref_iza_xyz[i] for i in iza_train_idxs]

opt_iza_xyz = [iza_frames[i] for i in iza_train_idxs]

# Compute RMSE in cell vectors
cell_errors = np.full(len(ref_iza_xyz), np.nan)
for idx, (ref, opt) in enumerate(zip(ref_iza_xyz, opt_iza_xyz)):
    cell_errors[idx] = (
#         np.linalg.norm(ref.cell - opt.cell) ** 2 / np.linalg.norm(ref.cell) ** 2
        (ref.get_volume() - opt.get_volume()) ** 2 / ref.get_volume() ** 2
    )
    
cell_rmse = np.sqrt(np.mean(cell_errors))

print(cell_rmse)

Constant volume optimizations in GULP apparently don't optimize the cell at all, so they will have cell error of zero.

# Build a GCH based on PCovR projections

## Build GCH

In [None]:
# Global GCH parameters
s_c = cell_rmse # Uncertainty in cell between structures
s_e = energy_rmse # Uncertainty in energy
ndim = 3 # GCH dimensions (includes energy)
numref = 100 # Number of reference structures
numshaken = 10 # Number of rattled structures per reference
conv = 0.50 # Convergence threshold: 100/conv hulls are constructed
mode = 'fps' # Selection mode for the reference structures
npca = None # Number of KPCA components: None for providing projections, <= 0 for taking all components
mp = 0.60 # Cutoff probability for determining the GCH vertices

In [None]:
for mixing in mixing_suffixes:
    if mixing == '':
        print('===== GCH for PCovR with optimal mixing =====')
    else:
        print(f'===== GCH for PCovR with mixing = {mixing[1:]} =====')  
        
    # Unpickle the reference PCovR model
    pcovr_model_dict = load_json(
        f'{model_dir}/{cutoff}/{model_data_dir}/pcovr_regressor{mixing}.json'
    )
    pcovr_model_dict = recursive_array_convert(pcovr_model_dict)

    # Extract the regressors from PCovR
    pcovr_regressor_dict = pcovr_model_dict.pop('regressor')
    pcovr_fitted_regressor_dict = pcovr_model_dict.pop('regressor_')

    # Initialize the PCovR
    pcovr = PCovR()
    pcovr.__dict__ = pcovr_model_dict

    # Add the regressors to PCovR
    ridge = Ridge()
    ridge.__dict__ = pcovr_regressor_dict
    fitted_ridge = Ridge()
    fitted_ridge.__dict__ = pcovr_fitted_regressor_dict
    pcovr.regressor = ridge
    pcovr.regressor_ = fitted_ridge

    # Load centering and scale factors to apply to the rattled structures
    norm_scaler_dict = load_json(
        f'{model_dir}/{cutoff}/{model_data_dir}/norm_scaler{mixing}.json'
    )
    norm_scaler_dict = recursive_array_convert(norm_scaler_dict)
    norm_scaler = utils.StandardNormScaler()
    norm_scaler.__dict__ = norm_scaler_dict
    
    # Initialize GCH
    pk = f'{gch_dir}/T{mixing}.dat' # File containing the kernel (or projections)
    pnrg = f'{gch_dir}/energies_per_si.dat' # File containing the energies
    setxyz = f'{gch_dir}/iza+deem.xyz' # File containing the structures
    wdir_local = f'{gch_dir}/rattled{mixing}' # Directory in which to save the rattled reference structures

    gch_init(pk, pnrg, setxyz, wdir_local, s_c, s_e, ndim, numref, numshaken, conv, mode, npca)

    # Compute SOAPs for shaken structures
    shaken_refs = read(f'{gch_dir}/rattled{mixing}/shaketraj.xyz', index=':')

    shaken_ref_soaps = librascal_soap(
        shaken_refs,
        **soap_hyperparameters,
        **soap_args,
        **spline_args,
        average=True, 
        concatenate=True
    )

    shaken_ref_soaps = norm_scaler.transform(shaken_ref_soaps)

    # We initialize the GCH on all structures, but project the
    # rattled structures using the same train set as was used to build
    # the original PCovR model
    T_rattled = pcovr.transform(shaken_ref_soaps)
    np.savetxt(f'{gch_dir}/rattled{mixing}/T{mixing}.dat', T_rattled)

    # Run GCH
    shk = f'{gch_dir}/rattled{mixing}/T{mixing}.dat' # File containing the kernel (or projections) for the rattled structures
    wdir = f'{gch_dir}/rattled{mixing}' # Directory in which the rattled reference structures reside
    gch_run(shk, wdir, mp, compute_distances=True)

## Build a standard convex hull

In [33]:
# Compute the hull distances
def hull_distances(hull, data):
        
    # Omit the simplices on the 'top' of the GCH
    hull_facets = np.delete(
        hull.equations,
        np.nonzero(hull.equations[:, 0] > 0.0),
        axis=0
    )
    
    hull_distance = -1.0 * (
        np.matmul(data, hull_facets[:, 0:-1].T) 
        + hull_facets[:, -1]
    )
    
    hull_distance_energy = -1.0 * hull_distance / hull_facets[:, 0]
    
    hull_distance = np.amin(hull_distance, axis=1)
    hull_distance_energy = np.amin(hull_distance_energy, axis=1)
    
    return hull_distance, hull_distance_energy

In [34]:
# Load energies
pnrg = np.loadtxt(f'{gch_dir}/energies_per_si.dat')

for mixing in mixing_suffixes:
    if mixing == '':
        print('===== CH for PCovR with optimal mixing =====')
    else:
        print(f'===== CH for PCovR with mixing = {mixing[1:]} =====')
    
    # Load projections
    pk = np.loadtxt(f'{gch_dir}/T{mixing}.dat')
    data = np.column_stack((pnrg, pk[:, 0:ndim-1]))
    ch = ConvexHull(data)
    
    d, de = hull_distances(ch, data)
    np.savetxt(f'{gch_dir}/hull_distances{mixing}.dat', d)
    np.savetxt(f'{gch_dir}/hull_distances_energy{mixing}.dat', de)
    np.savetxt(f'{gch_dir}/hull_vertices{mixing}.dat', ch.vertices, fmt='%d')

===== CH for PCovR with optimal mixing =====
===== CH for PCovR with mixing = 0.0 =====
===== CH for PCovR with mixing = 1.0 =====
