In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# System
import os
import sys
sys.path.append('/home/helfrech/Tools/Toolbox/utils')
sys.path.append('/home/helfrech/Tools/GCH/GCH')

# Maths
import numpy as np
from scipy.spatial import ConvexHull, convex_hull_plot_2d

# Plotting
import matplotlib.pyplot as plt

# Atoms
import ase.io as aseio

# ML
from regression import KPCovR
from kernels import build_kernel, linear_kernel, gaussian_kernel
from kernels import center_kernel, center_kernel_fast
from kernels import center_kernel_oos, center_kernel_oos_fast
from gch_init import gch_init
from gch_run import gch_run

# Utilities
import h5py
import json
import subprocess
import glob
from copy import deepcopy
import project_utils as utils
from tools import load_json

# SOAP
from soap import quippy_soap, librascal_soap

In /home/helfrech/.config/matplotlib/stylelib/cosmo.mplstyle: 
The savefig.frameon rcparam was deprecated in Matplotlib 3.1 and will be removed in 3.3.
In /home/helfrech/.config/matplotlib/stylelib/cosmoLarge.mplstyle: 
The savefig.frameon rcparam was deprecated in Matplotlib 3.1 and will be removed in 3.3.


In [None]:
#sys.path.append('/scratch/helfrech/Sync/GDrive/Projects/KPCovR/kernel-tutorials')
#from utilities.sklearn_covr.kpcovr import KernelPCovR as KPCovR2

# Model setup

In [3]:
cutoff = 6.0

In [None]:
component_idxs = np.loadtxt(f'../Processed_Data/DEEM_10k/Data/{cutoff}/FPS_components.idxs', 
                            usecols=0, dtype=np.int)

In [None]:
# Load SOAP hyperparameters
with open('../Processed_Data/soap_hyperparameters.json', 'r') as f:
    soap_hyperparameters = json.load(f)

centers = [14] # Center on Si, take Si and O in environment
soap_hyperparameters['component_idxs'] = component_idxs
soap_hyperparameters['interaction_cutoff'] = cutoff # TODO: change this to do 3.5 cutoff also

In [None]:
# Load IZA cantons to remove RWY
cantons_iza = np.loadtxt('../Raw_Data/GULP/IZA_226/cantons.txt', usecols=1, dtype=int)
RWY = np.nonzero(cantons_iza == 4)[0][0]
cantons_iza = np.delete(cantons_iza, RWY)

In [None]:
# Compute cell uncertainty (use CIFs b/c we have the original, unoptimized structures)
orig_glob = sorted(glob.glob('../Raw_Data/GULP/IZA_226/Orig_cif_files.d/*.cif'))
opt_glob = sorted(glob.glob('../Raw_Data/GULP/IZA_226/Opt_cif_files.d/*.cif'))

# All the IZA structures are in alphabetical order, so this should work
orig_glob.pop(RWY)
opt_glob.pop(RWY)

cell_errors = []
for orig, opt in zip(orig_glob, opt_glob):
    orig_cif = aseio.read(orig)
    opt_cif = aseio.read(opt)
    cell_errors.append(np.linalg.norm(orig_cif.cell - opt_cif.cell)**2)
    
cell_errors = np.asarray(cell_errors)
cell_rmse = np.sqrt(np.mean(cell_errors))

print(cell_rmse)

In [None]:
# Load idxs_deem_train and idxs_iza_train
idxs_deem_train = np.loadtxt('../Processed_Data/DEEM_10k/train.idxs', dtype=int)
idxs_iza_train = np.loadtxt('../Processed_Data/IZA_226/train.idxs', dtype=int)

# Build a GCH based on KPCovR projections

In [1]:
# Load SOAPs to build kernel with rattled structures or compute appropriate centering and scaling
deem_file = f'../Processed_Data/DEEM_10k/Data/{cutoff}/soaps.hdf5'
deem_soaps = load_structures_from_hdf5(deem_file, datasets=None, concatenate=False)
deem_soaps = [deem_soaps[i] for i in idxs_deem_train]

iza_file = f'../Processed_Data/IZA_226onDEEM_10k/Data/{cutoff}/soaps.hdf5'
iza_soaps = load_structures_from_hdf5(iza_file, datasets=None, concatenate=False)
iza_soaps.pop(RWY)
iza_soaps = [iza_soaps[i] for i in idxs_iza_train]

NameError: name 'cutoff' is not defined

In [4]:
gch_dir = f'../Processed_Data/Models/{cutoff}/Kernel_Models/Gaussian/KSVC-KPCovR/4-Class/GCH'

if not os.path.exists(gch_dir):
    os.makedirs(gch_dir)

In [None]:
# Concatenate IZA and DEEM xyz files
# TODO: consolidate the OPT files -- IZA_OPT should just be IZA and DEEM_10000_OPT should be Raw_Data/DEEM_10k/DEEM_10000.xyz
deem = aseio.read('../Raw_Data/DEEM_10k/DEEM_10000.xyz', index=':')
iza = aseio.read('../Raw_Data/GULP/IZA_226/IZA_OPT.xyz', index=':')
iza.pop(RWY) # All the IZA structures are in alphabetical order, so this should work
aseio.write(f'{gch_dir}/iza+deem.xyz', iza+deem, format='extxyz')

# Prepare energies and volumes
volumes_deem = np.loadtxt('../Processed_Data/DEEM_10k/structure_volumes.dat')
volumes_iza = np.loadtxt('../Processed_Data/IZA_226/structure_volumes.dat')
volumes = np.concatenate((volumes_iza, volumes_deem))

energies_deem = np.loadtxt('../Processed_Data/DEEM_10k/structure_energies.dat')
energies_deem_opt = np.loadtxt('../Processed_Data/DEEM_10k/structure_energies_opt.dat')
energies_iza = np.loadtxt('../Processed_Data/IZA_226/structure_energies.dat')
energies = np.concatenate((energies_iza, energies_deem))
energy_errors = energies_deem - energies_deem_opt

energies -= np.mean(energies, axis=0)
energy_rmse = np.sqrt(np.mean(energy_errors**2))

np.savetxt(f'{gch_dir}/energies_per_si.dat', energies)

print(energy_rmse)

In [None]:
# Load train kernel to center the rattled kernel
kernel_file = f'../Processed_Data/Models/{cutoff}/Kernel_Models/Gaussian/structure_kernels.hdf5'
f = h5py.File(kernel_file, 'r')
K_train = f['K_train'][:]
kernel_parameters = f.attrs
f.close()

In [None]:
# Load projections to convert HDF5 to text so the GCH utility can load it
deem_file = f'../Processed_Data/DEEM_10k/Data/{cutoff}/Kernel_Models/Gaussian/KSVC-KPCovR/4-Class/pcovr_structures.hdf5'
T_deem = utils.load_structures_from_hdf5(deem_file, datasets=None, concatenate=True)

iza_file = f'../Processed_Data/IZA_226onDEEM_10k/Data/{cutoff}/Kernel_Models/Gaussian/KSVC-KPCovR/4-Class/pcovr_structures.hdf5'
T_iza = utils.load_structures_from_hdf5(iza_file, datasets=None, concatenate=True)

np.savetxt(f'{gch_dir}/T.dat', np.vstack((T_iza, T_deem)))

In [None]:
# Unpickle the reference KPCovR model
kpcovr_file = f'../Processed_Data/Models/{cutoff}/Kernel_Models/Gaussian/KSVC-KPCovR/kpcovr.json'
kpcovr = load_json(kpcovr_file, array_convert=True)

In [None]:
# Initialize GCH
pk = f'{gch_dir}/T.dat' # File containing the kernel (or projections)
pnrg = f'{gch_dir}/energies_per_si.dat' # File containing the energies
setxyz = f'{gch_dir}/iza+deem.xyz' # File containing the structures
wdir_local = f'{gch_dir}/rattled' # Directory in which to save the rattled reference structures
s_c = cell_rmse # Uncertainty in cell between structures
s_e = energy_rmse # Uncertainty in energy
ndim = 3 # GCH dimensions (includes energy)
numref = 100 # Number of reference structures
numshaken = 10 # Number of rattled structures per reference
conv = 0.20 # Convergence threshold: 100/conv hulls are constructed
mode = 'fps' # Selection mode for the reference structures
npca = None # Number of KPCA components: None for providing projections, <= 0 for taking all components

gch_init(pk, pnrg, setxyz, wdir_local, s_c, s_e, ndim, numref, numshaken, conv, mode, npca)

In [None]:
# Compute SOAPs for shaken structures
shaken_refs = aseio.read(f'{gch_dir}/rattled/shaketraj.xyz', index=':')

shaken_ref_soaps = librascal_soap(shaken_refs, centers, **soap_hyperparameters)

In [None]:
print(kernel_parameters)

In [None]:
# Compute kernel for shaken structures
K_rattled = build_kernel(shaken_ref_soaps, iza_soaps + deem_soaps,
                         **kernel_parameters)
Kc_rattled = center_kernel_fast(K_rattled, K_ref=K_train)

Kc_rattled /= np.trace(K_train) / K_train.shape[0]

In [None]:
# We initialize the GCH on all structures, but project the
# rattled structures using the same train set as was used to build
# the original KPCovR model
T_rattled = kpcovr.transform_K(Kc_rattled)
np.savetxt(f'{gch_dir}/rattled/T.dat', T_rattled)

In [None]:
T = np.loadtxt(f'{gch_dir}/T.dat')
ref_idxs = np.loadtxt(f'{gch_dir}/rattled/refstruct.idx', dtype=int)

In [None]:
plt.scatter(T[:, 0], T[:, 1])
plt.scatter(T_rattled[:, 0], T_rattled[:, 1])
plt.show()

In [None]:
plt.scatter(T[:, 0], T[:, 1])
plt.scatter(T[ref_idxs, 0], T[ref_idxs, 1])
plt.show()

In [None]:
# Run GCH
shk = f'{gch_dir}/rattled/T.dat' # File containing the kernel (or projections) for the rattled structures
wdir = f'{gch_dir}/rattled' # Directory in which the rattled reference structures reside
mp = 0.99 # Cutoff probability for determining the GCH vertices
gch_run(shk, wdir, mp)

In [None]:
vprobprune = np.loadtxt(f'{gch_dir}/rattled/vprobprune.dat')
print(vprobprune.shape)

In [None]:
# Hull distances
gch_vertices = np.nonzero(vprobprune[-1])[0]
print(gch_vertices)

vertices = np.zeros(T.shape[0], dtype=int)
vertices[gch_vertices] = 1

e_gch = energies[gch_vertices]
T_gch = T[gch_vertices, :]

eT = np.hstack((e_gch[:, np.newaxis], T_gch))
eT_all = np.hstack((energies[:, np.newaxis], T))
vertex_hull = ConvexHull(eT[:, 0:ndim])

# Omit simplices on the "top" of the GCH
vertex_hull_facets = np.delete(vertex_hull.equations, 
                               np.nonzero(vertex_hull.equations[:, 0] > 0.0),
                               axis=0)

d = -(np.matmul(eT_all[:, 0:ndim], vertex_hull_facets[:, 0:-1].T) + vertex_hull_facets[:, -1])
de = -d / vertex_hull_facets[:, 0]
d = np.amin(np.abs(d), axis=1)
de = np.amin(np.abs(de), axis=1)

In [None]:
for i in range(0, np.amax(cantons_iza)):
    selection = np.nonzero(cantons_iza == i+1)
    print(np.mean(d[selection]), np.amin(d[selection]), np.amax(d[selection]), np.std(d[selection]))

In [None]:
print(np.mean(d[0:225]), np.amin(d[0:225]), np.amax(d[0:225]), np.std(d[0:225]))
print(np.mean(d[225:]), np.amin(d[225:]), np.amax(d[225:]), np.std(d[225:]))

In [None]:
for i in range(0, np.amax(cantons_iza)):
    selection = np.nonzero(cantons_iza == i+1)
    print(np.mean(de[selection]), np.amin(de[selection]), np.amax(de[selection]), np.std(de[selection]))

In [None]:
print(np.mean(de[0:225]), np.amin(de[0:225]), np.amax(de[0:225]), np.std(de[0:225]))
print(np.mean(de[225:]), np.amin(de[225:]), np.amax(de[225:]), np.std(de[225:]))

In [None]:
# Compute inverse distances
d_inv = np.log10(1.0/d)
de_inv = np.log10(1.0/de)

# Set inf distances to maximum non-inf distances
d_inv[np.isinf(d_inv)] = np.amax(np.delete(d_inv, np.nonzero(np.isinf(d_inv))))
de_inv[np.isinf(de_inv)] = np.amax(np.delete(de_inv, np.nonzero(np.isinf(de_inv))))

In [None]:
for vertex_simplex in vertex_hull.simplices:
    plt.scatter(T_gch[vertex_simplex, 0], T_gch[vertex_simplex, 1], c='k', s=100)
    
plt.scatter(T_gch[vertex_hull.vertices,0], T_gch[vertex_hull.vertices,1], c='r', s=50)
plt.scatter(T[:,0], T[:,1], c=d, cmap='viridis', s=20)
plt.colorbar()
plt.show()

In [None]:
plt.scatter(eT_all[gch_vertices, 0], eT_all[gch_vertices, 1], c='r', s=50)
plt.scatter(eT_all[:,0], eT_all[:,1], c=de, cmap='viridis', s=20)
plt.colorbar()
plt.show()

In [None]:
plt.scatter(T[:, 0], T[:, 1], c=energies, cmap='viridis')
plt.scatter(T[gch_vertices, 0], T[gch_vertices, 1], c='r', s=100)
plt.colorbar()
plt.show()

# Build a GCH based on PCovR projections

In [None]:
#Load centering and scale factors to apply to the rattled structures
center_scale = load_json(f'../Processed_Data/Models/{cutoff}/Linear_Models/LSVC-LPCovR/4-Class/OO+OSi+SiSi/center_scale.json', array_convert=True)

In [4]:
gch_dir = f'../Processed_Data/Models/{cutoff}/Linear_Models/LSVC-LPCovR/4-Class/Power/OO+OSi+SiSi/GCH'

if not os.path.exists(gch_dir):
    os.makedirs(gch_dir)

In [None]:
# Concatenate IZA and DEEM xyz files
# TODO: consolidate the OPT files -- IZA_OPT should just be IZA and DEEM_10000_OPT should be Raw_Data/DEEM_10k/DEEM_10000.xyz
deem = aseio.read('../Raw_Data/DEEM_330k/XYZ/DEEM_331172.xyz', index=':')
iza = aseio.read('../Raw_Data/GULP/IZA_226/IZA_OPT.xyz', index=':')
iza.pop(RWY) # All the IZA structures are in alphabetical order, so this should work
aseio.write(f'{gch_dir}/iza+deem.xyz', iza + deem, format='extxyz')

# Indices of the DEEM 10k set within the 330k structures
idxs_deem_10k = np.loadtxt('../Processed_Data/DEEM_330konDEEM_10k/deem_10k.idxs', dtype=int)

# Prepare energies and volumes
volumes_deem = np.loadtxt('../Processed_Data/DEEM_330k/structure_volumes.dat')
volumes_iza = np.loadtxt('../Processed_Data/IZA_226/structure_volumes.dat')
volumes = np.concatenate((volumes_iza, volumes_deem))

energies_deem = np.loadtxt('../Processed_Data/DEEM_330k/structure_energies.dat')
energies_deem_opt = np.loadtxt('../Processed_Data/DEEM_10k/structure_energies_opt.dat')
energies_iza = np.loadtxt('../Processed_Data/IZA_226/structure_energies.dat')
energies = np.concatenate((energies_iza, energies_deem))
energy_errors = energies_deem[idxs_deem_10k] - energies_deem_opt

energies -= np.mean(energies, axis=0)
energy_rmse = np.sqrt(np.mean(energy_errors**2))

np.savetxt(f'{gch_dir}/energies_per_si.dat', energies)

print(energy_rmse)

In [None]:
# Load projections to convert HDF5 to text so the GCH utility can load it;
# use all 330k DEEM structures
deem_file = f'../Processed_Data/DEEM_330k/Data/{cutoff}/Linear_Models/LSVC-LPCovR/4-Class/Power/OO+OSi+SiSi/pcovr_structures.hdf5'
T_deem = utils.load_structures_from_hdf5(deem_file, datasets=None, concatenate=True)

iza_file = f'../Processed_Data/IZA_226/Data/{cutoff}/Linear_Models/LSVC-LPCovR/4-Class/Power/OO+OSi+SiSi/pcovr_structures.hdf5'
T_iza = utils.load_structures_from_hdf5(iza_file, datasets=None, concatenate=True)

np.savetxt(f'{gch_dir}/T.dat', np.vstack((T_iza, T_deem)))

In [None]:
# Unpickle the reference PCovR model
pcovr_file = f'../Processed_Data/Models/{cutoff}/Linear_Models/LSVC-LPCovR/4-Class/Power/OO+OSi+SiSi/pcovr.json'
pcovr_model_dict = load_json(pcovr_file, array_convert=True)
pcovr = PCovR()
pcovr.__dict__ = pcovr_model_dict

In [None]:
# Initialize GCH
pk = f'{gch_dir}/T.dat' # File containing the kernel (or projections)
pnrg = f'{gch_dir}/energies_per_si.dat' # File containing the energies
setxyz = f'{gch_dir}/iza+deem.xyz' # File containing the structures
wdir_local = f'{gch_dir}/rattled' # Directory in which to save the rattled reference structures
s_c = cell_rmse # Uncertainty in cell between structures
s_e = energy_rmse # Uncertainty in energy
ndim = 3 # GCH dimensions (includes energy)
numref = 100 # Number of reference structures
numshaken = 10 # Number of rattled structures per reference
conv = 0.20 # Convergence threshold: 100/conv hulls are constructed
mode = 'fps' # Selection mode for the reference structures
npca = None # Number of KPCA components: None for providing projections, <= 0 for taking all components

gch_init(pk, pnrg, setxyz, wdir_local, s_c, s_e, ndim, numref, numshaken, conv, mode, npca)

In [None]:
# Compute SOAPs for shaken structures
shaken_refs = aseio.read(f'{gch_dir}/rattled/shaketraj.xyz', index=':')

shaken_ref_soaps = librascal_soap(shaken_refs, centers, **soap_hyperparameters)
shapen_ref_soaps = (shaken_ref_soaps - center_scale['center']) / center_scale['scale']

In [None]:
# We initialize the GCH on all structures, but project the
# rattled structures using the same train set as was used to build
# the original PCovR model
T_rattled = pcovr.transform_K(shaken_ref_soaps)
np.savetxt(f'{gch_dir}/rattled/T.dat', T_rattled)

In [None]:
T = np.loadtxt(f'{gch_dir}/T.dat')
ref_idxs = np.loadtxt(f'{gch_dir}/rattled/refstruct.idx', dtype=int)

In [None]:
plt.scatter(T[:, 0], T[:, 1])
plt.scatter(T_rattled[:, 0], T_rattled[:, 1])
plt.show()

In [None]:
plt.scatter(T[:, 0], T[:, 1])
plt.scatter(T[ref_idxs, 0], T[ref_idxs, 1])
plt.show()

In [None]:
# Run GCH
shk = f'{gch_dir}/rattled/T.dat' # File containing the kernel (or projections) for the rattled structures
wdir = f'{gch_dir}/rattled' # Directory in which the rattled reference structures reside
mp = 0.99 # Cutoff probability for determining the GCH vertices
gch_run(shk, wdir, mp)

In [None]:
vprobprune = np.loadtxt(f'{gch_dir}/rattled/vprobprune.dat')
print(vprobprune.shape)

In [None]:
# Hull distances
gch_vertices = np.nonzero(vprobprune[-1])[0]
print(gch_vertices)

vertices = np.zeros(T.shape[0], dtype=int)
vertices[gch_vertices] = 1

e_gch = energies[gch_vertices]
T_gch = T[gch_vertices, :]

eT = np.hstack((e_gch[:, np.newaxis], T_gch))
eT_all = np.hstack((energies[:, np.newaxis], T))
vertex_hull = ConvexHull(eT[:, 0:ndim])

# Omit simplices on the "top" of the GCH
vertex_hull_facets = np.delete(vertex_hull.equations, 
                               np.nonzero(vertex_hull.equations[:, 0] > 0.0),
                               axis=0)

d = -(np.matmul(eT_all[:, 0:ndim], vertex_hull_facets[:, 0:-1].T) + vertex_hull_facets[:, -1])
de = -d / vertex_hull_facets[:, 0]
d = np.amin(np.abs(d), axis=1)
de = np.amin(np.abs(de), axis=1)

In [None]:
for i in range(0, np.amax(cantons_iza)):
    selection = np.nonzero(cantons_iza == i+1)
    print(np.mean(d[selection]), np.amin(d[selection]), np.amax(d[selection]), np.std(d[selection]))

In [None]:
print(np.mean(d[0:225]), np.amin(d[0:225]), np.amax(d[0:225]), np.std(d[0:225]))
print(np.mean(d[225:]), np.amin(d[225:]), np.amax(d[225:]), np.std(d[225:]))

In [None]:
for i in range(0, np.amax(cantons_iza)):
    selection = np.nonzero(cantons_iza == i+1)
    print(np.mean(de[selection]), np.amin(de[selection]), np.amax(de[selection]), np.std(de[selection]))

In [None]:
print(np.mean(de[0:225]), np.amin(de[0:225]), np.amax(de[0:225]), np.std(de[0:225]))
print(np.mean(de[225:]), np.amin(de[225:]), np.amax(de[225:]), np.std(de[225:]))

In [None]:
# Compute inverse distances
d_inv = np.log10(1.0/d)
de_inv = np.log10(1.0/de)

# Set inf distances to maximum non-inf distances
d_inv[np.isinf(d_inv)] = np.amax(np.delete(d_inv, np.nonzero(np.isinf(d_inv))))
de_inv[np.isinf(de_inv)] = np.amax(np.delete(de_inv, np.nonzero(np.isinf(de_inv))))

In [None]:
for vertex_simplex in vertex_hull.simplices:
    plt.scatter(T_gch[vertex_simplex, 0], T_gch[vertex_simplex, 1], c='k', s=100)
    
plt.scatter(T_gch[vertex_hull.vertices,0], T_gch[vertex_hull.vertices,1], c='r', s=50)
plt.scatter(T[:,0], T[:,1], c=d, cmap='viridis', s=20)
plt.colorbar()
plt.show()

In [None]:
plt.scatter(eT_all[gch_vertices, 0], eT_all[gch_vertices, 1], c='r', s=50)
plt.scatter(eT_all[:,0], eT_all[:,1], c=de, cmap='viridis', s=20)
plt.colorbar()
plt.show()

In [None]:
plt.scatter(T[:, 0], T[:, 1], c=energies, cmap='viridis')
plt.scatter(T[gch_vertices, 0], T[gch_vertices, 1], c='r', s=100)
plt.colorbar()
plt.show()

# Build a chemiscope

In [None]:
sys.path.append('/home/helfrech/Tools/chemiscope/utils')
from chemiscope_input import write_chemiscope_input

In [None]:
true_classes_iza = np.array([f'IZA{i}' for i in cantons_iza])
true_classes_deem = np.array(['DEEM']*len(deem))
true_classes = np.concatenate((true_classes_iza, true_classes_deem))

predicted_classes_iza = \
    np.loadtxt(f'../Processed_Data/IZA_226onDEEM_10k/Data/{cutoff}/Kernel_Models/Gaussian/KSVC-KPCovR/4-Class/kpcovr_structure_cantons.dat', dtype=int)
predicted_classes_deem = \
    np.loadtxt(f'../Processed_Data/DEEM_10k/Data/{cutoff}/Kernel_Models/Gaussian/KSVC-KPCovR/4-Class/kpcovr_structure_cantons.dat', dtype=int)
predicted_classes = np.concatenate((predicted_classes_iza, predicted_classes_deem))
predicted_classes = np.array([f'IZA{pc}' if pc < 4 else 'DEEM' for pc in predicted_classes])

In [None]:
Si_fraction_iza = np.loadtxt('../Raw_Data/GULP/IZA_226/ChemComp_F.txt', usecols=1)
Si_fraction_deem = np.ones(len(deem)) / 3
Si_fraction = np.concatenate((Si_fraction_iza, Si_fraction_deem))
Si_fraction = np.delete(Si_fraction, RWY)

In [None]:
frames = deepcopy(deem)
for frame in frames:
    for i in ('Energy_unit', 'spacegroup', 'Energy', 'unit_cell', 'Energy_per_Si_Opt'):
        frame.info.pop(i)

frames = iza + frames

extra = dict(projection=dict(target='structure', values=T[:, 0:2]),
             energies=dict(target='structure', values=energies),
             volumes=dict(target='structure', values=volumes),
             vertex=dict(target='structure', values=vertices),
             true_class=dict(target='structure', values=true_classes),
             predicted_class=dict(target='structure', values=predicted_classes),
             hull_distance=dict(target='structure', values=d),
             inv_hull_distance=dict(target='structure', values=d_inv),
             hull_distance_energy=dict(target='structure', values=de),
             inv_hull_distance_energy=dict(target='structure', values=de_inv),
             Si_fraction=dict(target='structure', values=Si_fraction))

In [None]:
write_chemiscope_input(f'{gch_dir}/svm-kpcovr-gch_chemiscope_test.json.gz', 
                       frames, 
                       extra=extra, 
                       meta=dict(name='SVM-KPCovR-GCH'), 
                       cutoff=None)

# TODO: move this to an analysis notebook

In [5]:
import gzip
import plotly.graph_objects as go

In [6]:
with gzip.GzipFile(f'{gch_dir}/svm-kpcovr-gch_chemiscope_test.json.gz', 'r') as f:
    data = json.load(f)

In [7]:
T1 = np.array(data['properties']['projection[1]']['values'])
T2 = np.array(data['properties']['projection[2]']['values'])
E = np.array(data['properties']['Energy_per_Si']['values'])
hull_distance = np.array(data['properties']['hull_distance']['values'])
hull_distance_energy = np.array(data['properties']['hull_distance_energy']['values'])
true_class = np.array(data['properties']['true_class']['values'])
predicted_class = np.array(data['properties']['predicted_class']['values'])

In [8]:
symbols = []
for t in true_class:
    if t == 'DEEM':
        symbols.append('circle')
    elif t == 'IZA1':
        symbols.append('square')
    elif t == 'IZA2':
        symbols.append('diamond')
    elif t == 'IZA3':
        symbols.append('cross')

In [9]:
classes = sorted(list(set(true_class)))

In [10]:
fig = go.Figure()

for t, s in zip(classes, ('circle', 'square', 'diamond', 'cross')):
    X = T1[true_class == t]
    Y = T2[true_class == t]
    Z = E[true_class == t]
    HDE = hull_distance_energy[true_class == t]
    fig.add_trace(go.Scatter3d(x=X, y=Y, z=Z, mode='markers',
                               marker=dict(color=HDE,
                                           coloraxis='coloraxis',
                                           line_width=1,
                                           size=3,
                                           symbol=s),
                              name=t))

fig.update_layout(template='plotly_white',
                  scene=dict(xaxis_title='T<sub>1</sub>',
                             yaxis_title='T<sub>2</sub>',
                             zaxis_title='Energy per Si<br>(kJ/mol Si)'),
                  legend=dict(x=0.0, y=1.0,
                              xanchor='left', yanchor='top',
                              itemsizing='constant'),
                  coloraxis=dict(colorscale='Plasma',
                                 colorbar=dict(title='Hull Distance<br>(kJ/mol)')),
                  autosize=False,
                  width=600, height=600)

fig.show()
fig.write_image('../Results/6.0/SVM-KPCovR-GCH.png')