In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# System
import os
import sys
sys.path.append('/home/helfrech/Tools/Toolbox/utils')

# Maths
import numpy as np

# Plotting
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
import plotly.graph_objects as go

# ML
from regression import PCovR, KPCovR, SparseKPCovR
from regression import LR, KRR
from kernels import build_kernel, linear_kernel, gaussian_kernel
from kernels import center_kernel, center_kernel_fast
from kernels import center_kernel_oos, center_kernel_oos_fast
from soap import compute_soap_density, reshape_soaps

from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import LogisticRegression

# Atoms
from ase.io import read
from ase.neighborlist import neighbor_list

# Utilities
import h5py
import json
import itertools
from tqdm.notebook import tqdm
from project_utils import load_structures_from_hdf5

# Import COSMO style toolkit
import cosmoplot.colorbars as cosmocbars
import cosmoplot.utils as cosmoutils
import cosmoplot.style as cosmostyle

cosmostyle.set_style('article')
colorList = cosmostyle.color_cycle

In /home/helfrech/.config/matplotlib/stylelib/cosmo.mplstyle: 
The savefig.frameon rcparam was deprecated in Matplotlib 3.1 and will be removed in 3.3.
In /home/helfrech/.config/matplotlib/stylelib/cosmoLarge.mplstyle: 
The savefig.frameon rcparam was deprecated in Matplotlib 3.1 and will be removed in 3.3.


The savefig.frameon rcparam was deprecated in Matplotlib 3.1 and will be removed in 3.3.



In [3]:
# sys.path.append('/scratch/helfrech/Sync/GDrive/Projects/KPCovR/kernel-tutorials')
# sys.path.append('/scratch/helfrech/Sync/GDrive/Projects/KPCovR/KernelPCovR/analysis/scripts')
# from utilities.sklearn_covr.kpcovr import KernelPCovR as KPCovR2
# from utilities.sklearn_covr.pcovr import PCovR as PCovR2
# from helpers import l_regr, l_kpcovr

# Functions

In [4]:
def df_to_class(df, df_type, n_classes, use_df_sums=True):
    """
        Make class predictions based on a decision function.
        Based on the sci-kit learn SVC prediction, see
        `sklearn.multiclass._ovr_decision_function` 
        (sci-kit learn licensed under BSD 3-Clause license)
        TODO: could we also just use this function?
        
        ---Arguments---
        df: decision function on which to make class predictions
        df_type: decision function type, 'ovo' or 'ovr'
        n_classes: number of integer classes
        use_df_sums: augment the 'ovo' vote counts with
            decision function values (useful for tie breaks)
            
        ---Returns---
        predicted_class: predicted integer class
    """
    
    # Approximation to the number of classes, should be valid up to at least 1M
    #n_classes = int(np.sqrt(2*df.shape[-1])) + 1
    
    if n_classes > 2:
        if df_type == 'ovo':
            vote_matrix = np.zeros((df.shape[0], n_classes))
            df_sum = np.zeros((df.shape[0], n_classes))

            # Predicted class determined by majority vote
            col_idx = 0
            for i in range(0, n_classes):
                for j in range(i + 1, n_classes):
                    col_train = df[:, col_idx]
                    vote_matrix[col_train > 0, i] += 1
                    vote_matrix[col_train <= 0, j] += 1

                    # Add value of decision function
                    if use_df_sums:
                        df_sum[:, i] += df[:, col_idx]
                        df_sum[:, j] -= df[:, col_idx]

                    col_idx += 1

            # sci-kit learn transformation from 'ovo' to 'ovr'
            if use_df_sums:
                transformed_df_sum = df_sum / (3 * (np.abs(df_sum) + 1))
                vote_matrix += transformed_df_sum

            predicted_class = np.argmax(vote_matrix, axis=1) + 1

        elif df_type == 'ovr':

            # Predicted class determined by largest value of the decision function
            predicted_class = np.argmax(df, axis=1) + 1

        else:
            print("Error: invalid decision function. Use 'ovo' or 'ovr'")
    else:
        predicted_class = np.zeros(df.shape[0], dtype=int)
        
        # This appears to be the convention, which is "opposite" of that above
        # Default exactly zero decision function value to the "positive" class
        predicted_class[df >= 0] = 2
        predicted_class[df < 0] = 1
        
    return predicted_class

def rrw_neighbors(frame, center_species, env_species, cutoff, self_interaction=False):
    """
        Compute the neighbor list for every atom of the central atom species
        and generate the r, r', w for each pair of neighbors 
        
        ---Arguments---
        frame: atomic structure
        center_species: species of atoms to use as centers
        env_species: species of atoms to include in the environment
        cutoff: atomic environment cutoff
        self_interaction: include the central atom as its own neighbor
        
        ---Returns---
        rrw: list of a list of numpy 3D numpy arrays. 
            Each numpy array is of shape (3, n_neighbors_a, n_neighbors_b),
            where the axes are organized as follows:
            axis=0: distances to neighbor A from the central atom
            axis=1: distances to neighbor B from the central atom
            axis=2: angle between the distance vectors to neighbors A and B from the central atom
        idxs: same structure as rrw, but holds the indices of the atoms involved in the tuple, i.e.,
            axis=0: index of central atom
            axis=1: index of neighbor A
            axis=2: index of neighbor B
    """
    
    # Extract indices of central atoms and environment atoms
    center_species_idxs = [np.nonzero(frame.numbers == i)[0] for i in center_species]
    env_species_idxs = [np.nonzero(frame.numbers == i)[0] for i in env_species]
    
    # Build neighbor list for all atoms
    nl = {}
    nl['i'], nl['j'], nl['d'], nl['D'] = neighbor_list('ijdD', frame, cutoff, 
                                                       self_interaction=self_interaction)
    
    rrw = []
    idxs = []
    
    # Loop over centers grouped by species
    for center_idxs in center_species_idxs:
        for center in center_idxs:
            
            # Build subset of neighbor list that just has the neighbors of
            # the center
            center_nl_idxs = np.nonzero(nl['i'] == center)[0]
            nl_center = {}
            for k, v in nl.items():
                nl_center[k] = v[center_nl_idxs]
                
            rrw_species = []
            idxs_species = []
                
            # Loop over combinations of environment species
            for env_species_a, env_species_b in itertools.combinations_with_replacement(env_species_idxs, 2):
                a = np.nonzero(np.isin(nl_center['j'], env_species_a))[0]
                b = np.nonzero(np.isin(nl_center['j'], env_species_b))[0]

                # Extract distances to neighbors from the central atom (r, r')
                da = nl_center['d'][a]
                db = nl_center['d'][b]
                Da = nl_center['D'][a]
                Db = nl_center['D'][b]
                r_n, r_m = np.meshgrid(da, db, indexing='ij')                
                
                # Compute angles between neighbors and central atom (w)
                D = np.matmul(Da, Db.T)
                d = np.outer(da, db)
                d[d <= 0.0] = 1.0
                w = D / d

                # Extract indices of the atoms in the rr'w triplet
                ia = nl_center['j'][a]
                ib = nl_center['j'][b]
                j_n, j_m = np.meshgrid(ia, ib, indexing='ij')
                j_center = np.full(j_n.shape, center, dtype=int)
                
                # Build 3D matrix of rr'w triplets
                rrw_species.append(np.stack((r_n, r_m, w)))
                idxs_species.append(np.stack((j_center, j_n, j_m)))
            
            rrw.append(rrw_species)
            idxs.append(idxs_species)
    
    return rrw, idxs

def make_tuples(data):
    """
        Take a list of lists of rr'w formatted 3D arrays (see rrw_neighbors)
        and reshape into a list of lists of 2D arrays of shape (n_neighbor_pairs, 3),
        where each row is a rr'w triplet and the columns are in the order r, r', w
        
        ---Arguments---
        data: list of lists of arrays to "reshape"
        
        ---Returns---
        center_tuple: "reshaped" data list
    """
    n_centers = len(data)
    center_tuple = []
    
    # Loop over centers
    for nctr in range(0, n_centers):
        n_pairs = len(data[nctr])
        pair_tuple = []
        
        # Loop over species pairs
        for npr in range(0, n_pairs):
            data_shape = np.shape(data[nctr][npr])
            
            # Reshape the 3D array to a 2D array
            tuple_array = np.reshape(np.moveaxis(data[nctr][npr], 0, -1), 
                                     (np.prod(data_shape[1:]), data_shape[0]))
            
            pair_tuple.append(tuple_array)
        
        center_tuple.append(pair_tuple)
    
    return center_tuple

# Load and split data

In [5]:
# Load train and test set indices for Deem
idxs_deem_train = np.loadtxt('../Processed_Data/DEEM_10k/train.idxs', dtype=int)
idxs_deem_test = np.loadtxt('../Processed_Data/DEEM_10k/test.idxs', dtype=int)

# Total number of structures
n_deem_train = idxs_deem_train.size
n_deem_test = idxs_deem_test.size
n_deem = n_deem_train + n_deem_test

In [6]:
# Load SOAP cutoffs
with open('../Processed_Data/soap_hyperparameters.json', 'r') as f:
    soap_hyperparameters = json.load(f)
    
cutoffs = soap_hyperparameters['interaction_cutoff']

In [7]:
# Load IZA cantons
cantons_iza = np.loadtxt('../Raw_Data/GULP/IZA_226/cantons.txt', usecols=1, dtype=int)
RWY = np.nonzero(cantons_iza == 4)[0][0]
cantons_iza = np.delete(cantons_iza, RWY)
n_iza = len(cantons_iza)

In [8]:
cantons_iza = np.ones(n_iza, dtype=int)

In [9]:
# Select IZA sample
# (will be overwritten if we load a kernel
n_iza_train = n_iza // 2
n_iza_test = n_iza - n_iza_train
idxs_iza = np.arange(0, n_iza)
np.random.shuffle(idxs_iza)

idxs_iza_train = idxs_iza[0:n_iza_train]
idxs_iza_test = idxs_iza[n_iza_train:n_iza_train+n_iza_test]

In [10]:
# Make dummy DEEM cantons
cantons_deem = np.ones(n_deem, dtype=int) * 4

In [11]:
# Make dummy DEEM cantons
cantons_deem = np.ones(n_deem, dtype=int) * 2

# Load SOAPs and build kernels

In [12]:
# Flag to recompute existing kernels
remove_kernels = False

In [13]:
K_train = {}
K_test = {}
K_test_test = {}
kernel_type = {}
gamma = {}

In [14]:
for cutoff in cutoffs:
    kernel_file = f'../Processed_Data/Models/{cutoff}/structure_kernels.hdf5'
    
    # Start fresh
    if remove_kernels and os.path.exists(kernel_file):
        os.remove(kernel_file)
    
    # Load the kernels if they exist
    try:
        f = h5py.File(kernel_file, 'r')
        
        K_train[cutoff] = f['K_train'][:]
        K_test[cutoff] = f['K_test'][:]
        K_test_test[cutoff] = f['K_test_test'][:]
        kernel_type[cutoff] = f.attrs['kernel_type']
        gamma[cutoff] = f.attrs['gamma']
        
        # Don't need to store indices in a dictonary
        # since they are the same for all cutoffs
        idxs_iza_train = f.attrs['idxs_iza_train']
        idxs_iza_test = f.attrs['idxs_iza_test']
        idxs_deem_train = f.attrs['idxs_deem_train']
        idxs_deem_test = f.attrs['idxs_deem_test']
 
        f.close()
    
    # Compute the kernels if they don't exist
    except OSError:
    
        # Load kernel parameters
        model_file = f'../Processed_Data/Models/{cutoff}/volumes_mae_parameters.json'

        with open(model_file, 'r') as f:
            model_dict = json.load(f)

        kernel_type[cutoff] = model_dict['kernel_type']
        gamma[cutoff] = model_dict['gamma']

        # Load SOAPs
        deem_file = f'../Processed_Data/DEEM_10k/Data/{cutoff}/soaps.hdf5'
        deem_soaps = load_structures_from_hdf5(deem_file, datasets=None, concatenate=False)

        iza_file = f'../Processed_Data/IZA_226onDEEM_10k/Data/{cutoff}/soaps.hdf5'
        iza_soaps = load_structures_from_hdf5(iza_file, datasets=None, concatenate=False)
        iza_soaps.pop(RWY)

        # Build the collection of soap vectors
        # for the "master" kernel
        deem_train = [deem_soaps[i] for i in idxs_deem_train]
        deem_test = [deem_soaps[i] for i in idxs_deem_test]
        iza_train = [iza_soaps[i] for i in idxs_iza_train]
        iza_test = [iza_soaps[i] for i in idxs_iza_test]

        # Build "master" kernel between all DEEM and all IZA
        K_train[cutoff] = build_kernel(iza_train+deem_train, iza_train+deem_train, 
                                       kernel=kernel_type[cutoff], gamma=gamma[cutoff])
        K_test[cutoff] = build_kernel(iza_test+deem_test, iza_train+deem_train, 
                                      kernel=kernel_type[cutoff], gamma=gamma[cutoff])
        K_test_test[cutoff] = build_kernel(iza_test+deem_test, iza_test+deem_test, 
                                           kernel=kernel_type[cutoff], gamma=gamma[cutoff])
        
        # Save kernels for later
        g = h5py.File(kernel_file, 'w')
        
        g.create_dataset('K_train', data=K_train[cutoff])
        g.create_dataset('K_test', data=K_test[cutoff])
        g.create_dataset('K_test_test', data=K_test_test[cutoff])
        
        g.attrs['idxs_iza_train'] = idxs_iza_train
        g.attrs['idxs_iza_test'] = idxs_iza_test
        g.attrs['idxs_deem_train'] = idxs_deem_train
        g.attrs['idxs_deem_test'] = idxs_deem_test
        g.attrs['kernel_type'] = kernel_type[cutoff]
        g.attrs['gamma'] = gamma[cutoff]
        
        g.close()

In [15]:
# Overwrite with full SOAPs
soaps_train = {}
soaps_test = {}
soaps_center = {}
soaps_scale = {}
for cutoff in cutoffs:
    #deem_file = f'../Processed_Data/DEEM_10k/Data/{cutoff}/soaps_full_avg.hdf5'
    #deem_file = f'../Processed_Data/DEEM_10k/Data/{cutoff}/soaps_full_avg_nonorm.hdf5'
    deem_file = f'../Processed_Data/DEEM_10k/Data/{cutoff}/soaps_radial_full_avg_nonorm.hdf5'
    soaps_deem = load_structures_from_hdf5(deem_file, datasets=None, concatenate=True)
    
    #iza_file = f'../Processed_Data/IZA_226/Data/{cutoff}/soaps_full_avg.hdf5'
    #iza_file = f'../Processed_Data/IZA_226/Data/{cutoff}/soaps_full_avg_nonorm.hdf5'
    iza_file = f'../Processed_Data/IZA_226/Data/{cutoff}/soaps_radial_full_avg_nonorm.hdf5'
    soaps_iza = load_structures_from_hdf5(iza_file, datasets=None, concatenate=True)
    soaps_iza = np.delete(soaps_iza, RWY, axis=0)
    
    soaps_deem_train = soaps_deem[idxs_deem_train]
    soaps_deem_test = soaps_deem[idxs_deem_test]
    
    soaps_iza_train = soaps_iza[idxs_iza_train]
    soaps_iza_test = soaps_iza[idxs_iza_test]
    
    soaps_train[cutoff] = np.concatenate((soaps_iza_train, soaps_deem_train))
    soaps_test[cutoff] = np.concatenate((soaps_iza_test, soaps_deem_test))
    
    soaps_center[cutoff] = np.zeros(soaps_train[cutoff].shape[1])
    #soaps_scale[cutoff] = 1.0
    soaps_scale[cutoff] = np.std(soaps_train[cutoff])
    
    #soaps_center[cutoff] = np.mean(soaps_train[cutoff], axis=0)
    #soaps_train[cutoff] -= soaps_center[cutoff]
    #soaps_test[cutoff] -= soaps_center[cutoff]
    
    #soaps_scale[cutoff] = np.linalg.norm(soaps_train[cutoff], axis=0) / np.sqrt(soaps_train[cutoff].shape[0] / soaps_train[cutoff].shape[1])
    #soaps_scale[cutoff] = np.linalg.norm(soaps_train[cutoff]) / np.sqrt(soaps_train[cutoff].shape[0])
    soaps_train[cutoff] /= soaps_scale[cutoff]
    soaps_test[cutoff] /= soaps_scale[cutoff]

In [16]:
# Save IZA indices for later
# (we do this after the kernel loading to make sure that if an existing
# kernel is loaded, the associated indices don't get overwritten)
#np.savetxt(f'../Processed_Data/IZA_226onDEEM_10k/train.idxs', idxs_iza_train, fmt='%d')
#np.savetxt(f'../Processed_Data/IZA_226onDEEM_10k/test.idxs', idxs_iza_test, fmt='%d')

In [17]:
# Build set of "master" canton labels
cantons_train = np.concatenate((cantons_iza[idxs_iza_train], cantons_deem[idxs_deem_train]))
cantons_test = np.concatenate((cantons_iza[idxs_iza_test], cantons_deem[idxs_deem_test]))
n_classes = np.amax(cantons_train)

In [18]:
# Center and scale kernels
for cutoff in cutoffs:
    K_test[cutoff] = center_kernel_fast(K_test[cutoff], K_ref=K_train[cutoff])
    K_train[cutoff] = center_kernel_fast(K_train[cutoff])

    K_scale = np.trace(K_train[cutoff]) / K_train[cutoff].shape[0]
    K_test[cutoff] /= K_scale
    K_train[cutoff] /= K_scale

# SVM on full test and train sets

In [19]:
# Use a different SVM regularization than the optimal
C_override = None

In [22]:
for cutoff in cutoffs:
    
    model_dir = f'../Processed_Data/Models/{cutoff}'
    with open(f'{model_dir}/ksvc_parameters.json', 'r') as f:
        model_dict = json.load(f)
        
    if C_override is not None:
        C = C_override
    else:
        C = model_dict['C']
            
    # Assemble kernels
    k_train = K_train[cutoff]
    k_test = K_test[cutoff]

    # SVC
#     svc = SVC(kernel='precomputed', decision_function_shape=model_dict['df_type'], 
#              class_weight=model_dict['class_weight'], C=C)
#     svc.fit(k_train, cantons_train)
    
#     df_train = svc.decision_function(k_train)
#     df_test = svc.decision_function(k_test)
    
#     predicted_cantons_train = svc.predict(k_train)
#     predicted_cantons_test = svc.predict(k_test)
    
#     print(svc.score(k_train, cantons_train))
#     print(svc.score(k_test, cantons_test))
    
    # TODO: if using LinearSVC, need to optimize with LinearSVC, but re-use C for now
    # NOTE: no ovo option for LinearSVC
    # NOTE: already centered data
    # NOTE: l1 penalty doesn't seem to work so well here, but with logistic regression it is a little better
    model_dict['df_type'] = 'ovr'
    svc = LinearSVC(penalty='l2', loss='squared_hinge', dual=False, C=1.0, 
                    multi_class=model_dict['df_type'], fit_intercept=True, 
                    tol=1.0E-3, max_iter=1000)
    
    svc.fit(soaps_train[cutoff], cantons_train)
    
    df_train = svc.decision_function(soaps_train[cutoff])
    df_test = svc.decision_function(soaps_test[cutoff])
    
    predicted_cantons_train = svc.predict(soaps_train[cutoff])
    predicted_cantons_test = svc.predict(soaps_test[cutoff])

    print(svc.score(soaps_train[cutoff], cantons_train))
    print(svc.score(soaps_test[cutoff], cantons_test))
    print(svc.coef_)
    print(svc.intercept_)
    print(classification_report(cantons_test, predicted_cantons_test))
    print(confusion_matrix(cantons_test, predicted_cantons_test))

#     w = reshape_soaps(svc.coef_, 3, 12, 9)
#     density = compute_soap_density(12, 9, cutoff, w,
#                                    np.linspace(0, cutoff, 50),
#                                    np.linspace(-1, 1, 50),
#                                    chunk_size_r=10, chunk_size_p=10)

    # Save decision functions
    if n_classes == 2:
        df_deem = np.zeros(n_deem)
        df_iza = np.zeros(n_iza)
    else:
        if model_dict['df_type'] == 'ovo':
            n_df = n_classes * (n_classes - 1) // 2
        else:
            n_df = n_classes
            
        df_deem = np.zeros((n_deem, n_df))
        df_iza = np.zeros((n_iza, n_df))
    
    df_deem[idxs_deem_train] = df_train[n_iza_train:]
    df_deem[idxs_deem_test] = df_test[n_iza_test:]
    np.savetxt(f'../Processed_Data/DEEM_10k/Data/{cutoff}/ksvc_structure_dfs.dat', df_deem)
    
    df_iza[idxs_iza_train] = df_train[0:n_iza_train]
    df_iza[idxs_iza_test] = df_test[0:n_iza_test]
    np.savetxt(f'../Processed_Data/IZA_226onDEEM_10k/Data/{cutoff}/ksvc_structure_dfs.dat', df_iza)
    
    # Save SVC class predictions
    predicted_cantons_deem = np.zeros(n_deem)
    predicted_cantons_deem[idxs_deem_train] = predicted_cantons_train[n_iza_train:]
    predicted_cantons_deem[idxs_deem_test] = predicted_cantons_test[n_iza_test:]
    np.savetxt(f'../Processed_Data/DEEM_10k/Data/{cutoff}/ksvc_structure_cantons.dat',
               predicted_cantons_deem, fmt='%d')
    
    predicted_cantons_iza = np.zeros(n_iza)
    predicted_cantons_iza[idxs_iza_train] = predicted_cantons_train[0:n_iza_train]
    predicted_cantons_iza[idxs_iza_test] = predicted_cantons_test[0:n_iza_test]
    np.savetxt(f'../Processed_Data/IZA_226onDEEM_10k/Data/{cutoff}/ksvc_structure_cantons.dat', 
               predicted_cantons_iza, fmt='%d')

0.9857542610022895
0.9521794329242488
[[-0.03557808  0.04348423 -0.04869769  0.81227903  1.60380599 -0.08174649
  -1.09349027  0.86009147  0.01907264 -0.59745718  0.92694671  1.18224905
  -0.19608589 -0.13700014 -0.0433833  -0.01767375  0.00207597  0.06435977
  -0.2565276   0.31742197  0.76654069  0.08545561 -0.05903185 -0.04736582]]
[-0.05248742]
              precision    recall  f1-score   support

           1       0.00      0.00      0.00       113
           2       0.95      1.00      0.98      2250

    accuracy                           0.95      2363
   macro avg       0.48      0.50      0.49      2363
weighted avg       0.91      0.95      0.93      2363

[[   0  113]
 [   0 2250]]
0.9894428898499109
0.9610664409648751
[[ 1.10374281e-01  9.56425961e-02  1.19155951e+00 -1.01636374e+00
  -5.54478461e-01  2.88024395e+00  8.00689031e-01  3.97893145e-01
  -5.00260772e-01  2.59057339e-02  1.50644444e+00  1.19099079e+00
  -5.47239597e-01 -1.45174214e-03 -8.20166276e-03  1.0044762


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



In [None]:
print(classification_report(cantons_test, predicted_cantons_test))
print(confusion_matrix(cantons_test, predicted_cantons_test))

In [None]:
np.nonzero(predicted_cantons_test == 1)

In [None]:
np.mean(soaps_train[6.0], axis=0)

In [None]:
np.std(soaps_train[6.0])

# Regress full (averaged) SOAP on Linear SVM

In [25]:
# TODO: should probably just do this "legitimately" with a linear SVM on the full average SOAPs
# instead of regressing on the linear kernel SVM
# Could also convert from the dual to the primal weights, 
# but based on the sk-learn implementation this seems a bit messy,
# and it is still probably best just to do the linear SVM anyway.
# For a quick first pass though, we'll regress on the decision function of a linear kernel SVM
# built on the FPS'ed SOAPs

# Logistic Regression

In [26]:
cutoff = 6.0
coef = svc.coef_

In [27]:
df_max = np.argmax(np.matmul(soaps_test[cutoff], coef.T))
df_min = np.argmin(np.matmul(soaps_test[cutoff], coef.T))
print(df_max, df_min)

1419 111


In [28]:
np.dot(coef, soaps_test[cutoff][df_min])

array([-4.40345322])

In [29]:
np.dot(coef, soaps_test[cutoff][df_max])

array([16.25753674])

In [30]:
unscaled_soap = soaps_test[cutoff][df_max]*soaps_scale[cutoff]+soaps_center[cutoff]
w = coef * (1.0 - soaps_center[cutoff] / unscaled_soap) / soaps_scale[cutoff]
np.dot(w, unscaled_soap)

array([16.25753674])

In [31]:
soaps_scale[cutoff] = 1.0

In [32]:
# SOAP density
soap_density = compute_soap_density(12, 9, cutoff,
                                    reshape_soaps(soaps_test[cutoff][[df_max, df_min]]*soaps_scale[cutoff]+soaps_center[cutoff],
                                                  3, 12, 9),
                                    np.linspace(0, cutoff, 50), 
                                    np.linspace(-1, 1, 50),
                                    chunk_size_r=10, chunk_size_p=10)

mean_soap_density = compute_soap_density(12, 9, cutoff,
                                         reshape_soaps(np.mean(soaps_train[cutoff]*soaps_scale[cutoff]+soaps_center[cutoff], axis=0),
                                         3, 12, 9),
                                         np.linspace(0.0, cutoff, 50),
                                         np.linspace(-1, 1, 50),
                                         chunk_size_r=10, chunk_size_p=10)

density_overlap = density*soap_density

In [33]:
#real_space_w = coef * (1.0 - soaps_center[cutoff] / soaps_test[cutoff][[df_max, df_min]]) / soaps_scale[cutoff]
real_space_w = coef
real_space_density = compute_soap_density(12, 9, cutoff,
                                    reshape_soaps(real_space_w, 3, 12, 9),
                                    np.linspace(0, cutoff, 50), 
                                    np.linspace(-1, 1, 50),
                                    chunk_size_r=10, chunk_size_p=10)

density_overlap = real_space_density*soap_density

In [34]:
density_overlap.shape

(2, 3, 50, 50, 50)

In [35]:
dr = np.diff(np.linspace(0, cutoff, 50))[0]
dp = np.diff(np.linspace(-1, 1, 50))[0]
np.sum(density_overlap, axis=(1, 2, 3, 4))*dr*dr*dp

array([35.94566244, -4.53828658])

In [36]:
np.amin(soap_density), np.amax(soap_density)

(-4.039897703813097, 48.6212760239801)

In [37]:
np.amin(real_space_density), np.amax(real_space_density)

(-2.774295566411343, 2.930572774027066)

In [38]:
np.save('./real_space_soap_mean.npy', mean_soap_density)
np.save('./real_space_soap.npy', soap_density)
np.save('./real_space_weights.npy', density)

In [39]:
np.save('./nnl_soap_scaling.npy', soaps_scale)

In [40]:
deem_10k = read('../Raw_Data/DEEM_10k/DEEM_10000.xyz', index=':')
iza = read('../Raw_Data/GULP/IZA_226/IZA_OPT.xyz', index=':')
deem_10k_test = [deem_10k[i] for i in idxs_deem_test]
iza_test = [iza[i] for i in idxs_iza_test]
frames_test = iza_test + deem_10k_test
soap_neighbors, idxs_neighbors = zip(*[rrw_neighbors(frames_test[i], [14], [8, 14], cutoff, 
                                                     self_interaction=True) for i in [df_max, df_min]])

In [41]:
# 0: O-O
# 1: Si-O
# 2: Si-Si
species_pair_idx = 0
soap_idx = 0

def convert_species_idx(species_pair_idx):
    if species_pair_idx == 0:
        species_pair_label = 'OO'
    elif species_pair_idx == 1:
        species_pair_label = 'OSi'
    elif species_pair_idx == 2:
        species_pair_label = 'SiSi'
    else:
        species_pair_label = 'XX'
    return species_pair_label
 
def convert_soap_idx(soap_idx):
    if soap_idx == 0:
        soap_label = 'DEEM'
    elif soap_idx == 1:
        soap_label = 'IZA'
    else:
        soap_label = 'XXX'
    return soap_label

In [42]:
# Plot
for species_pair_idx in range(0, 3):
    species_pair_label = convert_species_idx(species_pair_idx)
    
    rx_grid, ry_grid, tz_grid = np.meshgrid(np.linspace(0, cutoff, 50), 
                                            np.linspace(0, cutoff, 50), 
                                            np.linspace(-1, 1, 50), indexing='ij')

    fig = go.Figure(data=go.Volume(x=rx_grid.flatten(),
                                   y=ry_grid.flatten(),
                                   z=tz_grid.flatten(),
                                   value=real_space_density[0][species_pair_idx].flatten(),
                                   #value=real_space_density[soap_idx][species_pair_idx].flatten(),
                                   coloraxis='coloraxis',
                                   isomin=-1.75E+0,
                                   isomax=1.75E+0,
                                   opacity=0.6,
                                   surface_count=4,
                                   caps=dict(x_show=False, y_show=False, z_show=False)))

    atom_stencil_x_deem = np.concatenate([soap_neighbors[0][center][species_pair_idx][0].flatten() 
                                     for center in range(0, len(soap_neighbors[0]))])
    atom_stencil_y_deem = np.concatenate([soap_neighbors[0][center][species_pair_idx][1].flatten()
                                     for center in range(0, len(soap_neighbors[0]))])
    atom_stencil_z_deem = np.concatenate([soap_neighbors[0][center][species_pair_idx][2].flatten()
                                     for center in range(0, len(soap_neighbors[0]))])

    atom_stencil_x_iza = np.concatenate([soap_neighbors[1][center][species_pair_idx][0].flatten() 
                                     for center in range(0, len(soap_neighbors[1]))])
    atom_stencil_y_iza = np.concatenate([soap_neighbors[1][center][species_pair_idx][1].flatten()
                                     for center in range(0, len(soap_neighbors[1]))])
    atom_stencil_z_iza = np.concatenate([soap_neighbors[1][center][species_pair_idx][2].flatten()
                                     for center in range(0, len(soap_neighbors[1]))])

    idx_x_deem = np.concatenate([idxs_neighbors[0][center][species_pair_idx][0].flatten() 
                            for center in range(0, len(soap_neighbors[0]))])
    idx_y_deem = np.concatenate([idxs_neighbors[0][center][species_pair_idx][1].flatten() 
                            for center in range(0, len(soap_neighbors[0]))])
    idx_z_deem = np.concatenate([idxs_neighbors[0][center][species_pair_idx][2].flatten() 
                            for center in range(0, len(soap_neighbors[0]))])

    idx_x_iza = np.concatenate([idxs_neighbors[1][center][species_pair_idx][0].flatten() 
                            for center in range(0, len(soap_neighbors[1]))])
    idx_y_iza = np.concatenate([idxs_neighbors[1][center][species_pair_idx][1].flatten() 
                            for center in range(0, len(soap_neighbors[1]))])
    idx_z_iza = np.concatenate([idxs_neighbors[1][center][species_pair_idx][2].flatten() 
                            for center in range(0, len(soap_neighbors[1]))])

    fig.add_trace(go.Scatter3d(x=atom_stencil_x_deem,
                               y=atom_stencil_y_deem,
                               z=atom_stencil_z_deem,
                               name=f'DEEM, {species_pair_label}',
                               mode='markers',
                               marker=dict(size=1,
                                           color='green'),
                               hovertemplate='x: %{x}<br>y: %{y}<br>z: %{z}<br>(i, j): %{text}',
                               text=['{}'.format(i) for i in zip(idx_x_deem, idx_y_deem, idx_z_deem)],
                               showlegend=True))

    fig.add_trace(go.Scatter3d(x=atom_stencil_x_iza,
                               y=atom_stencil_y_iza,
                               z=atom_stencil_z_iza,
                               name=f'IZA, {species_pair_label}',
                               mode='markers',
                               marker=dict(size=2,
                                           color='purple'),
                               hovertemplate='x: %{x}<br>y: %{y}<br>z: %{z}<br>(i, j): %{text}',
                               text=['{}'.format(i) for i in zip(idx_x_iza, idx_y_iza, idx_z_iza)],
                               showlegend=True))

    fig.update_layout(template='plotly_white',
                      scene=dict(xaxis_title='r',
                                 yaxis_title='r\'',
                                 zaxis_title='w'),
                      legend=dict(x=0.0, y=1.0,
                                  xanchor='left', yanchor='top',
                                  itemsizing='constant'),
                      coloraxis=dict(colorscale='RdBu',
                                     colorbar=dict(title='Weights')),
                      autosize=True)

    #fig.show()
    #fig.write_html(f'../Results/{cutoff}/real_space_weights-{soap_label}-{species_pair_label}.html')
    fig.write_html(f'../Results/{cutoff}/real_space_weights-IZA-DEEM-{species_pair_label}.html')
    fig.write_image(f'../Results/{cutoff}/real_space_weights-IZA-DEEM-{species_pair_label}.png')

In [43]:
# Plot
for species_pair_idx in range(0, 3):
    species_pair_label = convert_species_idx(species_pair_idx)
    for soap_idx in range(0, 2):
        soap_label = convert_soap_idx(soap_idx)
        
        rx_grid, ry_grid, tz_grid = np.meshgrid(np.linspace(0, cutoff, 50), 
                                                np.linspace(0, cutoff, 50), 
                                                np.linspace(-1, 1, 50), indexing='ij')

        fig = go.Figure(data=go.Volume(x=rx_grid.flatten(),
                                       y=ry_grid.flatten(),
                                       z=tz_grid.flatten(),
                                       value=soap_density[soap_idx][species_pair_idx].flatten(),
                                       coloraxis='coloraxis',
                                       isomin=1.0E+0,
                                       isomax=None,
                                       opacity=0.2,
                                       surface_count=20))

        atom_stencil_x = np.concatenate([soap_neighbors[soap_idx][center][species_pair_idx][0].flatten() 
                                         for center in range(0, len(soap_neighbors[soap_idx]))])
        atom_stencil_y = np.concatenate([soap_neighbors[soap_idx][center][species_pair_idx][1].flatten()
                                         for center in range(0, len(soap_neighbors[soap_idx]))])
        atom_stencil_z = np.concatenate([soap_neighbors[soap_idx][center][species_pair_idx][2].flatten()
                                         for center in range(0, len(soap_neighbors[soap_idx]))])

        idx_x = np.concatenate([idxs_neighbors[soap_idx][center][species_pair_idx][0].flatten() 
                                for center in range(0, len(soap_neighbors[soap_idx]))])
        idx_y = np.concatenate([idxs_neighbors[soap_idx][center][species_pair_idx][1].flatten() 
                                for center in range(0, len(soap_neighbors[soap_idx]))])
        idx_z = np.concatenate([idxs_neighbors[soap_idx][center][species_pair_idx][2].flatten() 
                                for center in range(0, len(soap_neighbors[soap_idx]))])


        fig.add_trace(go.Scatter3d(x=atom_stencil_x,
                                   y=atom_stencil_y,
                                   z=atom_stencil_z,
                                   name=f'{soap_label}, {species_pair_label}',
                                   mode='markers',
                                   marker=dict(size=1,
                                               color='cyan'),
                                   hovertemplate='x: %{x}<br>y: %{y}<br>z: %{z}<br>(i, j): %{text}',
                                   text=['{}'.format(i) for i in zip(idx_x, idx_y, idx_z)],
                                   showlegend=True))

        fig.update_layout(template='plotly_white',
                          scene=dict(xaxis_title='r',
                                     yaxis_title='r\'',
                                     zaxis_title='w'),
                          legend=dict(x=0.0, y=1.0,
                                      xanchor='left', yanchor='top',
                                      itemsizing='constant'),
                          coloraxis=dict(colorscale='Plasma',
                                         colorbar=dict(title='Density')),
                          autosize=True)

        #fig.show()
        fig.write_html(f'../Results/{cutoff}/real_space_soap-{soap_label}-{species_pair_label}.html')
        fig.write_image(f'../Results/{cutoff}/real_space_soap-{soap_label}-{species_pair_label}.png')

In [44]:
# Plot
for species_pair_idx in range(0, 3):
    species_pair_label = convert_species_idx(species_pair_idx)
    for soap_idx in range(0, 2):
        soap_label = convert_soap_idx(soap_idx)
        rx_grid, ry_grid, tz_grid = np.meshgrid(np.linspace(0, cutoff, 50), 
                                                np.linspace(0, cutoff, 50), 
                                                np.linspace(-1, 1, 50), indexing='ij')

        fig = go.Figure(data=go.Volume(x=rx_grid.flatten(),
                                       y=ry_grid.flatten(),
                                       z=tz_grid.flatten(),
                                       value=density_overlap[soap_idx][species_pair_idx].flatten()/10,
                                       coloraxis='coloraxis',
                                       isomin=-1.75E+0,
                                       isomax=1.75E+0,
                                       opacity=0.6,
                                       surface_count=4,
                                       caps=dict(x_show=False, y_show=False, z_show=False)))

        atom_stencil_x = np.concatenate([soap_neighbors[soap_idx][center][species_pair_idx][0].flatten() 
                                         for center in range(0, len(soap_neighbors[soap_idx]))])
        atom_stencil_y = np.concatenate([soap_neighbors[soap_idx][center][species_pair_idx][1].flatten()
                                         for center in range(0, len(soap_neighbors[soap_idx]))])
        atom_stencil_z = np.concatenate([soap_neighbors[soap_idx][center][species_pair_idx][2].flatten()
                                         for center in range(0, len(soap_neighbors[soap_idx]))])

        idx_x = np.concatenate([idxs_neighbors[soap_idx][center][species_pair_idx][0].flatten() 
                                for center in range(0, len(soap_neighbors[soap_idx]))])
        idx_y = np.concatenate([idxs_neighbors[soap_idx][center][species_pair_idx][1].flatten() 
                                for center in range(0, len(soap_neighbors[soap_idx]))])
        idx_z = np.concatenate([idxs_neighbors[soap_idx][center][species_pair_idx][2].flatten() 
                                for center in range(0, len(soap_neighbors[soap_idx]))])


        fig.add_trace(go.Scatter3d(x=atom_stencil_x,
                                   y=atom_stencil_y,
                                   z=atom_stencil_z,
                                   name=f'{soap_label}, {species_pair_label}',
                                   mode='markers',
                                   marker=dict(size=1,
                                               color='green'),
                                   hovertemplate='x: %{x}<br>y: %{y}<br>z: %{z}<br>(i, j): %{text}',
                                   text=['{}'.format(i) for i in zip(idx_x, idx_y, idx_z)],
                                   showlegend=True))

        fig.update_layout(template='plotly_white',
                          scene=dict(xaxis_title='r',
                                     yaxis_title='r\'',
                                     zaxis_title='w'),
                          legend=dict(x=0.0, y=1.0,
                                      xanchor='left', yanchor='top',
                                      itemsizing='constant'),
                          coloraxis=dict(colorscale='RdBu',
                                         colorbar=dict(title='Density*Weights')),
                          autosize=True)

        #fig.show()
        fig.write_html(f'../Results/{cutoff}/real_space_overlap-{soap_label}-{species_pair_label}.html')
        fig.write_image(f'../Results/{cutoff}/real_space_overlap-{soap_label}-{species_pair_label}.png')

# Check that decision functions can be predicted with KRR

In [None]:
for cutoff in cutoffs:
    
    # Assemble kernels
    k_train = K_train[cutoff]
    k_test = K_test[cutoff]
    
    # Load decision functions
    df_deem = np.loadtxt(f'../Processed_Data/DEEM_10k/Data/{cutoff}/ksvc_structure_dfs.dat')
    df_iza = np.loadtxt(f'../Processed_Data/IZA_226onDEEM_10k/Data/{cutoff}/ksvc_structure_dfs.dat')
    
    df_deem_train = df_deem[idxs_deem_train]
    df_deem_test = df_deem[idxs_deem_test]
    
    df_iza_train = df_iza[idxs_iza_train]
    df_iza_test = df_iza[idxs_iza_test]
    
    df_train = np.concatenate((df_iza_train, df_deem_train))
    df_test = np.concatenate((df_iza_test, df_deem_test))
    
    # Center and scale decision functions
    df_center = np.mean(df_train, axis=0)
    df_train -= df_center
    df_test -= df_center

    if df_train.ndim == 1:
        df_scale = np.linalg.norm(df_train) / np.sqrt(df_train.size)
    else:
        df_scale = np.linalg.norm(df_train, axis=0) / np.sqrt(df_train.shape[0] / df_train.shape[1])
    
    df_train /= df_scale
    df_test /= df_scale

    # Test KRR on decision functions
    # NOTE: KRR can't predict the test set
    # decision function very well -- why? <-- TODO: is this only for LinearSVC or also SVC?
    
#     krr = KernelRidge(alpha=1.0E-12, kernel='precomputed')
#     krr.fit(k_train, df_krr_train)
#     dfp_krr_train = krr.predict(k_train)
#     dfp_krr_test = krr.predict(k_test)

    krr = KRR(regularization=1.0E-12)
    krr.fit(k_train, df_train)
    dfp_train = krr.transform(k_train)
    dfp_test = krr.transform(k_test)
    
    print(np.mean(np.abs(dfp_train - df_train), axis=0))
    print(np.mean(np.abs(dfp_test - df_test), axis=0))

# KPCovR on full test and train sets

In [None]:
# Use a different number of components than that used for the optimization
n_components_override = 6

# Use an alpha other than the optimal
alpha_override = 0.0

# Use a regularization other than the optimal
regularization_override = 1.0E-12

In [None]:
for cutoff in cutoffs:
    
    model_dir = f'../Processed_Data/Models/{cutoff}'
    with open(f'{model_dir}/kpcovr_parameters.json', 'r') as f:
        model_dict = json.load(f)
        
    with open(f'{model_dir}/ksvc_parameters.json', 'r') as f:
        df_type = json.load(f)['df_type']
        
    # Assemble kernels
    k_train = K_train[cutoff]
    k_test = K_test[cutoff]
        
    # Load decision functions
    df_deem = np.loadtxt(f'../Processed_Data/DEEM_10k/Data/{cutoff}/ksvc_structure_dfs.dat')
    df_iza = np.loadtxt(f'../Processed_Data/IZA_226onDEEM_10k/Data/{cutoff}/ksvc_structure_dfs.dat')
    
    df_deem_train = df_deem[idxs_deem_train]
    df_deem_test = df_deem[idxs_deem_test]
    
    df_iza_train = df_iza[idxs_iza_train]
    df_iza_test = df_iza[idxs_iza_test]
    
    df_train = np.concatenate((df_iza_train, df_deem_train))
    df_test = np.concatenate((df_iza_test, df_deem_test))
    
    # Center and scale decision functions
    df_center = np.mean(df_train, axis=0)
    df_train -= df_center
    df_test -= df_center

    if df_train.ndim == 1:
        df_scale = np.linalg.norm(df_train) / np.sqrt(df_train.size)
    else:
        df_scale = np.linalg.norm(df_train, axis=0) / np.sqrt(df_train.shape[0] / df_train.shape[1])
    df_train /= df_scale
    df_test /= df_scale
    
    # Set KPCovR parameters
    if n_components_override is not None:
        n_components = n_components_override
    else:
        n_components = model_dict['n_components']
        
    if alpha_override is not None:
        alpha = alpha_override
    else:
        alpha = model_dict['alpha']
        
    if regularization_override is not None:
        regularization = regularization_override
    else:
        regularization = model_dict['regularization']

#     kpcovr = KPCovR2(n_components=n_components, kernel='precomputed',
#                      mixing=alpha,
#                      krr_params=dict(alpha=regularization))
#     kpcovr.fit(k_train, y_train)

#     T_train[cutoff] = kpcovr.transform(k_train)
#     yp_train[cutoff] = kpcovr.predict(k_train) 
#     T_test[cutoff] = kpcovr.transform(k_test) 
#     yp_test[cutoff] = kpcovr.predict(k_test)

    kpcovr = KPCovR(n_components=n_components, 
                    alpha=alpha, 
                    regularization=regularization)
    kpcovr.fit(k_train, df_train)
    
    T_train = kpcovr.transform_K(k_train)
    dfp_train = kpcovr.transform_Y(k_train)
    T_test = kpcovr.transform_K(k_test)
    dfp_test = kpcovr.transform_Y(k_test)
    
    dfp_train = np.squeeze(dfp_train) # TODO: move the squeezing to the KPCovR function
    dfp_test = np.squeeze(dfp_test)
        
    # Save KPCovR projections
    n_digits_deem = len(str(n_deem - 1))
    T_deem = np.zeros((n_deem, n_components)) # TODO: change this so just 1 df for 2-class
    T_deem[idxs_deem_train] = T_train[n_iza_train:]
    T_deem[idxs_deem_test] = T_test[n_iza_test:]
    
    g = h5py.File(f'../Processed_Data/DEEM_10k/Data/{cutoff}/kpcovr_structures.hdf5', 'w')
    for tdx, t in enumerate(T_deem):
        g.create_dataset(str(tdx).zfill(n_digits_deem), data=t)
        
    g.attrs['n_components'] = n_components
    g.attrs['alpha'] = alpha
    g.attrs['regularization'] = regularization
    
    g.close()
    
    n_digits_iza = len(str(n_iza - 1))
    T_iza = np.zeros((n_iza, n_components))
    T_iza[idxs_iza_train] = T_train[0:n_iza_train]
    T_iza[idxs_iza_test] = T_test[0:n_iza_test]
    
    g = h5py.File(f'../Processed_Data/IZA_226onDEEM_10k/Data/{cutoff}/kpcovr_structures.hdf5', 'w')
    for tdx, t in enumerate(T_iza):
        g.create_dataset(str(tdx).zfill(n_digits_iza), data=t)
        
    g.attrs['n_components'] = n_components
    g.attrs['alpha'] = alpha
    g.attrs['regularization'] = regularization
        
    g.close()
                    
    # Pickle the models
    # Copy the dict so we can make the numpy arrays lists
    kpcovr_dict = kpcovr.__dict__.copy()

    # Convert arrays to lists
    for k, v in kpcovr_dict.items():
        if isinstance(v, np.ndarray):
            kpcovr_dict[k] = v.tolist()

    # Save
    with open(f'{model_dir}/kpcovr.json', 'w') as f:
        json.dump(kpcovr_dict, f)
    
    # Rescale to raw decision function
    dfp_train = dfp_train * df_scale + df_center
    dfp_test = dfp_test * df_scale + df_center

    # Predict classes based on KPCovRized decision functions
    predicted_cantons_train = df_to_class(dfp_train, df_type, n_classes, use_df_sums=True)
    predicted_cantons_test = df_to_class(dfp_test, df_type, n_classes, use_df_sums=True)
    
    # Save KPCovR decision function predictions
    if n_classes == 2:
        df_deem = np.zeros(n_deem)
        df_iza = np.zeros(n_iza)
    else:
        if df_type == 'ovo':
            n_df = n_classes * (n_classes - 1) // 2
        else:
            n_df = n_classes
            
        df_deem = np.zeros((n_deem, n_df))
        df_iza = np.zeros((n_iza, n_df))

    df_deem[idxs_deem_train] = dfp_train[n_iza_train:]
    df_deem[idxs_deem_test] = dfp_test[n_iza_test:]
    np.savetxt(f'../Processed_Data/DEEM_10k/Data/{cutoff}/kpcovr_structure_dfs.dat', df_deem)
    
    df_iza[idxs_iza_train] = dfp_train[0:n_iza_train]
    df_iza[idxs_iza_test] = dfp_test[0:n_iza_test]
    np.savetxt(f'../Processed_Data/IZA_226onDEEM_10k/Data/{cutoff}/kpcovr_structure_dfs.dat', df_iza)
    
    # Save KPCovR class predictions
    predicted_cantons_deem = np.zeros(n_deem)
    predicted_cantons_deem[idxs_deem_train] = predicted_cantons_train[n_iza_train:]
    predicted_cantons_deem[idxs_deem_test] = predicted_cantons_test[n_iza_test:]
    np.savetxt(f'../Processed_Data/DEEM_10k/Data/{cutoff}/kpcovr_structure_cantons.dat',
               predicted_cantons_deem, fmt='%d')
    
    predicted_cantons_iza = np.zeros(n_iza)
    predicted_cantons_iza[idxs_iza_train] = predicted_cantons_train[0:n_iza_train]
    predicted_cantons_iza[idxs_iza_test] = predicted_cantons_test[0:n_iza_test]
    np.savetxt(f'../Processed_Data/IZA_226onDEEM_10k/Data/{cutoff}/kpcovr_structure_cantons.dat', 
               predicted_cantons_iza, fmt='%d')