In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# System
import os
import sys
sys.path.append('/home/helfrech/Tools/Toolbox/utils')

# Maths
import numpy as np

# Plotting
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
import plotly.graph_objects as go

# ML
from regression import PCovR, KPCovR, SparseKPCovR
from regression import LR, KRR
from kernels import build_kernel, linear_kernel, gaussian_kernel
from kernels import center_kernel, center_kernel_fast
from kernels import center_kernel_oos, center_kernel_oos_fast
from soap import compute_soap_density, reshape_soaps
from soap import rrw_neighbors, make_tuples

from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import LogisticRegression

# Atoms
from ase.io import read
from ase.neighborlist import neighbor_list

# Utilities
import h5py
import json
import itertools
from tqdm.notebook import tqdm
import project_utils as utils
from tools import load_json

# Import COSMO style toolkit
import cosmoplot.colorbars as cosmocbars
import cosmoplot.utils as cosmoutils
import cosmoplot.style as cosmostyle

cosmostyle.set_style('article')
colorList = cosmostyle.color_cycle

In /home/helfrech/.config/matplotlib/stylelib/cosmo.mplstyle: 
The savefig.frameon rcparam was deprecated in Matplotlib 3.1 and will be removed in 3.3.
In /home/helfrech/.config/matplotlib/stylelib/cosmoLarge.mplstyle: 
The savefig.frameon rcparam was deprecated in Matplotlib 3.1 and will be removed in 3.3.


The savefig.frameon rcparam was deprecated in Matplotlib 3.1 and will be removed in 3.3.



In [3]:
# sys.path.append('/scratch/helfrech/Sync/GDrive/Projects/KPCovR/kernel-tutorials')
# sys.path.append('/scratch/helfrech/Sync/GDrive/Projects/KPCovR/KernelPCovR/analysis/scripts')
# from utilities.sklearn_covr.kpcovr import KernelPCovR as KPCovR2
# from utilities.sklearn_covr.pcovr import PCovR as PCovR2
# from helpers import l_regr, l_kpcovr

# Functions

In [None]:
def load_soaps(deem_file, iza_file, 
               idxs_deem_train, idxs_deem_test, 
               idxs_iza_train, idxs_iza_test,
               idxs_deem_delete=[], idxs_iza_delete=[]):
    
    # Load SOAPs
    soaps_deem = load_structures_from_hdf5(deem_file, datasets=None, concatenate=False)
    for i in sorted(idxs_deem_delete, reverse=True):
        soaps_deem.pop(i)
    
    soaps_iza = load_structures_from_hdf5(iza_file, datasets=None, concatenate=False)
    for i in sorted(idxs_iza_delete, reverse=True):
        soaps_iza.pop(i)
        
    # Build the train and test sets
    deem_train = [soaps_deem[i] for i in idxs_deem_train]
    deem_test = [soaps_deem[i] for i in idxs_deem_test]
    iza_train = [soaps_iza[i] for i in idxs_iza_train]
    iza_test = [soap_iza[i] for i in idxs_iza_test]
    
    soaps_train = iza_train + deem_train
    soaps_test = iza_test + deem_test
    
    return soaps_train, soaps_test

def preprocess_soaps(soaps_train, soaps_test):
        
    # Can also do other scaling/centering here -- 
    # this is mostly just to get the SOAPs to a 'usable' magnitude
    soaps_scale = np.std(soaps_train)
    soaps_train_scaled = soaps_train / soaps_scale
    soaps_test_scaled = soaps_test / soaps_scale
    
    return soaps_train, soaps_test

def load_data(deem_file, iza_file,
              idxs_deem_train, idxs_deem_test,
              idxs_iza_train, idxs_iza_test):
    
    deem_data = np.loadtxt(deem_file)
    iza_data = np.loadtxt(iza_file)
        
    deem_data_train = deem_data[idxs_deem_train]
    deem_data_test = deem_data[idxs_deem_test]
    
    iza_data_train = iza_data[idxs_iza_train]
    iza_data_test = iza_data[idxs_iza_test]
    
    train_data = np.concatenate((iza_data_train, deem_data_train))
    test_data = np.concatenate((iza_data_test, deem_data_test))
    
    return train_data, test_data

def load_kernels(kernel_file):
    
    # Load the kernels
    f = h5py.File(kernel_file, 'r')

    K_train = f['K_train'][:]
    K_test = f['K_test'][:]
    K_test_test = f['K_test_test'][:]

    f.close()
    
    return K_train, K_test, K_test_test
    
def compute_kernels(soaps_train, soaps_test, kernel_file=None, **kwargs):
    

    # Build kernel between all DEEM and all IZA
    K_train = build_kernel(soaps_train, soaps_train, **kwargs)
    K_test = build_kernel(soaps_test, soaps_train, **kwargs)
    K_test_test = build_kernel(soaps_test, soaps_test, **kwargs)
        
    if kernel_file is not None:
        
        # Save kernels for later
        g = h5py.File(kernel_file, 'w')

        g.create_dataset('K_train', data=K_train)
        g.create_dataset('K_test', data=K_test)
        g.create_dataset('K_test_test', data=K_test_test)

        for k, v in kernel_parameters.items():
            g.attrs[k] = v

        g.close()
    
    return K_train, K_test, K_test_test
    
def preprocess_kernels(K_train, K_test=[], K_test_test=[], unpack=True):
    
    K_test_test = [center_kernel(k, K_ref=K_train) for k in K_test_test]
    K_test = [center_kernel(k, K_ref=K_train) for k in K_test]
    K_train = center_kernel(K_train)
    
    K_scale = np.trace(K_train) / K_train.shape[0]
    
    K_test_test = [k / K_scale for k in K_test_test]
    K_test = [k / K_scale for k in K_test]
    K_train /= K_scale
    
    if unpack:
        return (K_train, *K_test, *K_test_test)
    else:
        return K_train, K_test, K_test_test

def do_svc(train_data, test_data, train_classes, test_classes,
           svc_type='linear', outputs=['decision_functions', 'predictions', 'scores'], **kwargs):
    
    if svc_type == 'kernel':
        svc = SVC(**kwargs)
        
    elif svc_type == 'linear':
        svc = LinearSVC(**kwargs)
        
    else:
        print("Error: invalid svc_type; valid choices are 'kernel' and 'linear'")
        return

    svc.fit(train_data, train_classes)
    
    output_list = []
    
    # Structure in this way to return in the same order as given in the outputs list
    for out in outputs:
        if out == 'decision_functions':
            df_train = svc.decision_function(train_data)
            df_test = svc.decision_function(test_data)
            output_list.extend((df_train, df_test))
            
        elif out == 'predictions':
            predicted_train_classes = svc.predict(train_data)
            predicted_test_classes = svc.predict(test_data)
            output_list.extend((predicted_train_classes, predicted_test_classes))
        
            print(classification_report(test_classes, predicted_test_classes))
            print(confusion_matrix(test_classes, predicted_test_classes))
            
        elif out == 'scores':
            train_score = svc.score(train_data, train_classes)
            test_score = svc.score(test_data, test_classes)
            output_list.extend((train_scores, test_scores))

            print(train_score)
            print(test_score)
            
    return output_list
           
def regression_check(train_data, test_data,
                     train_target, test_target,
                     regression_type='linear'):
    
    if regression_type == 'linear':
        regression_func = LR
        
    elif regression_type == 'kernel':
        regression_func = KRR
        
    else:
        print("Error: invalid regression type; use 'linear' or 'kernel'")
        return

    # Test KRR on decision functions
    # NOTE: KRR can't predict the test set
    # decision function very well -- why? <-- TODO: is this only for LinearSVC or also SVC?
    
#     regressor = KernelRidge(alpha=1.0E-12, kernel='precomputed')
#     regressor.fit(train_data, train_target)
#     predicted_train_target = regressor.predict(train_data)
#     predicted_test_target = regressor.predict(test_data)

    regressor = regression_func(regularization=1.0E-12)
    regressor.fit(train_data, train_target)
    predicted_train_target = regressor.transform(train_data)
    predicted_test_target = regressor.transform(test_data)
    
    print(np.mean(np.abs(predicted_train_target - train_target), axis=0))
    print(np.mean(np.abs(predicted_test_target - test_target), axis=0))

def preprocess_decision_functions(df_train, df_test):
    df_center = np.mean(df_train, axis=0)
    
    df_train -= df_center
    df_test -= df_center
    
    if df_train.ndim == 1:
        df_scale = np.linalg.norm(df_train) / np.sqrt(df_train.size)
    else:
        df_scale = np.linalg.norm(df_train, axis=0) / np.sqrt(df_train.shape[0] / df_train.shape[1])
    
    df_train /= df_scale
    df_test /= df_scale
    
    return df_train, df_test, df_center, df_scale

def postprocess_decision_functions(df_train, df_test, df_center, df_scale):
    
    # Rescale to raw decision function
    dfp_train = dfp_train * df_scale + df_center
    dfp_test = dfp_test * df_scale + df_center

    # Predict classes based on KPCovRized decision functions
    predicted_cantons_train = df_to_class(dfp_train, df_type, n_classes)
    predicted_cantons_test = df_to_class(dfp_test, df_type, n_classes)
    
def split_and_save(train_data, test_data, 
                   train_idxs, test_idxs, 
                   train_slice, test_slice, 
                   hdf5_attrs=None, 
                   output, output_format='%f'):
    
    # Save KPCovR class predictions
    n_samples = len(train_data) + len(test_data)
    data = np.zeros((n_samples, train_data.shape[1]))
    data[train_idxs] = train_data[train_slice]
    data[test_idxs] = test_data[test_slice]
    
    if hdf5_attrs is not None:
        n_digits = len(str(n_samples - 1))
        g = h5py.File(output, 'w')
        for ddx, d in enumerate(data):
            g.create_dataset(str(ddx).zfill(n_digits), data=d)

        for k, v in hdf5_attrs.items():
            g.attrs[k] = v

        g.close()
    else:
        np.savetxt(output, data, fmt=output_format)

def do_covr(train_data, test_data, 
            train_targets, test_targets,
            covr_type='linear', **covr_parameters):
    
    if covr_type == 'linear':
        covr_func = PCovR
    elif covr_type == 'kernel':
        covr_func = KPCovR
    else:
        print("Error: invalid CovR type; use 'linear' or 'gaussian'")
    
    covr = covr_func(**covr_parameters)
    
    covr.fit(train_data, train_targets)
    
    T_train = covr.transform_K(train_data)
    predicted_train_target = covr.transform_Y(train_data)
    T_test = covr.transform_K(test_data)
    predicted_test_target = covr.transform_Y(test_data)
    
    predicted_train_target = np.squeeze(dfp_train) # TODO: move the squeezing to the KPCovR function
    predicted_test_target = np.squeeze(dfp_test)
    
    return T_train, T_test, predicted_train_target, predicted_test_target

# Load train and test splits

In [6]:
# Load SOAP cutoffs
with open('../Processed_Data/soap_hyperparameters.json', 'r') as f:
    soap_hyperparameters = json.load(f)
    
cutoffs = soap_hyperparameters['interaction_cutoff']

In [5]:
# Load train and test set indices for Deem
idxs_deem_train = np.loadtxt('../Processed_Data/DEEM_10k/train.idxs', dtype=int)
idxs_deem_test = np.loadtxt('../Processed_Data/DEEM_10k/test.idxs', dtype=int)

# Total number of structures
n_deem_train = idxs_deem_train.size
n_deem_test = idxs_deem_test.size
n_deem = n_deem_train + n_deem_test

In [10]:
# Make dummy DEEM cantons
cantons_deem = np.ones(n_deem, dtype=int) * 4

In [11]:
# Make dummy DEEM cantons
cantons_deem = np.ones(n_deem, dtype=int) * 2

In [None]:
# Load IZA cantons
cantons_iza = np.loadtxt('../Raw_Data/GULP/IZA_226/cantons.txt', usecols=1, dtype=int)
RWY = np.nonzero(cantons_iza == 4)[0][0]

In [7]:
cantons_iza = np.delete(cantons_iza, RWY)
n_iza = len(cantons_iza)

In [8]:
cantons_iza = np.ones(n_iza, dtype=int)

In [9]:
# Select IZA sample
# TODO: try loading existing indices first, otherwise generate and save
n_iza_train = n_iza // 2
n_iza_test = n_iza - n_iza_train
idxs_iza = np.arange(0, n_iza)
np.random.shuffle(idxs_iza)

idxs_iza_train = idxs_iza[0:n_iza_train]
idxs_iza_test = idxs_iza[n_iza_train:n_iza_train+n_iza_test]

In [17]:
# Build set of "master" canton labels
cantons_train = np.concatenate((cantons_iza[idxs_iza_train], cantons_deem[idxs_deem_train]))
cantons_test = np.concatenate((cantons_iza[idxs_iza_test], cantons_deem[idxs_deem_test]))
n_classes = np.amax(cantons_train)

In [None]:
deem_train_slice = slice(n_iza_train, None)
deem_test_slice = slice(n_iza_test, None)
iza_train_slice = slice(0, n_iza_train)
iza_test_slice = slice(0, n_iza_test)

# Model setup

In [None]:
kernel_model_dir = '../Processed_Data/Models/Kernel_Models'
linear_model_dir = '../Processed_Data/Models/Linear_Models'
deem_dir = '../Processed_Data/DEEM_10k/Data'
iza_dir = '../Processed_Data/IZA_226onDEEM_10k/Data'

In [None]:
# Global model parameters
# TODO: or use .get_params()?
svc_kwargs = dict(linear=dict(penalty='l2',
                              loss='squared_hinge',
                              dual=False,
                              multi_class='ovr',
                              class_weight=None,
                              fit_intercept=True,
                              intercept_scaling=1.0,
                              tol=1.0E-3),
                  kernel=dict(kernel='precomputed',
                              decision_function_shape='ovr',
                              class_weight=None,
                              break_ties=False,
                              tol=1.0E-3))

covr_args = dict(linear=dict(n_components=None, alpha=0.0, regularization=1.0E-12),
                 kernel=dict(n_components=None, alpha=0.0, regularization=1.0E-12))

# Build kernels

In [None]:
# TODO: this can probably just stay in the optimization notebook once testing is done,
# it isn't really needed here if it is already in the optimization notebook
# Hmm...but the kernels will be different, right? maybe need to keep both

In [None]:
# Load the kernels or compute if they don't exist
for cutoff in cutoffs:
    for kernel_type in ('linear', 'gaussian'):
        kernel_name = kernel_type.capitalize()
        
        # File to store kernels for re-use
        kernel_file = f'{kernel_model_dir}/{kernel_name}/{cutoff}/structure_kernels.hdf5'
        kernel_parameter_file = f'{kernel_model_dir}/{kernel_name}/{cutoff}/volumes_mae_parameters.json'

        if not os.path.exists(kernel_file):
            
            # SOAP files (atomwise, FPS'ed features)
            deem_file = f'{deem_dir}/{cutoff}/soaps.hdf5'
            iza_file = f'{iza_dir}/{cutoff}/soaps.hdf5'

            # Assemble the train and test set SOAPs from IZA and DEEM
            soaps_train, soaps_test = load_soaps(deem_file, iza_file,
                                                 idxs_deem_train, idxs_deem_test,
                                                 idxs_iza_train, idxs_iza_test,
                                                 idxs_iza_delete=[RWY])

            # Compute kernels
            kernel_parameters = load_json(kernel_parameter_file)
            K_train, K_test, K_test_test = \
                compute_kernels(soaps_train, soaps_test, **kernel_parameters, save=True)

# Kernel models

## KernelSVC

In [None]:
for cutoff in cutoffs:
    for kernel_type in ('linear', 'gaussian'):
        kernel_name = kernel_type.capitalize()
        
        # File to store kernels for re-use
        kernel_file = f'{kernel_model_dir}/{kernel_name}/{cutoff}/structure_kernels.hdf5'
        
        # Load kernels
        K_train, K_test, K_test_test = load_kernels(kernel_file)

        # Center and scale kernels
        K_train, K_test, K_test_test = \
            preprocess_kernels(K_train, K_test=[K_test], K_test_test=[K_test_test])
        
        for n_cantons in (2, 4):
            # TODO: MOVE KERNELS BEFORE RUNNING!
            
            # Directory to put all the output data in
            output_dir = f'KSVC-KPCovR/{n_cantons}-class/{kernel_name}'
            
            if not os.path.exists(output_dir):
                os.makedirs(output_dir)
            
            # Files to store IZA and DEEM KSVC decision functions
            ksvc_df_deem_file = f'{deem_dir}/{cutoff}/{output_dir}/ksvc_structure_dfs.dat'
            ksvc_df_iza_file = f'{iza_dir}/{cutoff}/{output_dir}/ksvc_structure_dfs.dat'
            
            # Files to store IZA and DEEM KSVC class predictions
            ksvc_canton_deem_file = f'{deem_dir}/{cutoff}/{output_dir}/ksvc_structure_cantons.dat'
            ksvc_canton_iza_file = f'{iza_dir}/{cutoff}/{output_dir}/ksvc_structure_cantons.dat'
            
            # Files to store IZA and DEEM KPCovR projections
            kpcovr_projection_deem_file = f'{deem_dir}/{cutoff}/{output_dir}/kpcovr_structures.hdf5' 
            kpcovr_projection_iza_file = f'{iza_dir}/{cutoff}/{output_dir}/kpcovr_structures.hdf5'
            
            # Files to store IZA and DEEM KPCovR decision functions
            kpcovr_df_deem_file = f'{deem_dir}/{cutoff}/{output_dir}/kpcovr_structure_dfs.dat'
            kpcovr_df_iza_file = f'{iza_dir}/{cutoff}/{output_dir}/kpcovr_structure_dfs.dat'
            
            # Files to store IZA and DEEM KPCovR class predictions
            kpcovr_canton_deem_file = f'{deem_dir}/{cutoff}/{output_dir}/kpcovr_structure_cantons.dat'
            kpcovr_canton_iza_file = f'{iza_dir}/{cutoff}/{output_dir}/kpcovr_structure_cantons.dat'
            
            # Files containing the hyperparameters for the kernel, KSVC, and KPCovR
            ksvc_parameter_file = f'{kernel_model_dir}/{cutoff}/{output_dir}/ksvc_parameters.json'
            kpcovr_parameter_file = f'{kernel_model_dir}/{cutoff}/{output_dir}/kpcovr_parameters.json'

            # Run KSVC
            ksvc_parameters = load_json(ksvc_parameter_file)
            df_train, df_test, predicted_cantons_train, predicted_cantons_test = \
                do_svc(K_train, K_test, cantons_train, cantons_test, 
                       svc_type='kernel', **ksvc_parameters,
                       outputs=['decision_functions', 'predictions'])
            
            # Save IZA and DEEM KSVC decision functions
            split_and_save(df_train, df_test,
                           idxs_deem_train, idxs_deem_test,
                           deem_train_slice, deem_test_slice,
                           hdf5_attrs=None, 
                           output=ksvc_df_deem_file, output_format='%f')
            
            split_and_save(df_train, df_test,
                           idxs_iza_train, idxs_iza_test,
                           iza_train_slice, iza_test_slice,
                           hdf5_attrs=None, 
                           output=ksvc_df_iza_file, output_format='%f')
            
            # Save IZA and DEEM KSVC canton predictions
            split_and_save(predicted_cantons_train, predicted_cantons_test,
                           idxs_deem_train, idxs_deem_test,
                           deem_train_slice, deem_test_slice,
                           hdf5_attrs=None, 
                           output=ksvc_cantons_deem_file, output_format='%f')
            
            split_and_save(predicted_cantons_train, predicted_cantons_test,
                           idxs_iza_train, idxs_iza_test,
                           iza_train_slice, iza_test_slice,
                           hdf5_attrs=None, 
                           output=ksvc_cantons_iza_file, output_format='%f')

## KRR check

In [None]:
for cutoff in cutoffs:
    for kernel_type in ('linear', 'gaussian'):
        
        # File to store kernels for re-use
        kernel_file = f'{kernel_model_dir}/{kernel_type.capitalize()}/{cutoff}/structure_kernels.hdf5'
        
        # Load kernels
        K_train, K_test, K_test_test = load_kernels(kernel_file)

        # Center and scale kernels
        K_train, K_test, K_test_test = \
            preprocess_kernels(K_train, K_test=[K_test], K_test_test=[K_test_test])
        
        for n_cantons in (2, 4):
            
            # Directory to put all the output data in
            output_dir = f'KSVC-KPCovR/{n_cantons}-class/{kernel_type.capitalize()}'
            
            # Files to store IZA and DEEM KSVC decision functions
            ksvc_df_deem_file = f'{deem_dir}/{cutoff}/{output_dir}/ksvc_structure_dfs.dat'
            ksvc_df_iza_file = f'{iza_dir}/{cutoff}/{output_dir}/ksvc_structure_dfs.dat'
            
            df_train, df_test = load_data(ksvc_df_deem_file, ksvc_df_iza_file,
                                          idxs_deem_train, idxs_deem_test,
                                          idxs_iza_train, idxs_iza_test)
            
            # Center and scale the decision functions
            df_train, df_test, df_center, df_scale = \
                preprocess_decision_functions(df_train, df_test)

            # Check that KRR can reproduce the decision functions
            regression_check(K_train, K_test, df_train, df_test, regression_type='kernel')

## KPCovR

In [None]:
for cutoff in cutoffs:
    for kernel_type in ('linear', 'gaussian'):
        
        # File to store kernels for re-use
        kernel_file = f'{kernel_model_dir}/{kernel_type.capitalize()}/{cutoff}/structure_kernels.hdf5'
        
        # Load kernels
        K_train, K_test, K_test_test = load_kernels(kernel_file)

        # Center and scale kernels
        K_train, K_test, K_test_test = \
            preprocess_kernels(K_train, K_test=[K_test], K_test_test=[K_test_test])
        
        for n_cantons in (2, 4):
            
            # Directory to put all the output data in
            output_dir = f'KSVC-KPCovR/{n_cantons}-class/{kernel_type.capitalize()}'
            
            # Files to store IZA and DEEM KSVC decision functions
            ksvc_df_deem_file = f'{deem_dir}/{cutoff}/{output_dir}/ksvc_structure_dfs.dat'
            ksvc_df_iza_file = f'{iza_dir}/{cutoff}/{output_dir}/ksvc_structure_dfs.dat'
            
            df_train, df_test = load_data(ksvc_df_deem_file, ksvc_df_iza_file,
                                          idxs_deem_train, idxs_deem_test,
                                          idxs_iza_train, idxs_iza_test)
            
            # Center and scale the decision functions
            df_train, df_test, df_center, df_scale = \
                preprocess_decision_functions(df_train, df_test)
            
            # Run KPCovR
            kpcovr_parameters = load_json(kpcovr_parameter_file)
            T_train, T_test, dfp_train, dfp_test = \
                do_covr(K_train, K_test, df_train, df_test, covr_type='kernel')
            
            # Post process the KPCovR decision functions
            # (i.e., turn them back into canton predictions)
            predicted_cantons_train, predicted_cantons_test = \
                postprocess_decision_functions(dfp_train, dfp_test, df_center, df_scale)
            
            # Save IZA and DEEM KPCovR projections
            split_and_save(T_train, T_test,
                           idxs_deem_train, idxs_deem_test,
                           deem_train_slice, deem_test_slice,
                           hdf5_attrs=covr_parameters['kernel'],
                           output=kpcovr_projection_deem_file, output_format='%f')
            
            split_and_save(T_train, T_test,
                           idxs_iza_train, idxs_iza_test,
                           iza_train_slice, iza_test_slice
                           hdf5_attrs=covr_parameters['kernel'],
                           output=kpcovr_projection_iza_file, output_format='%f')
            
            # Save IZA and DEEM KPCovR decision functions
            split_and_save(dfp_train, dfp_test,
                           idxs_deem_train, idxs_deem_test,
                           deem_train_slice, deem_test_slice,
                           hdf5_attrs=None, 
                           output=kpcovr_df_deem_file, output_format='%f')
            
            split_and_save(dfp_train, dfp_test,
                           idxs_iza_train, idxs_iza_test,
                           iza_train_slice, iza_test_slice,
                           hdf5_attrs=None, 
                           output=kpcovr_df_iza_file, output_format='%f')
            
            # Save IZA and DEEM KPCovR canton predictions
            split_and_save(predicted_cantons_train, predicted_cantons_test,
                           idxs_deem_train, idxs_deem_test,
                           deem_train_slice, deem_test_slice,
                           hdf5_attrs=None, 
                           output=kpcovr_cantons_deem_file, output_format='%d')
            
            split_and_save(predicted_cantons_train, predicted_cantons_test,
                           idxs_iza_train, idxs_iza_test,
                           iza_train_slice, iza_test_slice,
                           hdf5_attrs=None, 
                           output=kpcovr_cantons_iza_file, output_format='%d')

# Linear Models

In [None]:
n_species = 2
n_species_pairs = n_species * (n_species + 1) // 2
n_features = n_species_pairs * soap_hyperparameters['max_radial']**2 \
    * (soap_hyperparameters['max_angular'] + 1)
feature_groups = extract_species_pair_groups(n_features, n_species)

## LinearSVC

In [None]:
for cutoff in cutoffs:
    for spectrum_type in ('power', 'radial'):
        
        deem_file = f'{deem_dir}/{cutoff}/soaps_{spectrum_type}_full_avg_nonorm.hdf5'
        iza_file = f'{iza_dir}/{cutoff}/soaps_{spectrum_type}_full_avg_nonorm.hdf5'
        
        # Assemble the train and test set SOAPs from IZA and DEEM
        soaps_train, soaps_test = load_soaps(deem_file, iza_file,
                                             idxs_deem_train, idxs_deem_test,
                                             idxs_iza_train, idxs_iza_test,
                                             idxs_iza_delete=[RWY])

        # Scale the SOAPs so they are of a 'usable' magnitude for the SVC
        soaps_train, soaps_test = preprocess_soaps(soaps_train, soaps_test)
        
        for species_pairing, feature_idxs in zip(('OO', 'OSi', 'SiSi'), feature_groups):
            
            for n_cantons in (2, 4):
                
                # Directory to put all the output data in
                output_dir = f'LSVC-PCovR/{n_cantons}-class/{spectrum_type}/{species_pairing}'
                
                if not os.path.exists(output_dir):
                    os.makedirs(output_dir)
                
                # Files for IZA and DEEM LSVC decision functions          
                lsvc_df_deem_file = f'{deem_dir}/{cutoff}/{output_dir}/lsvc_structure_dfs.dat'
                lsvc_df_iza_file = f'{iza_dir}/{cutoff}/{output_dir}/lsvc_structure_dfs.dat'

                # Files for IZA and DEEM LSVC canton predictions
                lsvc_canton_deem_file = f'{deem_dir}/{cutoff}/{output_dir}/lsvc_structure_cantons.dat'
                lsvc_canton_iza_file = f'{iza_dir}/{cutoff}/{output_dir}/lsvc_structure_cantons.dat'
                
                # Files for IZA and DEEM PCovR projections
                pcovr_projection_deem_file = f'{deem_dir}/{cutoff}/{output_dir}/pcovr_structures.hdf5'
                pcovr_projection_iza_file = f'{iza_dir}/{cutoff}/{output_dir}/pcovr_structures.hdf5'
                
                # Files for IZA and DEEM PCovR decision functions
                pcovr_df_deem_file = f'{deem_dir}/{cutoff}/{output_dir}/pcovr_structure_dfs.dat'
                pcovr_df_iza_file = f'{iza_dir}/{cutoff}/{output_dir}/pcovr_structure_dfs.dat'
                
                # Files for IZA and DEEM PCovR canton predictions
                pcovr_canton_deem_file = f'{deem_dir}/{cutoff}/{output_dir}/pcovr_structure_cantons.dat'
                pcovr_canton_iza_file = f'{iza_dir}/{cutoff}/{output_dir}/pcovr_structure_cantons.dat'
                
                # Files for LSVC and PCovR hyperparameters
                lsvc_parameter_file = f'{model_dir}/{cutoff}/lsvc_parameters.json'
                pcovr_parameter_file = f'{model_dir}/{output_dir}/pcovr_parameters.json'

                # Run LSVC
                lsvc_parameters = load_json(lsvc_parameter_file)
                df_train, df_test, predicted_cantons_train, predicted_cantons_test = \
                    do_svc(soaps_train[:, feature_idxs], soaps_test[:, feature_idxs], 
                           cantons_train, cantons_test, svc_type='linear', **lsvc_parameters,
                           outputs=['decision_functions', 'predictions'])

                # Save IZA and DEEM LSVC decision functions
                split_and_save(df_train, df_test,
                               idxs_deem_train, idxs_deem_test,
                               deem_train_slice, deem_test_slice,
                               hdf5_attrs=None, 
                               output=lsvc_df_deem_file, output_format='%f')
                
                split_and_save(df_train, df_test,
                               idxs_iza_train, idxs_iza_test,
                               iza_train_slice, iza_test_slice,
                               hdf5_attrs=None, 
                               output=lsvc_df_iza_file, output_format='%f')

                # Save IZA and DEEM LSVC canton predictions
                split_and_save(predicted_cantons_train, predicted_cantons_test,
                               idxs_deem_train, idxs_deem_test,
                               deem_train_slice, deem_test_slice,
                               hdf5_attrs=None, 
                               output=lsvc_cantons_deem_file, output_format='%f')
                
                split_and_save(predicted_cantons_train, predicted_cantons_test,
                               idxs_iza_train, idxs_iza_test,
                               iza_train_slice, iza_test_slice,
                               hdf5_attrs=None, 
                               output=lsvc_cantons_iza_file, output_format='%f')

## LR check

In [None]:
for cutoff in cutoffs:
    for spectrum_type in ('power', 'radial'):
        
        deem_file = f'{deem_dir}/{cutoff}/soaps_{correlation_type}_full_avg_nonorm.hdf5'
        iza_file = f'{iza_dir}/{cutoff}/soaps_{correlation_type}_full_avg_nonorm.hdf5'

        # Assemble the train and test set SOAPs from IZA and DEEM
        soaps_train, soaps_test = load_soaps(deem_file, iza_file,
                                             idxs_deem_train, idxs_deem_test,
                                             idxs_iza_train, idxs_iza_test,
                                             idxs_iza_delete=[RWY])
        
        # Scale the SOAPs so they are of a 'usable' magnitude for the SVC
        soaps_train, soaps_test = preprocess_soaps(soaps_train, soaps_test)
        
        for species_pairing, feature_idxs in zip(('OO', 'OSi', 'SiSi'), feature_groups):
            
            for n_cantons in (2, 4):
                # Center and scale the decision functions
                df_train, df_test, df_center, df_scale = \
                    preprocess_decision_functions(df_train, df_test)

                # Check that LR can reproduce the decision functions
                regression_check(soaps_train[:, feature_idxs], soaps_test[:, feature_idxs],
                                 df_train, df_test, regression_type='linear')

## PCovR

In [None]:
for cutoff in cutoffs:
    for spectrum_type in ('power', 'radial'):
        
        deem_file = f'{deem_dir}/{cutoff}/soaps_{correlation_type}_full_avg_nonorm.hdf5'
        iza_file = f'{iza_dir}/{cutoff}/soaps_{correlation_type}_full_avg_nonorm.hdf5'

        # Assemble the train and test set SOAPs from IZA and DEEM
        soaps_train, soaps_test = load_soaps(deem_file, iza_file,
                                             idxs_deem_train, idxs_deem_test,
                                             idxs_iza_train, idxs_iza_test,
                                             idxs_iza_delete=[RWY])
        
        # Scale the SOAPs so they are of a 'usable' magnitude for the SVC
        soaps_train, soaps_test = preprocess_soaps(soaps_train, soaps_test)
        
        for species_pairing, feature_idxs in zip(('OO', 'OSi', 'SiSi'), feature_groups):
            
            for n_cantons in (2, 4):
                
                # Run PCovR
                pcovr_parameters = load_json(pcovr_parameter_file)
                T_train, T_test, dfp_train, dfp_test = \
                    do_covr(soaps_train[:, feature_idxs], soaps_test[:, feature_idxs],
                            df_train, df_test, covr_type='linear')

                # Post process the PCovR decision functions
                # (i.e., turn them back into canton predictions)
                predicted_cantons_train, predicted_cantons_test = \
                    postprocess_decision_functions(dfp_train, dfp_test, df_center, df_scale)
                
                # Save IZA and DEEM PCovR projections
                split_and_save(T_train, T_test,
                               idxs_deem_train, idxs_deem_test,
                               deem_train_slice, deem_test_slice,
                               hdf5_attrs=pcovr_parameters,
                               output=pcovr_projection_deem_file, output_format='%f')
                
                split_and_save(T_train, T_test,
                               idxs_iza_train, idxs_iza_test,
                               iza_train_slice, iza_test_slice,
                               hdf5_attrs=pcovr_parameters,
                               output=pcovr_projection_iza_file, output_format='%f')

                # Save IZA and DEEM PCovR decision functions
                split_and_save(dfp_train, dfp_test,
                               idxs_deem_train, idxs_deem_test,
                               deem_train_slice, deem_test_slice,
                               hdf5_attrs=None, 
                               output=pcovr_df_deem_file, output_format='%f')
                
                split_and_save(dfp_train, dfp_test,
                               idxs_iza_train, idxs_iza_test,
                               iza_train_slice, iza_test_slice,
                               hdf5_attrs=None, 
                               output=pcovr_df_iza_file, output_format='%f')

                # Save IZA and DEEM PCovR canton predictions
                split_and_save(predicted_cantons_train, predicted_cantons_test,
                               idxs_deem_train, idxs_deem_test,
                               deem_train_slice, deem_test_slice,
                               hdf5_attrs=None, 
                               output=pcovr_cantons_deem_file, output_format='%d')
                
                split_and_save(predicted_cantons_train, predicted_cantons_test,
                               idxs_iza_train, idxs_iza_test,
                               iza_train_slice, iza_test_slice,
                               hdf5_attrs=None, 
                               output=pcovr_cantons_iza_file, output_format='%d')

# Logistic Regression