In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# System
import os
import sys
sys.path.append('/home/helfrech/Tools/Toolbox/utils')

# Maths
import numpy as np

# Plotting
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D

# ML
from regression import PCovR, KPCovR, SparseKPCovR
from regression import LR, KRR
from kernels import build_kernel, linear_kernel, gaussian_kernel
from kernels import center_kernel, center_kernel_fast
from kernels import center_kernel_oos, center_kernel_oos_fast

from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import LogisticRegression

# Utilities
import h5py
import json
import itertools
from copy import deepcopy
from tqdm.notebook import tqdm
import project_utils as utils
from tools import load_json, save_json

# Import COSMO style toolkit
import cosmoplot.colorbars as cosmocbars
import cosmoplot.utils as cosmoutils
import cosmoplot.style as cosmostyle

cosmostyle.set_style('article')
colorList = cosmostyle.color_cycle

In [None]:
sys.path.append('/scratch/helfrech/Sync/GDrive/Projects/KPCovR/kernel-tutorials')
sys.path.append('/scratch/helfrech/Sync/GDrive/Projects/KPCovR/KernelPCovR/analysis/scripts')
# from utilities.sklearn_covr.kpcovr import KernelPCovR as KPCovR2
# from utilities.sklearn_covr.pcovr import PCovR as PCovR2
from helpers import l_regr, l_kpcovr, l_proj

# Functions

In [None]:
def compute_regression_losses(lrs_train, lrs_test, lrts_train, ltrs_test, 
                              y_train, y_test, yp_train, yp_test):
    
    # Sum over cantonwise losses
    lr_test = np.sum(l_regr(y_test, yp_test))
    lr_train = np.sum(l_regr(y_train, yp_train))

    return lr_train, lr_test

def compute_kernel_projection_losses(k_train, k_test, k_test_test, t_train, t_test):
    
    # Sum over cantonwise losses
    lp_test = np.sum(l_kpcovr(k_train=k_train,
                              k_test=k_test,
                              k_test_test=k_test_test,
                              t_train=t_train, t_test=t_test))

    lp_train = np.sum(l_kpcovr(k_train=k_train,
                               t_train=t_train, t_test=t_test))

    return lp_train, lp_test

def compute_linear_projection_losses(x_train, x_test, xr_train, xr_test):
    
    # Sum over cantonwise losses
    lp_train = np.sum(l_proj(x_train, xr=xr_train))
    lp_test = np.sum(l_proj(x_test, xr=xr_test))

    return lp_train, lp_test

def load_covr_losses(filename):
    
    # Load losses
    loss_matrix_shape = (len(alphas), len(regularizations))
    covr_errors = np.loadtxt(filename)
    alpha_matrix = np.reshape(covr_errors['alpha'], loss_matrix_shape)
    reg_matrix = np.reshape(covr_errors['regularization'], loss_matrix_shape)
    lr_train_matrix = np.reshape(np.mean(covr_errors['lr_train'], axis=1), loss_matrix_shape)
    lr_test_matrix = np.reshape(np.mean(covr_errors['lr_test'], axis=1), loss_matrix_shape)
    lp_train_matrix = np.reshape(np.mean(covr_errors['lp_train'], axis=1), loss_matrix_shape)
    lp_test_matrix = np.reshape(np.mean(covr_errors['lp_test'], axis=1), loss_matrix_shape)
    alphas = alpha_matrix[:, 0]
    opt_reg_idx = np.unravel_index(np.argmin(lr_test_matrix + lp_test_matrix), 
                                   lr_test_matrix.shape)[1]
    
    return alphas, opt_reg_idx, \
        lr_train_matrix, lr_test_matrix, lp_train_matrix, lp_test_matrix

# Load train and test splits

In [None]:
# Load SOAP cutoffs
with open('../Processed_Data/soap_hyperparameters.json', 'r') as f:
    soap_hyperparameters = json.load(f)
    
cutoffs = soap_hyperparameters['interaction_cutoff']

In [None]:
# Load train and test set indices for Deem
idxs_deem = np.loadtxt('../Processed_Data/DEEM_10k/train.idxs', dtype=int)

# Total number of structures
n_deem = idxs_deem.size + np.loadtxt('../Processed_Data/DEEM_10k/test.idxs', dtype=int).size

In [None]:
idxs_deem_train_file = '../Processed_Data/Models/deem_train_ksvc-kpcovr.idxs'
idxs_deem_test_file = '../Processed_Data/Models/deem_test_ksvc-kpcovr.idxs'

# Load indices for kernel building
try:
    idxs_deem_train = np.loadtxt(deem_train_file, dtype=int)
    idxs_deem_test = np.loadtxt(deem_test_file, dtype=int)
    n_deem_train = len(idxs_deem_train)
    n_deem_test = len(idxs_deem_test)
    
    # Check to make sure the test and train set sizes are correct
    print(n_deem_train, n_deem_test)

# Compute indices if they don't exist
except IOError:

    n_deem_train = 5000
    n_deem_test = 2750
    
    # Deem is already shuffled, don't need to do so here
    idxs_deem_train = idxs_deem[0:n_deem_train]
    idxs_deem_test = idxs_deem[n_deem_train:n_deem_train+n_deem_test]
    
    np.savetxt(idxs_deem_train_file, idxs_deem_train, fmt='%d')
    np.savetxt(idxs_deem_test_file, idxs_deem_test, fmt='%d')

In [None]:
# Make DEEM cantons
cantons_deem = np.ones(n_deem, dtype=int) * 4

In [None]:
# Load IZA cantons
cantons_iza = np.loadtxt('../Raw_Data/GULP/IZA_226/cantons.txt', usecols=1, dtype=int)
RWY = np.nonzero(cantons_iza == 4)[0][0]

cantons_iza = np.delete(cantons_iza, RWY)
n_iza = len(cantons_iza)

In [None]:
idxs_iza_train_file = '../Processed_Data/Models/iza_train_ksvc-kpcovr.idxs'
idxs_iza_test_file = '../Processed_Data/Models/iza_test_ksvc-kpcovr.idxs'

# Load indices for kernel building
try:
    idxs_iza_train = np.loadtxt(idxs_iza_train_file, dtype=int)
    idxs_iza_test = np.loadtxt(idxs_iza_test_file, dtype=int)
    
    # Check to make sure the test and train set sizes are correct
    print(len(idxs_iza_train), len(idxs_iza_test))

# Compute indices if they don't exist
except IOError:

    idxs_iza_train = np.arange(0, n_iza)
    idxs_iza_test = np.arange(0, n_iza)
    
    np.savetxt(idxs_iza_train_file, idxs_iza_train, fmt='%d')
    np.savetxt(idxs_iza_test_file, idxs_iza_test, fmt='%d')

In [None]:
# Build set of "master" canton labels
cantons_train = {}
cantons_test = {}

cantons_train[4] = np.concatenate((cantons_iza[idxs_iza_train], cantons_deem[idxs_deem_train]))
cantons_test[4] = np.concatenate((cantons_iza[idxs_iza_test], cantons_deem[idxs_deem_test]))

cantons_train[2] = np.concatenate((np.ones(len(idxs_iza_train), dtype=int),
                                   np.ones(len(idxs_deem_train), dtype=int) * 2))
cantons_test[2] = np.concatenate((np.ones(len(idxs_iza_test), dtype=int),
                                  np.ones(len(idxs_deem_test), dtype=int) * 2))

## Optimization

In [None]:
n_subsets = 5

In [None]:
idxs_iza_train_kernel_file = '../Processed_Data/Models/iza_train_ksvc-kpcovr_optimization.idxs'
idxs_iza_test_kernel_file = '../Processed_Data/Models/iza_test_ksvc-kpcovr_optimization.idxs'

# Load IZA array splits for optimization
try:
    idxs_iza_train_kernel = [np.asarray(i) for i in load_json(idxs_iza_train_kernel_file)]
    idxs_iza_test_kernel = [np.asarray(i) for i in load_json(idxs_iza_test_kernel_file)]
    
# Compute indices if they don't exist
except IOError:
    
    idxs_iza = np.arange(0, n_iza)
    n_iza_train_kernel = int(n_iza / 2)
    n_iza_test_kernel = n_iza - n_iza_train_kernel

    idxs_iza_train_kernel = []
    idxs_iza_test_kernel = []
    for n in range(0, n_subsets):
        np.random.shuffle(idxs_iza)
        idxs_iza_train_kernel.append(idxs_iza[0:n_iza_train_kernel])
        idxs_iza_test_kernel.append(idxs_iza[n_iza_train_kernel:])
        
    save_json(idxs_iza_train_kernel, [array.tolist() for array in idxs_iza_train_kernel_file])
    save_json(idxs_iza_test_kernel, [array.tolist() for array in idxs_iza_test_kernel_file])

In [None]:
idxs_deem_train_kernel_file = '../Processed_Data/Models/deem_train_ksvc-kpcovr_optimization.idxs'
idxs_deem_test_kernel_file = '../Processed_Data/Models/deem_test_ksvc-kpcovr_optimization.idxs'

# Load DEEM array splits for optimization
try:
    idxs_deem_train_kernel = [np.asarray(i) for i in load_json(idxs_deem_train_kernel_file)]
    idxs_deem_test_kernel = [np.asarray(i) for i in load_json(idxs_deem_test_kernel_file)]

# Compute indices if they don't exist
except IOError:

    idxs_deem_train_kernel = np.arange(idxs_iza_train.size, idxs_iza_train.size+idxs_deem_train.size)
    np.random.shuffle(idxs_deem_train_kernel)
    idxs_deem_train_kernel = np.split(idxs_deem_train_kernel, n_subsets)

    idxs_deem_test_kernel = np.arange(idxs_iza_test.size, idxs_iza_test.size+idxs_deem_test.size)
    np.random.shuffle(idxs_deem_test_kernel)
    idxs_deem_test_kernel = np.split(idxs_deem_test_kernel, n_subsets)
    
    save_json(idxs_deem_train_kernel, [array.tolist() for array in idxs_deem_train_kernel_file])
    save_json(idxs_deem_test_kernel, [array.tolist() for array in idxs_deem_test_kernel_file])

In [None]:
# Concatenate the DEEM and IZA indices
idxs_train_kernel = [np.concatenate((iza, deem)) for iza, deem in 
                     zip(idxs_iza_train_kernel, idxs_deem_train_kernel)]
idxs_test_kernel = [np.concatenate((iza, deem)) for iza, deem in
                    zip(idxs_iza_test_kernel, idxs_deem_test_kernel)]

In [None]:
# Initialize datatype for saving KSVC classification accuracies and KPCovR losses
dt_covr_list = [('alpha', 'f8'),
                ('reg', 'f8'),
                ('lr_train', 'f8', (n_subsets,)),
                ('lr_test', 'f8', (n_subsets)),
                ('lp_train', 'f8', (n_subsets,)),
                ('lp_test', 'f8', (n_subsets))]

dt_svm_list = [('C', 'f8'),
               ('class_accuracy_train', 'f8', (n_subsets,)),
               ('class_accuracy_test', 'f8', (n_subsets,))]

save_json(dt_covr_list, '../Processed_Data/Models/covr_optimization_dtype.json')
save_json(dt_svm_list, '../Processed_Data/Models/svm_optimization_dtype.json')

dt_covr = np.dtype(dt_covr_list)
dt_svm = np.dtype(dt_svm_list)

## Model evaluation

In [None]:
idxs_deem_train_all_file = '../Processed_Data/Models/deem_train_ksvc-kpcovr_evaluation.idxs'
idxs_deem_test_all_file = '../Processed_Data/Models/deem_test_ksvc-kpcovr_evaluation.idxs'

# Load DEEM indices for running the models on the full optimization set
try:
    idxs_deem_train_all = np.loadtxt(idxs_deem_train_all_file, dtype=int)
    idxs_deem_test_all = np.loadtxt(idxs_deem_test_all_file, dtype=int)
    
# Compute indices if they don't exist
except IOError:

    idxs_deem_train_all = np.concatenate(idxs_deem_train_kernel)
    idxs_deem_test_all = np.concatenate(idxs_deem_test_kernel)
    
    np.savetxt(idxs_deem_train_all_file, idxs_deem_train_all, fmt='%d')
    np.savetxt(idxs_deem_test_all_file, idxs_deem_test_all, fmt='%d')

In [None]:
idxs_iza_train_all_file = '../Processed_Data/Models/iza_train_ksvc-kpcovr_evaluation.idxs'
idxs_iza_test_all_file = '../Processed_Data/Models/iza_test_ksvc-kpcovr_evaluation.idxs'

# Load IZA indices for running the models on the full optimization set
try:
    idxs_iza_train_all = np.loadtxt(idxs_iza_train_all_file, dtype=int)
    idxs_iza_test_all = np.loadtxt(idxs_iza_test_all_file, dtype=int)

# Compute indices if they don't exist
except IOError:

    idxs_iza = np.arange(0, n_iza)
    n_iza_train = n_iza // 2
    n_iza_test = n_iza - n_iza_train

    np.random.shuffle(idxs_iza)
    idxs_iza_train_all = idxs_iza[0:n_iza_train]
    idxs_iza_test_all = idxs_iza[n_iza_train:]
    
    np.savetxt(idxs_iza_train_all_file, idxs_iza_train_all, fmt='%d')
    np.savetxt(idxs_iza_test_all_file, idxs_iza_test_all, fmt='%d')

In [None]:
# Concatenate the DEEM and IZA indices
idxs_train_all = np.concatenate((idxs_iza_train_all, idxs_deem_train_all))
idxs_test_all = np.concatenate((idxs_iza_test_all, idxs_deem_test_all))

# Model setup

In [None]:
model_dir = '../Processed_Data/Models'

deem_name = 'DEEM_10k'
iza_name = 'IZA_226onDEEM_10k'
deem_dir = f'../Processed_Data/{deem_name}/Data'
iza_dir = f'../Processed_Data/{iza_name}/Data'

In [None]:
# TODO: parameter optimization handling with defaults dictated here

In [None]:
# Global model parameters
# TODO: or use .get_params()?
# TODO: should we make sure break_ties=True?
svc_kwargs = dict(linear=dict(penalty='l2',
                              loss='squared_hinge',
                              dual=False,
                              multi_class='ovr',
                              class_weight=None,
                              fit_intercept=True,
                              intercept_scaling=1.0,
                              tol=1.0E-3,
                              C=1.0),
                  kernel=dict(kernel='precomputed',
                              decision_function_shape='ovr',
                              class_weight=None,
                              break_ties=False,
                              tol=1.0E-3,
                              C=1.0))

n_components = 2
pcovr_kwargs = dict(linear=dict(n_components=n_components, alpha=0.0, regularization=1.0E-12),
                    kernel=dict(n_components=n_components, alpha=0.0, regularization=1.0E-12))

C = np.logspace(-5, 5, 11)
alphas = np.linspace(0.0, 1.0, 11)
regularizations = np.logspace(-12, -1, 12)

In [None]:
# Index slices for saving KSVC and KPCovR outputs
deem_train_slice = slice(n_iza_train, None)
deem_test_slice = slice(n_iza_test, None)
iza_train_slice = slice(0, n_iza_train)
iza_test_slice = slice(0, n_iza_test)

# Build kernels

In [None]:
# Load the kernels or compute if they don't exist
for cutoff in cutoffs:
    for kernel_type in ('linear', 'gaussian'):
        kernel_name = kernel_type.capitalize()
        
        work_dir = f'{model_dir}/{cutoff}/Kernel_Models/{kernel_name}/KSVC-KPCovR'
        
        if not os.path.exists(work_dir):
            os.makedirs(work_dir)
        
        # File to store kernels for re-use
        kernel_file = f'{work_dir}/structure_kernels_optimization.hdf5'
        kernel_parameter_file = f'{work_dir}/volumes_mae_parameters.json'

        if not os.path.exists(kernel_file):
            
            # SOAP files (atomwise, FPS'ed features)
            deem_file = f'{deem_dir}/{cutoff}/soaps.hdf5'
            iza_file = f'{iza_dir}/{cutoff}/soaps.hdf5'

            # Assemble the train and test set SOAPs from IZA and DEEM
            soaps_train, soaps_test = utils.load_soaps(deem_file, iza_file,
                                                       idxs_deem_train, idxs_deem_test,
                                                       idxs_iza_train, idxs_iza_test,
                                                       idxs_iza_delete=[RWY])

            # Compute kernels
            # TODO: this can be consolidated if doing linear KRR
            if kernel_type == 'gaussian':
                kernel_parameters = load_json(kernel_parameter_file)
                kernel_parameters.pop('sigma')
                kernel_parameters.pop('regularization')
            else:
                kernel_parameters = dict(kernel='linear', zeta=1)
            utils.compute_kernels(soaps_train, soaps_test, kernel_file=kernel_file, **kernel_parameters)

# Kernel Models

## Optimize KernelSVC parameters

In [None]:
for cutoff in tqdm(cutoffs, desc='Cutoff', leave=True):
    
    for kernel_type in tqdm(('linear', 'gaussian'), desc='Kernel', leave=False):
        kernel_name = kernel_type.capitalize()
        
        work_dir = f'{model_dir}/{cutoff}/Kernel_Models/{kernel_name}/KSVC-KPCovR'
        
        # Load kernels
        kernel_file = f'{work_dir}/structure_kernels_optimization.hdf5'
        K_train, K_test, K_test_test = utils.load_kernels(kernel_file)
        
        for n_cantons in tqdm((2, 4), desc='Classes', leave=False):
            
            # Output setup
            output_dir = f'{n_cantons}-class'
            
            if not os.path.exists(f'{work_dir}/{output_dir}'):
                os.makedirs(f'{work_dir}/{output_dir}')
            
            svc_parameter_file = f'{work_dir}/{output_dir}/svc_parameters.json'
            svc_parameters = svc_kwargs['kernel'].copy()
            
            svm_errors = []
            
            for cdx, c in enumerate(tqdm(C, desc='C', leave=False)):
                
                class_accuracy_train = np.zeros(n_subsets)
                class_accuracy_test = np.zeros(n_subsets)
                
                for n in tqdm(range(0, n_subsets), desc='n', leave=False):
                    
                    # Slice kernels
                    idxs_train = idxs_train_kernel[n]
                    idxs_test = idxs_test_kernel[n]
                    
                    k_train = K_train[idxs_train, :][:, idxs_train]
                    k_test = K_test[idxs_test, :][:, idxs_train]
                    k_test_test = K_test_test[idxs_test, :][:, idxs_test]
                    
                    # Center and scale kernels
                    k_train, [k_test], [k_test_test] = \
                        utils.preprocess_kernels(k_train, [k_test], [k_test_test])
                    
                    y_train = cantons_train[n_cantons][idxs_train]
                    y_test = cantons_test[n_cantons][idxs_test]
                    
                    # Run KSVC
                    df_train, df_test, predicted_cantons_train, predicted_cantons_test = \
                        utils.do_svc(k_train, k_test, y_train, y_test, 
                                     svc_type='kernel', **svc_parameters, C=c, outputs=['scores'])
                    
                    class_accuracy_train[n] = train_score
                    class_accuracy_test[n] = test_score
                    
                    # Collect errors
                    model = np.array([(c, class_accuracy_train, class_accuracy_test)], dtype=dt_svm)
                    svm_errors.append(model)
            
            svm_errors = np.concatenate(svm_errors)
            
            # Stack the arrays in a writable form so we can save in plain text instead of npy/npz
            save_structured_array(f'{work_dir}/{output_dir}/svm_optimization.dat',
                                  svm_errors, dt_svm)
            
            # TODO: use accuracy only on IZA structures?
            class_accuracy_train_avg = np.mean(svm_errors['class_accuracy_train'], axis=1)
            class_accuracy_test_avg = np.mean(svm_errors['class_accuracy_test'], axis=1)
            
            # Save KSVC parameters
            idx_C = np.argmax(class_accuracy_test_avg)
            C_opt = svm_errors['C'][idx_C]
            svc_parameters.update(C=C_opt)
            save_json(svc_parameter_file, svc_parameters)

## Optimal KernelSVC

In [None]:
for cutoff in tqdm(cutoffs, desc='Cutoff', leave=True):
    
    for kernel_type in tqdm(('linear', 'gaussian'), desc='Kernel', leave=False):
        kernel_name = kernel_type.capitalize()
        
        work_dir = f'{model_dir}/{cutoff}/Kernel_Models/{kernel_name}/KSVC-KPCovR'
        
        # Load kernels
        kernel_file = f'{work_dir}/structure_kernels_optimization.hdf5'
        K_train, K_test, K_test_test = utils.load_kernels(kernel_file)
        
        # Since we need decision functions for all IZA structures in the train set,
        # predict all structures
        k_train_all = K_train[:, idxs_train_all]
        k_test_all = K_test[:, idxs_train_all]

        k_train = k_train_all[idxs_train_all, :]
        k_test = K_test_all[idxs_test_all, :]
        
        k_train, k_test, k_train_all, k_test_all = \
            utils.preprocess_kernels(k_train, K_test=[k_test, k_train_all, k_test_all])
        
        for n_cantons in tqdm((2, 4), desc='Classes', leave=False):
            
            # Output setup
            output_dir = f'{n_cantons}-class'
            svc_parameter_file = f'{work_dir}/{output_dir}/svc_parameters.json'
            svc_df_train_file = f'{work_dir}/{output_dir}/svc_structure_dfs_train_optimization.dat'
            svc_df_test_file = f'{work_dir}/{output_dir}/svc_structure_dfs_test_optimization.dat'
                    
            # Assemble properties
            y_train = cantons_train[n_cantons][idxs_train_all]
            y_test = cantons_test[n_cantons][idxs_test_all]
            
            # KSVC
            svc_parameters = load_json(svc_parameter_file)
            df_train, df_test, train_score, test_score = \
                utils.do_svc(k_train, k_test, y_train, y_test, svc_type='kernel',
                             outputs=['decision_functions', 'scores'])
            
            # Save decision functions for KRR test and KPCovR optimization
            np.savetxt(svc_df_train_file, df_train)
            np.savetxt(svc_df_test_file, df_test)

## KRR check

In [None]:
for cutoff in tqdm(cutoffs, desc='Cutoff', leave=True):
    
    for kernel_type in tqdm(('linear', 'gaussian'), desc='Kernel', leave=False):
        kernel_name = kernel_type.capitalize()
        
        work_dir = f'{model_dir}/{cutoff}/Kernel_Models/{kernel_name}/KSVC-KPCovR'
        
        # Load kernels
        kernel_file = f'{work_dir}/structure_kernels_optimization.hdf5'
        K_train, K_test, K_test_test = utils.load_kernels(kernel_file)
        
        # Since we need decision functions for all IZA structures in the train set,
        # predict all structures
        k_train_all = K_train[:, idxs_train_all]
        k_test_all = K_test[:, idxs_train_all]

        k_train = k_train_all[idxs_train_all, :]
        k_test = K_test_all[idxs_test_all, :]

        k_train, k_test, k_train_all, k_test_all = \
            utils.preprocess_kernels(k_train, K_test=[k_test, k_train_all, k_test_all])
        
        for n_cantons in tqdm((2, 4), desc='Classes', leave=False):
            
            # Load decision functions
            input_dir = f'{n_cantons}-class'
            svc_df_train_file = f'{work_dir}/{input_dir}/svc_structure_dfs_train_optimization.dat'
            svc_df_test_file = f'{work_dir}/{input_dir}/svc_structure_dfs_test_optimization.dat'
            
            df_train = np.loadtxt(svc_df_train_file)
            df_test = np.loadtxt(svc_df_test_file)
            
            # Center and scale the decision functions
            df_train, df_test, df_center, df_scale = \
                utils.preprocess_data(df_train, df_test)

            # Check that KRR can reproduce the decision functions
            utils.regression_check(K_train, K_test, df_train, df_test, regression_type='kernel')

## Optimize KPCovR parameters

In [None]:
for cutoff in tqdm(cutoffs, desc='Cutoff', leave=True):
    
    for kernel_type in tqdm(('linear', 'gaussian'), desc='Kernel', leave=False):
        kernel_name = kernel_type.capitalize()
        
        work_dir = f'{model_dir}/{cutoff}/Kernel_Models/{kernel_name}/KSVC-KPCovR'
        
        # Load kernels
        kernel_file = f'{work_dir}/structure_kernels_optimization.hdf5'
        K_train, K_test, K_test_test = utils.load_kernels(kernel_file)
        
        for n_cantons in tqdm((2, 4), desc='Classes', leave=False):
            
            # Prepare outputs
            output_dir = f'{n_cantons}-class' 
            pcovr_parameter_file = f'{work_dir}/{output_dir}/pcovr_parameters.json'
            svc_parameter_file = f'{work_dir}/{output_dir}/svc_parameters.json'
            
            # Load decision functions
            svc_df_train_file = f'{work_dir}/{output_dir}/svc_structure_dfs_optimization.dat'
            svc_df_test_file = f'{work_dir}/{output_dir}/svc_structure_dfs_optimization.dat'
            
            df_train = np.loadtxt(svc_df_train_file)
            df_test = np.loadtxt(svc_df_test_file)
            
            kpcovr_errors = []
            
            for adx, a in enumerate(tqdm(alphas, desc='Alpha', leave=False)):
                for rdx, r in enumerate(tqdm(regularizations, desc='Reg', leave=False)):
                    
                    # Initialize matrices of losses
                    lr_train = np.zeros(n_subsets)
                    lp_train = np.zeros(n_subsets)
                    lr_test = np.zeros(n_subsets)
                    lp_test = np.zeros(n_subsets)
                                        
                    for n in tqdm(range(0, n_subsets), desc='n', leave=False):
                        
                        # Assemble kernels
                        idxs_train = idxs_train_kernel[n]
                        k_train = K_train[idxs_train, :][:, idxs_train]

                        idxs_test = idxs_test_kernel[n]
                        k_test = K_test[idxs_test, :][:, idxs_train]
                        k_test_test = K_test_test[idxs_test, :][:, idxs_test]
                        
                        k_train, [k_test], [k_test_test] = \
                            utils.preprocess_kernels(k_train, K_test=[k_test], K_test_test=[k_test_test])

                        # Assemble properties
                        y_train, y_test, y_center, y_scale = \
                            utils.preprocess_data(df_train[idxs_train], df_test[idxs_test])
                        
                        # Run KPCovR
                        kpcovr_parameters = pcovr_kwargs['kernel'].copy()
                        t_train, t_test, yp_train, yp_test = \
                            utils.do_pcovr(k_train, k_test, y_train, y_test, 
                                           pcovr_type='kernel', **pcovr_parameters,
                                           alpha=a, regularization=r)

                        # Post process the KPCovR decision functions
                        # (i.e., turn them back into canton predictions)
                        predicted_cantons_train, predicted_cantons_test = \
                            utils.postprocess_decision_functions(yp_train, yp_test, y_center, y_scale,
                                                                 df_type=svc_parameters['decision_function_shape'],
                                                                 n_classes=n_cantons)

                        # Compute regression and projection (KPCovR) losses for each canton
                        # TODO: losses on just IZA?
                        lr_train[n], lr_test[n] = \
                            utils.compute_regression_losses(y_train, y_test, yp_train, yp_test)
                        
                        lp_train[n], lp_test[n] = \
                            utils.compute_kernel_projection_losses(k_train, k_test, k_test_test, 
                                                                   t_train, t_test)
                     
                    # Collect errors
                    model = np.array([(a, r, lr_train, lr_test, lp_train, lp_test)], dtype=dt_covr)
                    kpcovr_errors.append(model)
            
            kpcovr_errors = np.concatenate(kpcovr_errors)
                    
            save_structured_array(f'{work_dir}/{output_dir}/kpcovr_optimization.dat', 
                                  kpcovr_errors, dt_kpcovr)
            
            # Extract optimal hyperparameters
            lr_avg_test = np.mean(kpcovr_errors['lr_test'], axis=1)
            lp_avg_test = np.mean(kpcovr_errors['lp_test'], axis=1)
            
            idx_opt = np.argmin(lr_avg_test + lp_avg_test)
            
            opt_alpha = kpcovr_errors['alpha'][idx_opt]
            opt_reg = kpcovr_errors['regularization'][idx_opt]

            print(opt_alpha, opt_reg)

            # Save KPCovR parameters
            kpcovr_parameters = dict(n_components=n_components,
                                     regularization=opt_reg,
                                     alpha=opt_alpha)
            
            save_json(kpcovr_parameters, kpcovr_parameter_file)

## Check for optimization problems

In [None]:
for cutoff in cutoffs:
    for kernel_type in ('linear', 'gaussian'):
        kernel_name = kernel_type.capitalize()
        
        work_dir = f'{model_dir}/{cutoff}/Kernel_Models/{kernel_name}/KSVC-KPCovR'

        for n_cantons in (2, 4):
            
            print(f'Cutoff: {cutoff}, Kernel: {kernel_name}, {n_cantons}-Class')
            
            output_dir = f'{n_cantons}-class'
            
            loss_file = f'{work_dir}/{output_dir}/kpcovr_optimization.dat'
            
            # Load losses
            alphas, opt_reg_idx, \
            lr_train_matrix, lr_test_matrix, \
            lp_train_matrix, lp_test_matrix = load_covr_losses(loss_file)
            
            fig = plt.figure(figsize=(7.0, 7.0))
            axs_loss_sum_train = fig.add_subplot(1, 2, 1)
            axs_loss_sum_test = fig.add_subplot(1, 2, 2)

            # Sum of projection and regression loss over all cantons for the train set
            axs_loss_sum_train.semilogy(alphas, lr_train_matrix[:, opt_reg_idx], 'o-', label='l_regr')
            axs_loss_sum_train.semilogy(alphas, lp_train_matrix[:, opt_reg_idx], 'o-', label='l_proj')
            axs_loss_sum_train.semilogy(alphas, lr_train_matrix[:, opt_reg_idx]+lp_train_matrix[:, opt_reg_idx], 
                                        'o-', label='l_regr+l_proj')

            axs_loss_sum_train.legend()
            axs_loss_sum_train.set_title('Train')
            axs_loss_sum_train.set_xlabel('alpha')
            axs_loss_sum_train.set_ylabel('loss')

            # Sum of projection and regression loss over all cantons for the test set
            axs_loss_sum_test.semilogy(alphas, lr_test_matrix[:, opt_reg_idx], 'o-', label='l_regr')
            axs_loss_sum_test.semilogy(alphas, lp_test_matrix[:, opt_reg_idx], 'o-', label='l_proj')
            axs_loss_sum_test.semilogy(alphas, lr_test_matrix[:, opt_reg_idx]+lp_test_matrix[:, opt_reg_idx],
                                       'o-', label='l_regr+l_proj')

            axs_loss_sum_test.legend()
            axs_loss_sum_test.set_title('Test')
            axs_loss_sum_test.set_xlabel('alpha')
            axs_loss_sum_test.set_ylabel('loss')

            plt.show()

# Linear

In [None]:
# Linear model setup
n_species = 2
group_names = {'power': ['OO', 'OSi', 'SiSi'], 'radial': ['O', 'Si']}

## Optimize LinearSVC parameters

In [None]:
for cutoff in tqdm(cutoffs, desc='Cutoff', leave=True):
    work_dir = f'{model_dir}/{cutoff}/Linear_Models/LSVC-LPCovR'
    
    if not os.path.exists(work_dir):
        os.makedirs(work_dir)
    
    for spectrum_type in tqdm(('power', 'radial'), desc='Spectrum', leave=False):
        spectrum_name = spectrum_type.capitalize()
        
        # Load SOAPs
        deem_file = f'{deem_dir}/{cutoff}/soaps_{spectrum_type}_full_avg_nonorm.hdf5'
        iza_file = f'{iza_dir}/{cutoff}/soaps_{spectrum_type}_full_avg_nonorm.hdf5'
        
        soaps_train, soaps_test = utils.load_soaps(deem_file, iza_file,
                                                   idxs_deem_train, idxs_deem_test,
                                                   idxs_iza_train, idxs_iza_test,
                                                   idxs_iza_delete=[RWY],
                                                   train_test_concatenate=True)
        
        # Scale the SOAPs so they are of a 'usable' magnitude for the SVC
        soaps_train, soaps_test = utils.preprocess_soaps(soaps_train, soaps_test)
        
        n_features = soaps_train.shape[1]
        feature_groups = extract_species_pair_groups(n_features, n_species, 
                                                     spectrum_type=spectrum_type)
        
        for species_pairing, feature_idxs in zip(tqdm(group_names[spectrum_type], 
                                                      desc='Species', leave=False),
                                                 feature_groups):            
            
            for n_cantons in tqdm((2, 4), desc='Classes', leave=False):

                # Prepare outputs
                output_dir = f'{n_cantons}-class/{spectrum_name}/{species_pairing}'
                
                if not os.path.exists(f'{work_dir}/{output_dir}'):
                    os.makedirs(f'{work_dir}/{output_dir}')
                
                svc_parameter_file = f'{work_dir}/{output_dir}/svc_parameters.json'
                svc_parameters = svc_kwargs['linear'].copy()

                for cdx, c in enumerate(tqdm(C), desc='C', leave=False):
                    svc_parameters.update(C=c)

                    for n in tqdm(range(0, n_subsets), desc='n', leave=False):
                        
                        # Slice SOAPs
                        idxs_train = idxs_train_kernel[n]
                        idxs_test = idxs_test_kernel[n]

                        x_train = soaps_train[idxs_train, feature_idxs]
                        x_test = soaps_test[idxs_test, feature_idxs]
                        
                        # Scale the SOAPs so they are of a 'usable' magnitude for the SVC
                        x_train, x_test = utils.preprocess_soaps(x_train, x_test)

                        y_train = cantons_train[n_cantons][idxs_train]
                        y_test = cantons_test[n_cantons][idxs_test]

                        # Run LSVC
                        train_score, test_score = \
                            utils.do_svc(x_train, x_test, y_train, y_test, 
                                         svc_type='linear', **svc_parameters, outputs=['scores'])

                        class_accuracy_train[n] = train_score
                        class_accuracy_test[n] = test_score
                    
                    model = np.array([(c, class_accuracy_train, class_accuracy_test)], dtype=dt_svm)
                    svm_errors.append(model)
            
            svm_errors = np.concatenate(svm_errors)
            
            # Stack the arrays in a writable form so we can save in plain text instead of npy/npz
            save_structured_array(f'{work_dir}/{output_dir}/svm_optimization.dat',
                                  svm_errors, dt_svm)
            
            # TODO: use accuracy only on IZA structures?
            class_accuracy_train_avg = np.mean(svm_errors['class_accuracy_train'], axis=1)
            class_accuracy_test_avg = np.mean(svm_errors['class_accuracy_test'], axis=1)
            
            # Save LSVC parameters
            idx_C = np.argmax(class_accuracy_test_avg)
            C_opt = svm_errors['C'][idx_C]
            svc_parameters.update(C=C_opt)
            save_json(svc_parameter_file, svc_parameters)   

## Optimal LinearSVC

In [None]:
for cutoff in tqdm(cutoffs, desc='Cutoff', leave=True):
    work_dir = f'{model_dir}/{cutoff}/Linear_Models/LSVC-LPCovR'
    
    for spectrum_type in tqdm(('power', 'radial'), desc='Spectrum', leave=False):
        spectrum_name = spectrum_type.capitalize()
        
        # Load SOAPs
        deem_file = f'{deem_dir}/{cutoff}/soaps_{spectrum_type}_full_avg_nonorm.hdf5'
        iza_file = f'{iza_dir}/{cutoff}/soaps_{spectrum_type}_full_avg_nonorm.hdf5'
        
        soaps_train, soaps_test = utils.load_soaps(deem_file, iza_file,
                                                   idxs_deem_train, idxs_deem_test,
                                                   idxs_iza_train, idxs_iza_test,
                                                   idxs_iza_delete=[RWY],
                                                   train_test_concatenate=True)
        
        # Scale the SOAPs so they are of a 'usable' magnitude for the SVC
        soaps_train, soaps_test = utils.preprocess_soaps(soaps_train, soaps_test)
        
        n_features = soaps_train.shape[1]
        feature_groups = extract_species_pair_groups(n_features, n_species, 
                                                     spectrum_type=spectrum_type)
        
        for species_pairing, feature_idxs in zip(tqdm(group_names[spectrum_type], 
                                                      desc='Species', leave=False),
                                                 feature_groups):                        
            
            for n_cantons in tqdm((2, 4), desc='Classes', leave=False):

                # Prepare outputs
                output_dir = f'{n_cantons}-class/{spectrum_name}/{species_pairing}'
                svc_parameter_file = f'{work_dir}/{output_dir}/svc_parameters.json'

                svc_df_train_file = f'{work_dir}/{output_dir}/svc_structure_dfs_train_optimization.dat'
                svc_df_test_file = f'{work_dir}/{output_dir}/svc_structure_dfs_test_optimization.dat'

                # Assemble properties
                y_train = cantons_train[n_cantons][idxs_train_all]
                y_test = cantons_test[n_cantons][idxs_test_all]
                
                x_train = soaps_train[idxs_train_all, feature_idxs]
                x_test = soaps_test[idxs_train_all, feature_idxs]
                
                # Scale the SOAPs so they are of a 'usable' magnitude for the SVC
                x_train, x_test = utils.preprocess_soaps(x_train, x_test)

                # LSVC
                svc_parameters = load_json(svc_parameter_file)
                df_train, df_test, train_score, test_score = \
                    utils.do_svc(soaps_train, soaps_test, y_train, y_test, svc_type='linear',
                                 outputs=['decision_functions', 'scores'])

                # Save decision functions for LR test and PCovR optimization
                np.savetxt(svc_df_train_file, df_train)
                np.savetxt(svc_df_test_file, df_test)

## LR check

In [None]:
for cutoff in tqdm(cutoffs, desc='Cutoff', leave=True):
    work_dir = f'{model_dir}/{cutoff}/Linear_Models/LSVC-LPCovR'
    
    for spectrum_type in tqdm(('power', 'radial'), desc='Spectrum', leave=False):
        spectrum_name = spectrum_type.capitalize()
        
        # Load SOAPs
        deem_file = f'{deem_dir}/{cutoff}/soaps_{spectrum_type}_full_avg_nonorm.hdf5'
        iza_file = f'{iza_dir}/{cutoff}/soaps_{spectrum_type}_full_avg_nonorm.hdf5'

        soaps_train, soaps_test = utils.load_soaps(deem_file, iza_file,
                                                   idxs_deem_train, idxs_deem_test,
                                                   idxs_iza_train, idxs_iza_test,
                                                   idxs_iza_delete=[RWY],
                                                   train_test_concatenate=True)
        
        n_features = soaps_train.shape[1]
        feature_groups = extract_species_pair_groups(n_features, n_species, 
                                                     spectrum_type=spectrum_type)
        
        for species_pairing, feature_idxs in zip(tqdm(group_names[spectrum_type], 
                                                      desc='Species', leave=False),
                                                 feature_groups):
            
            x_train = soaps_train[:, feature_idxs]
            x_test = soaps_test[:, feature_idxs]

            # Preprocess the SOAPs like the decision functions
            # (i.e., center and scale) for the regression.
            x_train, x_test, x_center, x_scale = \
                utils.preprocess_data(x_train, x_test)
                        
            for n_cantons in tqdm((2, 4), desc='Classes', leave=False):
            
                # Load decision functions
                input_dir = f'{n_cantons}-class/{spectrum_name}/{species_pairing}'

                # Files to store IZA and DEEM KSVC decision functions
                svc_df_train_file = f'{work_dir}/{input_dir}/svc_structure_dfs_train_optimization.dat'
                svc_df_test_file = f'{work_dir}/{input_dir}/svc_structure_dfs_test_optimization.dat'

                df_train = np.loadtxt(svc_df_train_file)
                df_test = np.loadtxt(svc_df_test_file)

                # Center and scale the decision functions
                df_train, df_test, df_center, df_scale = \
                    utils.preprocess_data(df_train, df_test)

                # Check that LR can reproduce the decision functions
                utils.regression_check(x_train, x_test, df_train, df_test, regression_type='linear')

## Optimize PCovR parameters

In [None]:
for cutoff in tqdm(cutoffs, desc='Cutoff', leave=True):
    work_dir = f'{model_dir}/{cutoff}/Linear_Models/LSVC-LPCovR'
    
    for spectrum_type in tqdm(('power', 'radial'), desc='Spectrum', leave=False):
        spectrum_name = spectrum_type.capitalize()
        
        # Load SOAPs
        deem_file = f'{deem_dir}/{cutoff}/soaps_{spectrum_type}_full_avg_nonorm.hdf5'
        iza_file = f'{iza_dir}/{cutoff}/soaps_{spectrum_type}_full_avg_nonorm.hdf5'
        
        soaps_train, soaps_test = utils.load_soaps(deem_file, iza_file,
                                                   idxs_deem_train, idxs_deem_test,
                                                   idxs_iza_train, idxs_iza_test,
                                                   idxs_iza_delete=[RWY],
                                                   train_test_concatenate=True)
        
        # Scale the SOAPs so they are of a 'usable' magnitude for the SVC
        soaps_train, soaps_test = utils.preprocess_soaps(soaps_train, soaps_test)
        
        n_features = soaps_train.shape[1]
        feature_groups = extract_species_pair_groups(n_features, n_species, 
                                                     spectrum_type=spectrum_type)
        
        for species_pairing, feature_idxs in zip(tqdm(group_names[spectrum_type], 
                                                      desc='Species', leave=False),
                                                 feature_groups):
                        
            for n_cantons in tqdm((2, 4), desc='Classes', leave=False):
                
                # Prepare outputs
                output_dir = f'{n_cantons}-class/{spectrum_name}/{species_pairing}'
                pcovr_parameter_file = f'{work_dir}/{output_dir}/pcovr_parameters.json'

                # Load decision functions
                svc_df_train_file = f'{work_dir}/{output_dir}/svc_structure_dfs_train_optimization.dat'
                svc_df_test_file = f'{work_dir}/{output_dir}/svc_structure_dfs_test_optimization.dat'

                df_train = np.loadtxt(svc_df_train_file)
                df_test = np.loadtxt(svc_df_test_file)

                for adx, a in enumerate(tqdm(alphas, desc='Alpha', leave=False)):
                    for rdx, r in enumerate(tqdm(regularizations, desc='Reg', leave=False)):

                        # Initialize matrices of losses
                        lr_train = np.zeros(n_subsets)
                        lp_train = np.zeros(n_subsets)
                        lr_test = np.zeros(n_subsets)
                        lp_test = np.zeros(n_subsets)

                        for n in tqdm(range(0, n_subsets), desc='n', leave=False):

                            # Slice the SOAPs
                            idxs_train = idxs_train_kernel[n]
                            idxs_test = idxs_test_kernel[n]

                            x_train = soaps_train[idxs_train, feature_idxs]
                            x_test = soaps_test[idxs_test, feature_idxs]

                            # Scale the SOAPs so they are of a 'usable' magnitude for the SVC
                            x_train, x_test = utils.preprocess_soaps(x_train, x_test)

                            y_train = df_train[idxs_train]
                            y_test = df_test[idxs_test]

                            # Assemble properties
                            y_train, y_test, y_center, y_scale = \
                                utils.preprocess_data(y_train, y_test)

                            # Run PCovR
                            pcovr_parameters = pcovr_kwargs['linear'].copy()
                            t_train, t_test, yp_train, yp_test, xr_train, xr_test = \
                                utils.do_pcovr(x_train, x_test, y_train, y_test, 
                                               pcovr_type='linear', compute_xr=True,
                                               **pcovr_parameters,
                                               alpha=a, regularization=r)

                            # Post process the PCovR decision functions
                            # (i.e., turn them back into canton predictions)
                            predicted_cantons_train, predicted_cantons_test = \
                                utils.postprocess_decision_functions(yp_train, yp_test, y_center, y_scale)

                            # Collect regression and projection (PCovR) losses for each canton
                            # TODO: losses on just IZA?
                            lr_train[n], lr_test[n] = \
                                utils.compute_regression_losses(y_train, y_test, yp_train, yp_test)

                            lp_train[n], lp_test[n] = \
                                utils.compute_linear_projection_losses(x_train, x_test,
                                                                       xr_train, xr_test)

                        # Collect errors
                        model = np.array([(a, r, lr_train, lr_test, lp_train, lp_test)], dtype=dt_covr)
                        pcovr_errors.append(model)

                pcovr_errors = np.concatenate(pcovr_errors)

                save_structured_array(f'{work_dir}/{output_dir}/pcovr_optimization.dat', 
                                      pcovr_errors, dt_kpcovr)

                # Extract optimal hyperparameters
                lr_avg_test = np.mean(kpcovr_errors['lr_test'], axis=1)
                lp_avg_test = np.mean(kpcovr_errors['lp_test'], axis=1)

                idx_opt = np.argmin(lr_avg_test + lp_avg_test)

                opt_alpha = kpcovr_errors['alpha'][idx_opt]
                opt_reg = kpcovr_errors['regularization'][idx_opt]

                print(opt_alpha, opt_reg)

                # Save KPCovR parameters
                pcovr_parameters.update(alpha=opt_alpha,
                                        regularization=opt_reg)

                save_json(pcovr_parameters, pcovr_parameter_file)

## Check for optimization problems

In [None]:
for cutoff in cutoffs:
    work_dir = f'{model_dir}/{cutoff}/Linear_Models/LSVC-LPCovR'
    
    for spectrum_type in ('power', 'radial'):
        spectrum_name = spectrum_type.capitalize()
        
        for species_pairing, feature_idxs in zip(group_names[spectrum_type], feature_groups):
                        
            for n_cantons in (2, 4):
                
                print(f'Cutoff: {cutoff}, Spectrum: {spectrum_name}, Species: {species_pairing}, {n_cantons}-Class')
                
                output_dir = f'{n_cantons}-class/{spectrum_name}/{species_pairing}'
            
                loss_file = f'{work_dir}/{output_dir}/pcovr_optimization.dat'

                # Load losses
                alphas, opt_reg_idx, \
                lr_train_matrix, lr_test_matrix, \
                lp_train_matrix, lp_test_matrix = load_covr_losses(loss_file)

                fig = plt.figure(figsize=(7.0, 7.0))
                axs_loss_sum_train = fig.add_subplot(1, 2, 1)
                axs_loss_sum_test = fig.add_subplot(1, 2, 2)

                # Sum of projection and regression loss over all cantons for the train set
                axs_loss_sum_train.semilogy(alphas, lr_train_matrix[:, opt_reg_idx], 'o-', label='l_regr')
                axs_loss_sum_train.semilogy(alphas, lp_train_matrix[:, opt_reg_idx], 'o-', label='l_proj')
                axs_loss_sum_train.semilogy(alphas, lr_train_matrix[:, opt_reg_idx]
                                            + lp_train_matrix[:, opt_reg_idx], 'o-', label='l_regr+l_proj')

                axs_loss_sum_train.legend()
                axs_loss_sum_train.set_title('Train')
                axs_loss_sum_train.set_xlabel('alpha')
                axs_loss_sum_train.set_ylabel('loss')

                # Sum of projection and regression loss over all cantons for the test set
                axs_loss_sum_test.semilogy(alphas, lr_test_matrix[:, opt_reg_idx], 'o-', label='l_regr')
                axs_loss_sum_test.semilogy(alphas, lp_test_matrix[:, opt_reg_idx], 'o-', label='l_proj')
                axs_loss_sum_test.semilogy(alphas, lr_test_matrix[:, opt_reg_idx]
                                           + lp_test_matrix[:, opt_reg_idx], 'o-', label='l_regr+l_proj')

                axs_loss_sum_test.legend()
                axs_loss_sum_test.set_title('Test')
                axs_loss_sum_test.set_xlabel('alpha')
                axs_loss_sum_test.set_ylabel('loss')

                plt.show()

# Logistic Regression