In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# System
import os
import sys
sys.path.append('/home/helfrech/Tools/Toolbox/utils')

# Maths
import numpy as np

# Plotting
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D

# ML
from regression import PCovR, KPCovR, SparseKPCovR
from regression import LR, KRR
from kernels import build_kernel, linear_kernel, gaussian_kernel
from kernels import center_kernel, center_kernel_fast
from kernels import center_kernel_oos, center_kernel_oos_fast
from soap import compute_soap_density, reshape_soaps

from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.kernel_ridge import KernelRidge

# Utilities
import h5py
import json
from tqdm.notebook import tqdm
from project_utils import load_structures_from_hdf5

# Import COSMO style toolkit
import cosmoplot.colorbars as cosmocbars
import cosmoplot.utils as cosmoutils
import cosmoplot.style as cosmostyle

cosmostyle.set_style('article')
colorList = cosmostyle.color_cycle

In [None]:
# sys.path.append('/scratch/helfrech/Sync/GDrive/Projects/KPCovR/kernel-tutorials')
# sys.path.append('/scratch/helfrech/Sync/GDrive/Projects/KPCovR/KernelPCovR/analysis/scripts')
# from utilities.sklearn_covr.kpcovr import KernelPCovR as KPCovR2
# from utilities.sklearn_covr.pcovr import PCovR as PCovR2
# from helpers import l_regr, l_kpcovr

# Functions

In [None]:
def df_to_class(df, df_type, n_classes, use_df_sums=True):
    """
        Make class predictions based on a decision function.
        Based on the sci-kit learn SVC prediction, see
        `sklearn.multiclass._ovr_decision_function` 
        (sci-kit learn licensed under BSD 3-Clause license)
        TODO: could we also just use this function?
        
        ---Arguments---
        df: decision function on which to make class predictions
        df_type: decision function type, 'ovo' or 'ovr'
        n_classes: number of integer classes
        use_df_sums: augment the 'ovo' vote counts with
            decision function values (useful for tie breaks)
            
        ---Returns---
        predicted_class: predicted integer class
    """
    
    # Approximation to the number of classes, should be valid up to at least 1M
    #n_classes = int(np.sqrt(2*df.shape[-1])) + 1
    
    if n_classes > 2:
        if df_type == 'ovo':
            vote_matrix = np.zeros((df.shape[0], n_classes))
            df_sum = np.zeros((df.shape[0], n_classes))

            # Predicted class determined by majority vote
            col_idx = 0
            for i in range(0, n_classes):
                for j in range(i + 1, n_classes):
                    col_train = df[:, col_idx]
                    vote_matrix[col_train > 0, i] += 1
                    vote_matrix[col_train <= 0, j] += 1

                    # Add value of decision function
                    if use_df_sums:
                        df_sum[:, i] += df[:, col_idx]
                        df_sum[:, j] -= df[:, col_idx]

                    col_idx += 1

            # sci-kit learn transformation from 'ovo' to 'ovr'
            if use_df_sums:
                transformed_df_sum = df_sum / (3 * (np.abs(df_sum) + 1))
                vote_matrix += transformed_df_sum

            predicted_class = np.argmax(vote_matrix, axis=1) + 1

        elif df_type == 'ovr':

            # Predicted class determined by largest value of the decision function
            predicted_class = np.argmax(df, axis=1) + 1

        else:
            print("Error: invalid decision function. Use 'ovo' or 'ovr'")
    else:
        predicted_class = np.zeros(df.shape[0], dtype=int)
        
        # This appears to be the convention, which is "opposite" of that above
        # Default exactly zero decision function value to the "positive" class
        predicted_class[df >= 0] = 2
        predicted_class[df < 0] = 1
        
    return predicted_class

# Load and split data

In [None]:
# Load train and test set indices for Deem
idxs_deem_train = np.loadtxt('../Processed_Data/DEEM_10k/train.idxs', dtype=int)
idxs_deem_test = np.loadtxt('../Processed_Data/DEEM_10k/test.idxs', dtype=int)

# Total number of structures
n_deem_train = idxs_deem_train.size
n_deem_test = idxs_deem_test.size
n_deem = n_deem_train + n_deem_test

In [None]:
# Load SOAP cutoffs
with open('../Processed_Data/soap_hyperparameters.json', 'r') as f:
    soap_hyperparameters = json.load(f)
    
cutoffs = soap_hyperparameters['interaction_cutoff']

In [None]:
# Load IZA cantons
cantons_iza = np.loadtxt('../Raw_Data/GULP/IZA_226/cantons.txt', usecols=1, dtype=int)
RWY = np.nonzero(cantons_iza == 4)[0][0]
cantons_iza = np.delete(cantons_iza, RWY)
n_iza = len(cantons_iza)

In [None]:
cantons_iza = np.ones(n_iza, dtype=int)

In [None]:
# Select IZA sample
# (will be overwritten if we load a kernel
n_iza_train = n_iza // 2
n_iza_test = n_iza - n_iza_train
idxs_iza = np.arange(0, n_iza)
np.random.shuffle(idxs_iza)

idxs_iza_train = idxs_iza[0:n_iza_train]
idxs_iza_test = idxs_iza[n_iza_train:n_iza_train+n_iza_test]

In [None]:
# Make dummy DEEM cantons
cantons_deem = np.ones(n_deem, dtype=int) * 2

# Load SOAPs and build kernels

In [None]:
# Flag to recompute existing kernels
remove_kernels = False

In [None]:
K_train = {}
K_test = {}
K_test_test = {}
kernel_type = {}
gamma = {}

In [None]:
for cutoff in cutoffs:
    kernel_file = f'../Processed_Data/Models/{cutoff}/structure_kernels.hdf5'
    
    # Start fresh
    if remove_kernels and os.path.exists(kernel_file):
        os.remove(kernel_file)
    
    # Load the kernels if they exist
    try:
        f = h5py.File(kernel_file, 'r')
        
        K_train[cutoff] = f['K_train'][:]
        K_test[cutoff] = f['K_test'][:]
        K_test_test[cutoff] = f['K_test_test'][:]
        kernel_type[cutoff] = f.attrs['kernel_type']
        gamma[cutoff] = f.attrs['gamma']
        
        # Don't need to store indices in a dictonary
        # since they are the same for all cutoffs
        idxs_iza_train = f.attrs['idxs_iza_train']
        idxs_iza_test = f.attrs['idxs_iza_test']
        idxs_deem_train = f.attrs['idxs_deem_train']
        idxs_deem_test = f.attrs['idxs_deem_test']
 
        f.close()
    
    # Compute the kernels if they don't exist
    except OSError:
    
        # Load kernel parameters
        model_file = f'../Processed_Data/Models/{cutoff}/volumes_mae_parameters.json'

        with open(model_file, 'r') as f:
            model_dict = json.load(f)

        kernel_type[cutoff] = model_dict['kernel_type']
        gamma[cutoff] = model_dict['gamma']

        # Load SOAPs
        deem_file = f'../Processed_Data/DEEM_10k/Data/{cutoff}/soaps.hdf5'
        deem_soaps = load_structures_from_hdf5(deem_file, datasets=None, concatenate=False)

        iza_file = f'../Processed_Data/IZA_226onDEEM_10k/Data/{cutoff}/soaps.hdf5'
        iza_soaps = load_structures_from_hdf5(iza_file, datasets=None, concatenate=False)
        iza_soaps.pop(RWY)

        # Build the collection of soap vectors
        # for the "master" kernel
        deem_train = [deem_soaps[i] for i in idxs_deem_train]
        deem_test = [deem_soaps[i] for i in idxs_deem_test]
        iza_train = [iza_soaps[i] for i in idxs_iza_train]
        iza_test = [iza_soaps[i] for i in idxs_iza_test]

        # Build "master" kernel between all DEEM and all IZA
        K_train[cutoff] = build_kernel(iza_train+deem_train, iza_train+deem_train, 
                                       kernel=kernel_type[cutoff], gamma=gamma[cutoff])
        K_test[cutoff] = build_kernel(iza_test+deem_test, iza_train+deem_train, 
                                      kernel=kernel_type[cutoff], gamma=gamma[cutoff])
        K_test_test[cutoff] = build_kernel(iza_test+deem_test, iza_test+deem_test, 
                                           kernel=kernel_type[cutoff], gamma=gamma[cutoff])
        
        # Save kernels for later
        g = h5py.File(kernel_file, 'w')
        
        g.create_dataset('K_train', data=K_train[cutoff])
        g.create_dataset('K_test', data=K_test[cutoff])
        g.create_dataset('K_test_test', data=K_test_test[cutoff])
        
        g.attrs['idxs_iza_train'] = idxs_iza_train
        g.attrs['idxs_iza_test'] = idxs_iza_test
        g.attrs['idxs_deem_train'] = idxs_deem_train
        g.attrs['idxs_deem_test'] = idxs_deem_test
        g.attrs['kernel_type'] = kernel_type[cutoff]
        g.attrs['gamma'] = gamma[cutoff]
        
        g.close()

In [None]:
# Overwrite kernels with linear
for cutoff in cutoffs:
    
    # Load SOAPs
    deem_file = f'../Processed_Data/DEEM_10k/Data/{cutoff}/soaps.hdf5'
    deem_soaps = load_structures_from_hdf5(deem_file, datasets=None, concatenate=False)

    iza_file = f'../Processed_Data/IZA_226onDEEM_10k/Data/{cutoff}/soaps.hdf5'
    iza_soaps = load_structures_from_hdf5(iza_file, datasets=None, concatenate=False)
    iza_soaps.pop(RWY)

    # Build the collection of soap vectors
    # for the "master" kernel
    deem_train = np.vstack([np.mean(deem_soaps[i], axis=0) for i in idxs_deem_train])
    deem_test = np.vstack([np.mean(deem_soaps[i], axis=0) for i in idxs_deem_test])
    iza_train = np.vstack([np.mean(iza_soaps[i], axis=0) for i in idxs_iza_train])
    iza_test = np.vstack([np.mean(iza_soaps[i], axis=0) for i in idxs_iza_test])
    
    # Build "master" kernel between all DEEM and all IZA
    K_train[cutoff] = linear_kernel(np.vstack((iza_train, deem_train)), 
                                    np.vstack((iza_train, deem_train)),
                                    zeta=1)
    K_test[cutoff] = linear_kernel(np.vstack((iza_test, deem_test)),
                                   np.vstack((iza_train, deem_train)),
                                   zeta=1)
    K_test_test[cutoff] = linear_kernel(np.vstack((iza_test, deem_test)),
                                        np.vstack((iza_test, deem_test)),
                                        zeta=1)
    
    # We can also do this, but it is slow
    #K_train[cutoff] = build_kernel(iza_train+deem_train, iza_train+deem_train, 
    #                               kernel='linear', zeta=1)
    #K_test[cutoff] = build_kernel(iza_test+deem_test, iza_train+deem_train, 
    #                              kernel='linear', zeta=1)
    #K_test_test[cutoff] = build_kernel(iza_test+deem_test, iza_test+deem_test, 
    #                                   kernel='linear', zeta=1)

In [None]:
# Save IZA indices for later
# (we do this after the kernel loading to make sure that if an existing
# kernel is loaded, the associated indices don't get overwritten)
#np.savetxt(f'../Processed_Data/IZA_226onDEEM_10k/train.idxs', idxs_iza_train, fmt='%d')
#np.savetxt(f'../Processed_Data/IZA_226onDEEM_10k/test.idxs', idxs_iza_test, fmt='%d')

In [None]:
# Build set of "master" canton labels
cantons_train = np.concatenate((cantons_iza[idxs_iza_train], cantons_deem[idxs_deem_train]))
cantons_test = np.concatenate((cantons_iza[idxs_iza_test], cantons_deem[idxs_deem_test]))
n_classes = np.amax(cantons_train)

In [None]:
# Center and scale kernels
for cutoff in cutoffs:
    K_test[cutoff] = center_kernel_fast(K_test[cutoff], K_ref=K_train[cutoff])
    K_train[cutoff] = center_kernel_fast(K_train[cutoff])

    K_scale = np.trace(K_train[cutoff]) / K_train[cutoff].shape[0]
    K_test[cutoff] /= K_scale
    K_train[cutoff] /= K_scale

# SVM on full test and train sets

In [None]:
# Use a different SVM regularization than the optimal
C_override = None

In [None]:
for cutoff in cutoffs:
    
    model_dir = f'../Processed_Data/Models/{cutoff}'
    with open(f'{model_dir}/ksvc_parameters.json', 'r') as f:
        model_dict = json.load(f)
        
    if C_override is not None:
        C = C_override
    else:
        C = model_dict['C']
            
    # Assemble kernels
    k_train = K_train[cutoff]
    k_test = K_test[cutoff]

    # SVC
    svc = SVC(kernel='precomputed', decision_function_shape=model_dict['df_type'], 
              class_weight=model_dict['class_weight'], C=C)
    svc.fit(k_train, cantons_train)
    
    df_train = svc.decision_function(k_train)
    df_test = svc.decision_function(k_test)
    
    predicted_cantons_train = svc.predict(k_train)
    predicted_cantons_test = svc.predict(k_test)
    
    # Save decision functions
    if n_classes == 2:
        df_deem = np.zeros(n_deem)
        df_iza = np.zeros(n_iza)
    else:
        if model_dict['df_type'] == 'ovo':
            n_df = n_classes * (n_classes - 1) // 2
        else:
            n_df = n_classes
            
        df_deem = np.zeros((n_deem, n_df))
        df_iza = np.zeros((n_iza, n_df))
    
    df_deem[idxs_deem_train] = df_train[n_iza_train:]
    df_deem[idxs_deem_test] = df_test[n_iza_test:]
    np.savetxt(f'../Processed_Data/DEEM_10k/Data/{cutoff}/ksvc_structure_dfs.dat', df_deem)
    
    df_iza[idxs_iza_train] = df_train[0:n_iza_train]
    df_iza[idxs_iza_test] = df_test[0:n_iza_test]
    np.savetxt(f'../Processed_Data/IZA_226onDEEM_10k/Data/{cutoff}/ksvc_structure_dfs.dat', df_iza)
    
    # Save KSVC class predictions
    predicted_cantons_deem = np.zeros(n_deem)
    predicted_cantons_deem[idxs_deem_train] = predicted_cantons_train[n_iza_train:]
    predicted_cantons_deem[idxs_deem_test] = predicted_cantons_test[n_iza_test:]
    np.savetxt(f'../Processed_Data/DEEM_10k/Data/{cutoff}/ksvc_structure_cantons.dat',
               predicted_cantons_deem, fmt='%d')
    
    predicted_cantons_iza = np.zeros(n_iza)
    predicted_cantons_iza[idxs_iza_train] = predicted_cantons_train[0:n_iza_train]
    predicted_cantons_iza[idxs_iza_test] = predicted_cantons_test[0:n_iza_test]
    np.savetxt(f'../Processed_Data/IZA_226onDEEM_10k/Data/{cutoff}/ksvc_structure_cantons.dat', 
               predicted_cantons_iza, fmt='%d')

# Regress full (averaged) SOAP on Linear SVM

In [None]:
# TODO: should probably just do this "legitimately" with a linear SVM on the full average SOAPs
# instead of regressing on the linear kernel SVM
# Could also convert from the dual to the primal weights, 
# but based on the sk-learn implementation this seems a bit messy,
# and it is still probably best just to do the linear SVM anyway.
# For a quick first pass though, we'll regress on the decision function of a linear kernel SVM
# built on the FPS'ed SOAPs

In [None]:
for cutoff in cutoffs:
    
    # Load decision functions
    df_deem = np.loadtxt(f'../Processed_Data/DEEM_10k/Data/{cutoff}/ksvc_structure_dfs.dat')
    df_iza = np.loadtxt(f'../Processed_Data/IZA_226onDEEM_10k/Data/{cutoff}/ksvc_structure_dfs.dat')
    
    df_deem_train = df_deem[idxs_deem_train]
    df_deem_test = df_deem[idxs_deem_test]
    
    df_iza_train = df_iza[idxs_iza_train]
    df_iza_test = df_iza[idxs_iza_test]
    
    df_train = np.concatenate((df_iza_train, df_deem_train))
    df_test = np.concatenate((df_iza_test, df_deem_test))
    
    # Center and scale decision functions
    df_center = np.mean(df_train, axis=0)
    df_train -= df_center
    df_test -= df_center

    if df_train.ndim == 1:
        df_scale = np.linalg.norm(df_train) / np.sqrt(df_train.size)
    else:
        df_scale = np.linalg.norm(df_train, axis=0) / np.sqrt(df_train.shape[0] / df_train.shape[1])
    
    df_train /= df_scale
    df_test /= df_scale

    # Load SOAPs
    soaps_deem = load_structures_from_hdf5(f'../Processed_Data/DEEM_10k/Data/{cutoff}/soaps_full_avg.hdf5',
                                           datasets=None, concatenate=True)
    soaps_iza = load_structures_from_hdf5(f'../Processed_Data/IZA_226/Data/{cutoff}/soaps_full_avg.hdf5',
                                          datasets=None, concatenate=True)
    
    soaps_deem_train = soaps_deem[idxs_deem_train]
    soaps_deem_test = soaps_deem[idxs_deem_test]
    
    soaps_iza_train = soaps_iza[idxs_iza_train]
    soaps_iza_test = soaps_iza[idxs_iza_test]
    
    soaps_train = np.concatenate((soaps_iza_train, soaps_deem_train))
    soaps_test = np.concatenate((soaps_iza_test, soaps_deem_test))
    
    soaps_center = np.mean(soaps_train, axis=0)
    soaps_train -= soaps_center
    soaps_test -= soaps_center
    
    soaps_scale = np.linalg.norm(soaps_train, axis=0) / np.sqrt(soaps_train.shape[0] / soaps_train.shape[1])
    soaps_train /= soaps_scale
    soaps_test /= soaps_scale

    # Linear regression on decision functions
    lr = LR(regularization=1.0E-12)
    lr.fit(soaps_train, df_train)
    
    # Test that the prediction is good
    dfp_train = lr.transform(soaps_train)
    dfp_test = lr.transform(soaps_test)
    
    print(np.mean(np.abs(dfp_train - df_train), axis=0))
    print(np.mean(np.abs(dfp_test - df_test), axis=0))

    # Extract weights
    w = lr.W.T
    print(w.shape)

    # Compute LR weight density
    # TODO: set n_pairs, n_max, l_max in a robust way, but for now hard-code the hyperparameters
    w = reshape_soaps(w, 3, 12, 9)
    density = compute_soap_density(12, 9, cutoff, w,
                                   np.linspace(0, cutoff, 50),
                                   np.linspace(-1, 1, 50),
                                   chunk_size_r=10, chunk_size_p=50)

In [None]:
density.shape

In [None]:
# Plot
import plotly.graph_objects as go
rx_grid, ry_grid, tz_grid = np.meshgrid(np.linspace(0, 6.0, 50), 
                                        np.linspace(0, 6.0, 50), 
                                        np.linspace(-1, 1, 50))
fig = go.Figure(data=go.Volume(x=rx_grid.flatten(),
                               y=ry_grid.flatten(),
                               z=tz_grid.flatten(),
                               value=density[0][0].flatten(),
                               isomin=1000,
                               isomax=None,
                               opacity=0.2,
                               surface_count=20))
fig.show()

# Check that decision functions can be predicted with KRR

In [None]:
for cutoff in cutoffs:
    
    # Assemble kernels
    k_train = K_train[cutoff]
    k_test = K_test[cutoff]
    
    # Load decision functions
    df_deem = np.loadtxt(f'../Processed_Data/DEEM_10k/Data/{cutoff}/ksvc_structure_dfs.dat')
    df_iza = np.loadtxt(f'../Processed_Data/IZA_226onDEEM_10k/Data/{cutoff}/ksvc_structure_dfs.dat')
    
    df_deem_train = df_deem[idxs_deem_train]
    df_deem_test = df_deem[idxs_deem_test]
    
    df_iza_train = df_iza[idxs_iza_train]
    df_iza_test = df_iza[idxs_iza_test]
    
    df_train = np.concatenate((df_iza_train, df_deem_train))
    df_test = np.concatenate((df_iza_test, df_deem_test))
    
    # Center and scale decision functions
    df_center = np.mean(df_train, axis=0)
    df_train -= df_center
    df_test -= df_center

    if df_train.ndim == 1:
        df_scale = np.linalg.norm(df_train) / np.sqrt(df_train.size)
    else:
        df_scale = np.linalg.norm(df_train, axis=0) / np.sqrt(df_train.shape[0] / df_train.shape[1])
    
    df_train /= df_scale
    df_test /= df_scale

    # Test KRR on decision functions
    # NOTE: KRR can't predict the test set
    # decision function very well -- why?
    
#     krr = KernelRidge(alpha=1.0E-12, kernel='precomputed')
#     krr.fit(k_train, df_krr_train)
#     dfp_krr_train = krr.predict(k_train)
#     dfp_krr_test = krr.predict(k_test)

    krr = KRR(regularization=1.0E-12)
    krr.fit(k_train, df_train)
    dfp_train = krr.transform(k_train)
    dfp_test = krr.transform(k_test)
    
    print(np.mean(np.abs(dfp_train - df_train), axis=0))
    print(np.mean(np.abs(dfp_test - df_test), axis=0))

# KPCovR on full test and train sets

In [None]:
# Use a different number of components than that used for the optimization
n_components_override = 6

# Use an alpha other than the optimal
alpha_override = 0.0

# Use a regularization other than the optimal
regularization_override = 1.0E-12

In [None]:
for cutoff in cutoffs:
    
    model_dir = f'../Processed_Data/Models/{cutoff}'
    with open(f'{model_dir}/kpcovr_parameters.json', 'r') as f:
        model_dict = json.load(f)
        
    with open(f'{model_dir}/ksvc_parameters.json', 'r') as f:
        df_type = json.load(f)['df_type']
        
    # Assemble kernels
    k_train = K_train[cutoff]
    k_test = K_test[cutoff]
        
    # Load decision functions
    df_deem = np.loadtxt(f'../Processed_Data/DEEM_10k/Data/{cutoff}/ksvc_structure_dfs.dat')
    df_iza = np.loadtxt(f'../Processed_Data/IZA_226onDEEM_10k/Data/{cutoff}/ksvc_structure_dfs.dat')
    
    df_deem_train = df_deem[idxs_deem_train]
    df_deem_test = df_deem[idxs_deem_test]
    
    df_iza_train = df_iza[idxs_iza_train]
    df_iza_test = df_iza[idxs_iza_test]
    
    df_train = np.concatenate((df_iza_train, df_deem_train))
    df_test = np.concatenate((df_iza_test, df_deem_test))
    
    # Center and scale decision functions
    df_center = np.mean(df_train, axis=0)
    df_train -= df_center
    df_test -= df_center

    if df_train.ndim == 1:
        df_scale = np.linalg.norm(df_train) / np.sqrt(df_train.size)
    else:
        df_scale = np.linalg.norm(df_train, axis=0) / np.sqrt(df_train.shape[0] / df_train.shape[1])
    df_train /= df_scale
    df_test /= df_scale
    
    # Set KPCovR parameters
    if n_components_override is not None:
        n_components = n_components_override
    else:
        n_components = model_dict['n_components']
        
    if alpha_override is not None:
        alpha = alpha_override
    else:
        alpha = model_dict['alpha']
        
    if regularization_override is not None:
        regularization = regularization_override
    else:
        regularization = model_dict['regularization']

#     kpcovr = KPCovR2(n_components=n_components, kernel='precomputed',
#                      mixing=alpha,
#                      krr_params=dict(alpha=regularization))
#     kpcovr.fit(k_train, y_train)

#     T_train[cutoff] = kpcovr.transform(k_train)
#     yp_train[cutoff] = kpcovr.predict(k_train) 
#     T_test[cutoff] = kpcovr.transform(k_test) 
#     yp_test[cutoff] = kpcovr.predict(k_test)

    kpcovr = KPCovR(n_components=n_components, 
                    alpha=alpha, 
                    regularization=regularization)
    kpcovr.fit(k_train, df_train)
    
    T_train = kpcovr.transform_K(k_train)
    dfp_train = kpcovr.transform_Y(k_train)
    T_test = kpcovr.transform_K(k_test)
    dfp_test = kpcovr.transform_Y(k_test)
    
    dfp_train = np.squeeze(dfp_train) # TODO: move the squeezing to the KPCovR function
    dfp_test = np.squeeze(dfp_test)
        
    # Save KPCovR projections
    n_digits_deem = len(str(n_deem - 1))
    T_deem = np.zeros((n_deem, n_components)) # TODO: change this so just 1 df for 2-class
    T_deem[idxs_deem_train] = T_train[n_iza_train:]
    T_deem[idxs_deem_test] = T_test[n_iza_test:]
    
    g = h5py.File(f'../Processed_Data/DEEM_10k/Data/{cutoff}/kpcovr_structures.hdf5', 'w')
    for tdx, t in enumerate(T_deem):
        g.create_dataset(str(tdx).zfill(n_digits_deem), data=t)
        
    g.attrs['n_components'] = n_components
    g.attrs['alpha'] = alpha
    g.attrs['regularization'] = regularization
    
    g.close()
    
    n_digits_iza = len(str(n_iza - 1))
    T_iza = np.zeros((n_iza, n_components))
    T_iza[idxs_iza_train] = T_train[0:n_iza_train]
    T_iza[idxs_iza_test] = T_test[0:n_iza_test]
    
    g = h5py.File(f'../Processed_Data/IZA_226onDEEM_10k/Data/{cutoff}/kpcovr_structures.hdf5', 'w')
    for tdx, t in enumerate(T_iza):
        g.create_dataset(str(tdx).zfill(n_digits_iza), data=t)
        
    g.attrs['n_components'] = n_components
    g.attrs['alpha'] = alpha
    g.attrs['regularization'] = regularization
        
    g.close()
                    
    # Pickle the models
    # Copy the dict so we can make the numpy arrays lists
    kpcovr_dict = kpcovr.__dict__.copy()

    # Convert arrays to lists
    for k, v in kpcovr_dict.items():
        if isinstance(v, np.ndarray):
            kpcovr_dict[k] = v.tolist()

    # Save
    with open(f'{model_dir}/kpcovr.json', 'w') as f:
        json.dump(kpcovr_dict, f)
    
    # Rescale to raw decision function
    dfp_train = dfp_train * df_scale + df_center
    dfp_test = dfp_test * df_scale + df_center

    # Predict classes based on KPCovRized decision functions
    predicted_cantons_train = df_to_class(dfp_train, df_type, n_classes, use_df_sums=True)
    predicted_cantons_test = df_to_class(dfp_test, df_type, n_classes, use_df_sums=True)
    
    # Save KPCovR decision function predictions
    if n_classes == 2:
        df_deem = np.zeros(n_deem)
        df_iza = np.zeros(n_iza)
    else:
        if df_type == 'ovo':
            n_df = n_classes * (n_classes - 1) // 2
        else:
            n_df = n_classes
            
        df_deem = np.zeros((n_deem, n_df))
        df_iza = np.zeros((n_iza, n_df))

    df_deem[idxs_deem_train] = dfp_train[n_iza_train:]
    df_deem[idxs_deem_test] = dfp_test[n_iza_test:]
    np.savetxt(f'../Processed_Data/DEEM_10k/Data/{cutoff}/kpcovr_structure_dfs.dat', df_deem)
    
    df_iza[idxs_iza_train] = dfp_train[0:n_iza_train]
    df_iza[idxs_iza_test] = dfp_test[0:n_iza_test]
    np.savetxt(f'../Processed_Data/IZA_226onDEEM_10k/Data/{cutoff}/kpcovr_structure_dfs.dat', df_iza)
    
    # Save KPCovR class predictions
    predicted_cantons_deem = np.zeros(n_deem)
    predicted_cantons_deem[idxs_deem_train] = predicted_cantons_train[n_iza_train:]
    predicted_cantons_deem[idxs_deem_test] = predicted_cantons_test[n_iza_test:]
    np.savetxt(f'../Processed_Data/DEEM_10k/Data/{cutoff}/kpcovr_structure_cantons.dat',
               predicted_cantons_deem, fmt='%d')
    
    predicted_cantons_iza = np.zeros(n_iza)
    predicted_cantons_iza[idxs_iza_train] = predicted_cantons_train[0:n_iza_train]
    predicted_cantons_iza[idxs_iza_test] = predicted_cantons_test[0:n_iza_test]
    np.savetxt(f'../Processed_Data/IZA_226onDEEM_10k/Data/{cutoff}/kpcovr_structure_cantons.dat', 
               predicted_cantons_iza, fmt='%d')