In [1]:
%load_ext autoreload
%autoreload 2

In [4]:
# System
import os
import sys
sys.path.append('/home/helfrech/Tools/Toolbox/utils')

# Maths
import numpy as np

# Plotting
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D

# ML
from regression import PCovR
from soap import extract_species_pair_groups

from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score, balanced_accuracy_score, roc_auc_score
from sklearn.linear_model import Ridge

# Utilities
import h5py
import json
import itertools
from copy import deepcopy
from tqdm.notebook import tqdm
import project_utils as utils
from tools import load_json, save_json

# Import COSMO style toolkit
import cosmoplot.colorbars as cosmocbars
import cosmoplot.utils as cosmoutils
import cosmoplot.style as cosmostyle

cosmostyle.set_style('article')
colorList = cosmostyle.color_cycle

In [5]:
sys.path.append('/scratch/helfrech/Sync/GDrive/Projects/KPCovR/kernel-tutorials')
sys.path.append('/scratch/helfrech/Sync/GDrive/Projects/KPCovR/KernelPCovR/analysis/scripts')
# from utilities.sklearn_covr.kpcovr import KernelPCovR as KPCovR2
# from utilities.sklearn_covr.pcovr import PCovR as PCovR2
from helpers import l_regr, l_kpcovr, l_proj

In [None]:
# TODO: custom scorer for PCovR with loss functions

# Functions

In [6]:
def compute_regression_losses(y_train, y_test, yp_train, yp_test):
    
    # Sum over cantonwise losses
    lr_test = np.sum(l_regr(y_test, yp_test))
    lr_train = np.sum(l_regr(y_train, yp_train))

    return lr_train, lr_test

def compute_kernel_projection_losses(k_train, k_test, k_test_test, t_train, t_test):
    
    # Sum over cantonwise losses
    lp_test = np.sum(l_kpcovr(k_train=k_train,
                              k_test=k_test,
                              k_test_test=k_test_test,
                              t_train=t_train, t_test=t_test))

    lp_train = np.sum(l_kpcovr(k_train=k_train,
                               t_train=t_train, t_test=t_test))

    return lp_train, lp_test

def compute_linear_projection_losses(x_train, x_test, xr_train, xr_test):
    
    # Sum over cantonwise losses
    lp_train = np.sum(l_proj(x_train, xr=xr_train))
    lp_test = np.sum(l_proj(x_test, xr=xr_test))

    return lp_train, lp_test

def load_pcovr_losses(filename, alphas, regularizations, dtype):
    
    # Load losses
    loss_matrix_shape = (len(alphas), len(regularizations))
    pcovr_errors = np.loadtxt(filename, dtype=dtype)
    alpha_matrix = np.reshape(pcovr_errors['alpha'], loss_matrix_shape)
    reg_matrix = np.reshape(pcovr_errors['regularization'], loss_matrix_shape)
    lr_train_matrix = np.reshape(np.mean(pcovr_errors['lr_train'], axis=1), loss_matrix_shape)
    lr_test_matrix = np.reshape(np.mean(pcovr_errors['lr_test'], axis=1), loss_matrix_shape)
    lp_train_matrix = np.reshape(np.mean(pcovr_errors['lp_train'], axis=1), loss_matrix_shape)
    lp_test_matrix = np.reshape(np.mean(pcovr_errors['lp_test'], axis=1), loss_matrix_shape)
    alphas = alpha_matrix[:, 0]
    opt_reg_idx = np.unravel_index(np.argmin(lr_test_matrix + lp_test_matrix), 
                                   lr_test_matrix.shape)[1]
    
    return alphas, opt_reg_idx, \
        lr_train_matrix, lr_test_matrix, lp_train_matrix, lp_test_matrix

# Load train and test splits

In [7]:
# Load SOAP cutoffs
with open('../Processed_Data/soap_hyperparameters.json', 'r') as f:
    soap_hyperparameters = json.load(f)
    
cutoffs = soap_hyperparameters['interaction_cutoff']

In [8]:
# Load train set and CV indices for Deem
deem_train_idxs = np.loadtxt('../Processed_Data/DEEM_330k/train.idxs', dtype=int)
deem_cv_idxs = np.loadtxt('../Processed_Data/DEEM_330k/cv_2.idxs', dtype=int)

In [None]:
# Load train set and CV indices for IZA
iza_train_idxs = np.loadtxt('../Processed_Data/IZA_226/train.idxs', dtype=int)
iza_cv_idxs = np.loadtxt('../Processed_Data/IZA_226/cv_2.idxs', dtype=int)

In [11]:
# Load IZA cantons
cantons_iza = np.loadtxt('../Raw_Data/GULP/IZA_226/cantons.txt', usecols=1, dtype=int)
RWY = np.nonzero(cantons_iza == 4)[0][0]

cantons_iza = np.delete(cantons_iza, RWY)
n_iza = len(cantons_iza)

In [13]:
# Build set of "master" canton labels
cantons = {}

cantons[4] = np.concatenate((
    cantons_iza[iza_train_idxs], 
    np.ones(len(deem_train_idxs), dtype=int) * 4
))

cantons[2] = np.concatenate((
    np.ones(len(iza_train_idxs), dtype=int),
    np.ones(len(deem_train_idxs), dtype=int) * 2
))

In [None]:
# Concatenate IZA and Deem CV idxs
cv_idxs = np.vstack((iza_cv_idxs, deem_cv_idxs + len(iza_train_idxs)))

# Model setup

In [22]:
model_dir = '../Processed_Data/Models'

deem_name = 'DEEM_330k'
iza_name = 'IZA_226'
deem_dir = f'../Processed_Data/{deem_name}/Data'
iza_dir = f'../Processed_Data/{iza_name}/Data'

In [23]:
# TODO: parameter optimization handling with defaults dictated here

In [24]:
n_components = 2
pcovr_kwargs = dict(n_components=n_components, alpha=0.0, regularization=1.0E-12)

alphas = np.linspace(0.0, 1.0, 11)
regularizations = np.logspace(-12, -1, 2)

# Linear PCovR

In [26]:
# Linear model setup
n_species = 2
group_names = {'power': ['OO', 'OSi', 'SiSi', 
                         'OO+OSi', 'OO+SiSi', 'OSi+SiSi',
                         'OO+OSi+SiSi'], 
               'radial': ['O', 'Si', 'O+Si']}

In [27]:
deem_name = 'DEEM_330k'
iza_name = 'IZA_226'
deem_dir = f'../Processed_Data/{deem_name}/Data'
iza_dir = f'../Processed_Data/{iza_name}/Data'

In [None]:
for cutoff in tqdm(cutoffs, desc='Cutoff', leave=True):
    work_dir = f'{model_dir}/{cutoff}/Linear_Models/PCovR'
    
    for spectrum_type in tqdm(('power', 'radial'), desc='Spectrum', leave=False):
        spectrum_name = spectrum_type.capitalize()
        
        # Load SOAPs
        deem_file = f'{deem_dir}/{cutoff}/soaps_{spectrum_type}_full_avg_nonorm.hdf5'
        iza_file = f'{iza_dir}/{cutoff}/soaps_{spectrum_type}_full_avg_nonorm.hdf5'
        
        # TODO: new SOAP loading with concatenation
        
        n_features = soaps_train.shape[1]
        feature_groups = extract_species_pair_groups(n_features, n_species, 
                                                     spectrum_type=spectrum_type,
                                                     combinations=True)
        
        for species_pairing, feature_idxs in zip(tqdm(group_names[spectrum_type], 
                                                      desc='Species', leave=False),
                                                 feature_groups):
             
            # Scale the SOAPs so they are of a 'usable' magnitude for the SVC
            # TODO: SOAP feature selection (before scaling!)
            # TODO: new SOAP preprocessing with pipeline
            
            for n_cantons in tqdm((2, 4), desc='Classes', leave=False):
                
                # Prepare inputs and outputs
                output_dir = f'{n_cantons}-Class/{spectrum_name}/{species_pairing}'
                
                pcovr_parameter_file = f'{work_dir}/{output_dir}/pcovr_parameters.json'
                pcovr_parameters = pcovr_kwargs['linear'].copy()

                # Load decision functions
                svc_df_train_file = f'{work_dir}/{output_dir}/svc_structure_dfs_train_optimization.dat'
                svc_df_test_file = f'{work_dir}/{output_dir}/svc_structure_dfs_test_optimization.dat'

                # TODO: clean up this decision function structure, we don't need separate for optimization
                df_train = np.loadtxt(svc_df_train_file)
                df_test = np.loadtxt(svc_df_test_file)
                
                # TODO: preprocess decision functions with pipeline
                # TODO: gridsearchcv with pipeline
                # TODO: custom scorer
                # TODO: save CV run in JSON

## Check for optimization problems

In [None]:
for cutoff in cutoffs:
    work_dir = f'{model_dir}/{cutoff}/Linear_Models/PCovR'
    
    for spectrum_type in ('power', 'radial'):
        spectrum_name = spectrum_type.capitalize()
        
        for species_pairing, feature_idxs in zip(group_names[spectrum_type], feature_groups):
                        
            for n_cantons in (2, 4):
                
                print(f'Cutoff: {cutoff}, Spectrum: {spectrum_name}, Species: {species_pairing}, {n_cantons}-Class')
                
                output_dir = f'{n_cantons}-Class/{spectrum_name}/{species_pairing}'
            
                loss_file = f'{work_dir}/{output_dir}/pcovr_optimization.dat'

                # Load losses
                alphas, opt_reg_idx, \
                lr_train_matrix, lr_test_matrix, \
                lp_train_matrix, lp_test_matrix = load_pcovr_losses(loss_file, alphas, regularizations, dt_pcovr)

                fig = plt.figure(figsize=(7.0, 3.5))
                axs_loss_sum_train = fig.add_subplot(1, 2, 1)
                axs_loss_sum_test = fig.add_subplot(1, 2, 2)

                # Sum of projection and regression loss over all cantons for the train set
                axs_loss_sum_train.semilogy(alphas, lr_train_matrix[:, opt_reg_idx], 'o-', label='l_regr')
                axs_loss_sum_train.semilogy(alphas, lp_train_matrix[:, opt_reg_idx], 'o-', label='l_proj')
                axs_loss_sum_train.semilogy(alphas, lr_train_matrix[:, opt_reg_idx]
                                            + lp_train_matrix[:, opt_reg_idx], 'o-', label='l_regr+l_proj')

                axs_loss_sum_train.legend()
                axs_loss_sum_train.set_title('Train')
                axs_loss_sum_train.set_xlabel('alpha')
                axs_loss_sum_train.set_ylabel('loss')

                # Sum of projection and regression loss over all cantons for the test set
                axs_loss_sum_test.semilogy(alphas, lr_test_matrix[:, opt_reg_idx], 'o-', label='l_regr')
                axs_loss_sum_test.semilogy(alphas, lp_test_matrix[:, opt_reg_idx], 'o-', label='l_proj')
                axs_loss_sum_test.semilogy(alphas, lr_test_matrix[:, opt_reg_idx]
                                           + lp_test_matrix[:, opt_reg_idx], 'o-', label='l_regr+l_proj')

                axs_loss_sum_test.legend()
                axs_loss_sum_test.set_title('Test')
                axs_loss_sum_test.set_xlabel('alpha')
                axs_loss_sum_test.set_ylabel('loss')

                plt.show()