In [1]:
# System
import os
import sys
sys.path.append('/home/helfrech/Tools/Toolbox/utils')
sys.path.append('..')

# Maths
import numpy as np

# Plotting
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D

# ML
from sklearn.metrics import classification_report, confusion_matrix
from errors import MAE

# Utilities
import h5py
import json
from project_utils import load_structures_from_hdf5

# Import COSMO style toolkit
import cosmoplot.colorbars as cosmocbars
import cosmoplot.utils as cosmoutils
import cosmoplot.style as cosmostyle

cosmostyle.set_style('article')
colorList = cosmostyle.color_cycle

In /home/helfrech/.config/matplotlib/stylelib/cosmo.mplstyle: 
The savefig.frameon rcparam was deprecated in Matplotlib 3.1 and will be removed in 3.3.
In /home/helfrech/.config/matplotlib/stylelib/cosmoLarge.mplstyle: 
The savefig.frameon rcparam was deprecated in Matplotlib 3.1 and will be removed in 3.3.
The savefig.frameon rcparam was deprecated in Matplotlib 3.1 and will be removed in 3.3.
  self[key] = other[key]


# Plot KPCovR projections and predicted classes

In [2]:
# Load SOAP cutoffs
with open('../../Processed_Data/soap_hyperparameters.json', 'r') as f:
    soap_hyperparameters = json.load(f)
    
cutoffs = soap_hyperparameters['interaction_cutoff']

In [3]:
# Load train and test indices
idxs_deem_train = np.loadtxt('../../Processed_Data/DEEM_10k/train.idxs', dtype=int)
idxs_deem_test = np.loadtxt('../../Processed_Data/DEEM_10k/test.idxs', dtype=int)

n_deem_train = idxs_deem_train.size
n_deem_test = idxs_deem_test.size
n_deem = n_deem_train + n_deem_test

In [4]:
idxs_iza_train = np.loadtxt('../../Processed_Data/IZA_226onDEEM_10k/train.idxs', dtype=int)
idxs_iza_test = np.loadtxt('../../Processed_Data/IZA_226onDEEM_10k/test.idxs', dtype=int)

In [5]:
# Load IZA cantons
cantons_iza = np.loadtxt('../../Raw_Data/GULP/IZA_226/cantons.txt', usecols=1, dtype=int)
RWY = np.nonzero(cantons_iza == 4)[0][0]
cantons_iza = np.delete(cantons_iza, RWY)
n_iza = len(cantons_iza)

In [6]:
# Make dummy DEEM cantons
cantons_deem = np.ones(n_deem, dtype=int) * 4

# Print classification statistics

In [7]:
# Build set of "master" canton labels
cantons_train = np.concatenate((cantons_iza[idxs_iza_train], cantons_deem[idxs_deem_train]))
cantons_test = np.concatenate((cantons_iza[idxs_iza_test], cantons_deem[idxs_deem_test]))
n_classes = np.amax(cantons_train)

## SVM

In [8]:
for cutoff in cutoffs:
    print(f'===== {cutoff} =====')
    predicted_cantons_iza = np.loadtxt(f'../../Processed_Data/IZA_226onDEEM_10k/Data/{cutoff}/ksvc_structure_cantons.dat', dtype=int)
    predicted_cantons_deem = np.loadtxt(f'../../Processed_Data/DEEM_10k/Data/{cutoff}/ksvc_structure_cantons.dat', dtype=int)
   
    predicted_cantons_train = np.concatenate((predicted_cantons_iza[idxs_iza_train],
                                              predicted_cantons_deem[idxs_deem_train]))
    predicted_cantons_test = np.concatenate((predicted_cantons_iza[idxs_iza_test],
                                             predicted_cantons_deem[idxs_deem_test]))
    
    print('----- Train -----')
    print(classification_report(cantons_train, predicted_cantons_train))
    matrix_train = confusion_matrix(cantons_train, predicted_cantons_train)
    print('Train accuracy:', np.sum(np.diag(matrix_train)) / np.sum(matrix_train))
    print(matrix_train)
    print('\n')
    
    print('----- Test -----')
    print(classification_report(cantons_test, predicted_cantons_test))
    matrix_test = confusion_matrix(cantons_test, predicted_cantons_test)
    print('Test accuracy:', np.sum(np.diag(matrix_test)) / np.sum(matrix_test))
    print(matrix_test)
    print('\n')

===== 3.5 =====
----- Train -----
              precision    recall  f1-score   support

           1       0.67      0.38      0.48        21
           2       0.73      0.41      0.52        54
           3       1.00      0.27      0.43        37
           4       0.99      1.00      1.00      7750

    accuracy                           0.99      7862
   macro avg       0.85      0.51      0.61      7862
weighted avg       0.99      0.99      0.99      7862

Train accuracy: 0.9905876367336556
[[   8    3    0   10]
 [   4   22    0   28]
 [   0    3   10   24]
 [   0    2    0 7748]]


----- Test -----
              precision    recall  f1-score   support

           1       0.11      0.07      0.09        14
           2       0.63      0.25      0.36        68
           3       0.20      0.06      0.10        31
           4       0.97      1.00      0.98      2250

    accuracy                           0.96      2363
   macro avg       0.48      0.35      0.38      2363
weig

## KPCovR

In [9]:
for cutoff in cutoffs:
    print(f'===== {cutoff} =====')
    predicted_cantons_iza = np.loadtxt(f'../../Processed_Data/IZA_226onDEEM_10k/Data/{cutoff}/kpcovr_structure_cantons.dat', dtype=int)
    predicted_cantons_deem = np.loadtxt(f'../../Processed_Data/DEEM_10k/Data/{cutoff}/kpcovr_structure_cantons.dat', dtype=int)
   
    predicted_cantons_train = np.concatenate((predicted_cantons_iza[idxs_iza_train],
                                              predicted_cantons_deem[idxs_deem_train]))
    predicted_cantons_test = np.concatenate((predicted_cantons_iza[idxs_iza_test],
                                             predicted_cantons_deem[idxs_deem_test]))
    
    dfs_ksvc_iza = np.loadtxt(f'../../Processed_Data/IZA_226onDEEM_10k/Data/{cutoff}/ksvc_structure_dfs.dat')
    dfs_ksvc_deem = np.loadtxt(f'../../Processed_Data/DEEM_10k/Data/{cutoff}/ksvc_structure_dfs.dat')
    
    dfs_kpcovr_iza = np.loadtxt(f'../../Processed_Data/IZA_226onDEEM_10k/Data/{cutoff}/kpcovr_structure_dfs.dat')
    dfs_kpcovr_deem = np.loadtxt(f'../../Processed_Data/DEEM_10k/Data/{cutoff}/kpcovr_structure_dfs.dat')
    
    if n_classes == 2:
        stacking_func = np.concatenate
    else:
        stacking_func = np.vstack
        
    dfs_ksvc_train = stacking_func((dfs_ksvc_iza[idxs_iza_train],
                                    dfs_ksvc_deem[idxs_deem_train]))
    dfs_ksvc_test = stacking_func((dfs_ksvc_iza[idxs_iza_test],
                                   dfs_ksvc_deem[idxs_deem_test]))

    dfs_kpcovr_train = stacking_func((dfs_kpcovr_iza[idxs_iza_train],
                                      dfs_kpcovr_deem[idxs_deem_train]))
    dfs_kpcovr_test = stacking_func((dfs_kpcovr_iza[idxs_iza_test],
                                     dfs_kpcovr_deem[idxs_deem_test]))
    
    print('----- Train -----')
    print(classification_report(cantons_train, predicted_cantons_train))
    matrix_train = confusion_matrix(cantons_train, predicted_cantons_train)
    print('Train accuracy:', np.sum(np.diag(matrix_train)) / np.sum(matrix_train))
    print(matrix_train)
    print('Train DF MAE:', MAE(dfs_ksvc_train, dfs_kpcovr_train))
    print('\n')
    
    print('----- Test -----')
    print(classification_report(cantons_test, predicted_cantons_test))
    matrix_test = confusion_matrix(cantons_test, predicted_cantons_test)
    print('Test accuracy:', np.sum(np.diag(matrix_test)) / np.sum(matrix_test))
    print(matrix_test)
    print('Test DF MAE:', MAE(dfs_ksvc_test, dfs_kpcovr_test))
    print('\n')

===== 3.5 =====
----- Train -----
              precision    recall  f1-score   support

           1       0.73      0.38      0.50        21
           2       0.74      0.43      0.54        54
           3       1.00      0.27      0.43        37
           4       0.99      1.00      1.00      7750

    accuracy                           0.99      7862
   macro avg       0.87      0.52      0.62      7862
weighted avg       0.99      0.99      0.99      7862

Train accuracy: 0.9907148308318494
[[   8    3    0   10]
 [   3   23    0   28]
 [   0    3   10   24]
 [   0    2    0 7748]]
Train DF MAE: [9.34808791e-10 8.65415481e-11 5.40572994e-10 8.55770132e-10
 1.11703007e-09 6.67273683e-10]


----- Test -----
              precision    recall  f1-score   support

           1       0.12      0.07      0.09        14
           2       0.63      0.25      0.36        68
           3       0.20      0.06      0.10        31
           4       0.97      1.00      0.98      2250

    a