In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# System
import os
import sys
sys.path.append('/home/helfrech/Tools/Toolbox/utils')

# Maths
import numpy as np

# Plotting
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D

# ML
from soap import extract_species_pair_groups

from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score, balanced_accuracy_score, roc_auc_score
from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline
from sklearn.compose import TransformedTargetRegressor

# Utilities
import h5py
import json
import itertools
from tqdm.notebook import tqdm
import project_utils as utils
from tools import load_json, save_json
from tempfile import mkdtemp
from shutil import rmtree

# Import COSMO style toolkit
import cosmoplot.colorbars as cosmocbars
import cosmoplot.utils as cosmoutils
import cosmoplot.style as cosmostyle

cosmostyle.set_style('article')
colorList = cosmostyle.color_cycle

# Load train and test splits

In [5]:
# Load SOAP cutoffs
soap_hyperparameters = load_json('../Processed_Data/soap_hyperparameters.json')   
cutoffs = soap_hyperparameters['interaction_cutoff']

In [6]:
# Load train and test set indices for Deem
deem_train_idxs = np.loadtxt('../Processed_Data/DEEM_330k/train.idxs', dtype=int)
deem_test_idxs = np.loadtxt('../Processed_Data/DEEM_330k/test.idxs', dtype=int)
n_deem = len(deem_train_idxs) + len(deem_test_idxs)

7750 2250


In [None]:
# Load train and test set indices for IZA
iza_train_idxs = np.loadtxt('../Processed_Data/IZA_226/train.idxs', dtype=int)
iza_test_idxs = np.loadtxt('../Processed_Data/IZA_226/test.idxs', dtype=int)
n_iza = len(iza_train_idxs) + len(iza_test_idxs)

In [9]:
# Load IZA cantons
iza_cantons = np.loadtxt('../Raw_Data/GULP/IZA_226/cantons.txt', usecols=1, dtype=int)

In [None]:
# Load DEEM cantons
deem_cantons_2 = np.loadtxt('../Processed_Data/DEEM_330k/Data/cantons_2-class.dat', dtype=int)
deem_cantons_4 = np.loadtxt('../Processed_Data/DEEM_330k/Data/cantons_4-class.dat', dtype=int)

In [11]:
# Build set of "master" canton labels
cantons = {}

cantons[4] = np.concatenate((
    iza_cantons, 
    deem_cantons_4
))

cantons[2] = np.concatenate((
    np.ones(n_iza, dtype=int),
    deem_cantons_2
))

In [12]:
# Load dummy cantons for Deem
dummy_cantons = {}
dummy_cantons[2] = np.loadtxt('../Processed_Data/DEEM_330k/dummy_cantons_2-class.dat', dtype=int)
dummy_cantons[4] = np.loadtxt('../Processed_Data/DEEM_330k/dummy_cantons_4-class.dat', dtype=int)

In [None]:
# Concatenate IZA and Deem indices
train_idxs = np.concatenate((iza_train_idxs, deem_train_idxs + n_iza))
test_idxs = np.concatenate((iza_test_idxs, deem_test_idxs + n_iza))

# Model setup

In [13]:
model_dir = '../Processed_Data/Models'

In [18]:
deem_name = 'DEEM_330k'
iza_name = 'IZA_226'
deem_dir = f'../Processed_Data/{deem_name}/Data'
iza_dir = f'../Processed_Data/{iza_name}/Data'

In [14]:
class_names = {}
class_names[2] = ['IZA', 'DEEM']
class_names[4] = ['IZA1', 'IZA2', 'IZA3', 'DEEM']

# Linear SVC

In [17]:
# Linear model setup
n_species = 2
group_names = {'power': ['OO', 'OSi', 'SiSi', 
                         'OO+OSi', 'OO+SiSi', 'OSi+SiSi',
                         'OO+OSi+SiSi'], 
               'radial': ['O', 'Si', 'O+Si']}

In [22]:
for cutoff in tqdm(cutoffs, desc='Cutoff', leave=True):
    linear_dir = f'{model_dir}/{cutoff}/Linear_Models/SVC'
    
    for spectrum_type in tqdm(('power', 'radial'), desc='Spectrum', leave=False):
        spectrum_name = spectrum_type.capitalize()
        
        # Load SOAPs
        iza_file = f'{iza_dir}/{cutoff}/soaps_{spectrum_type}_full_avg_nonorm.hdf5'
        iza_soaps = utils.load_hdf5(iza_file)
        
        deem_file = f'{deem_dir}/{cutoff}/soaps_{spectrum_type}_full_avg_nonorm.hdf5'
        f = h5py.File(deem_file, 'r')
        deem_330k_dataset = f['0']
        
        soaps = np.vstack((iza_soaps[iza_train_idxs], deem_330k_dataset[deem_train_idxs]))

        n_features = soaps_train.shape[1]
        feature_groups = extract_species_pair_groups(n_features, n_species, 
                                                     spectrum_type=spectrum_type,
                                                     combinations=True)
        
        # TODO: set batches (like PCovR) if the pipeline isn't computationally feasible

        for species_pairing, feature_idxs in zip(tqdm(group_names[spectrum_type], 
                                                      desc='Species', leave=False),
                                                 feature_groups):
            
            for n_cantons in tqdm((2, 4), desc='Classes', leave=False):
                
                # Prepare outputs
                output_dir = f'Linear_Models/SVC/{n_cantons}-Class/{spectrum_name}/{species_pairing}'
                
                os.makedirs(f'{deem_dir}/{cutoff}/{output_dir}', exist_ok=True)
                os.makedirs(f'{iza_dir}/{cutoff}/{output_dir}', exist_ok=True)
                
                parameter_dir = f'{linear_dir}/{n_cantons}-Class/{spectrum_name}/{species_pairing}'
                
                # TODO: manually set the class weights?
                svc_parameters = load_json(f'{parameter_dir}/svc_parameters_accuracy.json')
                dummy_svc_parameters = load_json(f'{parameter_dir}/dummy_svc_parameters_accuracy.json')
                
                # IZA+DEEM classification
                cache_dir = mkdtemp()
                pipeline = Pipeline(
                    [
                        ('norm_scaler', utils.NormScaler(with_mean=False)),
                        ('svc', LinearSVC(**svc_parameters))
                    ],
                    memory=cache_dir
                )
                
                pipeline.fit(soaps[:, feature_idxs], cantons[n_cantons])
                
                # Read the IZA structures and compute decision functions
                # and canton predictions
                iza_dfs = pipeline.decision_function(iza_soaps[:, feature_idxs])
                iza_predicted_cantons = pipeline.predict(iza_soaps[:, feature_idxs])
                
                np.savetxt(f'{iza_dir}/{cutoff}/{output_dir}/svc_structure_dfs.dat', iza_dfs)
                np.savetxt(f'{iza_dir}/{cutoff}/{output_dir}/svc_structure_cantons.dat', iza_predicted_cantons)
                
                # Read the DEEM structures and compute decision functions
                # and canton predictions
                
                # TODO: make sure this is computationally feasible, otherwise batches
                # We could load up the whole dataset beforehand and probably make this faster
                # since we wouldn't have to read the HDF5 for both the DFs and predictions,
                # but the single-read seems to be memory-prohibitive and this 
                # multiple-read construction appears tolerably fast
                # and also seems to use much less memory
                deem_dfs = pipeline.decision_function(deem_330k_dataset[:, feature_idxs])
                deem_predicted_cantons = pipeline.predict(deem_330k_dataset[:, feature_idxs])
                
                np.savetxt(f'{deem_dir}/{cutoff}/{output_dir}/svc_structure_dfs.dat', deem_dfs)
                np.savetxt(f'{deem_dir}/{cutoff}/{output_dir}/svc_structure_cantons.dat', deem_predicted_cantons)
                
                # Save the SVC model and the scaler
                for key, model in pipeline.named_steps:
                    save_json(model.__dict__, f'{parameter_dir}/{key}.json', array_convert=True)
                
                rmtree(cache_dir)
                
                # Dummy DEEM classification
                dummy_cache_dir = mkdtemp()
                dummy_pipeline = Pipeline(
                    [
                        ('norm_scaler', utils.NormScaler(with_mean=False)),
                        ('svc', LinearSVC(**dummy_svc_parameters))
                    ],
                    memory=dummy_cache_dir
                )
                dummy_pipeline.fit(deem_330k_dataset[train_idxs, feature_idxs], dummy_cantons[n_cantons])
                # Could also just choose a subset of the DEEM 330k as test
                dummy_deem_dfs = dummy_pipeline.decision_function(deem_330k_dataset[train_idxs, feature_idxs])
                dummy_deem_predicted_cantons = dummy_pipeline.predict(deem_330k_dataset[train_idxs, feature_idxs])
                
                np.savetxt(f'{deem_dir}/{cutoff}/{output_dir}/dummy_svc_structure_dfs.dat', dummy_dfs)
                np.savetxt(f'{deem_dir}/{cutoff}/{output_dir}/dummy_svc_structure_cantons.dat', dummy_predicted_cantons)
                
                # Save the dummy SVC model and the scaler
                for key, model in pipeline.named_steps:
                    save_json(model.__dict__, f'{parameter_dir}/dummy_{key}.json', array_convert=True)
                
                rmtree(dummy_cache_dir)
                
        f.close()

HBox(children=(FloatProgress(value=0.0, description='Cutoff', max=2.0, style=ProgressStyle(description_width='…

HBox(children=(FloatProgress(value=0.0, description='Spectrum', max=2.0, style=ProgressStyle(description_width…

HBox(children=(FloatProgress(value=0.0, description='Species', max=7.0, style=ProgressStyle(description_width=…

HBox(children=(FloatProgress(value=0.0, description='Classes', max=2.0, style=ProgressStyle(description_width=…




## LR check

In [None]:
for cutoff in tqdm(cutoffs, desc='Cutoff', leave=True):
    work_dir = f'{model_dir}/{cutoff}/Linear_Models/SVC'
    
    for spectrum_type in tqdm(('power', 'radial'), desc='Spectrum', leave=False):
        spectrum_name = spectrum_type.capitalize()
        
        # Load SOAPs
        iza_file = f'{iza_dir}/{cutoff}/soaps_{spectrum_type}_full_avg_nonorm.hdf5'
        iza_soaps = utils.load_hdf5(iza_file)
        
        deem_file = f'{deem_dir}/{cutoff}/soaps_{spectrum_type}_full_avg_nonorm.hdf5'
        f = h5py.File(deem_file, 'r')
        deem_330k_dataset = f['0']
        
        soaps = np.vstack((iza_soaps[iza_train_idxs], deem_330k_dataset[deem_train_idxs]))
        
        n_features = soaps.shape[1]
        feature_groups = extract_species_pair_groups(n_features, n_species, 
                                                     spectrum_type=spectrum_type,
                                                     combinations=True)
        
        for species_pairing, feature_idxs in zip(tqdm(group_names[spectrum_type], 
                                                      desc='Species', leave=False),
                                                 feature_groups):
                        
            for n_cantons in tqdm((2, 4), desc='Classes', leave=False):              
                
                # Load decision functions
                input_dir = f'{n_cantons}-Class/{spectrum_name}/{species_pairing}'
                
                # Load decision functions
                iza_dfs = np.loadtxt(f'{iza_dir}/{cutoff}/{df_dir}/svc_structure_dfs.dat')
                
                deem_dfs = np.loadtxt(f'{deem_dir}/{cutoff}/{df_dir}/svc_structure_dfs.dat')
                
                dfs = np.concatenate((iza_dfs[iza_train_idxs], deem_dfs[deem_train_idxs]))
                                
                cache_dir = mkdtemp()
                pipeline = Pipeline(
                    [
                        ('norm_scaler', utils.NormScaler()),
                        ('ridge', TransformedTargetRegressor(
                            regressor=Ridge(alpha=1.0E-12),
                            transformer=utils.NormScaler(featurewise=True)
                        ))
                    ],
                    memory=cache_dir
                )
                pipeline.fit(soaps[train_idxs], dfs[train_idxs])
                # TODO: how to handle the train and test set deem?
                predicted_dfs = pipeline.transform(soaps[test_idxs], dfs[test_idxs])
                # TODO: score the results (MAE, RMSE)
                rmtree(cache_dir)