In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# System
import os
import sys

# Maths
import numpy as np
from numpy.random import default_rng

# Plotting
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D

# ML
from soap import extract_species_pair_groups

from sklearn.svm import LinearSVC, SVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score, balanced_accuracy_score, roc_auc_score
from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.multiclass import OneVsRestClassifier

# Utilities
import h5py
import json
import itertools
from tempfile import mkdtemp
from shutil import rmtree
from copy import deepcopy
from tqdm.auto import tqdm
import project_utils as utils
from tools import load_json, save_json

# Import COSMO style toolkit
import cosmoplot.colorbars as cosmocbars
import cosmoplot.style as cosmostyle

cosmostyle.set_style('article')
colorList = cosmostyle.color_cycle

In [3]:
deem_volumes = np.loadtxt('../Processed_Data/DEEM_330k/Data/structure_volumes.dat')
deem_energies = np.loadtxt('../Processed_Data/DEEM_330k/Data/structure_energies.dat')
# deem_n_Si = np.loadtxt('../Processed_Data/DEEM_330k/Data/n_Si.dat', dtype=int)
deem_ev = np.column_stack([deem_volumes, deem_energies])
n_deem = len(deem_volumes)

In [4]:
iza_volumes = np.loadtxt('../Processed_Data/IZA_230/Data/structure_volumes.dat')
iza_energies = np.loadtxt('../Processed_Data/IZA_230/Data/structure_energies.dat')
# iza_n_Si = np.loadtxt('../Processed_Data/IZA_230/Data/n_Si.dat', dtype=int)
iza_ev = np.column_stack([iza_volumes, iza_energies])

In [5]:
# Load train sets for IZA and Deem
iza_train_idxs = np.loadtxt('../Processed_Data/IZA_230/svm_train.idxs', dtype=int)
iza_sort_idxs = np.argsort(iza_train_idxs)
iza_unsort_idxs = np.argsort(iza_sort_idxs)
deem_train_idxs = np.loadtxt('../Processed_Data/DEEM_330k/svm_train.idxs', dtype=int)

In [6]:
# Build energy-volume features
ev = np.vstack([
    iza_ev[iza_train_idxs],
    deem_ev[deem_train_idxs]
])

In [7]:
# Load cantons for IZA and Deem
iza_cantons = np.loadtxt('../Raw_Data/IZA_230/cantons_compositions.dat', usecols=1, dtype=int)
deem_cantons_2 = np.loadtxt('../Processed_Data/DEEM_330k/Data/cantons_2-class.dat', dtype=int)
deem_cantons_4 = np.loadtxt('../Processed_Data/DEEM_330k/Data/cantons_4-class.dat', dtype=int)

In [8]:
# Build set of "master" canton labels
cantons = {}

cantons[4] = np.concatenate((
    iza_cantons[iza_train_idxs], 
    deem_cantons_4[deem_train_idxs]
))

cantons[2] = np.concatenate((
    np.ones(len(iza_train_idxs), dtype=int),
    deem_cantons_2[deem_train_idxs]
))

# Build set of class weights (by sample) for centering and scaling
class_weights = {n_cantons: utils.balanced_class_weights(cantons[n_cantons]) for n_cantons in (2, 4)}

# Model setup

In [9]:
model_dir = '../Processed_Data/Models'

deem_name = 'DEEM_330k'
iza_name = 'IZA_230'
deem_dir = f'../Processed_Data/{deem_name}/Data'
iza_dir = f'../Processed_Data/{iza_name}/Data'

In [10]:
# CV splits
n_splits = 2

# When using the OneVsRestClassifier, 
# n_classes binary problems are passed to SVC, 
# and the decision function shape doesn't have an impact
svc_parameters = dict(
    kernel='precomputed',
    decision_function_shape='ovo',
    class_weight='balanced',
    tol=1.0E-3,
    cache_size=1000
)

# Linear SVC

In [11]:
# Linear model setup
df_types = ['OvR', 'OvO']
# df_types = ['OvO']

## Optimize LinearSVC parameters

In [12]:
# Regularization parameters for cross-validation
C = np.logspace(-4, 4, 9)
parameter_grid = dict(
    OvR=dict(svc__estimator__C=C),
    OvO=dict(svc__C=C)
)

In [13]:
for df_type in tqdm(df_types, desc='DF', leave=True):    
    work_dir = f'{model_dir}/Classical/LSVC/{df_type}'
    
    os.makedirs(work_dir, exist_ok=True)
    
    for n_cantons in tqdm((2, 4), desc='Classes', leave=False):

        # IZA + Deem classification
        svc = SVC(**svc_parameters)
        if df_type == 'OvR':
            svc = OneVsRestClassifier(svc)

        pipeline = utils.ClassBalancedPipeline(
            [
                ('norm_scaler', utils.StandardNormScaler(featurewise=True)),
                ('kernel_constructor', utils.KernelConstructor()),
                ('svc', svc)
            ],
        )

        gscv = GridSearchCV(
            pipeline, parameter_grid[df_type],
            scoring=[
                'accuracy', 'balanced_accuracy',
            ],
            cv=StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=0),
            refit=False, return_train_score=True, error_score='raise', n_jobs=4
        )
        #fit_params = {'norm_scaler__sample_weight': class_weights[n_cantons]}
        fit_params = {'keys': ['norm_scaler__sample_weight']}
        gscv.fit(ev, cantons[n_cantons], **fit_params)

        # Prepare outputs
        output_dir = f'{n_cantons}-Class/Energy-Volume'
        os.makedirs(f'{work_dir}/{output_dir}', exist_ok=True)
        save_json(gscv.cv_results_, f'{work_dir}/{output_dir}/cv_results.json', array_convert=True)

HBox(children=(FloatProgress(value=0.0, description='DF', max=2.0, style=ProgressStyle(description_width='init…

HBox(children=(FloatProgress(value=0.0, description='Classes', max=2.0, style=ProgressStyle(description_width=…

HBox(children=(FloatProgress(value=0.0, description='Classes', max=2.0, style=ProgressStyle(description_width=…




## Check the cross-validated parameters

In [14]:
# IZA + DEEM classification
for df_type in df_types:
    work_dir = f'{model_dir}/Classical/LSVC/{df_type}'
    for n_cantons in (2, 4):
        result_dir = f'{n_cantons}-Class/Energy-Volume'
        cv_results = load_json(f'{work_dir}/{result_dir}/cv_results.json')
        print(f'-----Optimal Parameters for {df_type} {n_cantons} -----')
#                 fig, axs = plt.subplots(1, 2)        

        for score in ('accuracy', 'balanced_accuracy'):
#                 for sdx, score in enumerate(('accuracy', 'balanced_accuracy')):
            idx = np.argmin(cv_results[f'rank_test_{score}'])
            opt_parameters = utils.get_optimal_parameters(cv_results, score, **svc_parameters)
            print(f'{score} =', cv_results[f'mean_test_{score}'][idx])
            print(opt_parameters)
            print('')
#                     axs[sdx].semilogx(
#                         np.array([list(d.values()) for d in cv_results['params']]).flatten(),
#                         cv_results[f'mean_test_{score}']
#                     )
#                     axs[sdx].set_title(f'{cutoff} {spectrum_type} {group_name} {n_cantons}')

            save_json(opt_parameters, f'{work_dir}/{result_dir}/svc_parameters_{score}.json')

-----Optimal Parameters for OvR 2 -----
accuracy = 0.8629490110211938
{'kernel': 'precomputed', 'decision_function_shape': 'ovo', 'class_weight': 'balanced', 'tol': 0.001, 'cache_size': 1000, 'C': 10000.0}

balanced_accuracy = 0.8829976746226438
{'kernel': 'precomputed', 'decision_function_shape': 'ovo', 'class_weight': 'balanced', 'tol': 0.001, 'cache_size': 1000, 'C': 10000.0}

-----Optimal Parameters for OvR 4 -----
accuracy = 0.8099472383498998
{'kernel': 'precomputed', 'decision_function_shape': 'ovo', 'class_weight': 'balanced', 'tol': 0.001, 'cache_size': 1000, 'C': 100.0}

balanced_accuracy = 0.5622278472361139
{'kernel': 'precomputed', 'decision_function_shape': 'ovo', 'class_weight': 'balanced', 'tol': 0.001, 'cache_size': 1000, 'C': 0.01}

-----Optimal Parameters for OvO 2 -----
accuracy = 0.8629490110211938
{'kernel': 'precomputed', 'decision_function_shape': 'ovo', 'class_weight': 'balanced', 'tol': 0.001, 'cache_size': 1000, 'C': 10000.0}

balanced_accuracy = 0.8829976746

# Evaluate the SVM

In [15]:
batch_size = 20000

In [16]:
for df_type in tqdm(df_types, desc='DF', leave=True):   
    linear_dir = f'{model_dir}/Classical/LSVC/{df_type}'

    # Prepare batches for SVM
    n_samples_330k = n_deem
    n_batches = n_samples_330k // batch_size
    if n_samples_330k % batch_size > 0:
        n_batches += 1

    for n_cantons in tqdm((2, 4), desc='Classes', leave=False):

        # Prepare outputs
        output_dir = f'LSVC/{df_type}/{n_cantons}-Class/Energy-Volume'

        os.makedirs(f'{deem_dir}/Classical/{output_dir}', exist_ok=True)
        os.makedirs(f'{iza_dir}/Classical/{output_dir}', exist_ok=True)

        parameter_dir = f'{linear_dir}/{n_cantons}-Class/Energy-Volume'

        svc_parameters = load_json(f'{parameter_dir}/svc_parameters_balanced_accuracy.json')

        # IZA+DEEM classification
        svc = SVC(**svc_parameters)
        if df_type == 'OvR':
            svc = OneVsRestClassifier(svc, n_jobs=4)

        pipeline = Pipeline(
            [
                ('norm_scaler', utils.StandardNormScaler(featurewise=True)),
                ('kernel_constructor', utils.KernelConstructor()),
                ('svc', svc)
            ],
        )
        fit_params = {'norm_scaler__sample_weight': class_weights[n_cantons]}
        pipeline.fit(ev, cantons[n_cantons], **fit_params)

        # Read the IZA structures and compute decision functions
        # and canton predictions
        iza_dfs = pipeline.decision_function(iza_ev)
        iza_predicted_cantons = pipeline.predict(iza_ev)

        np.savetxt(f'{iza_dir}/Classical/{output_dir}/svc_structure_dfs.dat', iza_dfs)
        np.savetxt(f'{iza_dir}/Classical/{output_dir}/svc_structure_cantons.dat', iza_predicted_cantons, fmt='%d')

        # Read the DEEM structures and compute decision functions
        # and canton predictions
        if n_cantons == 2:
            deem_dfs = np.zeros(n_deem)
        else:
            if df_type == 'OvR':
                deem_dfs = np.zeros((n_deem, n_cantons))
            elif df_type == 'OvO':
                deem_dfs = np.zeros((n_deem, n_cantons * (n_cantons - 1) // 2))

        deem_predicted_cantons = np.zeros(n_deem)

        for i in tqdm(range(0, n_batches), desc='Batch', leave=False):
            batch_slice = slice(i * batch_size, (i + 1) * batch_size)

            deem_330k_batch = deem_ev[batch_slice]
            deem_dfs[batch_slice] = pipeline.decision_function(deem_330k_batch)
            deem_predicted_cantons[batch_slice] = pipeline.predict(deem_330k_batch)

        np.savetxt(f'{deem_dir}/Classical/{output_dir}/svc_structure_dfs.dat', deem_dfs)
        np.savetxt(f'{deem_dir}/Classical/{output_dir}/svc_structure_cantons.dat', deem_predicted_cantons, fmt='%d')

        # Save the SVC model and the scaler
        # We don't save the KernelConstructor b/c it is really big
        save_json(pipeline.named_steps['norm_scaler'].__dict__, f'{parameter_dir}/norm_scaler.json', array_convert=True)
        save_json(pipeline.named_steps['svc'].__dict__, f'{parameter_dir}/svc.json', array_convert=True)

HBox(children=(FloatProgress(value=0.0, description='DF', max=2.0, style=ProgressStyle(description_width='init…

HBox(children=(FloatProgress(value=0.0, description='Classes', max=2.0, style=ProgressStyle(description_width=…

HBox(children=(FloatProgress(value=0.0, description='Batch', max=17.0, style=ProgressStyle(description_width='…

HBox(children=(FloatProgress(value=0.0, description='Batch', max=17.0, style=ProgressStyle(description_width='…

HBox(children=(FloatProgress(value=0.0, description='Classes', max=2.0, style=ProgressStyle(description_width=…

HBox(children=(FloatProgress(value=0.0, description='Batch', max=17.0, style=ProgressStyle(description_width='…

HBox(children=(FloatProgress(value=0.0, description='Batch', max=17.0, style=ProgressStyle(description_width='…


