In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# System
import os
import sys
sys.path.append('/home/helfrech/Tools/Toolbox/utils')

# Maths
import numpy as np

# ML
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

# Utilities
import h5py
from tqdm.auto import tqdm
import project_utils as utils
from tempfile import mkdtemp
from shutil import rmtree
from tools import load_json, save_json

# Initial setup

In [3]:
# Load train set (same as ridge)
train_idxs = np.loadtxt('../Processed_Data/DEEM_330k/ridge_train.idxs', dtype=int)

In [4]:
# Load SOAP cutoffs
soap_hyperparameters = load_json('../Processed_Data/soap_hyperparameters.json')   
cutoffs = soap_hyperparameters['interaction_cutoff']

# Linear PCA: IZA on Deem 10k

In [5]:
# Load IZA cantons
cantons_iza = np.loadtxt('../Raw_Data/IZA_230/cantons_compositions.dat', usecols=1, dtype=int)
n_iza = len(cantons_iza)

In [6]:
deem_name = 'DEEM_330k'
iza_name = 'IZA_230'
deem_dir = f'../Processed_Data/{deem_name}/Data'
iza_dir = f'../Processed_Data/{iza_name}/Data'

In [7]:
batch_size = 10000
n_components = 3

In [8]:
for cutoff in tqdm(cutoffs, desc='Cutoff', leave=True):
    
    # Set PCA parameters
    pca_parameters = dict(n_components=n_components)
    
    # Set data directory
    output_dir = 'LPCA'
    model_dir = f'../Processed_Data/Models/{cutoff}/{output_dir}'
    
    # Prepare output files and directories
    os.makedirs(model_dir, exist_ok=True)
    os.makedirs(f'{deem_dir}/{cutoff}/{output_dir}', exist_ok=True)   
    os.makedirs(f'{iza_dir}/{cutoff}/{output_dir}', exist_ok=True)

    # SOAP files
    deem_file = f'{deem_dir}/{cutoff}/soaps_power_full_avg_nonorm.hdf5'
    iza_file = f'{iza_dir}/{cutoff}/soaps_power_full_avg_nonorm.hdf5'
    
    # Prepare loading of the DEEM 330k structures 
    f = h5py.File(deem_file, 'r')
    deem_330k = f['0']
    deem_10k = deem_330k[train_idxs, :]
    
    # Prepare batches for PCA on the 330k
    n_samples_330k = deem_330k.len()
    n_batches = n_samples_330k // batch_size
    if n_samples_330k % batch_size > 0:
        n_batches += 1
        
    # Load IZA SOAPs
    iza = utils.load_hdf5(iza_file)
        
    # Initialize PCA for structures
    cache_dir = mkdtemp()
    pipeline = Pipeline(
        [
            ('norm_scaler', utils.StandardNormScaler()), 
            ('pca', PCA(**pca_parameters))
        ],
        memory=cache_dir
    )
    pipeline.fit(deem_10k)
    
    # Compute IZA PCA projections
    T_iza = pipeline.transform(iza)
    
    # Transform the data and save
    # Prepare output arrays for batch processing
    T_deem_330k = np.zeros((n_samples_330k, n_components))

    # Read the DEEM_330k structures and compute decision functions
    # and canton predictions in batches
    for i in tqdm(range(0, n_batches), desc='Batch', leave=False):
        batch_slice = slice(i * batch_size, (i + 1) * batch_size)
        deem_330k_batch = deem_330k[batch_slice, :]
        T_deem_330k[batch_slice] = pipeline.transform(deem_330k_batch)
    
    f.close()
    
    utils.save_hdf5(f'{iza_dir}/{cutoff}/{output_dir}/pca_structures.hdf5', T_iza)
    utils.save_hdf5(f'{deem_dir}/{cutoff}/{output_dir}/pca_structures.hdf5', T_deem_330k)
    
    # Save the PCA model
    save_json(pipeline.named_steps['norm_scaler'].__dict__, f'{model_dir}/norm_scaler.json', array_convert=True)
    save_json(pipeline.named_steps['pca'].__dict__, f'{model_dir}/pca.json', array_convert=True)
    
    rmtree(cache_dir)

HBox(children=(FloatProgress(value=0.0, description='Cutoff', max=2.0, style=ProgressStyle(description_width='…

HBox(children=(FloatProgress(value=0.0, description='Batch', max=34.0, style=ProgressStyle(description_width='…

HBox(children=(FloatProgress(value=0.0, description='Batch', max=34.0, style=ProgressStyle(description_width='…


