In [10]:
# Import necessary libraries
import sys
sys.path.append('/home/cbo27/gbcompare')
import os
import glob
import pickle
import numpy as np
import scipy.linalg.interpolative as sli
from sklearn.cluster import KMeans
from largest_simplex import find_largest_simplex

# Directories for input and output for descriptor in this example we use SOAP.
dir_path_soap = 'soap/soap_data'
output_dir_average = 'soap/average'
output_dir_kmeans = 'soap/kmeans'
output_dir_skeleton = 'soap/skeleton'
output_dir_largest_simplex = 'soap/largest_simplex'

# Ensure the output directories exist
os.makedirs(output_dir_average, exist_ok=True)
os.makedirs(output_dir_kmeans, exist_ok=True)
os.makedirs(output_dir_skeleton, exist_ok=True)
os.makedirs(output_dir_largest_simplex, exist_ok=True)

start_indx = len(dir_path_soap) + 1

# Function to process data and save the averaged result
def process_and_save_average(data, filename, output_dir):
    fixed_data = np.average(data, axis=0)
    new_filename = os.path.splitext(filename[start_indx:])[0] + '_averaged.pickle'
    output_path = os.path.join(output_dir, new_filename)
    with open(output_path, 'wb') as f:
        pickle.dump(fixed_data, f)
    print(f'Processed and saved averaged results for {filename} to {output_path}')

# Function to perform KMeans clustering and save the result
def process_and_save_kmeans(data, filename, output_dir, n_clusters=20):
    kmeans = KMeans(n_clusters=n_clusters, random_state=0, n_init="auto").fit(data)
    cluster_centers = kmeans.cluster_centers_
    new_filename = os.path.splitext(filename[start_indx:])[0] + '_kmeans.pickle'
    output_path = os.path.join(output_dir, new_filename)
    with open(output_path, 'wb') as f:
        pickle.dump(cluster_centers, f)
    print(f'Processed and saved KMeans results for {filename} to {output_path}')

# Function to perform skeleton decomposition and save the result
def process_and_save_skeleton(data, filename, output_dir):
    idx, proj = sli.interp_decomp(np.transpose(data), 20)
    B = sli.reconstruct_skel_matrix(np.transpose(data), 20, idx)
    new_filename = os.path.splitext(filename[start_indx:])[0] + '_skeleton.pickle'
    output_path = os.path.join(output_dir, new_filename)
    with open(output_path, 'wb') as f:
        pickle.dump(np.transpose(B), f)
    print(f'Processed and saved skeleton decomposition for {filename} to {output_path}')

# Function to find the largest simplex and save the result
def process_and_save_largest_simplex(data, filename, output_dir):
    my_matrix = find_largest_simplex(data, 20)
    new_filename = os.path.splitext(filename[start_indx:])[0] + '_largest_simplex.pickle'
    output_path = os.path.join(output_dir, new_filename)
    with open(output_path, 'wb') as f:
        pickle.dump(my_matrix, f)
    print(f'Processed and saved largest simplex for {filename} to {output_path}')

## Average

In [11]:
# Process files for average representation
for filename in glob.glob(os.path.join(dir_path_soap, '*.pickle')):
    with open(filename, 'rb') as f:
        data = pickle.load(f)
    process_and_save_average(data, filename, output_dir_average)

Processed and saved averaged results for soap/soap_data/struct_S103e_fcc_N119_19_205_Al_M99_200714.3134.pickle to soap/average/struct_S103e_fcc_N119_19_205_Al_M99_200714.3134_averaged.pickle
Processed and saved averaged results for soap/soap_data/struct_S103e_fcc_N176_106_195_Al_M99_210622.2199.pickle to soap/average/struct_S103e_fcc_N176_106_195_Al_M99_210622.2199_averaged.pickle
Processed and saved averaged results for soap/soap_data/struct_S103e_fcc_N167_282_99_Al_M99_210624.2015.pickle to soap/average/struct_S103e_fcc_N167_282_99_Al_M99_210624.2015_averaged.pickle
Processed and saved averaged results for soap/soap_data/struct_S103e_fcc_N18_60_89_Al_M99_210625.1551.pickle to soap/average/struct_S103e_fcc_N18_60_89_Al_M99_210625.1551_averaged.pickle
Processed and saved averaged results for soap/soap_data/struct_S103e_fcc_N133_203_11_Al_M99_200714.2934.pickle to soap/average/struct_S103e_fcc_N133_203_11_Al_M99_200714.2934_averaged.pickle
Processed and saved averaged results for soap/s

## Kmeans

In [14]:
# Process files for KMeans clustering
for filename in glob.glob(os.path.join(dir_path_soap, '*.pickle')):
    with open(filename, 'rb') as f:
        data = pickle.load(f)
    data = np.nan_to_num(data, nan=0.000001, posinf=0.000001, neginf=0.000001)
    process_and_save_kmeans(data, filename, output_dir_kmeans)


Processed and saved KMeans results for soap/soap_data/struct_S103e_fcc_N119_19_205_Al_M99_200714.3134.pickle to soap/kmeans/struct_S103e_fcc_N119_19_205_Al_M99_200714.3134_kmeans.pickle
Processed and saved KMeans results for soap/soap_data/struct_S103e_fcc_N176_106_195_Al_M99_210622.2199.pickle to soap/kmeans/struct_S103e_fcc_N176_106_195_Al_M99_210622.2199_kmeans.pickle
Processed and saved KMeans results for soap/soap_data/struct_S103e_fcc_N167_282_99_Al_M99_210624.2015.pickle to soap/kmeans/struct_S103e_fcc_N167_282_99_Al_M99_210624.2015_kmeans.pickle
Processed and saved KMeans results for soap/soap_data/struct_S103e_fcc_N18_60_89_Al_M99_210625.1551.pickle to soap/kmeans/struct_S103e_fcc_N18_60_89_Al_M99_210625.1551_kmeans.pickle
Processed and saved KMeans results for soap/soap_data/struct_S103e_fcc_N133_203_11_Al_M99_200714.2934.pickle to soap/kmeans/struct_S103e_fcc_N133_203_11_Al_M99_200714.2934_kmeans.pickle
Processed and saved KMeans results for soap/soap_data/struct_S103e_fcc_N

## Skeleton/CUR

In [15]:
# Process files for skeleton decomposition
for filename in glob.glob(os.path.join(dir_path_soap, '*.pickle')):
    with open(filename, 'rb') as f:
        data = pickle.load(f)
    data = data.astype(np.float64)
    data = np.nan_to_num(data, nan=0.000001, posinf=0.000001, neginf=0.000001)
    process_and_save_skeleton(data, filename, output_dir_skeleton)


Processed and saved skeleton decomposition for soap/soap_data/struct_S103e_fcc_N119_19_205_Al_M99_200714.3134.pickle to soap/skeleton/struct_S103e_fcc_N119_19_205_Al_M99_200714.3134_skeleton.pickle
Processed and saved skeleton decomposition for soap/soap_data/struct_S103e_fcc_N176_106_195_Al_M99_210622.2199.pickle to soap/skeleton/struct_S103e_fcc_N176_106_195_Al_M99_210622.2199_skeleton.pickle
Processed and saved skeleton decomposition for soap/soap_data/struct_S103e_fcc_N167_282_99_Al_M99_210624.2015.pickle to soap/skeleton/struct_S103e_fcc_N167_282_99_Al_M99_210624.2015_skeleton.pickle
Processed and saved skeleton decomposition for soap/soap_data/struct_S103e_fcc_N18_60_89_Al_M99_210625.1551.pickle to soap/skeleton/struct_S103e_fcc_N18_60_89_Al_M99_210625.1551_skeleton.pickle
Processed and saved skeleton decomposition for soap/soap_data/struct_S103e_fcc_N133_203_11_Al_M99_200714.2934.pickle to soap/skeleton/struct_S103e_fcc_N133_203_11_Al_M99_200714.2934_skeleton.pickle
Processed an

## Largest Simplex

In [16]:
# Process files for finding the largest simplex
for filename in glob.glob(os.path.join(dir_path_soap, '*.pickle')):
    with open(filename, 'rb') as f:
        data = pickle.load(f)
    data = np.nan_to_num(data, nan=0.000001, posinf=0.000001, neginf=0.000001)
    process_and_save_largest_simplex(data, filename, output_dir_largest_simplex)

Processed and saved largest simplex for soap/soap_data/struct_S103e_fcc_N119_19_205_Al_M99_200714.3134.pickle to soap/largest_simplex/struct_S103e_fcc_N119_19_205_Al_M99_200714.3134_largest_simplex.pickle
Processed and saved largest simplex for soap/soap_data/struct_S103e_fcc_N176_106_195_Al_M99_210622.2199.pickle to soap/largest_simplex/struct_S103e_fcc_N176_106_195_Al_M99_210622.2199_largest_simplex.pickle
Processed and saved largest simplex for soap/soap_data/struct_S103e_fcc_N167_282_99_Al_M99_210624.2015.pickle to soap/largest_simplex/struct_S103e_fcc_N167_282_99_Al_M99_210624.2015_largest_simplex.pickle
Processed and saved largest simplex for soap/soap_data/struct_S103e_fcc_N18_60_89_Al_M99_210625.1551.pickle to soap/largest_simplex/struct_S103e_fcc_N18_60_89_Al_M99_210625.1551_largest_simplex.pickle
Processed and saved largest simplex for soap/soap_data/struct_S103e_fcc_N133_203_11_Al_M99_200714.2934.pickle to soap/largest_simplex/struct_S103e_fcc_N133_203_11_Al_M99_200714.2934_