In [1]:
import os
import sys
import subprocess
import pickle
import numpy as np
import ase.io as aseIO
from sklearn.preprocessing import MinMaxScaler

def getMinIdx(filename):
    x = np.load(filename)
    xErr = x[0, :, :, 0, 0, 5]
    xMin = np.argmin(xErr)
    nCol = x.shape[2]
    pos = (xMin/nCol, xMin%nCol)
    
    # Output width, sigma
    return (x[0, pos[0], pos[1], 0, 0, 1],
            x[0, pos[0], pos[1], 0, 0, 2])

sampleSizes = ['1000', '10000']
sampleNames = ['1k', '10k']
properties = ['Energy_per_Si', 'volume'] # TODO: for DEEM 10k (and IZA_OPT) add "_Opt" to p variable
propertyNames = ['Energy', 'Volume']
cutoffs = ['3.5', '6.0']
kernels = ['gaussian', 'linear']
kernelNames = ['Gaussian', 'Linear']
maxTrain = ['800', '8000']
trainPts = {'10k': ['10', '30', '50', '100', '300', '500', '1000', '3000', '5000', '8000'],
           '1k': ['10', '30', '50', '100', '300', '500', '800']}
pcaPts = {'Energy': ['1', '2', '4', '10', '20', '100', '300'],
         'Volume': ['1', '2', '4', '10', '20', '50', '100']}

try:
    parameterFile = open('hyperparameters.pkl', 'rb')
    print 'Found hyperparameter file'
    hypers = pickle.load(parameterFile)
    parameterFile.close()
except IOError:
    hypers = {}
    print 'Creating new hyperparameters'

Found hyperparameter file


# TODO: Re-run the hyper optimization and property decomposition for DEEM 10k using optimized energies

In [3]:
print hypers

{'10k-SOAP3.5-V-L': (nan, 0.03), '10k-SOAP3.5-V-G': (0.3, 0.1), '10k-SOAP6.0-E-L': (nan, 0.001), '1k-DA-E-L': (nan, 0.01), '1k-DA-E-G': (0.3, 0.3), '1k-SOAP3.5-E-L': (nan, 0.003), '1k-SOAP3.5-E-G': (1.0, 0.01), '1k-DA-V-G': (1.0, 0.03), '1k-SOAP6.0-E-G': (3.0, 0.001), '10k-SOAP6.0-E-G': (1.0, 0.01), '10k-DA-E-L': (nan, 0.01), '10k-SOAP3.5-E-G': (0.3, 0.03), '10k-SOAP3.5-E-L': (nan, 0.001), '10k-DA-E-G': (0.3, 0.3), '1k-SOAP6.0-E-L': (nan, 0.003), '10k-DA-V-G': (0.3, 0.3), '1k-SOAP3.5-V-G': (0.3, 0.3), '1k-SOAP6.0-V-L': (nan, 0.3), '1k-SOAP3.5-V-L': (nan, 0.3), '10k-DA-V-L': (nan, 0.01), '1k-SOAP6.0-V-G': (1.0, 0.3), '10k-SOAP6.0-V-G': (1.0, 0.1), '1k-DA-V-L': (nan, 0.3), '10k-SOAP6.0-V-L': (nan, 0.3)}


# Make k-Folds

In [4]:
# DEEM
for ss, sn in zip(sampleSizes, sampleNames):
    print 'Current construction: %s' % sn
    workDir = '../Processed_Data/DEEM_%s' % sn
    
    # Create directories
    subprocess.call(['mkdir', '-p', workDir])
    
    # Make k-folds
    subprocess.call(['python', 'kFolds.py',
                    '-nt', ss,
                    '-f', '1.0',
                    '-k', '5',
                    '-output', workDir])

Current construction: 1k
Current construction: 10k


In [3]:
# Optimized IZA

print 'Current construction: IZA_OPT'
workDir = '../Processed_Data/IZA_OPT'

# Create directories
subprocess.call(['mkdir', '-p', workDir])

subprocess.call(['python', 'kFolds.py',
                '-nt', '35',
                '-f', '1.0',
                '-k', '5',
                '-output', workDir])

Current construction: IZA_OPT


0

# Extract atom data and properties

## Deem 10k

In [15]:
subprocess.call(['python', 'appendStructureProperty.py',
                '-f', '../Raw_Data/GULP/DEEM_10000_EnergyComponents.dat',
                '-xyz', '../Raw_Data/DEEM_10000.xyz',
                '-c', '8',
                '-n', 'Energy_per_Si_Opt'])

subprocess.call(['mv', '../Raw_Data/DEEM_10000_SP.xyz', '../Raw_Data/GULP/DEEM_10000_OPT.xyz'])

0

## Optimized IZA

In [16]:
subprocess.call(['python', 'appendStructureProperty.py',
                '-f', '../Raw_Data/GULP/Energies_IZA_ConPAfterConV.dat',
                '-xyz', '../Raw_Data/GULP/Final_Confs_IZA_ConPAfterConV.xyz',
                '-c', '8',
                '-n', 'Energy_per_Si_Opt'])

subprocess.call(['mv', '../Raw_Data/GULP/Final_Confs_IZA_ConPAfterConV_SP.xyz', '../Raw_Data/GULP/IZA_OPT.xyz'])
subprocess.call(['sed', '-i', "s/Filename= /Filename=/g", '../Raw_Data/GULP/IZA_OPT.xyz'])

0

## 1k sample

In [5]:
# Extract SOAP environment atoms
subprocess.call(['python', 'atomLabels.py',
                '-input', '../Raw_Data/DEEM_1000.xyz',
                '-output', '../Processed_Data/DEEM_1k/atoms.dat',
                '-Z', '14',
                '-sp', 'Energy_per_Si', 'Filename'])

0

## 10k sample

In [17]:
# Extract SOAP environment atoms
subprocess.call(['python', 'atomLabels.py',
                '-input', '../Raw_Data/GULP/DEEM_10000_OPT.xyz',
                '-output', '../Processed_Data/DEEM_10k/atoms.dat',
                '-Z', '14',
                '-sp', 'Energy_per_Si', 'Energy_per_Si_Opt', 'Filename'])

0

## IZA

In [11]:
# Extract SOAP environment atoms
subprocess.call(['python', 'atomLabels.py',
                '-input', '../Raw_Data/IZA.xyz',
                '-output', '../Processed_Data/IZA/atoms.dat',
                '-Z', '14',
                '-sp', 'Filename'])

0

## OPT IZA

In [18]:
# Extract SOAP environment atoms
subprocess.call(['python', 'atomLabels.py',
                '-input', '../Raw_Data/GULP/IZA_OPT.xyz',
                '-output', '../Processed_Data/IZA_OPT/atoms.dat',
                '-Z', '14',
                '-sp', 'Energy_per_Si_Opt', 'Filename'])

0

# Compute SOAP

## DEEM 1k 

In [7]:
for c in cutoffs:
    print 'Current construction: %s' % c
    structureFile = '../Raw_Data/DEEM_1000.xyz'
    workDir = '../Processed_Data/DEEM_1k/PCA/%s' % c
    
    # Create directories
    subprocess.call(['mkdir', '-p', workDir])
        
    # Compute full SOAP vectors
    subprocess.call(['python', 'SOAP.py',
                    '-structure', structureFile,
                    '-n', '12',
                    '-l', '9',
                    '-c', c,
                    '-cw', '0.3',
                    '-g', '0.3',
                    '-Z', '14',
                    '-z', '14', '8',
                    '-output', workDir])

    # Select FPS components
    subprocess.call(['python', 'FPS.py',
                    '-soap', '%s/SOAPFiles.dat' % workDir,
                    '-fps', '500',
                    '-c',
                    '-output', workDir])
    
    os.rename('%s/FPS.idxs' % workDir, 
              '%s/FPS-c.idxs' % workDir)
    
    # Recompute SOAPs, retain only the FPS components
    subprocess.call(['python', 'SOAP.py',
                    '-structure', structureFile,
                    '-n', '12',
                    '-l', '9',
                    '-c', c,
                    '-cw', '0.3',
                    '-g', '0.3',
                    '-Z', '14',
                    '-z', '14', '8',
                    '-idxs', '%s/FPS-c.idxs' % workDir,
                    '-output', workDir])

    # Select representative environments
    subprocess.call(['python', 'FPS.py',
                    '-soap', '%s/SOAPFiles.dat' % workDir,
                    '-fps', '2000',
                    '-output', workDir])
    os.rename('%s/FPS.idxs' % workDir,
              '%s/FPS-rSOAP.idxs' % workDir)

Current construction: 3.5
Current construction: 6.0


## DEEM 10k

In [3]:
for c in cutoffs:
    print 'Current construction: %s' % c
    structureFile = '../Raw_Data/DEEM_10000.xyz'
    workDir = '../Processed_Data/DEEM_10k/PCA/%s' % c
    
    # Create directories
    subprocess.call(['mkdir', '-p', workDir])
    
    # Select random structures
    subprocess.call(['python', 'randomStructureSelect.py',
                    '-structure', structureFile,
                    '-nt', '10000',
                    '-nr', '2000',
                    '-output', workDir])

    # Compute SOAP vectors for random structures
    subprocess.call(['python', 'SOAP.py',
                    '-structure', '%s/randomSelection.xyz' % workDir,
                    '-n', '12',
                    '-l', '9',
                    '-c', c,
                    '-cw', '0.3',
                    '-g', '0.3',
                    '-Z', '14',
                    '-z', '14', '8',
                    '-output', workDir])

    # Select FPS components
    subprocess.call(['python', 'FPS.py',
                    '-soap', '%s/SOAPFiles.dat' % workDir,
                    '-fps', '500',
                    '-c',
                    '-output', workDir])
    
    os.rename('%s/FPS.idxs' % workDir,
              '%s/FPS-c.idxs' % workDir)

    # Re-compute SOAPs, retain only FPS components
    subprocess.call(['python', 'SOAP.py',
                    '-structure', structureFile,
                    '-n', '12',
                    '-l', '9',
                    '-c', c,
                    '-cw', '0.3',
                     '-g', '0.3',
                    '-Z', '14',
                    '-z', '14', '8',
                    '-idxs', '%s/FPS-c.idxs' % workDir,
                    '-batchsize', '500',
                    '-output', workDir])

    # Select representative environments
    subprocess.call(['python', 'FPS.py',
                    '-soap', '%s/SOAPFiles.dat' % workDir,
                    '-fps', '2000',
                    '-output', workDir])
    
    os.rename('%s/FPS.idxs' % workDir,
              '%s/FPS-rSOAP.idxs' % workDir)

Current construction: 3.5
Current construction: 6.0


## IZA

In [12]:
for c in cutoffs:
    print 'Current construction: %s' % c
    structureFile = '../Raw_Data/IZA.xyz'
    workDir = '../Processed_Data/IZA/PCA/%s' % c
    
    # Create directories
    subprocess.call(['mkdir', '-p', workDir])
        
    # Compute full SOAP vectors
    subprocess.call(['python', 'SOAP.py',
                    '-structure', structureFile,
                    '-n', '12',
                    '-l', '9',
                    '-c', c,
                    '-cw', '0.3',
                    '-g', '0.3',
                    '-Z', '14',
                    '-z', '14', '8',
                    '-output', workDir])

    # Select FPS components
    subprocess.call(['python', 'FPS.py',
                    '-soap', '%s/SOAPFiles.dat' % workDir,
                    '-fps', '500',
                    '-c',
                    '-output', workDir])
    
    os.rename('%s/FPS.idxs' % workDir, 
              '%s/FPS-c.idxs' % workDir)
    
    # Recompute SOAPs, retain only the FPS components
    subprocess.call(['python', 'SOAP.py',
                    '-structure', structureFile,
                    '-n', '12',
                    '-l', '9',
                    '-c', c,
                    '-cw', '0.3',
                    '-g', '0.3',
                    '-Z', '14',
                    '-z', '14', '8',
                    '-idxs', '%s/FPS-c.idxs' % workDir,
                    '-output', workDir])
    
    # Create dummy FPS file that includes all environments
    s = subprocess.Popen(['wc', '-l', '../Processed_Data/IZA/atoms.dat'],
                         stdout=subprocess.PIPE)
    nEnv = int(s.communicate()[0].strip().split()[0])
    
    np.savetxt('%s/FPS-rSOAP.idxs' % workDir, np.arange(0, nEnv), fmt='%d')

Current construction: 3.5
Current construction: 6.0


## OPT IZA

In [19]:
for c in cutoffs:
    print 'Current construction: %s' % c
    structureFile = '../Raw_Data/GULP/IZA_OPT.xyz'
    workDir = '../Processed_Data/IZA_OPT/PCA/%s' % c
    
    # Create directories
    subprocess.call(['mkdir', '-p', workDir])
        
    # Compute full SOAP vectors
    subprocess.call(['python', 'SOAP.py',
                    '-structure', structureFile,
                    '-n', '12',
                    '-l', '9',
                    '-c', c,
                    '-cw', '0.3',
                    '-g', '0.3',
                    '-Z', '14',
                    '-z', '14', '8',
                    '-output', workDir])

    # Select FPS components
    subprocess.call(['python', 'FPS.py',
                    '-soap', '%s/SOAPFiles.dat' % workDir,
                    '-fps', '500',
                    '-c',
                    '-output', workDir])
    
    os.rename('%s/FPS.idxs' % workDir, 
              '%s/FPS-c.idxs' % workDir)
    
    # Recompute SOAPs, retain only the FPS components
    subprocess.call(['python', 'SOAP.py',
                    '-structure', structureFile,
                    '-n', '12',
                    '-l', '9',
                    '-c', c,
                    '-cw', '0.3',
                    '-g', '0.3',
                    '-Z', '14',
                    '-z', '14', '8',
                    '-idxs', '%s/FPS-c.idxs' % workDir,
                    '-output', workDir])
    
    # Create dummy FPS file that includes all environments
    s = subprocess.Popen(['wc', '-l', '../Processed_Data/IZA_OPT/atoms.dat'],
                         stdout=subprocess.PIPE)
    nEnv = int(s.communicate()[0].strip().split()[0])
    
    np.savetxt('%s/FPS-rSOAP.idxs' % workDir, np.arange(0, nEnv), fmt='%d')

Current construction: 3.5
Current construction: 6.0


## Hyperparameter optimization

In [9]:
# DEEM
for ss, sn, mt in zip(sampleSizes, sampleNames, maxTrain):
    for c in cutoffs:
        for p, pn in zip(properties, propertyNames):
            for k, kn in zip(kernels, kernelNames):
                print 'Current optimization: %s %s %s %s' % (sn, c, pn, kn)
                workDir = '../Processed_Data/DEEM_%s/%s/%s/ParameterSearch/%s' % (sn, pn, c, kn)
                dataDir = '../Processed_Data/DEEM_%s/PCA/%s' % (sn, c)
                foldDir = '../Processed_Data/DEEM_%s' % sn
                structureFile = '../Raw_Data/DEEM_%s.xyz' % ss
                dictKey = '%s-SOAP%s-%s-%s' % (sn, c, pn[0], kn[0])
                
                # Create directories
                subprocess.call(['mkdir', '-p', workDir])

                subprocess.call(['python', 'learningCurves.py',
                                '-structure', structureFile,
                                '-soap', '%s/SOAPFiles.dat' % dataDir,
                                '-idxs', '%s/FPS-rSOAP.idxs' % dataDir,
                                '-p', p,
                                '-Z', '14',
                                '-k', '5',
                                '-kernel', k,
                                '-width', '0.001', '0.003', '0.01', '0.03', 
                                          '0.1', '0.3', '1.0', '3.0', '10.0',
                                '-sigma', '0.001', '0.003', '0.01', '0.03', 
                                          '0.1', '0.3', '1.0', '3.0', '10.0',
                                '-zeta', '1',
                                '-ntrain', mt,
                                '-train', '%s/kTrain.idxs' % foldDir,
                                '-validate', '%s/kValidate.idxs' % foldDir,
                                '-shuffle',
                                '-output', workDir])
                optParams = getMinIdx('%s/maeAvgTest.npy' % workDir)
                hypers[dictKey] = optParams

parameterFile = open('hyperparameters.pkl', 'wb')
pickle.dump(hypers, parameterFile)
parameterFile.close()

Current optimization: 1k 3.5 Energy Gaussian
Current optimization: 1k 3.5 Energy Linear
Current optimization: 1k 3.5 Volume Gaussian
Current optimization: 1k 3.5 Volume Linear
Current optimization: 1k 6.0 Energy Gaussian
Current optimization: 1k 6.0 Energy Linear
Current optimization: 1k 6.0 Volume Gaussian
Current optimization: 1k 6.0 Volume Linear
Current optimization: 10k 3.5 Energy Gaussian
Current optimization: 10k 3.5 Energy Linear
Current optimization: 10k 3.5 Volume Gaussian
Current optimization: 10k 3.5 Volume Linear
Current optimization: 10k 6.0 Energy Gaussian
Current optimization: 10k 6.0 Energy Linear
Current optimization: 10k 6.0 Volume Gaussian
Current optimization: 10k 6.0 Volume Linear


In [None]:
# deem 10k optimized energies here

In [9]:
# opt iza optimized volumes and energies here
# TODO: probably don't need this cell, as long as we don't need IZA_OPT decomposed energies
# (i.e., using the unique IZA_OPT SOAP vectors and not those with the DEEM FPS SOAP components)
for c in cutoffs:
    for p, pn in zip(['Energy_per_Si_Opt', 'volume'], propertyNames):
        for k, kn in zip(kernels, kernelNames):
            print 'Current optimization: %s %s %s' % (c, pn, kn)
            workDir = '../Processed_Data/IZA_OPT/%s/%s/ParameterSearch/%s' % (pn, c, kn)
            dataDir = '../Processed_Data/IZA_OPT/PCA/%s' % c
            foldDir = '../Processed_Data/IZA_OPT'
            structureFile = '../Raw_Data/GULP/IZA_OPT.xyz'
            dictKey = 'IZA_OPT-SOAP%s-%s-%s' % (c, pn[0], kn[0])
                
            # Create directories
            subprocess.call(['mkdir', '-p', workDir])

            subprocess.call(['python', 'learningCurves.py',
                            '-structure', structureFile,
                            '-soap', '%s/SOAPFiles.dat' % dataDir,
                            '-idxs', '%s/FPS-rSOAP.idxs' % dataDir,
                            '-p', p,
                            '-Z', '14',
                            '-k', '5',
                            '-kernel', k,
                            '-width', '0.001', '0.003', '0.01', '0.03', 
                                      '0.1', '0.3', '1.0', '3.0', '10.0',
                            '-sigma', '0.001', '0.003', '0.01', '0.03', 
                                      '0.1', '0.3', '1.0', '3.0', '10.0',
                            '-zeta', '1',
                            '-ntrain', '28',
                            '-train', '%s/kTrain.idxs' % foldDir,
                            '-validate', '%s/kValidate.idxs' % foldDir,
                            '-shuffle',
                            '-output', workDir])
            optParams = getMinIdx('%s/maeAvgTest.npy' % workDir)
            hypers[dictKey] = optParams

parameterFile = open('hyperparameters.pkl', 'wb')
pickle.dump(hypers, parameterFile)
parameterFile.close()

Current optimization: 3.5 Energy Gaussian
Current optimization: 3.5 Energy Linear
Current optimization: 3.5 Volume Gaussian
Current optimization: 3.5 Volume Linear
Current optimization: 6.0 Energy Gaussian
Current optimization: 6.0 Energy Linear
Current optimization: 6.0 Volume Gaussian
Current optimization: 6.0 Volume Linear


# Build PCA

## Build linear PCA

In [4]:
# DEEM
for ss, sn in zip(sampleSizes, sampleNames):
    for c in cutoffs:
        print 'Current construction: %s %s' % (sn, c)
        workDir = '../Processed_Data/DEEM_%s/PCA/%s' % (sn, c)
        
        # Create directories
        subprocess.call(['mkdir', '-p', workDir])
        
        subprocess.call(['python', 'SOAP-PCA.py',
                        '-soap', '%s/SOAPFiles.dat' % workDir,
                        '-dopca',
                        '-pca', '500',
                        '-output', workDir])

        subprocess.call(['python', 'SOAP-PCA.py',
                        '-soap', '%s/SOAPFiles.dat' % workDir,
                        '-dotransform',
                        '-w', '%s/eigenvectors.dat' % workDir,
                        '-mean', '%s/mean.dat' % workDir,
                        '-output', workDir])

        subprocess.call(['python', 'FPS.py',
                        '-soap', '%s/PCAFiles.dat' % workDir,
                        '-fps', '2000',
                        '-output', workDir])
        
        os.rename('%s/FPS.idxs' % workDir,
                  '%s/FPS-rPCA.idxs' % workDir)
        
# IZA
for c in cutoffs:
    print 'Current construction: IZA %s' % c
    workDir = '../Processed_Data/IZA/PCA/%s' % c

    # Create directories
    subprocess.call(['mkdir', '-p', workDir])

    subprocess.call(['python', 'SOAP-PCA.py',
                    '-soap', '%s/SOAPFiles.dat' % workDir,
                    '-dopca',
                    '-pca', '500',
                    '-output', workDir])

    subprocess.call(['python', 'SOAP-PCA.py',
                    '-soap', '%s/SOAPFiles.dat' % workDir,
                    '-dotransform',
                    '-w', '%s/eigenvectors.dat' % workDir,
                    '-mean', '%s/mean.dat' % workDir,
                    '-output', workDir])

Current construction: 1k 3.5
Current construction: 1k 6.0
Current construction: 10k 3.5
Current construction: 10k 6.0
Current construction: IZA 3.5
Current construction: IZA 6.0


In [20]:
# OPT IZA
for c in cutoffs:
    print 'Current construction: IZA %s' % c
    workDir = '../Processed_Data/IZA_OPT/PCA/%s' % c

    # Create directories
    subprocess.call(['mkdir', '-p', workDir])

    subprocess.call(['python', 'SOAP-PCA.py',
                    '-soap', '%s/SOAPFiles.dat' % workDir,
                    '-dopca',
                    '-pca', '500',
                    '-output', workDir])

    subprocess.call(['python', 'SOAP-PCA.py',
                    '-soap', '%s/SOAPFiles.dat' % workDir,
                    '-dotransform',
                    '-w', '%s/eigenvectors.dat' % workDir,
                    '-mean', '%s/mean.dat' % workDir,
                    '-output', workDir])

Current construction: IZA 3.5
Current construction: IZA 6.0


## Build KPCA

In [5]:
# DEEM
for ss, sn in zip(sampleSizes, sampleNames):
    for c in cutoffs:
        print 'Current construction: %s %s' % (sn, c)
        dictKey = '%s-SOAP%s-%s-%s' % (sn, c, 'V', 'G') # Use volume as default choice
        workDir = '../Processed_Data/DEEM_%s/PCA/%s' % (sn, c)
        
        # Create directories
        subprocess.call(['mkdir', '-p', workDir])

        subprocess.call(['python', 'SOAP-KPCA.py',
                        '-soap', '%s/SOAPFiles.dat' % workDir,
                        '-pca', '500',
                        '-kernel', 'gaussian', # Don't need linear b/c we have the PCA, which is faster
                        '-width', str(hypers[dictKey][0]),
                        '-lowmem',
                        '-idxs', '%s/FPS-rSOAP.idxs' % workDir,
                        '-output', workDir])

        subprocess.call(['python', 'FPS.py',
                        '-soap', '%s/KPCAFiles.dat' % workDir,
                        '-fps', '2000',
                        '-output', workDir])
        
        os.rename('%s/FPS.idxs' % workDir,
                  '%s/FPS-rKPCA.idxs' % workDir)
        
# IZA
sn = '10k' # Use the 10k hyperparameters
for c in cutoffs:
    print 'Current construction: IZA %s' % c
    dictKey = '%s-SOAP%s-%s-%s' % (sn, c, 'V', 'G') # Use volume as default choice
    workDir = '../Processed_Data/IZA/PCA/%s' % c
        
    # Create directories
    subprocess.call(['mkdir', '-p', workDir])

    subprocess.call(['python', 'SOAP-KPCA.py',
                    '-soap', '%s/SOAPFiles.dat' % workDir,
                    '-pca', '500',
                    '-kernel', 'gaussian', # Don't need linear b/c we have the PCA, which is faster
                    '-width', str(hypers[dictKey][0]),
                    '-lowmem',
                    '-idxs', '%s/FPS-rSOAP.idxs' % workDir,
                    '-output', workDir])

Current construction: 1k 3.5
Current construction: 1k 6.0
Current construction: 10k 3.5
Current construction: 10k 6.0
Current construction: IZA 3.5
Current construction: IZA 6.0


In [23]:
# OPT IZA
sn = '10k' # Use the 10k hyperparameters
for c in cutoffs:
    print 'Current construction: IZA %s' % c
    dictKey = '%s-SOAP%s-%s-%s' % (sn, c, 'V', 'G') # Use volume as default choice
    workDir = '../Processed_Data/IZA_OPT/PCA/%s' % c
        
    # Create directories
    subprocess.call(['mkdir', '-p', workDir])

    subprocess.call(['python', 'SOAP-KPCA.py',
                    '-soap', '%s/SOAPFiles.dat' % workDir,
                    '-pca', '500',
                    '-kernel', 'gaussian', # Don't need linear b/c we have the PCA, which is faster
                    '-width', str(hypers[dictKey][0]),
                    '-lowmem',
                    '-idxs', '%s/FPS-rSOAP.idxs' % workDir,
                    '-output', workDir])

Current construction: IZA 3.5
Current construction: IZA 6.0


# Build Classical Descriptors

## Distances

In [3]:
for ss, sn in zip(sampleSizes, sampleNames):
    print 'Current construction: %s' % sn
    workDir = '../Processed_Data/DEEM_%s/Distances' % sn
    structureFile = '../Raw_Data/DEEM_%s_Distances.xyz' % ss
    
    # Create directories
    subprocess.call(['mkdir', '-p', workDir])
    
    subprocess.call(['python', 'buildClassicalDescriptors.py',
                    '-input', structureFile,
                    '-output', '%s/distances.dat' % workDir,
                    '-p', 'distances'])
    
    f = open('%s/distanceFiles.dat' % workDir, 'w')
    f.write(os.path.abspath('%s/distances.dat' % workDir))
    f.close()
    
    subprocess.call(['python', 'FPS.py',
                    '-soap', '%s/distanceFiles.dat' % workDir,
                    '-fps', '2000',
                    '-output', workDir])
        
    os.rename('%s/FPS.idxs' % workDir,
              '%s/FPS-rDistances.idxs' % workDir)

Current construction: 1k
Current construction: 10k


## Angles

In [4]:
for ss, sn in zip(sampleSizes, sampleNames):
    print 'Current construction: %s' % sn
    workDir = '../Processed_Data/DEEM_%s/Angles' % sn
    structureFile = '../Raw_Data/DEEM_%s_Angles.xyz' % ss
    
    # Create directories
    subprocess.call(['mkdir', '-p', workDir])
    
    subprocess.call(['python', 'buildClassicalDescriptors.py',
                    '-input', structureFile,
                    '-output', '%s/angles.dat' % workDir,
                    '-p', 'angles'])
    
    f = open('%s/angleFiles.dat' % workDir, 'w')
    f.write(os.path.abspath('%s/angles.dat' % workDir))
    f.close()
    
    subprocess.call(['python', 'FPS.py',
                    '-soap', '%s/angleFiles.dat' % workDir,
                    '-fps', '2000',
                    '-output', workDir])
        
    os.rename('%s/FPS.idxs' % workDir,
              '%s/FPS-rAngles.idxs' % workDir)

Current construction: 1k
Current construction: 10k


## Distances + Angles

In [5]:
for ss, sn in zip(sampleSizes, sampleNames):
    print 'Current construction: %s' % sn
    workDir = '../Processed_Data/DEEM_%s/Distances+Angles' % sn
    
    # Create directories
    subprocess.call(['mkdir', '-p', workDir])
    
    # Make and the combined feature
    d = np.loadtxt('../Processed_Data/DEEM_%s/Distances/distances.dat' % sn)
    a = np.loadtxt('../Processed_Data/DEEM_%s/Angles/angles.dat' % sn)
    da = np.column_stack((d, a))
    
    # Scale features between 0 and 1,
    # since distances and angles have very different scales
    mms = MinMaxScaler()
    da_mms = mms.fit_transform(da)
    np.savetxt('%s/da.dat' % workDir, da_mms)
    
    f = open('%s/daFiles.dat' % workDir, 'w')
    f.write(os.path.abspath('%s/da.dat' % workDir))
    f.close()
    
    subprocess.call(['python', 'FPS.py',
                    '-soap', '%s/daFiles.dat' % workDir,
                    '-fps', '2000',
                    '-output', workDir])
        
    os.rename('%s/FPS.idxs' % workDir,
              '%s/FPS-rDA.idxs' % workDir)

Current construction: 1k
Current construction: 10k


## Hyperparameter optimization (Distances + Angles)

In [12]:
for ss, sn, mt in zip(sampleSizes, sampleNames, maxTrain):
    for p, pn in zip(properties, propertyNames):
        for k, kn in zip(kernels, kernelNames):
            print 'Current optimization: %s %s %s' % (sn, pn, kn)
            workDir = '../Processed_Data/DEEM_%s/%s/Distances+Angles/ParameterSearch/%s' % (sn, pn, kn)
            structureFile = '../Raw_Data/DEEM_%s.xyz' % ss
            dataFile = '../Processed_Data/DEEM_%s/Distances+Angles/daFiles.dat' % sn
            idxsFile = '../Processed_Data/DEEM_%s/Distances+Angles/FPS-rDA.idxs' % sn
            trainFile = '../Processed_Data/DEEM_%s/kTrain.idxs' % sn
            validateFile = '../Processed_Data/DEEM_%s/kValidate.idxs' % sn
            dictKey = '%s-DA-%s-%s' % (sn, pn[0], kn[0])
            
            # Create directories
            subprocess.call(['mkdir', '-p', workDir])

            subprocess.call(['python', 'learningCurves.py',
                            '-structure', structureFile,
                            '-soap', dataFile,
                            '-idxs', idxsFile,
                            '-p', p,
                            '-Z', '14',
                            '-k', '5',
                            '-kernel', k,
                            '-width', '0.01', '0.03', '0.1', '0.3', '1.0', '3.0', '10.0',
                            '-sigma', '0.01', '0.03', '0.1', '0.3', '1.0', '3.0', '10.0',
                            '-zeta', '1',
                            '-ntrain', mt,
                            '-train', trainFile,
                            '-validate', validateFile,
                            '-shuffle',
                            '-output', workDir])
            optParams = getMinIdx('%s/maeAvgTest.npy' % workDir)
            hypers[dictKey] = optParams            

parameterFile = open('hyperparameters.pkl', 'wb')
pickle.dump(hypers, parameterFile)
parameterFile.close()

Current optimization: 1k Energy Gaussian
Current optimization: 1k Energy Linear
Current optimization: 1k Volume Gaussian
Current optimization: 1k Volume Linear
Current optimization: 10k Energy Gaussian
Current optimization: 10k Energy Linear
Current optimization: 10k Volume Gaussian
Current optimization: 10k Volume Linear


## Learning Curves (Distances+Angles)

In [14]:
for ss, sn in zip(sampleSizes, sampleNames):
    for p, pn in zip(properties, propertyNames):
        for k, kn in zip(kernels, kernelNames):
            print 'Current model: %s %s %s' % (sn, pn, kn)
            workDir = '../Processed_Data/DEEM_%s/%s/Distances+Angles/%s' % (sn, pn, kn)
            structureFile = '../Raw_Data/DEEM_%s.xyz' % ss
            dataFile = '../Processed_Data/DEEM_%s/Distances+Angles/daFiles.dat' % sn
            idxsFile = '../Processed_Data/DEEM_%s/Distances+Angles/FPS-rDA.idxs' % sn
            trainFile = '../Processed_Data/DEEM_%s/kTrain.idxs' % sn
            validateFile = '../Processed_Data/DEEM_%s/kValidate.idxs' % sn
            dictKey = '%s-DA-%s-%s' % (sn, pn[0], kn[0])
            
            # Create directories
            subprocess.call(['mkdir', '-p', workDir])

            args = ['python', 'learningCurves.py',
                    '-structure', structureFile,
                    '-soap', dataFile,
                    '-idxs', idxsFile,
                    '-p', p,
                    '-Z', '14',
                    '-k', '5',
                    '-kernel', k,
                    '-width', str(hypers[dictKey][0]),
                    '-sigma', str(hypers[dictKey][1]),
                    '-zeta', '1',
                    '-ntrain'] + trainPts[sn] + \
                    ['-train', trainFile,
                    '-validate', validateFile,
                    '-shuffle',
                    '-output', workDir]             
            subprocess.call(args)

Current model: 1k Energy Gaussian
Current model: 1k Energy Linear
Current model: 1k Volume Gaussian
Current model: 1k Volume Linear
Current model: 10k Energy Gaussian
Current model: 10k Energy Linear
Current model: 10k Volume Gaussian
Current model: 10k Volume Linear


# Projection of IZA onto DEEM

In [7]:
# Create SOAPs for projection
for ss, sn in zip(sampleSizes, sampleNames):
    for c in cutoffs:
        print 'Current construction: %s %s' % (sn, c)
        structureFile = '../Raw_Data/IZA.xyz'
        workDir = '../Processed_Data/IZAonDEEM_%s/PCA/%s' % (sn, c)
        dataDir = '../Processed_Data/DEEM_%s/PCA/%s' % (sn, c)

        # Create directories
        subprocess.call(['mkdir', '-p', workDir])

        # Compute SOAPs, retain only the FPS components that are consistent with DEEM
        subprocess.call(['python', 'SOAP.py',
                        '-structure', structureFile,
                        '-n', '12',
                        '-l', '9',
                        '-c', c,
                        '-cw', '0.3',
                        '-g', '0.3',
                        '-Z', '14',
                        '-z', '14', '8',
                        '-idxs', '%s/FPS-c.idxs' % dataDir,
                        '-output', workDir])

        # Create dummy FPS file that includes all environments
        s = subprocess.Popen(['wc', '-l', '../Processed_Data/IZA/atoms.dat'],
                             stdout=subprocess.PIPE)
        nEnv = int(s.communicate()[0].strip().split()[0])

        np.savetxt('%s/FPS-rSOAP.idxs' % workDir, np.arange(0, nEnv), fmt='%d')

Current construction: 1k 3.5
Current construction: 1k 6.0
Current construction: 10k 3.5
Current construction: 10k 6.0


In [24]:
# Create SOAPs for projection: OPT IZA
for ss, sn in zip(sampleSizes, sampleNames):
    for c in cutoffs:
        print 'Current construction: %s %s' % (sn, c)
        structureFile = '../Raw_Data/GULP/IZA_OPT.xyz'
        workDir = '../Processed_Data/IZA_OPTonDEEM_%s/PCA/%s' % (sn, c)
        dataDir = '../Processed_Data/DEEM_%s/PCA/%s' % (sn, c)

        # Create directories
        subprocess.call(['mkdir', '-p', workDir])

        # Compute SOAPs, retain only the FPS components that are consistent with DEEM
        subprocess.call(['python', 'SOAP.py',
                        '-structure', structureFile,
                        '-n', '12',
                        '-l', '9',
                        '-c', c,
                        '-cw', '0.3',
                        '-g', '0.3',
                        '-Z', '14',
                        '-z', '14', '8',
                        '-idxs', '%s/FPS-c.idxs' % dataDir,
                        '-output', workDir])

        # Create dummy FPS file that includes all environments
        s = subprocess.Popen(['wc', '-l', '../Processed_Data/IZA_OPT/atoms.dat'],
                             stdout=subprocess.PIPE)
        nEnv = int(s.communicate()[0].strip().split()[0])

        np.savetxt('%s/FPS-rSOAP.idxs' % workDir, np.arange(0, nEnv), fmt='%d')

Current construction: 1k 3.5
Current construction: 1k 6.0
Current construction: 10k 3.5
Current construction: 10k 6.0


In [28]:
# Linear PCA
for ss, sn in zip(sampleSizes, sampleNames):
    for c in cutoffs:
        print 'Current construction: PCA %s %s' % (sn, c)
        workDir = '../Processed_Data/IZAonDEEM_%s/PCA/%s' % (sn, c)
        dataDir = '../Processed_Data/DEEM_%s/PCA/%s' % (sn, c)
        
        # Create directories
        subprocess.call(['mkdir', '-p', workDir])
        
        subprocess.call(['python', 'SOAP-PCA.py',
                        '-soap', '%s/SOAPFiles.dat' % workDir,
                        '-dotransform',
                        '-w', '%s/eigenvectors.dat' % dataDir,
                        '-mean', '%s/mean.dat' % dataDir,
                        '-output', workDir])

# Kernel PCA
for ss, sn in zip(sampleSizes, sampleNames):
    for c in cutoffs:
        print 'Current construction: KPCA %s %s' % (sn, c)
        dictKey = '%s-SOAP%s-%s-%s' % (sn, c, 'V', 'G') # Use volume as default choice
        workDir = '../Processed_Data/IZAonDEEM_%s/PCA/%s' % (sn, c)
        dataDir = '../Processed_Data/DEEM_%s/PCA/%s' % (sn, c)
        
        # Create directories
        subprocess.call(['mkdir', '-p', workDir])

        subprocess.call(['python', 'SOAP-KPCA.py',
                        '-soap', '%s/SOAPFiles.dat' % dataDir,
                        '-pca', '500',
                        '-idxs', '%s/FPS-rSOAP.idxs' % dataDir,
                        '-kernel', 'gaussian', # Don't need linear b/c we have the PCA, which is faster
                        '-width', str(hypers[dictKey][0]),
                        '-lowmem',
                        '-dotransform', '%s/SOAPFiles.dat' % workDir,
                        '-w', '%s/UFiles.dat' % dataDir,
                        '-mean', '%s/G-mean.npy' % dataDir,
                        '-g', '%s/GFiles.dat' % dataDir, 
                        '-output', workDir])

Current construction: PCA 1k 3.5
Current construction: PCA 1k 6.0
Current construction: PCA 10k 3.5
Current construction: PCA 10k 6.0
Current construction: KPCA 1k 3.5
Current construction: KPCA 1k 6.0
Current construction: KPCA 10k 3.5
Current construction: KPCA 10k 6.0


In [29]:
# Linear PCA: OPT IZA
for ss, sn in zip(sampleSizes, sampleNames):
    for c in cutoffs:
        print 'Current construction: PCA %s %s' % (sn, c)
        workDir = '../Processed_Data/IZA_OPTonDEEM_%s/PCA/%s' % (sn, c)
        dataDir = '../Processed_Data/DEEM_%s/PCA/%s' % (sn, c)
        
        # Create directories
        subprocess.call(['mkdir', '-p', workDir])
        
        subprocess.call(['python', 'SOAP-PCA.py',
                        '-soap', '%s/SOAPFiles.dat' % workDir,
                        '-dotransform',
                        '-w', '%s/eigenvectors.dat' % dataDir,
                        '-mean', '%s/mean.dat' % dataDir,
                        '-output', workDir])

# Kernel PCA: OPT IZA
for ss, sn in zip(sampleSizes, sampleNames):
    for c in cutoffs:
        print 'Current construction: KPCA %s %s' % (sn, c)
        dictKey = '%s-SOAP%s-%s-%s' % (sn, c, 'V', 'G') # Use volume as default choice
        workDir = '../Processed_Data/IZA_OPTonDEEM_%s/PCA/%s' % (sn, c)
        dataDir = '../Processed_Data/DEEM_%s/PCA/%s' % (sn, c)
        
        # Create directories
        subprocess.call(['mkdir', '-p', workDir])

        subprocess.call(['python', 'SOAP-KPCA.py',
                        '-soap', '%s/SOAPFiles.dat' % dataDir,
                        '-idxs', '%s/FPS-rSOAP.idxs' % dataDir,
                        '-pca', '500',
                        '-kernel', 'gaussian', # Don't need linear b/c we have the PCA, which is faster
                        '-width', str(hypers[dictKey][0]),
                        '-lowmem',
                        '-dotransform', '%s/SOAPFiles.dat' % workDir,
                        '-w', '%s/UFiles.dat' % dataDir,
                        '-mean', '%s/G-mean.npy' % dataDir,
                        '-g', '%s/GFiles.dat' % dataDir, 
                        '-output', workDir])

Current construction: PCA 1k 3.5
Current construction: PCA 1k 6.0
Current construction: PCA 10k 3.5
Current construction: PCA 10k 6.0
Current construction: KPCA 1k 3.5
Current construction: KPCA 1k 6.0
Current construction: KPCA 10k 3.5
Current construction: KPCA 10k 6.0


# Property Decomposition

## SOAP

In [9]:
# DEEM
for ss, sn in zip(sampleSizes, sampleNames):
    for c in cutoffs:
        for p, pn in zip(properties, propertyNames):
            for k, kn in zip(kernels, kernelNames):
                print 'Current model: %s %s %s %s' % (sn, c, pn, kn)
                workDir = '../Processed_Data/DEEM_%s/%s/%s/%s' % (sn, pn, c, kn)
                structureFile = '../Raw_Data/DEEM_%s.xyz' % ss
                dataFile = '../Processed_Data/DEEM_%s/PCA/%s/SOAPFiles.dat' % (sn, c)
                idxsFile = '../Processed_Data/DEEM_%s/PCA/%s/FPS-rSOAP.idxs' % (sn, c)
                dictKey = '%s-SOAP%s-%s-%s' % (sn, c, pn[0], kn[0])
                
                # Create directories
                subprocess.call(['mkdir', '-p', workDir])

                subprocess.call(['python', 'propertyRegression.py',
                                '-structure', structureFile,
                                '-soap', dataFile,
                                '-idxs', idxsFile,
                                '-p', p, 
                                '-Z', '14',
                                '-kernel', k,
                                '-width', str(hypers[dictKey][0]),
                                '-sigma', str(hypers[dictKey][1]),
                                '-zeta', '1',
                                '-ntrain', ss,
                                '-env',
                                '-lowmem',
                                '-output', workDir])

Current model: 1k 3.5 Energy Gaussian
Current model: 1k 3.5 Energy Linear
Current model: 1k 3.5 Volume Gaussian
Current model: 1k 3.5 Volume Linear
Current model: 1k 6.0 Energy Gaussian
Current model: 1k 6.0 Energy Linear
Current model: 1k 6.0 Volume Gaussian
Current model: 1k 6.0 Volume Linear
Current model: 10k 3.5 Energy Gaussian
Current model: 10k 3.5 Energy Linear
Current model: 10k 3.5 Volume Gaussian
Current model: 10k 3.5 Volume Linear
Current model: 10k 6.0 Energy Gaussian
Current model: 10k 6.0 Energy Linear
Current model: 10k 6.0 Volume Gaussian
Current model: 10k 6.0 Volume Linear


In [None]:
# deem 10k with optimized energies here

In [16]:
# Decompose IZA based on DEEM 1k and 10k by using saved weights
                
# IZA
for ss, sn in zip(sampleSizes, sampleNames):
    for c in cutoffs:
        for p, pn in zip(properties, propertyNames):
            for k, kn in zip(kernels, kernelNames):
                print 'Current model: %s %s %s %s' % (sn, c, pn, kn)
                workDir = '../Processed_Data/IZAonDEEM_%s/%s/%s/%s' % (sn, pn, c, kn)
                dataDir = '../Processed_Data/DEEM_%s/PCA/%s' % (sn, c)
                wFile = '../Processed_Data/DEEM_%s/%s/%s/%s/w.dat' % (sn, pn, c, kn)
                projFile = '../Processed_Data/IZAonDEEM_%s/PCA/%s/SOAPFiles.dat' % (sn, c)
                dictKey = '%s-SOAP%s-%s-%s' % (sn, c, pn[0], kn[0])
                
                # Create directories
                subprocess.call(['mkdir', '-p', workDir])

                subprocess.call(['python', 'propertyRegression.py',
                                '-structure', structureFile,
                                '-soap', '%s/SOAPFiles.dat' % dataDir,
                                '-idxs', '%s/FPS-rSOAP.idxs' % dataDir,
                                '-p', p, 
                                '-Z', '14',
                                '-kernel', k,
                                '-width', str(hypers[dictKey][0]),
                                '-sigma', str(hypers[dictKey][1]),
                                '-zeta', '1',
                                '-ntrain', ss,
                                '-w', wFile,
                                '-project', projFile,
                                '-lowmem',
                                '-output', workDir])

Current model: 1k 3.5 Energy Gaussian
Current model: 1k 3.5 Energy Linear
Current model: 1k 3.5 Volume Gaussian
Current model: 1k 3.5 Volume Linear
Current model: 1k 6.0 Energy Gaussian
Current model: 1k 6.0 Energy Linear
Current model: 1k 6.0 Volume Gaussian
Current model: 1k 6.0 Volume Linear
Current model: 10k 3.5 Energy Gaussian
Current model: 10k 3.5 Energy Linear
Current model: 10k 3.5 Volume Gaussian
Current model: 10k 3.5 Volume Linear
Current model: 10k 6.0 Energy Gaussian
Current model: 10k 6.0 Energy Linear
Current model: 10k 6.0 Volume Gaussian
Current model: 10k 6.0 Volume Linear


In [30]:
# Decompose IZA based on DEEM 1k and 10k by using saved weights
                
# OPT IZA
for ss, sn in zip(sampleSizes, sampleNames):
    for c in cutoffs:
        for p, pn in zip(properties, propertyNames):
            for k, kn in zip(kernels, kernelNames):
                print 'Current model: %s %s %s %s' % (sn, c, pn, kn)
                workDir = '../Processed_Data/IZA_OPTonDEEM_%s/%s/%s/%s' % (sn, pn, c, kn)
                dataDir = '../Processed_Data/DEEM_%s/PCA/%s' % (sn, c)
                wFile = '../Processed_Data/DEEM_%s/%s/%s/%s/w.dat' % (sn, pn, c, kn)
                projFile = '../Processed_Data/IZA_OPTonDEEM_%s/PCA/%s/SOAPFiles.dat' % (sn, c)
                dictKey = '%s-SOAP%s-%s-%s' % (sn, c, pn[0], kn[0])
                
                # Create directories
                subprocess.call(['mkdir', '-p', workDir])

                subprocess.call(['python', 'propertyRegression.py',
                                '-structure', structureFile,
                                '-soap', '%s/SOAPFiles.dat' % dataDir,
                                '-idxs', '%s/FPS-rSOAP.idxs' % dataDir,
                                '-p', p, 
                                '-Z', '14',
                                '-kernel', k,
                                '-width', str(hypers[dictKey][0]),
                                '-sigma', str(hypers[dictKey][1]),
                                '-zeta', '1',
                                '-ntrain', ss,
                                '-w', wFile,
                                '-project', projFile,
                                '-lowmem',
                                '-output', workDir])

Current model: 1k 3.5 Energy Gaussian
Current model: 1k 3.5 Energy Linear
Current model: 1k 3.5 Volume Gaussian
Current model: 1k 3.5 Volume Linear
Current model: 1k 6.0 Energy Gaussian
Current model: 1k 6.0 Energy Linear
Current model: 1k 6.0 Volume Gaussian
Current model: 1k 6.0 Volume Linear
Current model: 10k 3.5 Energy Gaussian
Current model: 10k 3.5 Energy Linear
Current model: 10k 3.5 Volume Gaussian
Current model: 10k 3.5 Volume Linear
Current model: 10k 6.0 Energy Gaussian
Current model: 10k 6.0 Energy Linear
Current model: 10k 6.0 Volume Gaussian
Current model: 10k 6.0 Volume Linear


In [30]:
# Decompose IZA
# TODO: use this or decompose IZA with Deem SOAP components? Probably the latter, i.e., delete this cell                
# OPT IZA
for c in cutoffs:
    for p, pn in zip(['Energy_per_Si_Opt', 'volume'], propertyNames):
        for k, kn in zip(kernels, kernelNames):
            print 'Current model: %s %s %s' % (c, pn, kn)
            workDir = '../Processed_Data/IZA_OPT/%s/%s/%s' % (pn, c, kn)
            structureFile = '../Raw_Data/GULP/IZA_OPT.xyz'
            dataFile = '../Processed_Data/IZA_OPT/PCA/%s/SOAPFiles.dat' % c
            idxsFile = '../Processed_Data/IZA_OPT/PCA/%s/FPS-rSOAP.idxs' % c
            dictKey = 'IZA_OPT-SOAP%s-%s-%s' % (c, pn[0], kn[0])
                
            # Create directories
            subprocess.call(['mkdir', '-p', workDir])

            subprocess.call(['python', 'propertyRegression.py',
                            '-structure', structureFile,
                            '-soap', dataFile,
                            '-idxs', idxsFile,
                            '-p', p, 
                            '-Z', '14',
                            '-kernel', k,
                            '-width', str(hypers[dictKey][0]),
                            '-sigma', str(hypers[dictKey][1]),
                            '-zeta', '1',
                            '-ntrain', '35',
                            '-env',
                            '-lowmem',
                            '-output', workDir])

Current model: 1k 3.5 Energy Gaussian
Current model: 1k 3.5 Energy Linear
Current model: 1k 3.5 Volume Gaussian
Current model: 1k 3.5 Volume Linear
Current model: 1k 6.0 Energy Gaussian
Current model: 1k 6.0 Energy Linear
Current model: 1k 6.0 Volume Gaussian
Current model: 1k 6.0 Volume Linear
Current model: 10k 3.5 Energy Gaussian
Current model: 10k 3.5 Energy Linear
Current model: 10k 3.5 Volume Gaussian
Current model: 10k 3.5 Volume Linear
Current model: 10k 6.0 Energy Gaussian
Current model: 10k 6.0 Energy Linear
Current model: 10k 6.0 Volume Gaussian
Current model: 10k 6.0 Volume Linear


# Find unique environments with FPS

## DEEM 1k

In [17]:
# Find unique environments with FPS
for c in cutoffs:
    print 'Current construction: %s' % c
    workDir = '../Processed_Data/DEEM_1k/PCA/%s' % c
    
    # SOAP
    subprocess.call(['python', 'FPS.py',
                    '-soap', '%s/SOAPFiles.dat' % workDir,
                    '-fps', '75000',
                    '-nobatch',
                    '-d',
                    '-output', workDir])
    
    os.rename('%s/FPS.idxs' % workDir,
              '%s/FPS-uSOAP.idxs' % workDir)
    
    # PCA
    subprocess.call(['python', 'FPS.py',
                    '-soap', '%s/PCAFiles.dat' % workDir,
                    '-fps', '75000',
                    '-nobatch',
                    '-d',
                    '-output', workDir])
    
    os.rename('%s/FPS.idxs' % workDir,
              '%s/FPS-uPCA.idxs' % workDir)
    
    # KPCA
    subprocess.call(['python', 'FPS.py',
                    '-soap', '%s/KPCAFiles.dat' % workDir,
                    '-fps', '75000',
                    '-nobatch',
                    '-d',
                    '-output', workDir])
    
    os.rename('%s/FPS.idxs' % workDir,
              '%s/FPS-uKPCA.idxs' % workDir)

Current construction: 3.5
Current construction: 6.0


## DEEM 10k

In [19]:
# Find unique environments with FPS
for c in cutoffs:
    print 'Current construction: %s' % c
    workDir = '../Processed_Data/DEEM_10k/PCA/%s' % c
    
    # SOAP
    subprocess.call(['python', 'FPS.py',
                    '-soap', '%s/SOAPFiles.dat' % workDir,
                    '-fps', '75000',
                    '-nobatch',
                    '-d',
                    '-output', workDir])
    
    os.rename('%s/FPS.idxs' % workDir,
              '%s/FPS-uSOAP.idxs' % workDir)
    
    # PCA
    subprocess.call(['python', 'FPS.py',
                    '-soap', '%s/PCAFiles.dat' % workDir,
                    '-fps', '75000',
                    '-nobatch',
                    '-d',
                    '-output', workDir])
    
    os.rename('%s/FPS.idxs' % workDir,
              '%s/FPS-uPCA.idxs' % workDir)
    
    # KPCA
    subprocess.call(['python', 'FPS.py',
                    '-soap', '%s/KPCAFiles.dat' % workDir,
                    '-fps', '75000',
                    '-nobatch',
                    '-d',
                    '-output', workDir])
    
    os.rename('%s/FPS.idxs' % workDir,
              '%s/FPS-uKPCA.idxs' % workDir)

Current construction: 3.5
Current construction: 6.0


## IZA

In [15]:
# Find unique environments with FPS
for iza in ['IZA', 'IZAonDEEM_1k', 'IZAonDEEM_10k']:
    for c in cutoffs:
        print 'Current construction: %s %s' % (iza, c)
        workDir = '../Processed_Data/%s/PCA/%s' % (iza, c)

        # SOAP
        subprocess.call(['python', 'FPS.py',
                        '-soap', '%s/SOAPFiles.dat' % workDir,
                        '-fps', '75000',
                        '-nobatch',
                        '-d',
                        '-output', workDir])

        os.rename('%s/FPS.idxs' % workDir,
                  '%s/FPS-uSOAP.idxs' % workDir)

        # PCA
        subprocess.call(['python', 'FPS.py',
                        '-soap', '%s/PCAFiles.dat' % workDir,
                        '-fps', '75000',
                        '-nobatch',
                        '-d',
                        '-output', workDir])

        os.rename('%s/FPS.idxs' % workDir,
                  '%s/FPS-uPCA.idxs' % workDir)

        # KPCA
        if iza == 'IZA':
            subprocess.call(['python', 'FPS.py',
                            '-soap', '%s/KPCAFiles.dat' % workDir,
                            '-fps', '75000',
                            '-nobatch',
                            '-d',
                            '-output', workDir])

            os.rename('%s/FPS.idxs' % workDir,
                      '%s/FPS-uKPCA.idxs' % workDir)
        
        else:
            subprocess.call(['python', 'FPS.py',
                            '-soap', '%s/projFiles.dat' % workDir,
                            '-fps', '75000',
                            '-nobatch',
                            '-d',
                            '-output', workDir])

            os.rename('%s/FPS.idxs' % workDir,
                      '%s/FPS-uKPCA.idxs' % workDir)

Current construction: IZA 3.5
Current construction: IZA 6.0
Current construction: IZAonDEEM_1k 3.5
Current construction: IZAonDEEM_1k 6.0
Current construction: IZAonDEEM_10k 3.5
Current construction: IZAonDEEM_10k 6.0


In [31]:
# Find unique environments with FPS
for iza in ['IZA_OPT', 'IZA_OPTonDEEM_1k', 'IZA_OPTonDEEM_10k']:
    for c in cutoffs:
        print 'Current construction: %s %s' % (iza, c)
        workDir = '../Processed_Data/%s/PCA/%s' % (iza, c)

        # SOAP
        subprocess.call(['python', 'FPS.py',
                        '-soap', '%s/SOAPFiles.dat' % workDir,
                        '-fps', '75000',
                        '-nobatch',
                        '-d',
                        '-output', workDir])

        os.rename('%s/FPS.idxs' % workDir,
                  '%s/FPS-uSOAP.idxs' % workDir)

        # PCA
        subprocess.call(['python', 'FPS.py',
                        '-soap', '%s/PCAFiles.dat' % workDir,
                        '-fps', '75000',
                        '-nobatch',
                        '-d',
                        '-output', workDir])

        os.rename('%s/FPS.idxs' % workDir,
                  '%s/FPS-uPCA.idxs' % workDir)

        # KPCA
        if iza == 'IZA_OPT':
            subprocess.call(['python', 'FPS.py',
                            '-soap', '%s/KPCAFiles.dat' % workDir,
                            '-fps', '75000',
                            '-nobatch',
                            '-d',
                            '-output', workDir])

            os.rename('%s/FPS.idxs' % workDir,
                      '%s/FPS-uKPCA.idxs' % workDir)
        
        else:
            subprocess.call(['python', 'FPS.py',
                            '-soap', '%s/projFiles.dat' % workDir,
                            '-fps', '75000',
                            '-nobatch',
                            '-d',
                            '-output', workDir])

            os.rename('%s/FPS.idxs' % workDir,
                      '%s/FPS-uKPCA.idxs' % workDir)

Current construction: IZA_OPT 3.5
Current construction: IZA_OPT 6.0
Current construction: IZA_OPTonDEEM_1k 3.5
Current construction: IZA_OPTonDEEM_1k 6.0
Current construction: IZA_OPTonDEEM_10k 3.5
Current construction: IZA_OPTonDEEM_10k 6.0


# Distance Histograms

## SOAP

In [5]:
# Build random selection indices
for ss, sn in zip(sampleSizes, sampleNames):
    for c in cutoffs:
        workDir = '../Processed_Data/DEEM_%s/PCA/%s' % (sn, c)
        
        subprocess.call(['python', 'FPS.py',
                        '-soap', '%s/SOAPFiles.dat' % workDir,
                        '-nr', '2000',
                        '-output', workDir])

        os.rename('%s/random.idxs' % workDir,
                  '%s/random-dhSOAP.idxs' % workDir)

In [10]:
# Build FPS selection indices
for ss, sn in zip(sampleSizes, sampleNames):
    for c in cutoffs:
        workDir = '../Processed_Data/DEEM_%s/PCA/%s' % (sn, c)
        
        subprocess.call(['python', 'FPS.py',
                        '-soap', '%s/SOAPFiles.dat' % workDir,
                        '-fps', '5000',
                        '-nobatch',
                        '-output', workDir])

        os.rename('%s/FPS.idxs' % workDir,
                  '%s/FPS-dhSOAP.idxs' % workDir)

In [11]:
for ss, sn in zip(sampleSizes, sampleNames):
    for c in cutoffs:
        for k, kn in zip(kernels, kernelNames):
            for r, rn in zip(('FPS-dhSOAP.idxs', 'random-dhSOAP.idxs'), ('FPS', 'Random')):
                print 'Current construction: %s %s %s %s' % (sn, c, kn, rn)
                dictKey = '%s-SOAP%s-%s-%s' % (sn, c, 'V', 'G') # Use volume as default choice
                idxsFile = '../Processed_Data/DEEM_%s/PCA/%s/%s' % (sn, c, r)
                deemDir = '../Processed_Data/DEEM_%s/PCA/%s' % (sn, c)
                izaDir = '../Processed_Data/IZAonDEEM_%s/PCA/%s' % (sn, c)
                workDir = '../Processed_Data/IZAonDEEM_%s/Histograms/SOAP/%s/%s/%s' % (sn, c, kn, rn)

                # Create directories
                subprocess.call(['mkdir', '-p', workDir])

                subprocess.call(['python', 'histograms.py',
                                '-deem', '%s/SOAPFiles.dat' % deemDir,
                                '-idxs', idxsFile,
                                '-iza', '%s/SOAPFiles.dat' % izaDir,
                                '-kernel', k,
                                '-width', str(hypers[dictKey][0]),
                                '-zeta', '1',
                                '-nbins', '200',
                                '-output', workDir])

Current construction: 1k 3.5 Gaussian FPS
Current construction: 1k 3.5 Gaussian Random
Current construction: 1k 3.5 Linear FPS
Current construction: 1k 3.5 Linear Random
Current construction: 1k 6.0 Gaussian FPS
Current construction: 1k 6.0 Gaussian Random
Current construction: 1k 6.0 Linear FPS
Current construction: 1k 6.0 Linear Random
Current construction: 10k 3.5 Gaussian FPS
Current construction: 10k 3.5 Gaussian Random
Current construction: 10k 3.5 Linear FPS
Current construction: 10k 3.5 Linear Random
Current construction: 10k 6.0 Gaussian FPS
Current construction: 10k 6.0 Gaussian Random
Current construction: 10k 6.0 Linear FPS
Current construction: 10k 6.0 Linear Random


In [13]:
# Opt IZA
for ss, sn in zip(sampleSizes, sampleNames):
    for c in cutoffs:
        for k, kn in zip(kernels, kernelNames):
            for r, rn in zip(('FPS-dhSOAP.idxs', 'random-dhSOAP.idxs'), ('FPS', 'Random')):
                print 'Current construction: %s %s %s %s' % (sn, c, kn, rn)
                dictKey = '%s-SOAP%s-%s-%s' % (sn, c, 'V', 'G') # Use volume as default choice
                idxsFile = '../Processed_Data/DEEM_%s/PCA/%s/%s' % (sn, c, r)
                deemDir = '../Processed_Data/DEEM_%s/PCA/%s' % (sn, c)
                izaDir = '../Processed_Data/IZA_OPTonDEEM_%s/PCA/%s' % (sn, c)
                workDir = '../Processed_Data/IZA_OPTonDEEM_%s/Histograms/SOAP/%s/%s/%s' % (sn, c, kn, rn)

                # Create directories
                subprocess.call(['mkdir', '-p', workDir])

                subprocess.call(['python', 'histograms.py',
                                '-deem', '%s/SOAPFiles.dat' % deemDir,
                                '-idxs', idxsFile,
                                '-iza', '%s/SOAPFiles.dat' % izaDir,
                                '-kernel', k,
                                '-width', str(hypers[dictKey][0]),
                                '-zeta', '1',
                                '-nbins', '200',
                                '-output', workDir])

Current construction: 1k 3.5 Gaussian FPS
Current construction: 1k 3.5 Gaussian Random
Current construction: 1k 3.5 Linear FPS
Current construction: 1k 3.5 Linear Random
Current construction: 1k 6.0 Gaussian FPS
Current construction: 1k 6.0 Gaussian Random
Current construction: 1k 6.0 Linear FPS
Current construction: 1k 6.0 Linear Random
Current construction: 10k 3.5 Gaussian FPS
Current construction: 10k 3.5 Gaussian Random
Current construction: 10k 3.5 Linear FPS
Current construction: 10k 3.5 Linear Random
Current construction: 10k 6.0 Gaussian FPS
Current construction: 10k 6.0 Gaussian Random
Current construction: 10k 6.0 Linear FPS
Current construction: 10k 6.0 Linear Random
