In [None]:
import os
import sys
import subprocess

def getMinIdx(filename):
    x = np.load(filename)
    xErr = x[0, :, :, 0, 0, 5]
    xMin = np.argmin(xErr)
    nCol = x.shape[2]
    pos = (xMin/nCol, xMin%nCol)
    
    # Output width, sigma
    return (x[0, pos[0], pos[1], 0, 0, 1],
            x[0, pos[0], pos[1], 0, 0, 2])

sampleSizes = ['1000', '10000']
sampleNames = ['1k', '10k']
properties = ['Energy_per_Si', 'volume']
propertyNames = ['Energy', 'Volume']
cutoffs = ['3.5', '6.0']
kernels = ['gaussian', 'linear']
kernelNames = ['Gaussian', 'Linear']
hypers = {}

# Sort ring files

In [None]:
files = ['King_Binary', 'King_Distribution',
        'Short_Binary', 'Short_Distribution']

for ss in sampleSizes:
    for f in files:
        inFile = '../Raw_Data/DEEM_%s_Rings_%s.xyz' % (ss, f)
        outFile = '../Raw_Data/DEEM_%s_Rings_%s_Sorted.xyz' % (ss, f)
        subprocess.call(['python', 'sort_rings.py',
                        '-input', inFile,
                        '-output', outFile])

# Make k-Folds

In [None]:
for ss, sn in zip(sampleSizes, sampleNames):
    workDir = '../Processed_Data/DEEM_%s' % sn
    subprocess.call(['python', 'kFolds.py',
                    '-nt', ss,
                    '-f', '1.0',
                    '-k', '5',
                    '-output', workDir])

# Compute SOAP vectors

## Compute all SOAP vectors (1k only)

In [None]:
for c in cutoffs:
    structureFile = '../Raw_Data/DEEM_1000.xyz'
    workDir = '../Processed_Data/DEEM_1k/PCA/%s' % c
        
    # Compute full SOAP vectors
    subprocess.call(['python', 'SOAP.py',
                    '-structure', structureFile,
                    '-n', '12',
                    '-l', '9',
                    '-c', '3.5',
                    '-cw', '0.3',
                    '-g', '0.3',
                    '-Z', '14',
                    '-z', '14', '8',
                    '-output', workDir])

    # Select FPS components
    subprocess.call(['python', 'FPS.py',
                    '-soap', '%s/SOAPFiles.dat' % workDir,
                    '-fps', '500',
                    '-c',
                    '-output', workDir])
    
    # Recompute SOAPs, retain only the FPS components
    subprocess.call(['python', 'SOAP.py',
                    '-structure', structureFile,
                    '-n', '12',
                    '-l', '9',
                    '-c', '3.5',
                    '-cw', '0.3',
                    '-g', '0.3',
                    '-Z', '14',
                    '-z', '14', '8',
                    '-idxs', '%s/FPS-c.idxs' % workDir,
                    '-output', workDir])

    # Select representative environments
    subprocess.call(['python', 'FPS.py',
                    '-soap', 'SOAPFiles.dat',
                    '-fps', '2000',
                    '-output', workDir])

## Compute SOAP vectors (10k only)

In [None]:
for c in cutoffs:
    structureFile = '../Raw_Data/DEEM_10000.xyz'
    workDir = '../Processed_Data/DEEM_10k/PCA/%s' % c
    
    # Select random structures
    subprocess.call(['python', 'randomStructureSelect.py',
                    '-structure', structureFile,
                    '-nt', '10000',
                    '-nr', '2000',
                    '-output', workDir])

    # Compute SOAP vectors for random structures
    subprocess.call(['python', 'SOAP.py',
                    '-structure', '%s/randomSelection.xyz' % workDir,
                    '-n', '12',
                    '-l', '9',
                    '-c', '3.5',
                    '-cw', '0.3',
                    '-g', '0.3',
                    '-Z', '14',
                    '-z', '14', '8',
                    '-output', workDir])

    # Select FPS components
    subprocess.call(['python', 'FPS.py',
                    '-soap', 'SOAPFiles.dat',
                    '-fps', '500',
                    '-c',
                    '-output', workDir])

    # Re-compute SOAPs, retain only FPS components
    subprocess.call(['python', 'SOAP.py',
                    '-structure', structureFile,
                    '-n', '12',
                    '-l', '9',
                    '-c', '3.5',
                    '-cw', '0.3',
                     '-g', '0.3',
                    '-Z', '14',
                    '-z', '14', '8',
                    '-idxs', 'FPS-c.idxs',
                    '-output', workDir])

    # Select representative environments
    subprocess.call(['python', 'FPS.py',
                    '-soap', 'SOAPFiles.dat',
                    '-fps', '2000',
                    '-output', workDir])

## Hyperparameter optimization

In [None]:
for ss, sn in zip(sampleSizes, sampleNames):
    for c in cutoffs:
        for p, pn in zip(properties, propertyNames):
            for k, kn in zip(kernels, kernelNames):
                workDir = '../Processed_Data/DEEM_%s/%s/%s/ParameterSearch/%s' % (sn, pn, c, kn)
                dataDir = '../Processed_Data/DEEM_%s/PCA/%s' % (sn, c)
                foldDir = '../Processed_Data/DEEM_%s' % sn
                structureFile = '../Raw_Data/DEEM_%s.xyz' % ss
                dictKey = '%s-SOAP%s-%s-%s' % (sn, c, pn[0], kn[0])

                subprocess.call(['python', 'learningCurves.py',
                                '-structure', structureFile,
                                '-soap', '%s/SOAPFiles.dat' % dataDir,
                                '-idxs', '%s/FPS-rSOAP.idxs' % dataDir
                                '-p', p,
                                '-Z', '14',
                                '-k', '5',
                                '-kernel', k,
                                '-width', '0.001', '0.003', '0.01', '0.03', 
                                          '0.1', '0.3', '1.0', '3.0', '10.0',
                                '-sigma', '0.001', '0.003', '0.01', '0.03', 
                                          '0.1', '0.3', '1.0', '3.0', '10.0',
                                '-zeta', '1',
                                '-ntrain', '8000',
                                '-train', '%s/kTrain.idxs' % foldDir,
                                '-validate', '%s/kValidate.idxs' % foldDir,
                                '-shuffle',
                                '-output', workDir])
                hypers[dictKey][0], hypers[dictKey][1] = getMinIdx('%s/maeAvgTest.npy' % workDir)

# Build PCA

## Build linear PCA

In [None]:
for ss, sn in zip(sampleSizes, sampleNames):
    for c in cutoffs:
        dataDir = '../Processed_Data/DEEM_%s/PCA/%s' % (sn, c)
        
        subprocess.call(['python', 'SOAP-PCA.py',
                        '-soap', '%s/SOAPFiles.dat' % dataDir,
                        '-dopca',
                        '-pca', '500',
                        '-output', dataDir])

        subprocess.call(['python', 'SOAP-PCA.py',
                        '-soap', '%s/SOAPFiles.dat' % dataDir,
                        '-dotransform',
                        '-w', '%s/eigenvectors.dat' % dataDir,
                        '-mean', '%s/mean.dat' % dataDir,
                        '-output', dataDir])

        subprocess.call(['python', 'FPS.py',
                        '-soap', '%s/PCAFiles.dat' % dataDir,
                        '-fps', '2000',
                        '-output', dataDir])

## Build KPCA

In [None]:
for ss, sn in zip(sampleSizes, sampleNames):
    for c in cutoffs:
        dataDir = '../Processed_Data/DEEM_%s/PCA/%s' % (sn, c)
        
        subprocess.call(['python', 'centerData.py',
                        '-soap', '%s/SOAPFiles.dat' % dataDir])

        subprocess.call(['python', 'SOAP-KPCA.py',
                        '-soap', '%s/SOAPFiles-centered.dat' % dataDir,
                        '-pca', '500',
                        '-kernel', 'gaussian',
                        '-width', '1.0', # optimal from SOAP kernel hyperparameter search
                        '-lowmem',
                        '-idxs', '%s/FPS-rSOAP.idxs' % dataDir])

        subprocess.call(['python', 'FPS.py',
                        '-soap', '%s/KPCAFiles.dat' % dataDir,
                        '-fps', '2000',
                        '-output', dataDir])

# Build classical descriptors

## Distances

In [None]:
for ss, sn in zip(sampleSizes, sampleNames):
    dataDir = '../Processed_Data/DEEM_%s/Distances' % sn
    structureFile = '../Raw_Data/DEEM_%s_Distances.xyz' % ss
    
    subprocess.call(['python', 'buildClassicalDescriptors.py',
                    '-input', structureFile,
                    '-output', '%s/distances.dat' % dataDir,
                    '-p', 'distances'])

## Angles

In [None]:
for ss, sn in zip(sampleSizes, sampleNames):
    dataDir = '../Processed_Data/DEEM_%s/Angles' % sn
    structureFile = '../Raw_Data/DEEM_%s_Angles.xyz' % ss
    
    subprocess.call(['python', 'buildClassicalDescriptors.py',
                    '-input', structureFile,
                    '-output', '%s/angles.dat' % dataDir,
                    '-p', 'angles'])

## Rings

In [None]:
for ss, sn in zip(sampleSizes, sampleNames):
    for ks in ['King', 'Short']:
        for bd in ['Binary', 'Distribution']:
            dataDir = '../Processed_Data/DEEM_%s/Rings/%s/%s' % (sn, ks, bd)
            structureFile = '../Raw_Data/DEEM_%s_Rings_%s_%s_Sorted.xyz' % (ss, ks, bd)
            
            subprocess.call(['python', 'buildClassicalDescriptors.py',
                            '-input', structureFile,
                            '-output', '%s/rings.dat' % dataDir,
                            '-p', 'rings'])

# Hyperparameter Optimization

## PCA

In [None]:
for ss, sn in zip(sampleSizes, sampleNames):
    for c in cutoffs:
        for p, pn in zip(properties, propertyNames):
            for k, kn in zip(kernels, kernelNames):
                workDir = '../Processed_Data/DEEM_%s/%s/%s/PCALearn/ParameterSearch/%s' % (sn, pn, c, kn)
                structureFile = '../Raw_Data/DEEM_%s.xyz' % ss
                dataFile = '../Processed_Data/DEEM_%s/PCA/%s/PCAFiles.dat' % (sn, c)
                idxsFile = '../Processed_Data/DEEM_%s/PCA/%s/FPS-rPCA.idxs' % (sn, c)
                trainFile = '../Processed_Data/DEEM_%s/kTrain.idxs' % sn
                validateFile = '../Processed_Data/DEEM_%s/kValidate.idxs' % sn
                dictKey = '%s-PCA%s-%s-%s' % (sn, c, pn[0], kn[0])

                subprocess.call(['python', 'learningCurves.py',
                                '-structure', structureFile,
                                '-soap', dataFile,
                                '-idxs', idxsFile,
                                '-p', p,
                                '-Z', '14',
                                '-k', '5',
                                '-kernel', k,
                                '-width', '0.001', '0.003', '0.01', '0.03', 
                                          '0.1', '0.3', '1.0', '3.0', '10.0',
                                '-sigma', '0.001', '0.003', '0.01', '0.03', 
                                          '0.1', '0.3', '1.0', '3.0', '10.0', 
                                '-zeta', '1',
                                '-ntrain', '8000',
                                '-train', trainFile,
                                '-validate', validateFile,
                                '-shuffle',
                                '-output', workDir])
                hypers[dictKey][0], hypers[dictKey][1] = getMinIdx('%s/maeAvgTest.npy' % workDir)

## KPCA

In [None]:
for ss, sn in zip(sampleSizes, sampleNames):
    for c in cutoffs:
        for p, pn in zip(properties, propertyNames):
            for k, kn in zip(kernels, kernelNames):
                workDir = '../Processed_Data/DEEM_%s/%s/%s/KPCALearn/ParameterSearch/%s' % (sn, pn, c, kn)
                structureFile = '../Raw_Data/DEEM_%s.xyz' % ss
                dataFile = '../Processed_Data/DEEM_%s/PCA/%s/KPCAFiles.dat' % (sn, c)
                idxsFile = '../Processed_Data/DEEM_%s/PCA/%s/FPS-rKPCA.idxs' % (sn, c)
                trainFile = '../Processed_Data/DEEM_%s/kTrain.idxs' % sn
                validateFile = '../Processed_Data/DEEM_%s/kValidate.idxs' % sn
                dictKey = '%s-KPCA%s-%s-%s' % (sn, c, pn[0], kn[0])

                subprocess.call(['python', 'learningCurves.py',
                                '-structure', structureFile,
                                '-soap', dataFile,
                                '-idxs', idxsFile,
                                '-p', p,
                                '-Z', '14',
                                '-k', '5',
                                '-kernel', 'gaussian',
                                '-width', '0.001', '0.003', '0.01', '0.03', 
                                          '0.1', '0.3', '1.0', '3.0', '10.0',
                                '-sigma', '0.001', '0.003', '0.01', '0.03', 
                                          '0.1', '0.3', '1.0', '3.0', '10.0',
                                '-zeta', '1', 
                                '-ntrain', '8000',
                                '-train', trainFile,
                                '-validate', validateFile,
                                '-shuffle',
                                '-output', workDir])
                hypers[dictKey][0], hypers[dictKey][1] = getMinIdx('%s/maeAvgTest.npy' % workDir) 

## Distances

In [None]:
for ss, sn in zip(sampleSizes, sampleNames):
    for p, pn in zip(properties, propertyNames):
        for k, kn in zip(kernels, kernelNames):
            workDir = '../Processed_Data/DEEM_%s/%s/Distances/ParameterSearch/%s' % (sn, pn, kn)
            structureFile = '../Raw_Data/DEEM_%s.xyz' % ss
            soapFile = '../Processed_Data/DEEM_%s/Distances/distanceFiles.dat' % sn
            idxsFile = '../Processed_Data/DEEM_%s/Distances/FPS-rDistances.idxs' % sn
            trainFile = '../Processed_Data/DEEM_%s/kTrain.idxs' % sn
            validateFile = '../Processed_Data/DEEM_%s/kValidate.idxs' % sn
            dictKey = '%s-Dist-%s-%s' % (sn, pn[0], kn[0])

            subprocess.call(['python', 'learningCurves.py',
                            '-structure', structureFile,
                            '-soap', soapFile,
                            '-idxs', idxsFile,
                            '-p', p,
                            '-Z', '14',
                            '-k', '5',
                            '-kernel', k,
                            '-width', '0.001', '0.003', '0.01', '0.03', 
                                      '0.1', '0.3', '1.0',
                            '-sigma', '0.001', '0.003', '0.01', '0.03', 
                                      '0.1', '0.3', '1.0',
                            '-zeta', '1',
                            '-ntrain', '8000',
                            '-train', trainFile,
                            '-validate', validateFile,
                            '-shuffle',
                            '-output', workDir])
            hypers[dictKey][0], hypers[dictKey][1] = getMinIdx('%s/maeAvgTest.npy' % workDir)

## Angles

In [None]:
for ss, sn in zip(sampleSizes, sampleNames):
    for p, pn in zip(properties, propertyNames):
        for k, kn in zip(kernels, kernelNames):
            workDir = '../Processed_Data/DEEM_%s/%s/Angles/ParameterSearch/%s' % (sn, pn, kn)
            structureFile = '../Raw_Data/DEEM_%s.xyz' % ss
            dataFile = '../Processed_Data/DEEM_%s/Angles/angleFiles.dat' % sn
            idxsFile = '../Processed_Data/DEEM_%s/Angles/FPS-rAngles.idxs' % sn
            trainFile = '../Processed_Data/DEEM_%s/kTrain.idxs' % sn
            validateFile = '../Processed_Data/DEEM_%s/kValidate.idxs' % sn
            dictKey = '%s-Ang-%s-%s' % (sn, pn[0], kn[0])

            subprocess.call(['python', 'learningCurves.py',
                            '-structure', structureFile,
                            '-soap', dataFile,
                            '-idxs', idxsFile,
                            '-p', p,
                            '-Z', '14',
                            '-k', '5',
                            '-kernel', k,
                            '-width', '1.0', '3.0', '10.0', '30.0', 
                                      '100.0', '300.0', '1000.0',
                            '-sigma', '0.001', '0.003', '0.01', '0.03', 
                                      '0.1', '0.3', '1.0',
                            '-zeta', '1',
                            '-ntrain', '8000',
                            '-train', trainFile,
                            '-validate', validateFile,
                            '-shuffle',
                            '-output', workDir])
            hypers[dictKey][0], hypers[dictKey][1] = getMinIdx('%s/maeAvgTest.npy' % workDir) 

## Rings

In [None]:
for ss, sn in zip(sampleSizes, sampleNames):
    for p, pn in zip(properties, propertyNames):
        for ks in ['King', 'Short']:
            for bd in ['Binary', 'Distribution']:
                for k, kn in zip(kernels, kernelNames):
                    workDir = '../Processed_Data/DEEM_%s/%s/Rings/%s/%s/ParameterSearch/%s' % (sn, pn, ks, bd, kn)
                    structureFile = '../Raw_Data/DEEM_%s.xyz' % ss
                    dataFile = '../Processed_Data/DEEM_%s/Rings/%s/%s/ringFiles.dat' % (sn, ks, bd)
                    idxsFile = '../Processed_Data/DEEM_%s/Rings/%s/%s/FPS-rRings.idxs' % (sn, ks, bd)
                    trainFile = '../Processed_Data/DEEM_%s/kTrain.idxs' % sn
                    validateFile = '../Processed_Data/DEEM_%s/kValidate.idxs' % sn
                    dictKey = '%s-%s%s-%s-%s' % (sn, ks, bd[0], pn[0], kn[0])

                    subprocess.call(['python', 'learningCurves.py',
                                    '-structure', structureFile,
                                    '-soap', dataFile,
                                    '-idxs', idxsFile,
                                    '-p', p,
                                    '-Z', '14',
                                    '-k', '5', 
                                    'kernel', k,
                                    '-width', '1.0', '3.0', '10.0', '100.0', 
                                              '300.0', '1000.0',
                                    '-sigma', '1E-5', '3E-5', '1E-4', '3E-4', 
                                              '0.001', '0.003', '0.01', '0.03', 
                                              '0.1', '0.3', '1.0',
                                    '-zeta', '1',
                                    '-ntrain', '8000',
                                    '-train', trainFile,
                                    '-validate', validateFile,
                                    '-shuffle',
                                    '-output', workDir])
                    hypers[dictKey][0], hypers[dictKey][1] = getMinIdx('%s/maeAvgTest.npy' % workDir) 

# Dictionary of hyperparameters

In [None]:
# Manual definition (overwrite) of hyperparameters

# Dictonary string has the format "S-R-P-K":
# S = sample size, {1k, 10k}
# R = representation, {SOAP3.5, SOAP6.0, PCA3.5, PCA6.0, KPCA3.5, KPCA6.0, 
#                      Dist, Ang, KingBin, KingDist, ShortBin, ShortDist}
# P = property, {E, V}
# K = kernel, {G, L}

# Each element of the dictionary has the hypers [width, sigma]

hypers = {'1k-SOAP3.5-V-G': [0.3, 0.03],
         '1k-SOAP3.5-V-L':  [None, ],
         '1k-SOAP3.5-E-G':  [0.3, 0.03],
         '1k-SOAP3.5-E-L':  [None, ],
         '1k-SOAP6.0-V-G':  [1.0, 0.03],
         '1k-SOAP6.0-V-L':  [None, ],
         '1k-SOAP6.0-E-G':  [3.0, 0.003],
         '1k-SOAP6.0-E-L':  [None, ],
         '1k-PCA3.5-V-G':   [, ],
         '1k-PCA3.5-V-L':   [None, ],
         '1k-PCA3.5-E-G':   [, ],
         '1k-PCA3.5-E-L':   [None, ],
         '1k-PCA6.0-V-G':   [, ],
         '1k-PCA6.0-V-L':   [None, ],
         '1k-PCA6.0-E-G':   [, ],
         '1k-PCA6.0-E-L':   [None, ],
         '1k-KPCA3.5-V-G':  [1.0, 0.03],
         '1k-KPCA3.5-V-L':  [None, ],
         '1k-KPCA3.5-E-G':  [3.0, 0.03],
         '1k-KPCA3.5-E-L':  [None, ],
         '1k-KPCA6.0-V-G':  [1.0, 0.03],
         '1k-KPCA6.0-V-L':  [None, ],
         '1k-KPCA6.0-E-G':  [3.0, 0.01],
         '1k-KPCA6.0-E-L':  [None, ],
         '1k-Dist-V-G':     [, ],
         '1k-Dist-V-L':     [None, ],
         '1k-Dist-E-G':     [, ],
         '1k-Dist-E-L':     [None, ],
         '1k-Ang-V-G':      [, ],
         '1k-Ang-V-L':      [None, ],
         '1k-Ang-E-G':      [, ],
         '1k-Ang-E-L':      [None, ],
         '1k-KingB-V-G':    [, ],
         '1k-KingB-V-L':    [None, ],
         '1k-KingB-E-G':    [, ],
         '1k-KingB-E-L':    [None, ],
         '1k-KingD-V-G':    [, ],
         '1k-KingD-V-L':    [None, ],
         '1k-KingD-E-G':    [, ],
         '1k-KingD-E-L':    [None, ],
         '1k-ShortB-V-G':   [, ],
         '1k-ShortB-V-L':   [None, ],
         '1k-ShortB-E-G':   [, ],
         '1k-ShortB-E-L':   [None, ],
         '1k-ShortD-V-G':   [, ],
         '1k-ShortD-V-L':   [None, ],
         '1k-ShortD-E-G':   [, ],
         '1k-ShortD-E-L':   [None, ],
         '10k-SOAP3.5-V-G': [, ],
         '10k-SOAP3.5-V-L': [None, ],
         '10k-SOAP3.5-E-G': [, ],
         '10k-SOAP3.5-E-L': [None, ],
         '10k-SOAP6.0-V-G': [, ],
         '10k-SOAP6.0-V-L': [None, ],
         '10k-SOAP6.0-E-G': [, ],
         '10k-SOAP6.0-E-L': [None, ],
         '10k-PCA3.5-V-G':  [, ],
         '10k-PCA3.5-V-L':  [None, ],
         '10k-PCA3.5-E-G':  [, ],
         '10k-PCA3.5-E-L':  [None, ],
         '10k-PCA6.0-V-G':  [, ],
         '10k-PCA6.0-V-L':  [None, ],
         '10k-PCA6.0-E-G':  [, ],
         '10k-PCA6.0-E-L':  [None, ],
         '10k-KPCA3.5-V-G': [1.0, 0.03],
         '10k-KPCA3.5-V-L': [None, ],
         '10k-KPCA3.5-E-G': [, ],
         '10k-KPCA3.5-E-L': [None, ],
         '10k-KPCA6.0-V-G': [, ],
         '10k-KPCA6.0-V-L': [None, ],
         '10k-KPCA6.0-E-G': [, ],
         '10k-KPCA6.0-E-L': [None, ],
         '10k-Dist-V-G':    [, ],
         '10k-Dist-V-L':    [None, ],
         '10k-Dist-E-G':    [, ],
         '10k-Dist-E-L':    [None, ],
         '10k-Ang-V-G':     [, ],
         '10k-Ang-V-L':     [None, ],
         '10k-Ang-E-G':     [, ],
         '10k-Ang-E-L':     [None, ],
         '10k-KingB-V-G':   [, ],
         '10k-KingB-V-L':   [None, ],
         '10k-KingB-E-G':   [, ],
         '10k-KingB-E-L':   [None, ],
         '10k-KingD-V-G':   [, ],
         '10k-KingD-V-L':   [None, ],
         '10k-KingD-E-G':   [, ],
         '10k-KingD-E-L':   [None, ],
         '10k-ShortB-V-G':  [, ],
         '10k-ShortB-V-L':  [None, ],
         '10k-ShortB-E-G':  [, ],
         '10k-ShortB-E-L':  [None, ],
         '10k-ShortD-V-G':  [, ],
         '10k-ShortD-V-L':  [None, ],
         '10k-ShortD-E-G':  [, ],
         '10k-ShortD-E-L':  [None, ]}

# Learning Curves

## SOAP

In [None]:
for ss, sn in zip(sampleSizes, sampleNames):
    for c in cutoffs:
        for p, pn in zip(properties, propertyNames):
            for k, kn in zip(kernels, kernelNames):
                workDir = '../Processed_Data/DEEM_%s/%s/%s/%s/%s' % (sn, pn, c, kn)
                structureFile = '../Raw_Data/DEEM_%s.xyz' % ss
                dataFile = '../Processed_Data/DEEM_%s/PCA/%s/SOAPFiles.dat' % (sn, c)
                idxsFile = '../Processed_Data/DEEM_%s/PCA/%s/FPS-rSOAP.idxs' % (sn, c)
                trainFile = '../Processed_Data/DEEM_%s/kTrain.idxs' % sn
                validateFile = '../Processed_Data/DEEM_%s/kValidate.idxs' % sn
                dictKey = '%s-SOAP%s-%s-%s' % (sn, c, pn[0], kn[0])

                subprocess.call(['python', 'learningCurves.py',
                                '-structure', structureFile,
                                '-soap', dataFile,
                                '-idxs', idxsFile,
                                '-p', p,
                                '-Z', '14',
                                '-k', '5',
                                '-kernel', k,
                                '-width', str(hypers[dictKey][0]),
                                '-sigma', str(hypers[dictKey][1]),
                                '-zeta', '1',
                                '-ntrain', '10', '30', '50', '100', '300', '500', 
                                           '1000', '3000', '5000', '8000',
                                '-train', trainFile,
                                '-validate', validateFile,
                                '-shuffle',
                                '-output', workDir])

## PCA

In [None]:
for ss, sn in zip(sampleSizes, sampleNames):
    for c in cutoffs:
        for p, pn in zip(properties, propertyNames):
            for k, kn in zip(kernels, kernelNames):
                workDir = '../Processed_Data/DEEM_%s/%s/%s/%s/PCALearn/%s' % (sn, pn, c, kn)
                structureFile = '../Raw_Data/DEEM_%s.xyz' % ss
                dataFile = '../Processed_Data/DEEM_%s/PCA/%s/PCAFiles.dat' % (sn, c)
                idxsFile = '../Processed_Data/DEEM_%s/PCA/%s/FPS-rPCA.idxs' % (sn, c)
                trainFile = '../Processed_Data/DEEM_%s/kTrain.idxs' % sn
                validateFile = '../Processed_Data/DEEM_%s/kValidate.idxs' % sn
                dictKey = '%s-PCA%s-%s-%s' % (sn, c, pn[0], kn[0])

                subprocess.call(['python', 'learningCurves.py',
                                '-structure', structureFile,
                                '-soap', dataFile,
                                '-idxs', idxsFile,
                                '-p', 'volume',
                                '-Z', '14',
                                '-k', '5',
                                '-kernel', k,
                                '-width', str(hypers[dictKey][0]),
                                '-sigma', str(hypers[dictKey][1]),
                                '-zeta', '1',
                                '-ntrain', '10', '30', '50', '100', '300', '500', 
                                           '1000', '3000', '5000', '8000',
                                '-train', trainFile,
                                '-validate', validateFile,
                                '-shuffle',
                                '-output', workDir])

## KPCA

In [None]:
for ss, sn in zip(sampleSizes, sampleNames):
    for c in cutoffs:
        for p, pn in zip(properties, propertyNames):
            for k, kn in zip(kernels, kernelNames):
                workDir = '../Processed_Data/DEEM_%s/%s/%s/%s/KPCALearn/%s' % (sn, pn, c, kn)
                structureFile = '../Raw_Data/DEEM_%s.xyz' % ss
                dataFile = '../Processed_Data/DEEM_%s/PCA/%s/KPCAFiles.dat' % (sn, c)
                idxsFile = '../Processed_Data/DEEM_%s/PCA/%s/FPS-rKPCA.idxs' % (sn, c)
                trainFile = '../Processed_Data/DEEM_%s/kTrain.idxs' % sn
                validateFile = '../Processed_Data/DEEM_%s/kValidate.idxs' % sn
                dictKey = '%s-KPCA%s-%s-%s' % (sn, c, pn[0], kn[0])

                subprocess.call(['python', 'learningCurves.py',
                                '-structure', structureFile,
                                '-soap', dataFile,
                                '-idxs', idxsFile,
                                '-p', 'volume',
                                '-Z', '14',
                                '-k', '5',
                                '-kernel', k,
                                '-width', str(hypers[dictKey][0]),
                                '-sigma', str(hypers[dictKey][1]),
                                '-zeta', '1',
                                '-ntrain', '10', '30', '50', '100', '300', '500', 
                                           '1000', '3000', '5000', '8000',
                                '-train', trainFile,
                                '-validate', validateFile,
                                '-shuffle',
                                '-output', workDir])

## Distances

In [None]:
for ss, sn in zip(sampleSizes, sampleNames):
    for p, pn in zip(properties, propertyNames):
        for k, kn in zip(kernels, kernelNames):
            workDir = '../Processed_Data/DEEM_%s/%s/Distances/%s' % (sn, pn, kn)
            structureFile = '../Raw_Data/DEEM_%s.xyz' % ss
            dataFile = '../Processed_Data/DEEM_%s/Distances/distanceFiles.dat' % sn
            idxsFile = '../Processed_Data/DEEM_%s/Distances/FPS-rDistances.idxs' % sn
            trainFile = '../Processed_Data/DEEM_%s/kTrain.idxs' % sn
            validateFile = '../Processed_Data/DEEM_%s/kValidate.idxs' % sn
            dictKey = '%s-Dist-%s-%s' % (sn, pn[0], kn[0])

            subprocess.call(['python', 'learningCurves.py',
                            '-structure', structureFile,
                            '-soap', dataFile,
                            '-idxs', idxsFile,
                            '-p', p,
                            '-Z', '14',
                            '-k', '5',
                            '-kernel', k,
                            '-width', str(hypers[dictKey][0]),
                            '-sigma', str(hypers[dictKey][1]),
                            '-zeta', '1',
                            '-ntrain', '10', '30', '50', '100', '300', '500', 
                                       '1000', '3000', '5000', '8000',
                            '-train', trainFile,
                            '-validate', validateFile,
                            '-shuffle',
                            '-output', workDir])

## Angles

In [None]:
for ss, sn in zip(sampleSizes, sampleNames):
    for p, pn in zip(properties, propertyNames):
        for k, kn in zip(kernels, kernelNames):
            workDir = '../Processed_Data/DEEM_%s/%s/Angles/%s' % (sn, pn, kn)
            structureFile = '../Raw_Data/DEEM_%s.xyz' % ss
            dataFile = '../Processed_Data/DEEM_%s/Angles/angleFiles.dat' % sn
            idxsFile = '../Processed_Data/DEEM_%s/Angles/FPS-rAngles.idxs' % sn
            trainFile = '../Processed_Data/DEEM_%s/kTrain.idxs' % sn
            validateFile = '../Processed_Data/DEEM_%s/kValidate.idxs' % sn
            dictKey = '%s-Ang-%s-%s' % (sn, pn[0], kn[0])

            subprocess.call(['python', 'learningCurves.py',
                            '-structure', structureFile,
                            '-soap', dataFile,
                            '-idxs', idxsFile,
                            '-p', p,
                            '-Z', '14',
                            '-k', '5',
                            '-kernel', k,
                            '-width', str(hypers[dictKey][0]),
                            '-sigma', str(hypers[dictKey][1]),
                            '-zeta', '1',
                            '-ntrain', '10', '30', '50', '100', '300', '500', 
                                       '1000', '3000', '5000', '8000',
                            '-train', trainFile,
                            '-validate', validateFile,
                            '-shuffle',
                            '-output', workDir])

## Rings

In [None]:
for ss, sn in zip(sampleSizes, sampleNames):
    for p, pn in zip(properties, propertyNames):
        for ks in ['King', 'Short']:
            for bd in ['Binary', 'Distribution']:
                for k, kn in zip(kernels, kernelNames):
                    workDir = '../Processed_Data/DEEM_%s/%s/Rings/%s/%s/%s' % (sn, pn, ks, bd, kn)
                    structureFile = '../Raw_Data/DEEM_%s.xyz' % ss
                    dataFile = '../Processed_Data/DEEM_%s/Rings/%s/%s/ringFiles.dat' % (sn, ks, bd)
                    idxsFile = '../Processed_Data/DEEM_%s/Rings/%s/%s/FPS-rRings.idxs' % (sn, ks, bd)
                    trainFile = '../Processed_Data/DEEM_%s/kTrain.idxs' % sn
                    validateFile = '../Processed_Data/DEEM_%s/kValidate.idxs' % sn
                    dictKey = '%s-%s%s-%s-%s' % (sn, ks, bd[0], pn[0], kn[0])

                    subprocess.call(['python', 'learningCurves.py',
                                    '-structure', structureFile,
                                    '-soap', dataFile,
                                    '-idxs', idxsFile,
                                    '-p', p,
                                    '-Z', '14',
                                    '-k', '5',
                                    '-kernel', k,
                                    '-width', str(hypers[dictKey][0]),
                                    '-sigma', str(hypers[dictKey][1]),
                                    '-zeta', '1',
                                    '-ntrain', '10', '30', '50', '100', '300', '500', 
                                               '1000', '3000', '5000', '8000',
                                    '-train', trainFile,
                                    '-validate', validateFile,
                                    '-shuffle',
                                    '-output', workDir])

# Property Decomposition

## SOAP

In [None]:
for ss, sn in zip(sampleSizes, sampleNames):
    for c in cutoffs:
        for p, pn in zip(properties, propertyNames):
            for k, kn in zip(kernels, kernelNames):
                workDir = '../Processed_Data/DEEM_%s/%s/%s/%s/%s' % (sn, pn, c, kn)
                structureFile = '../Raw_Data/DEEM_%s.xyz' % ss
                dataFile = '../Processed_Data/DEEM_%s/PCA/%s/SOAPFiles.dat' % (sn, c)
                idxsFile = '../Processed_Data/DEEM_%s/PCA/%s/FPS-rSOAP.idxs' % (sn, c)
                trainFile = '../Processed_Data/DEEM_%s/kTrain.idxs' % sn
                validateFile = '../Processed_Data/DEEM_%s/kValidate.idxs' % sn
                dictKey = '%s-SOAP%s-%s-%s' % (sn, c, pn[0], kn[0])

                subprocess.call(['python', 'propertyRegression.py',
                                '-structure', structureFile,
                                '-soap', dataFile,
                                '-idxs', idxsFile,
                                '-p', p, 
                                '-Z', '14',
                                '-k', k,
                                '-width', str(hypers[dictKey][0]),
                                '-sigma', str(hypers[dictKey][1]),
                                '-zeta', '1',
                                '-ntrain', '10000',
                                '-env',
                                '-lowmem'])