In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# System
import os
import sys
sys.path.append('/home/helfrech/Tools/Toolbox/utils')

# Maths
import numpy as np

# Atoms
from ase.io import read

# ML
from regression import SparseKRR
from kernels import build_kernel, linear_kernel, gaussian_kernel
from split import simple_split

# Utilities
from selection import FPS, random_selection

# SOAP
from soap import quippy_soap, librascal_soap

# Initial setup

In [3]:
# Setup SOAP parameters
soap_hyperparameters = dict(max_radial=12,
                            max_angular=9,
                            interaction_cutoff=6.0,
                            cutoff_smooth_width=0.3,
                            gaussian_sigma_constant=0.3)

# Load DEEM_10k

In [4]:
# Load DEEM 10k
deem_10k = read('../Raw_Data/DEEM_10k/DEEM_10000.xyz', index=':')

In [5]:
# Extract energies per Si atom
deem_10k_volumes = []
deem_10k_energies = []
n_Si = []

for structure in deem_10k:
    Z = structure.get_atomic_numbers()
    n_Si.append(np.count_nonzero(Z == 14))
    deem_10k_volumes.append(structure.cell.volume / n_Si[-1])
    deem_10k_energies.append(structure.info['Energy_per_Si'])
    
deem_10k_volumes = np.asarray(deem_10k_volumes)
deem_10k_energies = np.asarray(deem_10k_energies)

# Check DEEM database energies vs. GULP

In [6]:
# Load GULP energies, which are in the same order as the loaded structures
deem_10k_energies_gulp = np.loadtxt('../Raw_Data/GULP/DEEM_10k/Energies_DEEM.dat', usecols=8)

# Check that GULP energies are similar to the database energies
abs_err = np.abs(deem_10k_energies - deem_10k_energies_gulp)

# mean, median, and maximum absolute error
print(np.mean(abs_err))
print(np.median(abs_err))
print(np.amax(abs_err))

0.08162864922236804
0.07408146875241073
8.360833570570321


# Truncate DEEM dataset for testing

In [7]:
n = 2000
stride = len(deem_10k) // n
structures = deem_10k[::stride]
structure_volumes = deem_10k_volumes[::stride]
structure_energies = deem_10k_energies[::stride]
print(len(structures))

2000


In [8]:
f = 0.75
idxs = np.arange(0, len(structures))
np.random.shuffle(idxs)
train_idxs = idxs[0:int(n*f)]
test_idxs = idxs[int(n*f):]

# Test FPS on environment SOAPs vs. FPS on mean SOAPs

In [9]:
# Compute environment SOAPs
soaps = librascal_soap(structures, [14],
                         **soap_hyperparameters)

In [10]:
# Compute avg SOAPs
soaps_avg = librascal_soap(structures, [14],
                              **soap_hyperparameters,
                              average=True)
soaps_avg = np.asarray(soaps_avg)

In [11]:
n_components = 500

In [12]:
# FPS on environment SOAPs
fps, _ = FPS(np.concatenate([soaps[i] for i in train_idxs], axis=0).T, n=n_components, start=0)
print(fps.size)

500


In [13]:
# FPS on average SOAPs
fps_avg, _ = FPS(soaps_avg[train_idxs, :].T, n=n_components, start=0)
print(fps_avg.size)

500


In [14]:
print(len(np.setdiff1d(fps, fps_avg)))
print(len(np.setdiff1d(fps_avg, fps)))

70
70


# Truncate the SOAPs

In [15]:
soaps_avg = [soap[:, fps_avg] for soap in soaps]
soaps = [soap[:, fps] for soap in soaps]

In [16]:
print(np.concatenate(soaps, axis=0).shape)
print(np.concatenate(soaps_avg, axis=0).shape)

(100527, 500)
(100527, 500)


# Get representative environments

In [17]:
n_representatives = 2000
representatives, _ = FPS(np.concatenate([soaps[i] for i in train_idxs], axis=0), n=n_representatives)
representatives_avg, _ = FPS(np.concatenate([soaps_avg[i] for i in train_idxs], axis=0), n=n_representatives)

In [18]:
soaps_rep = np.concatenate([soaps[i] for i in train_idxs], axis=0)[representatives, :]
soaps_rep_avg = np.concatenate([soaps_avg[i] for i in train_idxs], axis=0)[representatives_avg, :]

In [19]:
print(soaps_rep.shape)
print(soaps_rep_avg.shape)

(2000, 500)
(2000, 500)


# Center properties and build kernels

In [20]:
# Center properties
avg_volume = np.mean(structure_volumes[train_idxs])
avg_energy = np.mean(structure_energies[train_idxs])

structure_volumes -= avg_volume
structure_energies -= avg_energy

In [21]:
# Build sparse linear kernels
zeta = 1
KMM_linear = build_kernel(soaps_rep, soaps_rep,
                         kernel='linear', zeta=zeta)
KNM_train_linear = build_kernel([soaps[i] for i in train_idxs], soaps_rep, 
                                kernel='linear', zeta=zeta)
KNM_test_linear = build_kernel([soaps[i] for i in test_idxs], soaps_rep, 
                               kernel='linear', zeta=zeta)

KMM_linear_avg = build_kernel(soaps_rep_avg, soaps_rep_avg,
                             kernel='linear', zeta=zeta)
KNM_train_linear_avg = build_kernel([soaps_avg[i] for i in train_idxs], soaps_rep_avg, 
                                    kernel='linear', zeta=zeta)
KNM_test_linear_avg = build_kernel([soaps_avg[i] for i in test_idxs], soaps_rep_avg, 
                                   kernel='linear', zeta=zeta)

In [22]:
# Build sparse Gaussian kernels
gamma = 0.5
KMM_gaussian = build_kernel(soaps_rep, soaps_rep,
                           kernel='gaussian', gamma=gamma)
KNM_train_gaussian = build_kernel([soaps[i] for i in train_idxs], soaps_rep, 
                                  kernel='gaussian', gamma=gamma)
KNM_test_gaussian = build_kernel([soaps[i] for i in test_idxs], soaps_rep, 
                                 kernel='gaussian', gamma=gamma)

KMM_gaussian_avg = build_kernel(soaps_rep_avg, soaps_rep_avg,
                               kernel='gaussian', gamma=gamma)
KNM_train_gaussian_avg = build_kernel([soaps_avg[i] for i in train_idxs], soaps_rep_avg, 
                                      kernel='gaussian', gamma=gamma)
KNM_test_gaussian_avg = build_kernel([soaps_avg[i] for i in test_idxs], soaps_rep_avg, 
                                     kernel='gaussian', gamma=gamma)

# Kernel speed

In [42]:
K = build_kernel(soaps_rep, soaps_rep, kernel='gaussian', gamma=1.0)

In [43]:
%%timeit
build_kernel(soaps_rep, soaps_rep, kernel='gaussian', gamma=3.0)

991 ms ± 860 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [44]:
%%timeit
K**3.0

172 ms ± 160 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [45]:
print(build_kernel(soaps_rep, soaps_rep, kernel='gaussian', gamma=3.0))
print(K**3.0)

[[1.         0.33513696 0.38982442 ... 0.69105943 0.81470392 0.83777551]
 [0.33513696 1.         0.25213475 ... 0.39278405 0.33036088 0.48896545]
 [0.38982442 0.25213475 1.         ... 0.7012866  0.54833497 0.4063832 ]
 ...
 [0.69105943 0.39278405 0.7012866  ... 1.         0.83186931 0.71877921]
 [0.81470392 0.33036088 0.54833497 ... 0.83186931 1.         0.78297762]
 [0.83777551 0.48896545 0.4063832  ... 0.71877921 0.78297762 1.        ]]
[[1.         0.33513696 0.38982442 ... 0.69105943 0.81470392 0.83777551]
 [0.33513696 1.         0.25213475 ... 0.39278405 0.33036088 0.48896545]
 [0.38982442 0.25213475 1.         ... 0.7012866  0.54833497 0.4063832 ]
 ...
 [0.69105943 0.39278405 0.7012866  ... 1.         0.83186931 0.71877921]
 [0.81470392 0.33036088 0.54833497 ... 0.83186931 1.         0.78297762]
 [0.83777551 0.48896545 0.4063832  ... 0.71877921 0.78297762 1.        ]]


# Volume regression

## Linear

In [46]:
# SOAPs
sigma = 0.1**2
delta = np.var(structure_volumes[train_idxs])*KMM_linear.shape[0]/np.trace(KMM_linear)

skrr = SparseKRR(sigma=sigma, reg=1.0E-1)
skrr.fit(KNM_train_linear*delta, KMM_linear*delta, structure_volumes[train_idxs]*delta)
predicted_volumes = skrr.transform(KNM_test_linear)
print(np.mean(np.abs(structure_volumes[test_idxs] - predicted_volumes)))

1.1394364632306542


In [47]:
# Avg SOAPs with avg FPS
sigma = 0.1**2
delta = np.var(structure_volumes[train_idxs])*KMM_linear_avg.shape[0]/np.trace(KMM_linear_avg)

skrr_avg = SparseKRR(sigma=sigma, reg=1.0E-1)
skrr_avg.fit(KNM_train_linear_avg*delta, KMM_linear_avg*delta, structure_volumes[train_idxs]*delta)
predicted_volumes_avg = skrr_avg.transform(KNM_test_linear_avg)
print(np.mean(np.abs(structure_volumes[test_idxs] - predicted_volumes_avg)))

1.133300566550664


## Gaussian

In [48]:
# SOAPs
sigma = 0.1**2
delta = np.var(structure_volumes[train_idxs])*KMM_gaussian.shape[0]/np.trace(KMM_gaussian)

skrr = SparseKRR(sigma=sigma, reg=1.0E-1)
skrr.fit(KNM_train_gaussian*delta, KMM_gaussian*delta, structure_volumes[train_idxs]*delta)
predicted_volumes = skrr.transform(KNM_test_gaussian)
print(np.mean(np.abs(structure_volumes[test_idxs] - predicted_volumes)))

1.1304934982923796


In [49]:
# Avg SOAPs with avg FPS
sigma = 0.1**2
delta = np.var(structure_volumes[train_idxs])*KMM_gaussian_avg.shape[0]/np.trace(KMM_gaussian_avg)

skrr_avg = SparseKRR(sigma=sigma, reg=1.0E-1)
skrr_avg.fit(KNM_train_gaussian_avg*delta, KMM_gaussian_avg*delta, structure_volumes[train_idxs]*delta)
predicted_volumes_avg = skrr_avg.transform(KNM_test_gaussian_avg)
print(np.mean(np.abs(structure_volumes[test_idxs] - predicted_volumes_avg)))

1.119316543995585


# Energy regression

## Linear

In [50]:
# SOAPs
sigma = 0.01**2
delta = np.var(structure_energies[train_idxs])*KMM_linear.shape[0]/np.trace(KMM_linear)

skrr = SparseKRR(sigma=sigma, reg=1.0E-3)
skrr.fit(KNM_train_linear*delta, KMM_linear*delta, structure_energies[train_idxs]*delta)
predicted_energies = skrr.transform(KNM_test_linear)
print(np.mean(np.abs(structure_energies[test_idxs] - predicted_energies)))

0.6400137389996662


In [51]:
# Avg SOAPs with avg FPS
sigma = 0.01**2
delta = np.var(structure_energies[train_idxs])*KMM_linear_avg.shape[0]/np.trace(KMM_linear_avg)

skrr_avg = SparseKRR(sigma=sigma, reg=1.0E-3)
skrr_avg.fit(KNM_train_linear_avg*delta, KMM_linear_avg*delta, structure_energies[train_idxs]*delta)
predicted_energies_avg = skrr_avg.transform(KNM_test_linear_avg)
print(np.mean(np.abs(structure_energies[test_idxs] - predicted_energies_avg)))

0.6283474537982984


## Gaussian

In [52]:
# SOAPs
sigma = 0.01**2
delta = np.var(structure_energies[train_idxs])*KMM_gaussian.shape[0]/np.trace(KMM_gaussian)

skrr = SparseKRR(sigma=sigma, reg=1.0E-3)
skrr.fit(KNM_train_gaussian*delta, KMM_gaussian*delta, structure_energies[train_idxs]*delta)
predicted_energies = skrr.transform(KNM_test_gaussian)
print(np.mean(np.abs(structure_energies[test_idxs] - predicted_energies)))

0.662661060440006


In [53]:
# Avg SOAPs with avg FPS
sigma = 0.01**2
delta = np.var(structure_energies[train_idxs])*KMM_gaussian_avg.shape[0]/np.trace(KMM_gaussian_avg)

skrr_avg = SparseKRR(sigma=sigma, reg=1.0E-3)
skrr_avg.fit(KNM_train_gaussian_avg*delta, KMM_gaussian_avg*delta, structure_energies[train_idxs]*delta)
predicted_energies_avg = skrr_avg.transform(KNM_test_gaussian_avg)
print(np.mean(np.abs(structure_energies[test_idxs] - predicted_energies_avg)))

0.6627608443840436
