In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# System
import os
import sys
sys.path.append('/home/helfrech/Tools/Toolbox/utils')

# Maths
import numpy as np

# Atoms
from ase.io import read

# ML
from regression import SparseKRR
from kernels import build_kernel, linear_kernel, gaussian_kernel
from split import simple_split

# Utilities
from selection import FPS, random_selection
import multiprocessing
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cdist
import itertools

# SOAP
from soap import quippy_soap, librascal_soap

# Initial setup

In [None]:
# Setup SOAP parameters
soap_hyperparameters = dict(max_radial=12,
                            max_angular=9,
                            interaction_cutoff=6.0,
                            cutoff_smooth_width=0.3,
                            gaussian_sigma_constant=0.3)

# Load DEEM_10k

In [None]:
# Load DEEM 10k
deem_10k = read('../Raw_Data/DEEM_10k/DEEM_10000.xyz', index=':')

In [None]:
# Extract energies per Si atom
deem_10k_volumes = []
deem_10k_energies = []
n_Si = []

for structure in deem_10k:
    Z = structure.get_atomic_numbers()
    n_Si.append(np.count_nonzero(Z == 14))
    deem_10k_volumes.append(structure.cell.volume / n_Si[-1])
    deem_10k_energies.append(structure.info['Energy_per_Si'])
    
deem_10k_volumes = np.asarray(deem_10k_volumes)
deem_10k_energies = np.asarray(deem_10k_energies)

# Check DEEM database energies vs. GULP

In [None]:
# Load GULP energies, which are in the same order as the loaded structures
deem_10k_energies_gulp = np.loadtxt('../Raw_Data/GULP/DEEM_10k/Energies_DEEM.dat', usecols=8)

# Check that GULP energies are similar to the database energies
abs_err = np.abs(deem_10k_energies - deem_10k_energies_gulp)

# mean, median, and maximum absolute error
print(np.mean(abs_err))
print(np.median(abs_err))
print(np.amax(abs_err))

# Truncate DEEM dataset for testing

In [None]:
n = 2000
stride = len(deem_10k) // n
structures = deem_10k[::stride]
structure_volumes = deem_10k_volumes[::stride]
structure_energies = deem_10k_energies[::stride]
print(len(structures))

In [None]:
f = 0.75
idxs = np.arange(0, len(structures))
np.random.shuffle(idxs)
train_idxs = idxs[0:int(n*f)]
test_idxs = idxs[int(n*f):]

# Test FPS on environment SOAPs vs. FPS on mean SOAPs

In [None]:
# Compute environment SOAPs
soaps = librascal_soap(structures, [14],
                         **soap_hyperparameters)

In [None]:
# Compute avg SOAPs
soaps_avg = librascal_soap(structures, [14],
                              **soap_hyperparameters,
                              average=True)
soaps_avg = np.asarray(soaps_avg)

In [None]:
n_components = 500

In [None]:
# FPS on environment SOAPs
fps, _ = FPS(np.concatenate([soaps[i] for i in train_idxs], axis=0).T, n=n_components, start=0)
print(fps.size)

In [None]:
# FPS on average SOAPs
fps_avg, _ = FPS(soaps_avg[train_idxs, :].T, n=n_components, start=0)
print(fps_avg.size)

In [None]:
print(len(np.setdiff1d(fps, fps_avg)))
print(len(np.setdiff1d(fps_avg, fps)))

# Truncate the SOAPs

In [None]:
soaps_avg = [soap[:, fps_avg] for soap in soaps]
soaps = [soap[:, fps] for soap in soaps]

In [None]:
print(np.concatenate(soaps, axis=0).shape)
print(np.concatenate(soaps_avg, axis=0).shape)

# Get representative environments

In [None]:
n_representatives = 2000
representatives, _ = FPS(np.concatenate([soaps[i] for i in train_idxs], axis=0), n=n_representatives)
representatives_avg, _ = FPS(np.concatenate([soaps_avg[i] for i in train_idxs], axis=0), n=n_representatives)

In [None]:
soaps_rep = np.concatenate([soaps[i] for i in train_idxs], axis=0)[representatives, :]
soaps_rep_avg = np.concatenate([soaps_avg[i] for i in train_idxs], axis=0)[representatives_avg, :]

In [None]:
print(soaps_rep.shape)
print(soaps_rep_avg.shape)

# Center properties and build kernels

In [None]:
# Center properties
avg_volume = np.mean(structure_volumes[train_idxs])
avg_energy = np.mean(structure_energies[train_idxs])

structure_volumes -= avg_volume
structure_energies -= avg_energy

In [None]:
# Build sparse linear kernels
zeta = 1
KMM_linear = build_kernel(soaps_rep, soaps_rep,
                         kernel='linear', zeta=zeta)
KNM_train_linear = build_kernel([soaps[i] for i in train_idxs], soaps_rep, 
                                kernel='linear', zeta=zeta)
KNM_test_linear = build_kernel([soaps[i] for i in test_idxs], soaps_rep, 
                               kernel='linear', zeta=zeta)

KMM_linear_avg = build_kernel(soaps_rep_avg, soaps_rep_avg,
                             kernel='linear', zeta=zeta)
KNM_train_linear_avg = build_kernel([soaps_avg[i] for i in train_idxs], soaps_rep_avg, 
                                    kernel='linear', zeta=zeta)
KNM_test_linear_avg = build_kernel([soaps_avg[i] for i in test_idxs], soaps_rep_avg, 
                                   kernel='linear', zeta=zeta)

In [None]:
# Build sparse Gaussian kernels
gamma = 0.5
KMM_gaussian = build_kernel(soaps_rep, soaps_rep,
                           kernel='gaussian', gamma=gamma)
KNM_train_gaussian = build_kernel([soaps[i] for i in train_idxs], soaps_rep, 
                                  kernel='gaussian', gamma=gamma)
KNM_test_gaussian = build_kernel([soaps[i] for i in test_idxs], soaps_rep, 
                                 kernel='gaussian', gamma=gamma)

KMM_gaussian_avg = build_kernel(soaps_rep_avg, soaps_rep_avg,
                               kernel='gaussian', gamma=gamma)
KNM_train_gaussian_avg = build_kernel([soaps_avg[i] for i in train_idxs], soaps_rep_avg, 
                                      kernel='gaussian', gamma=gamma)
KNM_test_gaussian_avg = build_kernel([soaps_avg[i] for i in test_idxs], soaps_rep_avg, 
                                     kernel='gaussian', gamma=gamma)

# Kernel speed

In [None]:
K = build_kernel(soaps_rep, soaps_rep, kernel='gaussian', gamma=1.0)

In [None]:
%%timeit
build_kernel(soaps_rep, soaps_rep, kernel='gaussian', gamma=3.0)

In [None]:
%%timeit
K**3.0

In [None]:
print(build_kernel(soaps_rep, soaps_rep, kernel='gaussian', gamma=3.0))
print(K**3.0)

# Volume regression

## Linear

In [None]:
# SOAPs
sigma = 0.1**2
delta = np.var(structure_volumes[train_idxs])*KMM_linear.shape[0]/np.trace(KMM_linear)

skrr = SparseKRR(sigma=sigma, reg=1.0E-1)
skrr.fit(KNM_train_linear*delta, KMM_linear*delta, structure_volumes[train_idxs]*delta)
predicted_volumes = skrr.transform(KNM_test_linear)
print(np.mean(np.abs(structure_volumes[test_idxs] - predicted_volumes)))

In [None]:
# Avg SOAPs with avg FPS
sigma = 0.1**2
delta = np.var(structure_volumes[train_idxs])*KMM_linear_avg.shape[0]/np.trace(KMM_linear_avg)

skrr_avg = SparseKRR(sigma=sigma, reg=1.0E-1)
skrr_avg.fit(KNM_train_linear_avg*delta, KMM_linear_avg*delta, structure_volumes[train_idxs]*delta)
predicted_volumes_avg = skrr_avg.transform(KNM_test_linear_avg)
print(np.mean(np.abs(structure_volumes[test_idxs] - predicted_volumes_avg)))

## Gaussian

In [None]:
# SOAPs
sigma = 0.1**2
delta = np.var(structure_volumes[train_idxs])*KMM_gaussian.shape[0]/np.trace(KMM_gaussian)

skrr = SparseKRR(sigma=sigma, reg=1.0E-1)
skrr.fit(KNM_train_gaussian*delta, KMM_gaussian*delta, structure_volumes[train_idxs]*delta)
predicted_volumes = skrr.transform(KNM_test_gaussian)
print(np.mean(np.abs(structure_volumes[test_idxs] - predicted_volumes)))

In [None]:
# Avg SOAPs with avg FPS
sigma = 0.1**2
delta = np.var(structure_volumes[train_idxs])*KMM_gaussian_avg.shape[0]/np.trace(KMM_gaussian_avg)

skrr_avg = SparseKRR(sigma=sigma, reg=1.0E-1)
skrr_avg.fit(KNM_train_gaussian_avg*delta, KMM_gaussian_avg*delta, structure_volumes[train_idxs]*delta)
predicted_volumes_avg = skrr_avg.transform(KNM_test_gaussian_avg)
print(np.mean(np.abs(structure_volumes[test_idxs] - predicted_volumes_avg)))

# Energy regression

## Linear

In [None]:
# SOAPs
sigma = 0.01**2
delta = np.var(structure_energies[train_idxs])*KMM_linear.shape[0]/np.trace(KMM_linear)

skrr = SparseKRR(sigma=sigma, reg=1.0E-3)
skrr.fit(KNM_train_linear*delta, KMM_linear*delta, structure_energies[train_idxs]*delta)
predicted_energies = skrr.transform(KNM_test_linear)
print(np.mean(np.abs(structure_energies[test_idxs] - predicted_energies)))

In [None]:
# Avg SOAPs with avg FPS
sigma = 0.01**2
delta = np.var(structure_energies[train_idxs])*KMM_linear_avg.shape[0]/np.trace(KMM_linear_avg)

skrr_avg = SparseKRR(sigma=sigma, reg=1.0E-3)
skrr_avg.fit(KNM_train_linear_avg*delta, KMM_linear_avg*delta, structure_energies[train_idxs]*delta)
predicted_energies_avg = skrr_avg.transform(KNM_test_linear_avg)
print(np.mean(np.abs(structure_energies[test_idxs] - predicted_energies_avg)))

## Gaussian

In [None]:
# SOAPs
sigma = 0.01**2
delta = np.var(structure_energies[train_idxs])*KMM_gaussian.shape[0]/np.trace(KMM_gaussian)

skrr = SparseKRR(sigma=sigma, reg=1.0E-3)
skrr.fit(KNM_train_gaussian*delta, KMM_gaussian*delta, structure_energies[train_idxs]*delta)
predicted_energies = skrr.transform(KNM_test_gaussian)
print(np.mean(np.abs(structure_energies[test_idxs] - predicted_energies)))

In [None]:
# Avg SOAPs with avg FPS
sigma = 0.01**2
delta = np.var(structure_energies[train_idxs])*KMM_gaussian_avg.shape[0]/np.trace(KMM_gaussian_avg)

skrr_avg = SparseKRR(sigma=sigma, reg=1.0E-3)
skrr_avg.fit(KNM_train_gaussian_avg*delta, KMM_gaussian_avg*delta, structure_energies[train_idxs]*delta)
predicted_energies_avg = skrr_avg.transform(KNM_test_gaussian_avg)
print(np.mean(np.abs(structure_energies[test_idxs] - predicted_energies_avg)))

# Distance computation

In [None]:
# Build some large matrices
XA = np.random.random_sample(size=(10000, 500))
XB = np.random.random_sample(size=(10000, 500))

## cdist

In [None]:
%%timeit
D = cdist(XA, XB, metric='sqeuclidean')

In [None]:
%%timeit
D = cdist(XA, XB, metric='euclidean')**2

In [None]:
D = cdist(XA, XB, metric='sqeuclidean')
print(D)

In [None]:
D = cdist(XA, XB, metric='euclidean')**2
print(D)

## pairwise_distances

In [None]:
%%timeit
D = pairwise_distances(XA, XB, metric='sqeuclidean', n_jobs=-1)

In [None]:
%%timeit
D = pairwise_distances(XA, XB, metric='sqeuclidean')

In [None]:
%%timeit
D = pairwise_distances(XA, XB, metric='euclidean', n_jobs=-1)**2

In [None]:
%%timeit
D = pairwise_distances(XA, XB, metric='euclidean')**2

In [None]:
D = pairwise_distances(XA, XB, metric='sqeuclidean', n_jobs=-1)
print(D)

In [None]:
D = pairwise_distances(XA, XB, metric='sqeuclidean')
print(D)

In [None]:
D = pairwise_distances(XA, XB, metric='euclidean', n_jobs=-1)**2
print(D)

In [None]:
D = pairwise_distances(XA, XB, metric='euclidean')**2
print(D)

## Multiprocessing (unusably slow)

## Faster squared euclidean distance

In [None]:
def sqeuclidean_distance(XA, XB):
    XA2 = np.sum(XA**2, axis=1).reshape((-1, 1))
    XB2 = np.sum(XB**2, axis=1).reshape((1, -1))
    D = XA2 + XB2 - 2*np.matmul(XA, XB.T)
    return D

In [None]:
%%timeit
D = sqeuclidean_distance(XA, XB)

In [None]:
D = sqeuclidean_distance(XA, XB)
print(D)

# Kernel Computation

## Multiprocessing

In [None]:
# Gaussian kernel with cdist
def g_kernel_cdist(XA, XB, gamma=1.0, row_mean=True, col_mean=True):
    K = cdist(XA, XB, metric='sqeuclidean')
    K = np.exp(-gamma*K)
    
    if row_mean and col_mean:
        K = np.mean(K)
    elif row_mean:
        K = np.mean(K, axis=0)
    elif col_mean:
        K = np.mean(K, axis=1)
        
    return K

# Gaussian kernel pairwise_distances
def g_kernel_pairwise(XA, XB, gamma=1.0, row_mean=True, col_mean=True):
    K = pairwise_distances(XA, XB, metric='euclidean')**2
    K = np.exp(-gamma*K)
    
    if row_mean and col_mean:
        K = np.mean(K)
    elif row_mean:
        K = np.mean(K, axis=0)
    elif col_mean:
        K = np.mean(K, axis=1)
        
    return K

In [None]:
# Build some large matrices
XA = np.random.random_sample(size=(10000, 500))
XB = np.random.random_sample(size=(10000, 500))

XA /= 500
XB /= 500

In [None]:
split_A = np.random.randint(0, 10000, 100)
split_B = np.random.randint(0, 10000, 100)

split_A.sort()
split_B.sort()

In [None]:
XA = np.split(XA, split_A)
XB = np.split(XB, split_B)

XA = [xa for xa in XA if xa.size > 0]
XB = [xb for xb in XB if xb.size > 0]

In [None]:
%%timeit
with multiprocessing.Pool() as pool:
    out = pool.starmap(g_kernel_cdist, itertools.product(XA, XB))

np.reshape(out, (len(XA), len(XB)))

In [None]:
%%timeit
with multiprocessing.Pool() as pool:
    out = pool.starmap(g_kernel_pairwise, itertools.product(XA, XB))
    
np.reshape(out, (len(XA), len(XB)))

In [None]:
%%timeit
build_kernel(XA, XB, kernel='gaussian', gamma=1.0) # 8s with sklearn

In [None]:
with multiprocessing.Pool() as pool:
    out = pool.starmap(g_kernel_cdist, itertools.product(XA, XB))

out = np.reshape(out, (len(XA), len(XB)))
print(out)

In [None]:
with multiprocessing.Pool() as pool:
    out = pool.starmap(g_kernel_pairwise, itertools.product(XA, XB))

out = np.reshape(out, (len(XA), len(XB)))
print(out)

In [None]:
out = build_kernel(XA, XB, kernel='gaussian', gamma=1.0)
print(out)