In [703]:
#!/usr/bin/env python3

%reload_ext autoreload
%autoreload 2

# Maths
import numpy as np

# PCovR utilities
from regression import LR, KRR, SparseKRR, IterativeSparseKRR, PCovR, KPCovR, SparseKPCovR
from decomposition import PCA, KPCA, SparseKPCA, IterativeSparseKPCA
from kernels import linear_kernel, gaussian_kernel, center_kernel
from selection import FPS
from split import simple_split

# ASE
from ase.io import read, write

# SOAP
from rascal.representations import SphericalInvariants as SOAP
from rascal.neighbourlist.structure_manager import mask_center_atoms_by_species

# Scikit learn
from sklearn.linear_model import LinearRegression
from sklearn.kernel_ridge import KernelRidge
from sklearn.decomposition import PCA as sklPCA
from sklearn.decomposition import KernelPCA as sklKPCA

# Plotting
import matplotlib as mpl
import matplotlib.pyplot as plt

# Make the plots look nicer
plot_parameters = {
    'lines.linewidth': 1.0,
    'lines.markersize': 2,
    'patch.linewidth': 1.0,
    'hatch.linewidth': 1.0,
    'axes.linewidth': 1.0,
    'xtick.top': True,
    'xtick.bottom': True,
    'xtick.direction': 'in',
    'xtick.minor.visible': True,
    'xtick.major.size': 4.0,
    'xtick.minor.size': 2.0,
    'xtick.major.pad': 5.0,
    'xtick.minor.pad': 5.0,
    'ytick.left': True,
    'ytick.right': True,
    'ytick.direction': 'in',
    'ytick.minor.visible': True,
    'ytick.major.size': 4.0,
    'ytick.minor.size': 2.0,
    'ytick.major.pad': 5.0,
    'ytick.minor.pad': 5.0   
}

for pp in plot_parameters.keys():
    mpl.rcParams[pp] = plot_parameters[pp]

In [704]:
# Read xyz files
s = read('/scratch/helfrech/Sync/Shared/KernelPCovR/datasets/DEEM_10000-prop-wrapped.xyz', index=':100')

# Extract local properties
v = []
e = []
for ss in s:
    mask_center_atoms_by_species(ss, species_select='Si')
    vv = ss.arrays['Si_volumes']
    vv = vv[~np.isnan(vv)]
    v.append(vv)
    ee = ss.arrays['Si_energies']
    ee = ee[~np.isnan(ee)]
    e.append(ee)

In [705]:
# Compute SOAPs (from librascal tutorial)
soap = SOAP(soap_type='PowerSpectrum',
           interaction_cutoff=6.0,
           max_radial=8,
           max_angular=6,
           gaussian_sigma_type='Constant',
           gaussian_sigma_constant=0.4,
           cutoff_smooth_width=0.5)

In [706]:
soap_rep = soap.transform(s)

In [707]:
X = soap_rep.get_features(soap)
Yv = np.concatenate(v)
Ye = np.concatenate(e)
Y = np.stack((Yv, Ye), axis=1)
Y[:, 0] *= 1.0E4

In [708]:
# Train-Test split
f_train = 0.80
X_train, X_test, Y_train, Y_test = simple_split(X, Y, f_train)

In [709]:
# Select FPS components from train set
n_FPS = 200
idxs, d = FPS(X_train.T, n_FPS)

X_train = X_train[:, idxs]
X_test = X_test[:, idxs]

In [710]:
# Center the data
X_mean = np.mean(X_train, axis=0)
Y_mean = np.mean(Y_train, axis=0)

X_train -= X_mean
X_test -= X_mean
Y_train -= Y_mean
Y_test -= Y_mean

In [711]:
# Normalize the data
X_scale = np.linalg.norm(X_train)/np.sqrt(X_train.shape[0])
Y_scale = np.linalg.norm(Y_train, axis=0)/np.sqrt(Y_train.shape[0] / Y_train.shape[1])
#Y_scale = np.std(Y_train, axis=0)
#X_scale = 1.0
#Y_scale = 1.0

X_train_scaled = X_train / X_scale
X_test_scaled = X_test / X_scale
Y_train_scaled = Y_train / Y_scale
Y_test_scaled = Y_test / Y_scale

In [712]:
X_train.shape, Y_train.shape

((1153, 200), (1153, 2))

In [713]:
np.var(X_train_scaled), np.var(X_train_scaled, axis=0)

(0.005000000000000002,
 array([0.00506361, 0.01088657, 0.02638088, 0.00017527, 0.02768523,
        0.03161461, 0.0490499 , 0.01527895, 0.00372817, 0.04449884,
        0.0187499 , 0.03217988, 0.02085518, 0.03626766, 0.0054388 ,
        0.01963719, 0.01012529, 0.04214018, 0.00516689, 0.02365363,
        0.00897477, 0.00859821, 0.00452602, 0.02534818, 0.00869003,
        0.0109094 , 0.00532316, 0.02454363, 0.02490946, 0.00669818,
        0.00042194, 0.00557891, 0.00460156, 0.00982135, 0.01471558,
        0.00964468, 0.00324133, 0.01072166, 0.00376222, 0.00656494,
        0.00221936, 0.00956662, 0.00203181, 0.00464341, 0.00515841,
        0.00828375, 0.00237123, 0.02392129, 0.00512468, 0.00754408,
        0.00873337, 0.00701287, 0.01579248, 0.00210025, 0.00173834,
        0.00154841, 0.00483503, 0.00436785, 0.00617153, 0.00216541,
        0.0044285 , 0.0012899 , 0.00531098, 0.00428513, 0.00982737,
        0.00382568, 0.00165086, 0.00297084, 0.00223697, 0.0054732 ,
        0.00370447, 0.002

In [714]:
np.var(Y_train_scaled), np.var(Y_train_scaled, axis=0)

(0.4999999999999997, array([0.5, 0.5]))

# PCovR terms

In [715]:
def G(X, Yhat, alpha=0.5):
    G_pca = np.matmul(X, X.T)
    G_lr = np.matmul(Yhat, Yhat.T)
    print(np.linalg.norm(G_pca))
    print(np.linalg.norm(G_lr))
    
def G_with_norm(X, Y, Yhat, alpha=0.5):
    G_pca = np.matmul(X, X.T)/np.linalg.norm(X)**2
    G_lr = np.matmul(Yhat, Yhat.T)/np.linalg.norm(Y)**2
    print(np.linalg.norm(G_pca))
    print(np.linalg.norm(G_lr))

In [716]:
# Compute LR solutions of train and test set
lr = LR()
lr.fit(X_train, Y_train)
Yhat_train = lr.transform(X_train)
Yhat_test = lr.transform(X_test)

In [717]:
# Compute scaled LR solutions of train and test set
lr_scale = LR()
lr_scale.fit(X_train_scaled, Y_train_scaled)
Yhat_train_scaled = lr_scale.transform(X_train_scaled)
Yhat_test_scaled = lr_scale.transform(X_test_scaled)

In [718]:
G(X_train, Yhat_train) # BAD

13.43792874831383
3894428172048.081


In [719]:
G(X_train_scaled, Yhat_train_scaled) # OK

520.1485784388886
780.631684558894


In [720]:
G_with_norm(X_train, Y_train, Yhat_train) # OK

0.4511262605714555
0.9381218510638325


In [721]:
G_with_norm(X_train_scaled, Y_train_scaled, Yhat_train_scaled) # OK

0.4511262605714555
0.6770439588542015


# KPCovR terms

In [722]:
def GK(K, Yhat, alpha=0.5):
    G_kpca = K
    G_krr = np.matmul(Yhat, Yhat.T)
    print(np.linalg.norm(G_kpca))
    print(np.linalg.norm(G_krr))
    
def GK_with_norm(K, Y, Yhat, alpha=0.5):
    G_kpca = K / np.trace(K)
    G_krr = np.matmul(Yhat, Yhat.T)/np.linalg.norm(Y)**2
    print(np.linalg.norm(G_kpca))
    print(np.linalg.norm(G_krr))

## Linear kernel

In [723]:
# Build linear kernel, non-normalized X
KL_train = linear_kernel(X_train, X_train)
KL_test = linear_kernel(X_test, X_train)

KL_test = center_kernel(KL_test, K_ref=KL_train)
KL_train = center_kernel(KL_train)

KL_scale = np.trace(KL_train)

KL_train_scaled = KL_train / KL_scale
KL_test_scaled = KL_test / KL_scale

In [724]:
# Build linear kernel, normalized X
KL_train_Xscaled = linear_kernel(X_train_scaled, X_train_scaled)
KL_test_Xscaled = linear_kernel(X_test_scaled, X_train_scaled)

KL_test_Xscaled = center_kernel(KL_test_Xscaled, K_ref=KL_train_Xscaled)
KL_train_Xscaled = center_kernel(KL_train_Xscaled)

In [725]:
np.trace(KL_train), np.trace(KL_train_scaled), np.trace(KL_train_Xscaled)

(29.78751166312418, 1.0, 1153.0)

In [726]:
# Compute KRR solutions of train and test set
krrL = KRR()
krrL.fit(KL_train, Y_train)
YhatKL_train = krrL.transform(KL_train)
YhatKL_test = krrL.transform(KL_test)

In [727]:
# Compute scaled KRR solutions of train and test set
krrL_scaled = KRR()
krrL_scaled.fit(KL_train_scaled, Y_train_scaled)
YhatKL_train_scaled = krrL_scaled.transform(KL_train_scaled)
YhatKL_test_scaled = krrL_scaled.transform(KL_test_scaled)

In [728]:
# Compute Xscaled KRR solutions of train and test set
krrL_Xscaled = KRR()
krrL_Xscaled.fit(KL_train_Xscaled, Y_train_scaled)
YhatKL_train_Xscaled = krrL_Xscaled.transform(KL_train_Xscaled)
YhatKL_test_Xscaled = krrL_Xscaled.transform(KL_test_Xscaled)

In [729]:
GK(KL_train, Y_train, YhatKL_train) # BAD

13.437928748313832
4151302978217.31


In [730]:
GK(KL_train_scaled, YhatKL_train_scaled) # BAD

0.4511262605714555
780.6315665540532


In [731]:
GK(KL_train_Xscaled, YhatKL_train_Xscaled) # OK

520.1485784388886
780.631683695793


In [732]:
GK_with_norm(KL_train, Y_train, YhatKL_train) # OK

0.4511262605714555
0.9381217995945569


In [733]:
GK_with_norm(KL_train_scaled, Y_train_scaled, YhatKL_train_scaled) # OK

0.4511262605714555
0.6770438565082859


In [734]:
GK_with_norm(KL_train_Xscaled, Y_train_scaled, YhatKL_train_Xscaled) # OK

0.4511262605714556
0.6770439581056316


## Gaussian kernel

In [735]:
# Build gaussian kernel
KG_train = gaussian_kernel(X_train, X_train)
KG_test = gaussian_kernel(X_test, X_train)

KG_test = center_kernel(KG_test, K_ref=KG_train)
KG_train = center_kernel(KG_train)

KG_scale = np.trace(KG_train)

KG_train_scaled = KG_train / KG_scale
KG_test_scaled = KG_test / KG_scale

In [736]:
# Build gaussian kernel, normalized X
KG_train_Xscaled = gaussian_kernel(X_train_scaled, X_train_scaled)
KG_test_Xscaled = gaussian_kernel(X_test_scaled, X_train_scaled)

KG_test_Xscaled = center_kernel(KG_test_Xscaled, K_ref=KG_train_Xscaled)
KG_train_Xscaled = center_kernel(KG_train_Xscaled)

In [737]:
np.trace(KG_train), np.trace(KG_train_scaled), np.trace(KG_train_Xscaled)

(57.53003488328018, 0.9999999999999999, 901.1200533701208)

In [738]:
# Compute KRR solutions of train and test set
krrG = KRR()
krrG.fit(KG_train, Y_train)
YhatKG_train = krrG.transform(KG_train)
YhatKG_test = krrG.transform(KG_test)

In [739]:
# Compute scaled KRR solutions of train and test set
krrG_scaled = KRR()
krrG_scaled.fit(KG_train_scaled, Y_train_scaled)
YhatKG_train_scaled = krrG_scaled.transform(KG_train_scaled)
YhatKG_test_scaled = krrG_scaled.transform(KG_test_scaled)

In [740]:
# Compute scaled KRR solutions of train and test set
krrG_Xscaled = KRR()
krrG_Xscaled.fit(KG_train_Xscaled, Y_train_scaled)
YhatKG_train_Xscaled = krrG_Xscaled.transform(KG_train_Xscaled)
YhatKG_test_Xscaled = krrG_Xscaled.transform(KG_test_Xscaled)

In [741]:
GK(KG_train, YhatKG_train) # BAD

25.027236374037745
4151302971682.5215


In [742]:
GK(KG_train_scaled, YhatKG_train_scaled) # BAD

0.43502904917082463
815.4638467082684


In [743]:
GK(KG_train_Xscaled, YhatKG_train_Xscaled) # OK

162.86806303586818
815.4639042617947


In [744]:
GK_with_norm(KG_train, Y_train, YhatKG_train) # OK

0.43502904917082463
0.999999963042973


In [745]:
GK_with_norm(KG_train_scaled, Y_train_scaled, YhatKG_train_scaled) # OK

0.4350290491708247
0.7072539867374407


In [746]:
GK_with_norm(KG_train_Xscaled, Y_train_scaled, YhatKG_train_Xscaled) # OK

0.18073958339596816
0.7072540366537688
