In [936]:
#!/usr/bin/env python3

%reload_ext autoreload
%autoreload 2

# Maths
import numpy as np

# PCovR utilities
from regression import LR, KRR, SparseKRR, IterativeSparseKRR, PCovR, KPCovR, SparseKPCovR
from decomposition import PCA, KPCA, SparseKPCA, IterativeSparseKPCA
from kernels import linear_kernel, gaussian_kernel, center_kernel
from selection import FPS
from split import simple_split

# ASE
from ase.io import read, write

# SOAP
from rascal.representations import SphericalInvariants as SOAP
from rascal.neighbourlist.structure_manager import mask_center_atoms_by_species

# Scikit learn
from sklearn.linear_model import LinearRegression
from sklearn.kernel_ridge import KernelRidge
from sklearn.decomposition import PCA as sklPCA
from sklearn.decomposition import KernelPCA as sklKPCA

# Plotting
import matplotlib as mpl
import matplotlib.pyplot as plt

# Make the plots look nicer
plot_parameters = {
    'lines.linewidth': 1.0,
    'lines.markersize': 2,
    'patch.linewidth': 1.0,
    'hatch.linewidth': 1.0,
    'axes.linewidth': 1.0,
    'xtick.top': True,
    'xtick.bottom': True,
    'xtick.direction': 'in',
    'xtick.minor.visible': True,
    'xtick.major.size': 4.0,
    'xtick.minor.size': 2.0,
    'xtick.major.pad': 5.0,
    'xtick.minor.pad': 5.0,
    'ytick.left': True,
    'ytick.right': True,
    'ytick.direction': 'in',
    'ytick.minor.visible': True,
    'ytick.major.size': 4.0,
    'ytick.minor.size': 2.0,
    'ytick.major.pad': 5.0,
    'ytick.minor.pad': 5.0   
}

for pp in plot_parameters.keys():
    mpl.rcParams[pp] = plot_parameters[pp]

In [937]:
# Read xyz files
s = read('/scratch/helfrech/Sync/Shared/KernelPCovR/datasets/DEEM_10000-prop-wrapped.xyz', index=':100')

# Extract local properties
v = []
e = []
for ss in s:
    mask_center_atoms_by_species(ss, species_select='Si')
    vv = ss.arrays['Si_volumes']
    vv = vv[~np.isnan(vv)]
    v.append(vv)
    ee = ss.arrays['Si_energies']
    ee = ee[~np.isnan(ee)]
    e.append(ee)

In [938]:
# Compute SOAPs (from librascal tutorial)
soap = SOAP(soap_type='PowerSpectrum',
           interaction_cutoff=6.0,
           max_radial=8,
           max_angular=6,
           gaussian_sigma_type='Constant',
           gaussian_sigma_constant=0.4,
           cutoff_smooth_width=0.5)

In [939]:
soap_rep = soap.transform(s)

In [940]:
X = soap_rep.get_features(soap)
Yv = np.concatenate(v)
Ye = np.concatenate(e)
Y = np.stack((Yv, Ye), axis=1)
Y[:, 0] *= 1.0E4

In [941]:
# Train-Test split
f_train = 0.80
X_train, X_test, Y_train, Y_test = simple_split(X, Y, f_train)

In [942]:
# Select FPS components from train set
n_FPS = 200
idxs, d = FPS(X_train.T, n_FPS)

X_train = X_train[:, idxs]
X_test = X_test[:, idxs]

In [943]:
# Center the data
X_mean = np.mean(X_train, axis=0)
Y_mean = np.mean(Y_train, axis=0)

X_train -= X_mean
X_test -= X_mean
Y_train -= Y_mean
Y_test -= Y_mean

In [944]:
# Normalize the data
X_scale = np.linalg.norm(X_train)/np.sqrt(X_train.shape[0])
Y_scale = np.linalg.norm(Y_train, axis=0)/np.sqrt(Y_train.shape[0] / Y_train.shape[1])
#Y_scale = np.std(Y_train, axis=0)
#X_scale = 1.0
#Y_scale = 1.0

X_train_scaled = X_train / X_scale
X_test_scaled = X_test / X_scale
Y_train_scaled = Y_train / Y_scale
Y_test_scaled = Y_test / Y_scale

In [945]:
X_train.shape, Y_train.shape

((1153, 200), (1153, 2))

In [946]:
np.var(X_train_scaled), np.var(X_train_scaled, axis=0)

(0.004999999999999999,
 array([7.46763370e-05, 1.06915131e-02, 4.16900648e-02, 2.44294053e-02,
        3.05598520e-02, 4.79795500e-02, 4.37099187e-02, 1.44969263e-02,
        2.65831287e-02, 3.72982124e-02, 9.77736481e-03, 1.88380157e-02,
        3.18420517e-02, 6.88702137e-03, 4.67502374e-03, 1.97688765e-02,
        5.12291212e-03, 2.31915466e-02, 9.01150867e-03, 1.75609111e-04,
        2.52376701e-02, 8.94572571e-03, 6.87692204e-03, 7.58053538e-03,
        2.79486580e-02, 9.08700557e-03, 2.42978660e-02, 2.34132960e-02,
        1.44886283e-02, 1.00748033e-02, 5.74681168e-03, 5.26204898e-03,
        2.25104237e-03, 4.97620664e-03, 4.05548087e-03, 2.22680618e-03,
        1.04691337e-02, 5.54076177e-03, 6.45946588e-03, 9.42018877e-03,
        4.49076547e-03, 9.84299173e-03, 5.53553758e-03, 7.71017801e-03,
        3.83382934e-03, 6.27254494e-03, 8.77049125e-03, 2.08319813e-02,
        4.64170109e-03, 2.02408413e-03, 1.10987046e-02, 4.51661732e-04,
        3.73030585e-03, 2.13349807e-03, 1

In [947]:
np.var(Y_train_scaled), np.var(Y_train_scaled, axis=0)

(0.5, array([0.5, 0.5]))

# PCovR terms

In [948]:
def G(X, Yhat, alpha=0.5):
    G_pca = np.matmul(X, X.T)
    G_lr = np.matmul(Yhat, Yhat.T)
    print(np.linalg.norm(G_pca))
    print(np.linalg.norm(G_lr))
    
def G_with_norm(X, Y, Yhat, alpha=0.5):
    G_pca = np.matmul(X, X.T)/np.linalg.norm(X)**2
    G_lr = np.matmul(Yhat, Yhat.T)/np.linalg.norm(Y)**2
    print(np.linalg.norm(G_pca))
    print(np.linalg.norm(G_lr))

In [949]:
# Compute LR solutions of train and test set
lr = LR()
lr.fit(X_train, Y_train)
Yhat_train = lr.transform(X_train)
Yhat_test = lr.transform(X_test)

In [950]:
# Compute scaled LR solutions of train and test set
lr_scale = LR()
lr_scale.fit(X_train_scaled, Y_train_scaled)
Yhat_train_scaled = lr_scale.transform(X_train_scaled)
Yhat_test_scaled = lr_scale.transform(X_test_scaled)

In [951]:
G(X_train, Yhat_train) # BAD

13.421202488608541
3598212417575.713


In [952]:
G(X_train_scaled, Yhat_train_scaled) # OK

517.1620254382669
779.8479957866109


In [953]:
G_with_norm(X_train, Y_train, Yhat_train) # OK

0.44853601512425567
0.9357393221128638


In [954]:
G_with_norm(X_train_scaled, Y_train_scaled, Yhat_train_scaled) # OK

0.4485360151242556
0.6763642634749442


# KPCovR terms

In [955]:
def GK(K, Yhat, alpha=0.5):
    G_kpca = K
    G_krr = np.matmul(Yhat, Yhat.T)
    print(np.linalg.norm(G_kpca))
    print(np.linalg.norm(G_krr))
    
def GK_with_norm(K, Y, Yhat, alpha=0.5):
    G_kpca = K / np.trace(K)
    G_krr = np.matmul(Yhat, Yhat.T)/np.linalg.norm(Y)**2
    print(np.linalg.norm(G_kpca))
    print(np.linalg.norm(G_krr))

## Linear kernel

In [956]:
# Build linear kernel, non-normalized X
KL_train = linear_kernel(X_train, X_train)
KL_test = linear_kernel(X_test, X_train)

KL_test = center_kernel(KL_test, K_ref=KL_train)
KL_train = center_kernel(KL_train)

KL_scale = np.trace(KL_train)

KL_train_scaled = KL_train / KL_scale
KL_test_scaled = KL_test / KL_scale

In [957]:
# Build linear kernel, normalized X
KL_train_Xscaled = linear_kernel(X_train_scaled, X_train_scaled)
KL_test_Xscaled = linear_kernel(X_test_scaled, X_train_scaled)

KL_test_Xscaled = center_kernel(KL_test_Xscaled, K_ref=KL_train_Xscaled)
KL_train_Xscaled = center_kernel(KL_train_Xscaled)

In [958]:
np.trace(KL_train), np.trace(KL_train_scaled), np.trace(KL_train_Xscaled)

(29.92224043567723, 1.0, 1152.9999999999998)

In [959]:
# Compute KRR solutions of train and test set
krrL = KRR()
krrL.fit(KL_train, Y_train)
YhatKL_train = krrL.transform(KL_train)
YhatKL_test = krrL.transform(KL_test)

In [960]:
# Compute scaled KRR solutions of train and test set
krrL_scaled = KRR()
krrL_scaled.fit(KL_train_scaled, Y_train_scaled)
YhatKL_train_scaled = krrL_scaled.transform(KL_train_scaled)
YhatKL_test_scaled = krrL_scaled.transform(KL_test_scaled)

In [961]:
# Compute Xscaled KRR solutions of train and test set
krrL_Xscaled = KRR()
krrL_Xscaled.fit(KL_train_Xscaled, Y_train_scaled)
YhatKL_train_Xscaled = krrL_Xscaled.transform(KL_train_Xscaled)
YhatKL_test_Xscaled = krrL_Xscaled.transform(KL_test_Xscaled)

In [962]:
GK(KL_train, Y_train, YhatKL_train) # BAD

13.421202488608541
3845314818371.949


In [963]:
GK(KL_train_scaled, YhatKL_train_scaled) # BAD

0.4485360151242558
779.8469912547172


In [964]:
GK(KL_train_Xscaled, YhatKL_train_Xscaled) # OK

517.1620254382669
779.8479948778796


In [965]:
GK_with_norm(KL_train, Y_train, YhatKL_train) # OK

0.4485360151242558
0.9357392840661494


In [966]:
GK_with_norm(KL_train_scaled, Y_train_scaled, YhatKL_train_scaled) # OK

0.4485360151242558
0.676363392241732


In [967]:
GK_with_norm(KL_train_Xscaled, Y_train_scaled, YhatKL_train_Xscaled) # OK

0.44853601512425584
0.6763642626867993


## Gaussian kernel

In [968]:
# Build gaussian kernel
KG_train = gaussian_kernel(X_train, X_train)
KG_test = gaussian_kernel(X_test, X_train)

KG_test = center_kernel(KG_test, K_ref=KG_train)
KG_train = center_kernel(KG_train)

KG_scale = np.trace(KG_train)

KG_train_scaled = KG_train / KG_scale
KG_test_scaled = KG_test / KG_scale

In [969]:
# Build gaussian kernel, normalized X
KG_train_Xscaled = gaussian_kernel(X_train_scaled, X_train_scaled)
KG_test_Xscaled = gaussian_kernel(X_test_scaled, X_train_scaled)

KG_test_Xscaled = center_kernel(KG_test_Xscaled, K_ref=KG_train_Xscaled)
KG_train_Xscaled = center_kernel(KG_train_Xscaled)

In [970]:
np.trace(KG_train), np.trace(KG_train_scaled), np.trace(KG_train_Xscaled)

(57.78637062623631, 1.0, 901.5848723918432)

In [971]:
# Compute KRR solutions of train and test set
krrG = KRR()
krrG.fit(KG_train, Y_train)
YhatKG_train = krrG.transform(KG_train)
YhatKG_test = krrG.transform(KG_test)

In [972]:
# Compute scaled KRR solutions of train and test set
krrG_scaled = KRR()
krrG_scaled.fit(KG_train_scaled, Y_train_scaled)
YhatKG_train_scaled = krrG_scaled.transform(KG_train_scaled)
YhatKG_test_scaled = krrG_scaled.transform(KG_test_scaled)

In [973]:
# Compute scaled KRR solutions of train and test set
krrG_Xscaled = KRR()
krrG_Xscaled.fit(KG_train_Xscaled, Y_train_scaled)
YhatKG_train_Xscaled = krrG_Xscaled.transform(KG_train_Xscaled)
YhatKG_test_Xscaled = krrG_Xscaled.transform(KG_test_Xscaled)

In [974]:
GK(KG_train, YhatKG_train) # BAD

24.996044100387873
3845314811253.022


In [975]:
GK(KG_train_scaled, YhatKG_train_scaled) # BAD

0.43255950892058803
815.3829491599207


In [976]:
GK(KG_train_Xscaled, YhatKG_train_Xscaled) # OK

162.43663465023985
815.383012547666


In [977]:
GK_with_norm(KG_train, Y_train, YhatKG_train) # OK

0.43255950892058803
0.9999999603182806


In [978]:
GK_with_norm(KG_train_scaled, Y_train_scaled, YhatKG_train_scaled) # OK

0.43255950892058803
0.7071838240762537


In [979]:
GK_with_norm(KG_train_Xscaled, Y_train_scaled, YhatKG_train_Xscaled) # OK

0.18016787950235522
0.7071838790526158
