In [1]:
#!/usr/bin/env python3

%reload_ext autoreload
%autoreload 2

# System
import os
import sys
from tqdm.notebook import tqdm

# Maths
import numpy as np
import scipy.sparse as sps

# PCovR utilities
from regression import LR, KRR, SparseKRR, PCovR, KPCovR, SparseKPCovR
from decomposition import PCA, KPCA, SparseKPCA
from kernels import linear_kernel, gaussian_kernel, center_kernel
from tools import FPS, simple_split, CUR

# ASE
from ase.io import read, write

# SOAP
from rascal.representations import SphericalInvariants as SOAP

# Scikit learn
from sklearn.linear_model import LinearRegression
from sklearn.kernel_ridge import KernelRidge
from sklearn.decomposition import PCA as skPCA
from sklearn.decomposition import KernelPCA as skKPCA

# Plotting
import matplotlib as mpl
import matplotlib.pyplot as plt

# Make the plots look nicer
plot_parameters = {
    'lines.linewidth': 1.0,
    'lines.markersize': 2,
    'patch.linewidth': 1.0,
    'hatch.linewidth': 1.0,
    'axes.linewidth': 1.0,
    'xtick.top': True,
    'xtick.bottom': True,
    'xtick.direction': 'in',
    'xtick.minor.visible': True,
    'xtick.major.size': 4.0,
    'xtick.minor.size': 2.0,
    'xtick.major.pad': 5.0,
    'xtick.minor.pad': 5.0,
    'ytick.left': True,
    'ytick.right': True,
    'ytick.direction': 'in',
    'ytick.minor.visible': True,
    'ytick.major.size': 4.0,
    'ytick.minor.size': 2.0,
    'ytick.major.pad': 5.0,
    'ytick.minor.pad': 5.0   
}

for pp in plot_parameters.keys():
    mpl.rcParams[pp] = plot_parameters[pp]

In /home/helfrech/.config/matplotlib/stylelib/cosmo.mplstyle: 
The savefig.frameon rcparam was deprecated in Matplotlib 3.1 and will be removed in 3.3.
In /home/helfrech/.config/matplotlib/stylelib/cosmoLarge.mplstyle: 
The savefig.frameon rcparam was deprecated in Matplotlib 3.1 and will be removed in 3.3.


In [2]:
# Read xyz files
s = read('/scratch/helfrech/Sync/GDrive/Projects/KPCovR/KernelPCovR/datasets/CSD-1000R.xyz', index=':5')

# Extract chemical shifts
cs = []
for ss in s:
    ss.wrap()
    cs.append(ss.arrays['CS_local'])

In [3]:
# Compute SOAPs (from librascal tutorial)
soap = SOAP(soap_type='PowerSpectrum',
           interaction_cutoff=3.5,
           max_radial=6,
           max_angular=6,
           gaussian_sigma_type='Constant',
           gaussian_sigma_constant=0.4,
           cutoff_smooth_width=0.5)

In [4]:
soap_rep = soap.transform(s)

In [5]:
X = soap_rep.get_features(soap)
Y = np.concatenate(cs)

In [6]:
# Train-Test split
f_train = 0.80
X_train, X_test, Y_train, Y_test = simple_split(X, Y, f_train)

In [7]:
# Center the data
X_mean = np.mean(X_train, axis=0)
Y_mean = np.mean(Y_train)

X_train -= X_mean
X_test -= X_mean
Y_train -= Y_mean
Y_test -= Y_mean

# FPS

In [8]:
# Select FPS components from train set
n_FPS = 20
idxs, d = FPS(X_train.T, n_FPS)
print(idxs)

[2303    0 1008  924  280  294  378  196 1764  784   28  147 1029  448
  420  259  882   21  434 1680]


# CUR

In [9]:
idxs_c, idxs_r = CUR(X_train, n_col=20, n_row=0)
print(idxs_c)
print(idxs_r)

[   0 1008 2268  196  294  147  448  434  420  784  112  672  938  259
 1029  924  273  700 1540  378]
slice(None, None, None)


# CUR from KPCovR notebook

In [10]:
k=1
nCUR = 20
print(nCUR)
A_copy = X_train.copy()

20


In [11]:
%time
(U, sig, V) = np.linalg.svd(A_copy)
pi = (V[:k]**2.0).sum(axis=0)
j = pi.argmax()

CPU times: user 1 µs, sys: 1e+03 ns, total: 2 µs
Wall time: 4.29 µs


In [12]:
%time
(U, sig, V) = sps.linalg.svds(A_copy,k)
pi = (V[:k]**2.0).sum(axis=0)
j = pi.argmax()

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.77 µs


In [13]:
v = A_copy[:,j]/np.sqrt(np.matmul(A_copy[:, j],A_copy[:, j]))

for i in range(A_copy.shape[1]):
    A_copy[:,i] -= v * np.dot(v,A_copy[:,i])

In [14]:
idxs = [j]

for n in tqdm(range(nCUR-1)):
    (U, sig, V) = sps.linalg.svds(A_copy,k)
    pi = (V[:k]**2.0).sum(axis=0)
    #pi[idxs] = 0 #####
    idxs.append(pi.argmax())
    
    v = A_copy[:,idxs[-1]]/np.sqrt(np.matmul(A_copy[:, idxs[-1]],A_copy[:, idxs[-1]]))

    for i in range(A_copy.shape[1]):
        A_copy[:,i] -= v * np.dot(v,A_copy[:,i])

HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))




In [15]:
idxs = np.asarray(idxs)
print(idxs)

[   0 1008 2268  196  294  147  448  434  420  784  112  672  938  259
 1029  924  273  700 1540  378]
