In [None]:
import math
import sys
from functools import partial

sys.path.append("../")

import matplotlib.pyplot as plt
import numpy as np

from ase.io import read
import mpltex
from rascal.representations import SphericalInvariants as SOAP
from rascal.utils import get_score
from sklearn.kernel_ridge import KernelRidge
from sklearn.metrics.pairwise import polynomial_kernel
from sklearn.model_selection import train_test_split
from sklearn.utils import Bunch
from tqdm import tqdm

from pylode.projection_coeffs import Density_Projection_Calculator as LODE

tab10 = plt.rcParams["axes.prop_cycle"].by_key()['color']
kernel_func = partial(polynomial_kernel, gamma=1.0, degree=1)

# General functions

In [None]:
def build_kernel_matrix(kernel_func, X, Y, n_atoms, desc="Build kernel"):
    """Build kernel matrix for kernel ridge regression.

    Parameters
    ----------
    kernel_func : callable
        kernel function. i.e. `sklearn.metrics.pairwise.linear_kernel`
    X : ndarray of shape (n_samples_X * n_atoms, n_features)
        The first feature array.
    Y : ndarray of shape (n_samples_Y * n_atoms, n_features)
        The second feature array.
    n_atoms : int
        Number of atoms in feature. Used to predict number of structures.
        `n_structures = len(X)/n_atoms`. Same for `Y`.
    desc : str
        Description for progress bar.

    Returns
    -------
    K : ndarray of shape (n_samples_X, n_samples_Y)

    Raises
    ------
    ValueError
        If predicted number of structures times `n_atoms` is not equal to 
        the length of one of the feature arrays.
    """
    n_samples_X = len(X) // n_atoms
    if n_samples_X * n_atoms != len(X):
        raise ValueError("X's predicted number of structures does not match!")

    n_samples_Y = len(Y) // n_atoms
    if n_samples_Y * n_atoms != len(Y):
        raise ValueError("Y's predicted number of structures does not match!")

    if X is Y:
        # For X==Y the kernel matrix is symmetric.
        # Only calculayte upper trangle matrix and copy at the end.
        row, col = np.triu_indices(n_samples_X)
    else:
        row, col = np.indices((n_samples_X, n_samples_Y))

    K = np.zeros([n_samples_X, n_samples_Y])

    indices = [(r, c) for r, c in zip(row.flatten(),col.flatten())]
    for n, m in tqdm(indices, desc=desc):
        # Select all atoms in one structure
        X_A = X[n * n_atoms:(n + 1) * n_atoms, :]
        X_B = X[m * n_atoms:(m + 1) * n_atoms, :]
        K[n, m] = np.sum(kernel_func(X_A, X_B))

    if X is Y:
        K += K.T
        K -= np.diag(np.diag(K)) / 2

    return K

In [None]:
def train_predict_krr(X_reshape, n_features, regularisation=1):
    X_train = X_reshape[i_train, :, :]

    X_test = X_reshape[i_test, :, :]
    X_test = X_test.reshape(n_atoms * n_test, n_features)

    rmse_train = np.zeros(len(r_train_structures))
    rmse_test = np.zeros(len(r_train_structures))

    for i, n_train_structure in enumerate(r_train_structures):
        X_train_cur = X_train[:n_train_structure, :]
        X_train_cur = X_train_cur.reshape(n_atoms * n_train_structure,
                                          n_features)

        K_train = build_kernel_matrix(kernel_func,
                                      X=X_train_cur,
                                      Y=X_train_cur,
                                      n_atoms=n_atoms,
                                      desc=f"Build train kernel for"
                                      f" {n_train_structure} sets")

        K_test = build_kernel_matrix(kernel_func,
                                     X=X_test,
                                     Y=X_train_cur,
                                     n_atoms=n_atoms,
                                     desc=f"Build test kernel for"
                                     f" {n_train_structure} sets")

        Y_train_cur = Y_train[:n_train_structure]

        krr = KernelRidge(alpha=regularisation, kernel="precomputed", gamma=1)
        krr.fit(K_train, Y_train_cur)

        Y_train_pred = krr.predict(K_train)
        Y_test_pred = krr.predict(K_test)

        rmse_train[i] = get_score(Y_train_cur, Y_train_pred)["RMSE"]
        rmse_test[i] = get_score(Y_test, Y_test_pred)["RMSE"]

        # Calculate % RMSE
        rmse_train[i] /= Y_train_cur.var()
        rmse_test[i] /= Y_train_cur.var()

    return rmse_train, rmse_test

In [None]:
@mpltex.acs_decorator
def plot_rmsd(bunch_obj, method_key, prop_key, fname=None):
    fmts = ["-", "--", ".-"]

    results_obj = bunch_obj[method_key]
    method = method_key

    fig, ax = plt.subplots(constrained_layout=True)

    realiziations = results_obj.keys()
    handles_test = len(realiziations) * [None]
    handles_train = len(realiziations) * [None]

    for i, k in enumerate(realiziations):
        label_cutoff = fr"$r_\mathrm{{{prop_key}}}="
        label_cutoff += fr"{results_obj[k][prop_key]}\,\mathrm{{\AA}}$"

        handles_test[i] = ax.plot(r_train_structures,
                                  results_obj[k].rmse_test,
                                  fmts[i],
                                  c="k",
                                  label=rf"Test, {method}, {label_cutoff}")[0]

        handles_train[i] = ax.plot(
            r_train_structures,
            results_obj[k].rmse_train,
            fmts[i],
            c=tab10[0],
            label=rf"Train, {method}, {label_cutoff}")[0]

    ax.set_yscale("log")
    ax.set_xscale("log")

    ax.set_xlim(1e1, 2e3)
    ax.set_ylim(1e-2, 1e2)

    ax.set_xlabel("training structures")
    ax.set_ylabel("% RMSE")

    handles = handles_test + handles_train
    labels = [handle.get_label() for handle in handles]
    ax.legend(handles,
              labels,
              ncol=2,
              handlelength=1,
              frameon=True,
              edgecolor="None",
              fontsize=7)

    if fname is not None:
        fig.savefig(fname, transparent=True)

    fig.show()

# Import and preprocess data 

In [None]:
input_file = "../datasets/point_charges_Training_set.xyz"

frames = read(input_file, index=':')
n_frames = len(frames)
n_atoms = len(frames[0])

# Move atoms in unitcell
for frame in frames:
    frame.wrap()

# extract energy
Y = np.array([frame.info["energy"] for frame in frames])

# Create an object/dictionary for storing results
results = Bunch()

# Get atomic species in dataset
global_species = set()
for frame in frames:
    global_species.update(frame.get_chemical_symbols())
species_dict = {k: i for i, k in enumerate(global_species)}

# Split Train and Test data

Here only for Y since the X depends on our applied model (SOAP, LODE)

In [None]:
f_train = 0.75  # factor of the train set picked from the total set
n_subsets = 5  # number of subsets picked on a logspace for scaling test

i_train, i_test = train_test_split(np.arange(n_frames),
                                   train_size=f_train,
                                   random_state=0)
n_train = len(i_train)
n_test = len(i_test)

# Split energies
Y_train = Y[i_train]
Y_test = Y[i_test]

# Generate subsets numbers for training curve
exp_max = math.log(n_train, 10)
r_train_structures = np.logspace(1, exp_max, num=n_subsets, endpoint=True)
r_train_structures = np.round(r_train_structures).astype(int)

# Remove doubled values
r_train_structures = np.unique(r_train_structures)

# SOAP

In [None]:
recalc = False

In [None]:
results.soap = Bunch()
r_cut = [3, 6, 9]
regularisation_soap = 10e-3  # From paper

In [None]:
for cut in r_cut:
    results.soap[f"r_{cut}"] = Bunch()
    results.soap[f"r_{cut}"].cut = cut
    fname_precomputed = f"../datasets/precomputed/precomputed_soap_{cut}.npy"
    hypers_soap = dict(
        soap_type='PowerSpectrum',  # nu = 2
        max_radial=6,
        max_angular=6,
        gaussian_sigma_type='Constant',
        gaussian_sigma_constant=1.0,
        cutoff_smooth_width=0.5)

    if recalc:
        hypers_soap["interaction_cutoff"] = cut

        calculator = SOAP(**hypers_soap)
        soap_rep = calculator.transform(frames)
        X_raw = soap_rep.get_features(calculator)


        np.save(fname_precomputed, results.soap[f"r_{cut}"].X_raw)

    else:
        X_raw = np.load(fname_precomputed)

    results.soap[f"r_{cut}"].X_raw = X_raw

    n_features = X_raw.shape[1]
    X_reshape = X_raw.reshape(n_frames, n_atoms, n_features)
    results.soap[f"r_{cut}"].X_reshape = X_reshape
    results.soap[f"r_{cut}"].n_fetaures = n_features

## Train krr model

In [None]:
for cut in r_cut:
    print(f"cut = {cut} Å")

    rmse_train, rmse_test = train_predict_krr(
        X_reshape=results.soap[f"r_{cut}"].X_reshape,
        n_features=results.soap[f"r_{cut}"].n_fetaures,
        regularisation=regularisation_soap)

    results.soap[f"r_{cut}"].rmse_train = rmse_train.copy()
    results.soap[f"r_{cut}"].rmse_test = rmse_test.copy()

In [None]:
plot_rmsd(results, "soap", "cut", fname=None)

# LODE model

In [None]:
results.lode = Bunch()
r_smearing = [3, 2, 1]  # computational cost scales cubically with 1/smearing
regularisation_lode = 10e-6  # From paper

hypers_lode = dict(
    max_angular=2,
    cutoff_radius=3,
    potential_exponent=1,  # currently, only the exponent p=1 is supported
    compute_gradients=False)

In [None]:
# This will take a LONG time if `recalc=True`!
for smear in r_smearing:
    results.lode[f"r_{smear}"] = Bunch()
    results.lode[f"r_{smear}"].smear = smear
    fname_precomputed = f"../datasets/precomputed/precomputed_lode_{smear}.npy"
    hypers_lode = dict(
        max_angular=6,
        cutoff_radius=3,
        potential_exponent=1,  # currently, only the exponent p=1 is supported
        compute_gradients=False)

    if recalc:
        # TODO: Import function from utilities/precompute_lode.py
        hypers_lode["smearing"] = smear

        # Get atomic species in dataset
        global_species = set()
        for frame in frames:
            global_species.update(frame.get_chemical_symbols())
        species_dict = {k: i for i, k in enumerate(global_species)}

        calculator = LODE(**hypers_lode)
        lode_rep = calculator.transform(frames, species_dict)
        X_raw = lode_rep.get_features(calculator)

        np.save(fname_precomputed, results.lode[f"r_{smear}"].X_raw)

    else:
        X_raw = np.load(fname_precomputed)

    # reshape lode features in a 2D array - this is common structure
    X_raw = X_raw.reshape(X_raw.shape[0], np.prod(X_raw.shape[1:]))
    results.lode[f"r_{smear}"].X_raw = X_raw

    n_features = X_raw.shape[1]
    X_reshape = X_raw.reshape(n_frames, n_atoms, n_features)
    results.lode[f"r_{smear}"].X_reshape = X_reshape
    results.lode[f"r_{smear}"].n_fetaures = n_features

In [None]:
for smearing in r_smearing:
    print(f"smearing = {smearing} Å")

    rmse_train, rmse_test = train_predict_krr(
        X_reshape=results.lode[f"r_{smearing}"].X_reshape,
        n_features=results.lode[f"r_{smear}"].n_fetaures,
        regularisation=regularisation_lode)

    results.lode[f"r_{smear}"].rmse_train = rmse_train.copy()
    results.lode[f"r_{smear}"].rmse_test = rmse_test.copy()

In [None]:
plot_rmsd(results, "lode", "smear", fname=None)