In [1]:
import math
import sys

sys.path.append("../")

import matplotlib.pyplot as plt
import numpy as np

from ase.io import read
import mpltex
from rascal.representations import SphericalInvariants as SOAP
from rascal.utils import get_score
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import PairwiseKernel
from sklearn.model_selection import train_test_split
from sklearn.utils import Bunch
from tqdm.notebook import tqdm

from pylode.projection_coeffs import Density_Projection_Calculator as LODE

tab10 = plt.rcParams["axes.prop_cycle"].by_key()['color']
poly_kernel = PairwiseKernel(metric="polynomial")

# General functions

In [2]:
def sum_descriptor(X, n_atoms_per_frame):
    """Take the sum of a descriptor X over all atoms."""
    structure_idx = np.cumsum([0] + n_atoms_per_frame)
    X_sum = np.array([
        np.sum(X[structure_idx[i]:structure_idx[i + 1]], axis=0)
        for i in range(len(structure_idx) - 1)
    ])

    return X_sum

In [3]:
@mpltex.acs_decorator
def plot_rmsd(bunch_obj, method_key, prop_key, fname=None):
    fmts = ["-", "--", ".-"]

    results_obj = bunch_obj[method_key]
    method = method_key

    fig, ax = plt.subplots(constrained_layout=True)

    realiziations = results_obj.keys()
    handles_test = len(realiziations) * [None]
    handles_train = len(realiziations) * [None]

    for i, k in enumerate(realiziations):
        label_cutoff = fr"$r_\mathrm{{{prop_key}}}="
        label_cutoff += fr"{results_obj[k][prop_key]}\,\mathrm{{\AA}}$"

        handles_test[i] = ax.plot(r_train_structures,
                                  results_obj[k].rmse_test,
                                  fmts[i],
                                  c="k",
                                  label=rf"Test, {method}, {label_cutoff}")[0]

        handles_train[i] = ax.plot(
            r_train_structures,
            results_obj[k].rmse_train,
            fmts[i],
            c=tab10[0],
            label=rf"Train, {method}, {label_cutoff}")[0]

    ax.set_yscale("log")
    ax.set_xscale("log")

    ax.set_xlim(1e1, 2e3)
    ax.set_ylim(1e-2, 1e2)

    ax.set_xlabel("training structures")
    ax.set_ylabel("RMSE")

    handles = handles_test + handles_train
    labels = [handle.get_label() for handle in handles]
    ax.legend(handles,
              labels,
              ncol=2,
              handlelength=1,
              frameon=True,
              edgecolor="None",
              fontsize=7)

    if fname is not None:
        fig.savefig(fname, transparent=True)

    fig.show()

# Import and preprocess data 

In [4]:
input_file = "../datasets/point_charges_Training_set.xyz"

frames = read(input_file, index=':')
n_frames = len(frames)
n_atoms_per_frame = [len(frame) for frame in frames]

# Move atoms in unitcell
for frame in frames:
    frame.wrap()

# extract energy
Y = np.array([frame.info["energy"] for frame in frames])

# Create an object/dictionary for storing results
results = Bunch()

# Split Train and Test data

Here only for Y since the X depends on our applied model (SOAP, LODE)

In [5]:
f_train = 0.75  # factor of the train set picked from the total set
n_subsets = 10  # number of subsets picked on a logspace for scaling test

f_test = 1 - f_train
i_train, i_test = train_test_split(np.arange(n_frames),
                                   test_size=f_test,
                                   random_state=0)
n_train = len(i_train)
n_test = len(i_test)

# Split frames
frames_train = [frames[i] for i in i_train]
frames_test = [frames[i] for i in i_test]

# Split energies
Y_train = Y[i_train]
Y_test = Y[i_test]

# Generate subsets numbers for training curve
exp_max = math.log(n_train, 10)
r_train_structures = np.logspace(1, exp_max, num=n_subsets, endpoint=True)
r_train_structures = np.round(r_train_structures).astype(int)

# Remove doubled values
r_train_structures = np.unique(r_train_structures)

# SOAP

In [6]:
recalc = False

In [7]:
results.soap = Bunch()
r_cut = [3, 6, 9]
regularisation_soap = 10e-3  # From paper

In [8]:
for cut in r_cut:
    results.soap[f"r_{cut}"] = Bunch()
    results.soap[f"r_{cut}"].cut = cut
    fname_precomputed = f"../datasets/precomputed_soap_{cut}"
    hypers_soap = dict(
        soap_type='PowerSpectrum',  # nu = 2
        max_radial=2,
        max_angular=6,
        gaussian_sigma_type='Constant',
        gaussian_sigma_constant=1.0,
        cutoff_smooth_width=0.5)

    if recalc:
        hypers_soap["interaction_cutoff"] = cut

        calculator = SOAP(**hypers_soap)
        soap_rep = calculator.transform(frames)
        X_raw = soap_rep.get_features(calculator)

        results.soap[f"r_{cut}"].X_raw = X_raw
        np.save(fname_precomputed, results.soap[f"r_{cut}"].X_raw)

    else:
        results.soap[f"r_{cut}"].X_raw = np.load(fname_precomputed)

num_features_soap = results.soap[f"r_{r_cut[0]}.npy"].X_raw.shape[1]

FileNotFoundError: [Errno 2] No such file or directory: '../datasets/precomputed_soap_3'

## GPR model

In [None]:
for cut in r_cut:
    X = sum_descriptor(results.soap[f"r_{cut}"].X_raw, n_atoms_per_frame)
    X_train = X[i_train, :]
    X_test = X[i_test, :]

    rmse_train = np.zeros(len(r_train_structures))
    rmse_test = np.zeros(len(r_train_structures))

    for i, n_train_structure in enumerate(r_train_structures):
        X_train_cur = X_train[:n_train_structure, :]
        Y_train_cur = Y_train[:n_train_structure]

        gpr = GaussianProcessRegressor(kernel=poly_kernel,
                                       random_state=0,
                                       alpha=regularisation_soap)
        gpr.fit(X_train_cur, Y_train_cur)

        Y_train_pred = gpr.predict(X_train_cur)
        Y_test_pred = gpr.predict(X_test)

        rmse_train[i] = get_score(Y_train_cur, Y_train_pred)["RMSE"]
        rmse_test[i] = get_score(Y_test, Y_test_pred)["RMSE"]

    results.soap[f"r_{cut}"].rmse_train = rmse_train.copy()
    results.soap[f"r_{cut}"].rmse_test = rmse_test.copy()

In [None]:
plot_rmsd(results, "soap", "cut", fname=None)

# LODE model

In [None]:
frames[0]

In [None]:
results.lode = Bunch()
r_smearing = [1, 0.5]  # computational cost scales cubically with 1/smearing
regularisation_lode = 10e-6  # From paper
species_dict = {'Na': 0, 'Cl': 1}

hypers_lode = dict(
    max_angular=6,
    cutoff_radius=3,
    potential_exponent=1,  # currently, only the exponent p=1 is supported
    compute_gradients=False)

In [None]:
# This will take a LONG time if `recalc=True`!
for smearing in r_smearing:
    results.soap[f"r_{smearing}"] = Bunch()
    results.soap[f"r_{smearing}"].smearing = smearing
    fname_precomputed = f"../datasets/precomputed_lode_{smearing}"
    hypers_lode = dict(
        max_angular=6,
        cutoff_radius=3,
        potential_exponent=1,  # currently, only the exponent p=1 is supported
        compute_gradients=False)

    if recalc:
        hypers_lode["smearing"] = smearing

        calculator = LODE(**hypers_lode)
        lode_rep = calculator.transform(frames, species_dict)
        X_raw = lode_rep.get_features(calculator)

        results.lode[f"r_{smearing}"].X_raw = X_raw
        np.save(fname_precomputed, results.lode[f"r_{smearing}"].X_raw)

    else:
        results.soap[f"r_{smearing}"].X_raw = np.load(fname_precomputed)

num_features_lode = results.lode[f"r_{r_smearing[0]}"].X_raw.shape[1]

In [None]:
for smearing in r_smearing:
    X = sum_descriptor(results.lode[f"r_{smearing}"].X_raw, n_atoms_per_frame)
    X_train = X[i_train, :]
    X_test = X[i_test, :]

    rmse_train = np.zeros(len(r_train_structures))
    rmse_test = np.zeros(len(r_train_structures))

    for i, n_train_structure in enumerate(r_train_structures):
        X_train_cur = X_train[:n_train_structure, :]
        Y_train_cur = Y_train[:n_train_structure]

        gpr = GaussianProcessRegressor(kernel=poly_kernel,
                                       random_state=0,
                                       alpha=regularisation_soap)
        gpr.fit(X_train_cur, Y_train_cur)

        Y_train_pred = gpr.predict(X_train_cur)
        Y_test_pred = gpr.predict(X_test)

        rmse_train[i] = get_score(Y_train_cur, Y_train_pred)["RMSE"]
        rmse_test[i] = get_score(Y_test, Y_test_pred)["RMSE"]

    results.lode[f"r_{smearing}"].rmse_train = rmse_train.copy()
    results.lodee[f"r_{smearing}"].rmse_test = rmse_test.copy()