# MBTR descriptor 

In this notebook we examine the Many-body Tensor representation as a descriptor and assess the performance.

In [None]:
! pip install dscribe  # Downloads needed modules
! pip install ase

# -------------------------------------------------------

import io
from datetime import datetime
import zipfile

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import requests
from ase.io import read
from sklearn.model_selection import train_test_split

def download_published_data():
  """Downloads and unzips data folder.
  """


    start = datetime.now()
    print(start)
    r = requests.get('https://people.bath.ac.uk/crh53/m1507656.zip')
    zipped = zipfile.ZipFile(io.BytesIO(r.content))
    zipped.extractall('')
    finish = datetime.now()
    print(finish)
    print(finish - start)

def xyz_str_to_atoms(xyz_str):
    """
    Convert a xyz file to an ASE atoms object via in-memory file (StringIO).
    """
    f = io.StringIO()
    f.write(xyz_str)
    f.seek(0)
    atoms = read(f, format="xyz")
    return atoms

# -------------------------------------------------------

print('Downloading data...')
download_published_data()  # downloads needed data
% cd m1507656

from helpers import get_level  # needs to go here as wont have been downloaded earlier in

print('Loading data...')
df = pd.read_json('df_5k.json', orient='split')

print('Generating `ase.Atoms` objects...')
df['atoms'] = df['xyz_pbe_relaxed'].apply(xyz_str_to_atoms)

print('Extracting HOMO from data...')
df['HOMO'] = df.apply(lambda row: get_level(row, level_type='HOMO', subset='GOWO_at_PBE0_cbs'), axis=1)

print('Splitting data set...')
train, test = train_test_split(df, test_size=0.2, random_state=20210817)
train_atoms, test_atoms = train['atoms'].to_list(), test['atoms'].to_list()

print('Data Processing Complete')
print('#', '-'*119)

In [None]:
atomic_numbers = df['atoms'].apply(lambda x: x.numbers)
unique_atomic_numbers = set([a for b in atomic_numbers for a in b])
num_atoms = atomic_numbers.apply(len)
max_num_atoms = num_atoms.max()

print(unique_atomic_numbers)
print(max_num_atoms)

MBTR Generation

Going to try the k2 and k3 separately for the moment to see how that turns out
Rough testing indicates k2 is more reliable feature (also faster to calculate and produces smaller feature vector making resultant model training and assessment much quicker)
Do a quick t-SNE visualisation on a subsample

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE
from dscribe.descriptors import MBTR

In [None]:
n = 35

feature_calc = MBTR(
    species=unique_atomic_numbers,
    k2={
        "geometry": {"function": "inverse_distance"},
        "grid": {"min": 0, "max": 1, "n": n, "sigma": 0.1},
        "weighting": {"function": "exp", "scale": 0.5, "threshold": 1e-3},
    },
    periodic=False,
    normalization="l2_each",
)

In [None]:
mol = df['atoms'].iloc[0]
out = feature_calc.create(mol, n_jobs=-1)

print(len(mol), n, '-->', out.shape)

In [None]:
print('Generating features...')
X_train, X_test = (np.vstack([feature_calc.create(a) for a in d]) for d in (train_atoms, test_atoms))
print('Features generated.')

In [None]:
level = 'HOMO'
y_train, y_test = (d[level].values for d in (train, test))

In [None]:
from sklearn.kernel_ridge import KernelRidge
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.dummy import DummyRegressor

pipe = Pipeline([
              ('x_norm', StandardScaler()),
              ('estimator', KernelRidgeRegressor())
])
   

# iterate through different alpha values and generate train test curves for them

parameters = [{
    'estimator__alpha': np.logspace(-6, 3, 10),
    'estimator__gamma': np.logspace(-3, 3, 7),
    'estimator__kernel': ['rbf', 'laplacian']
}]

grid = GridSearchCV(pipe, param_grid=parameters, scoring='neg_mean_absolute_error', n_jobs=-1, cv=3, verbose=4)
grid.fit(X_train, y_train)

In [None]:
results = pd.DataFrame(grid.cv_results_).sort_values(by='rank_test_score')
results.head()

In [None]:
estimator = grid.best_estimator_
print(estimator)

In [None]:
X_tr, X_cv, y_tr, y_cv = train_test_split(X_train, y_train, test_size=0.2, random_state=20210820)

In [None]:
dummy = DummyRegressor()  # for reference
dummy.fit(X_tr, y_tr)
estimator.fit(X_tr, y_tr)


for mdl, label in zip((estimator, dummy), ('KRR', 'Dummy')):
    print()
    print(label, '------------------------------------')

for s, X_ref, y_ref in zip(('train', 'cv'), (X_tr, X_cv), (y_tr, y_cv)):
    pred = mdl.predict(X_ref)
    mae = mean_absolute_error(y_ref, pred)
    mse = mean_squared_error(y_ref, pred)
    r2 = r2_score(y_ref, pred)

    print(F'{s} : mae={mae:.3f}, mse={mse:.3f}, r2={r2:.3f}')

In [None]:
db_sizes = [10, 100, 1000, 2000, 3000]
scores = {'train':[[] for _ in db_sizes], 'test':[[] for _ in db_sizes]}

for index, size in enumerate(db_sizes):
    print(size)
for _ in range(3):
    print('\t', _)
    Xa, Xb, ya, yb  = train_test_split(X_train, y_train, train_size=size)
    estimator.fit(Xa, ya)
    tr_pred = estimator.predict(Xa)
    cv_pred = estimator.predict(Xb)

    scores['train'][index].append(mean_absolute_error(ya, tr_pred))
    scores['test'][index].append(mean_absolute_error(yb, cv_pred))
     

In [None]:
train_scores = np.array(scores['train'])
test_scores = np.array(scores['test'])

plt.figure(figsize=(10, 5))
plt.scatter(db_sizes, train_scores.mean(1), marker='o', label='train', s=80, c='b')
plt.plot(db_sizes, train_scores.mean(1), 'b--')
plt.scatter(db_sizes, test_scores.mean(1), marker='o', label='test', s=80, c='r')
plt.plot(db_sizes, test_scores.mean(1), 'r--')
plt.legend()
plt.xlabel('Training Set Size (# entries)', size=15)
plt.ylabel('MAE (eV)', size=15)
plt.ylim(0, 0.65)
plt.grid()
plt.savefig('MBTR_krr_overfit.png', bbox_inches=None)
plt.show()

In [None]:
# data for final parity plot
estimator.fit(X_train, y_train)
y_pred_train = estimator.predict(X_train)
y_pred_test = estimator.predict(X_test)

In [None]:
print(F'Train : mae={mean_absolute_error(y_train, y_pred_train):.3f}, mse={mean_squared_error(y_train, y_pred_train):.3f}, r2={r2_score(y_train, y_pred_train):.3f}')
print(F'Test  : mae={mean_absolute_error(y_test, y_pred_test):.3f}, mse={mean_squared_error(y_test, y_pred_test):.3f}, r2={r2_score(y_test, y_pred_test):.3f}')

In [None]:
plt.figure(figsize=(15, 5))

arrs = [[y_pred_train, y_train], [y_pred_test, y_test]]
labels = ['Train', 'Test']
colors = ['b', 'r']
r2_vals = [0.919, 0.710]

for count, (pred, ref) in enumerate(arrs):
    plt.subplot(1, 2, count+1)
    plt.title(F'{labels[count]}', size=17)
    plt.plot(pred, ref, 'o', alpha=0.5, c=colors[count])
    plt.plot([-12.5, -5.5], [-12.5, -5.5], 'k-', label=F'
    plt.xlabel(F'Predicted (eV)', size=15)
    plt.ylabel('Reference (eV)', size=15)
    plt.xlim(-13, -5)
    plt.ylim(-13, -5)
    plt.grid(linestyle='--')
    plt.legend(loc='upper left')
plt.savefig('parity_plots.png', bbox_inches=None)
plt.show()