# Atom Count Descriptor

 In this notebook we evaluated the atom count descriptor and tested the overall predictions made from this feature descriptor. We then tuned the hyperparameters to find the optimal settings.

In [None]:
! pip install dscribe  # Downloads needed modules
! pip install ase

# -------------------------------------------------------

import io
from datetime import datetime
import zipfile

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import requests
from ase.io import read
from sklearn.model_selection import train_test_split


import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.kernel_ridge import KernelRidge
from sklearn.metrics import mean_squared_error, r2_score

def download_published_data():
"""Downloads and unzips data folder. """


    start = datetime.now()
    print(start)
    r = requests.get('https://people.bath.ac.uk/crh53/m1507656.zip')
    zipped = zipfile.ZipFile(io.BytesIO(r.content))
    zipped.extractall('')
    finish = datetime.now()
    print(finish)
    print(finish - start)

def xyz_str_to_atoms(xyz_str):
"""Convert a xyz file to an ASE atoms object via in-memory file (StringIO)."""
  
    f = io.StringIO()
    f.write(xyz_str)
    f.seek(0)
    atoms = read(f, format="xyz")
    return atoms

# -------------------------------------------------------

print('Downloading data...')
download_published_data()  # downloads needed data
# % cd m1507656

from helpers import get_level  # needs to go here as wont have been downloaded earlier in

print('Loading data...')
df = pd.read_json('df_5k.json', orient='split')

print('Generating `ase.Atoms` objects...')
df['atoms'] = df['xyz_pbe_relaxed'].apply(xyz_str_to_atoms)

print('Extracting HOMO, LUMO, BANDGAP from data...')
df['HOMO'] = df.apply(lambda row: get_level(row, level_type='HOMO', subset='GOWO_at_PBE0_cbs'), axis=1)
df['LUMO'] = df.apply(lambda row: get_level(row, level_type='LUMO', subset='GOWO_at_PBE0_cbs'), axis=1)
df['BG'] = df['LUMO'] - df['HOMO']
print('~ 2300 molecules do not have LUMO energy levels for this or any other `GOWO` level of theory.')

print('Splitting data set...')
train, test = train_test_split(df, test_size=0.2, random_state=20210817)
train_atoms, test_atoms = train['atoms'].to_list(), test['atoms'].to_list()

print('Data Processing Complete')
print('#', '-'*119)

In [None]:
from sklearn.kernel_ridge import KernelRidge

class KernelRidgeRegressor(KernelRidge):
"""Class that performs target normalisation as part of the fit and predict methods."""

    def fit(self, X, y):
        self.mu = np.mean(y_train)
        y_tr = (y - self.mu) / self.mu
        super().fit(X, y_tr)

    def predict(self, X):
        pred = super().predict(X)
        pred_tr = (pred * self.mu) + self.mu
        return pred_tr

In [None]:
df.columns.tolist()
df.head(10)



df_subset = (df[['refcode_csd', 'canonical_smiles', 'number_of_atoms','total_energy_pbe', 'HOMO']])
df_subset

In [None]:
plt.plot((-12,-4), (0,100), c='green')
plt.scatter(df_subset["HOMO"], df_subset["number_of_atoms"], s=5)
plt.xlabel("HOMO/ eV")
plt.ylabel("Number of atoms")
plt.title(" HOMO energy distribution in 5k set", fontsize=15)
plt.show()

In [None]:
dataframe = df
df_name = '5k'

count_abs_occ = dict()    
count_molecules = dict()  
occ_per_molecule = dict() 
na_array= []
num_nonh_array = []

for i,row in dataframe.iterrows():
    types = [] 
    xyz = row.xyz_pbe_relaxed.split("\n")
    na = xyz[0]
    na_array.append(na)
    for i in range(int(na)):
        al = xyz[i+2]
        atom, x, y, z = al.split()
        types.append(atom)
    
    # count total number of times that a certain element occurs in dataset
    for element in types:
        if element in count_abs_occ:
            count_abs_occ[element] += 1
        else:
            count_abs_occ[element] = 1
            
    # count number of molecules that contain a certain element 
    for element in list(set(types)):
        if element in count_molecules:
            count_molecules[element] += 1
        else:
            count_molecules[element] = 1 
    
    # count number of "heavy" (non-H) atoms in a molecule
    num_nonh = sum(x != 'H' for x in types)
    num_nonh_array.append(num_nonh)


In [None]:

## compute size distribution of molecules

a = np.array(na_array)
na_list = a.tolist()
na_list = [int(i) for i in na_list]
max_na = max(na_list)

na_array = np.asarray(na_list)
y = np.bincount(na_array)
ii = np.nonzero(y)[0]
number_of_atoms_index = np.array(list(zip(ii, y[ii])))
number_of_atoms_index = number_of_atoms_index.T
num_atoms = number_of_atoms_index[0]
count = number_of_atoms_index[1]

## bins for histogram
bins = np.arange(1, max_na+1)


In [None]:
print('The largest molecule of the 5k set has %i atoms.' % max_na)
print("On average, the molecular size is %f atoms, with an average of %f heavy (non-H) atoms." 
      %(na_array.mean(), np.asarray(num_nonh_array).mean()))
print("The molecular size distribution has a standard deviation of %f and a variance of %f." 
      %(na_array.std(), na_array.var()))

In [None]:
ig, ax = plt.subplots(1,1, figsize=(8,6))
plt.style.use('seaborn-whitegrid')
ax.hist(na_list, bins=bins, align='left', color='limegreen', edgecolor='red')
plt.axvline(x=na_array.mean(), linestyle='--', color='k')
ax.grid(linewidth=1)
plt.tick_params(labelsize=40)
plt.tick_params(labelsize=40)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.xlabel('Number of atoms', fontsize=25)
plt.ylabel('Number of molecules', fontsize=25)
plt.title("Molecular size distribution in 5k set", fontsize=25)
#plt.xlim(0,120)
#fig.savefig('size_distribution_%s.png' %df_name, dpi=200, bbox_inches="tight")
plt.show()

In [None]:
atomic_numbers = df['atoms'].apply(lambda x: x.numbers)
unique_atomic_numbers = set([a for b in atomic_numbers for a in b])
max_num_atoms = atomic_numbers.apply(len).max()

print(unique_atomic_numbers)
print(max_num_atoms)

In [None]:
X_train = train['number_of_atoms'].values.reshape(-1,1)
X_train


X_test = test['number_of_atoms'].values.reshape(-1,1)
X_test
     

In [None]:
y_train = train['HOMO'].values  # extract target value from dataframe
X_tr, X_cv, y_tr, y_cv = train_test_split(X_train, y_train, test_size=0.2, random_state=1)

model = KernelRidge(kernel='laplacian', alpha=1.0, gamma=0.01)  # gauscian or laplacian



model.fit(X_tr, y_tr)

y_pred_tr = model.predict(X_tr)
y_pred_cv = model.predict(X_cv)

plt.title("G0W0 HOMO, KRR-Gaussian, alpha=0.1, gamma=0.01")
plt.plot(y_pred_tr, y_tr, '.', alpha=11, label='train')
plt.plot(y_pred_cv, y_cv, '.', alpha=11, label='test')
plt.plot(y_tr, y_tr, 'k--', label='ref')
plt.xlabel('Predicted HOMO ')
plt.ylabel('Reference HOMO')
plt.legend()
plt.show()

In [None]:
for s, pred, ref in zip(('train', 'cv'), (y_pred_tr, y_pred_cv), (y_tr, y_cv)):
    mse = mean_squared_error(ref, pred)
    r2 = r2_score(ref, pred)
    print(F'{s} : mse={mse:.3f}, r2={r2:.3f}')


print("Mean square error on test set: %0.3f eV" % mean_squared_error(y_pred_cv, y_cv))
print("Mean absolute error on test set: %0.3f eV" %(np.abs(y_pred_cv-y_cv)).mean())

In [None]:
from sklearn.model_selection import GridSearchCV

alpha = np.logspace(-2, 2, 5)
gamma = np.logspace(-2, 2, 5)
kernel = 'rbf'
cv_number = 5
scoring_function = 'neg_mean_absolute_error' 
grid_search = GridSearchCV(KernelRidge(), 
                           [{'kernel':[kernel],'alpha': alpha, 'gamma': gamma}], 
                           cv = cv_number, 
                           scoring = scoring_function,
                           verbose=1000)

from datetime import datetime
start = datetime.now()

grid_search.fit(X_train, y_train)

finish = datetime.now()
total_time = finish - start 
print("It took how long?", total_time)
    

In [None]:
optimum = grid_search.best_estimator_

print(F'Optimal alpha value is: {optimum.alpha}')
print(F'Optimal gamma value is: {optimum.gamma}')

In [None]:
y_pred_tr = optimum.predict(X_tr)
y_pred_cv = optimum.predict(X_cv)

plt.figure(figsize=(10, 5))
plt.title("G0W0 HOMO, KRR-Gaussian, alpha=0.1, gamma=0.01")
plt.plot(y_pred_tr, y_tr, '.', label='train')
plt.plot(y_pred_cv, y_cv, '.', label='test')
plt.plot(y_tr, y_tr, 'k--', label='ref')
plt.xlabel('Predicted')
plt.ylabel('Reference')
plt.legend()
plt.show()