In [None]:
! wget https://repo.anaconda.com/miniconda/Miniconda3-py37_4.8.2-Linux-x86_64.sh
! chmod +x Miniconda3-py37_4.8.2-Linux-x86_64.sh
! bash ./Miniconda3-py37_4.8.2-Linux-x86_64.sh -b -f -p /usr/local
import sys
sys.path.append('/usr/local/lib/python3.7/site-packages/')

! conda install -yq -c rdkit rdkit   # need for fingerprints
! conda install -yq -c conda-forge xgboost  # sklearn implementation inefficient
! conda install -yq -c conda-forge tqdm


def download_published_data():
  import io
  from datetime import datetime
  import zipfile

  import requests

  start = datetime.now()
  print(start)
  r = requests.get('https://people.bath.ac.uk/crh53/m1507656.zip')
  zipped = zipfile.ZipFile(io.BytesIO(r.content))
  zipped.extractall('')
  finish = datetime.now()
  print(finish)
  print(finish - start)

download_published_data()
% cd m1507656/

In [None]:
import io

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.metrics import pairwise_distances
from sklearn.manifold import TSNE

from rdkit.Chem import MolFromSmiles
from rdkit.Chem import AllChem, DataStructs
from xgboost.sklearn import XGBRegressor

from helpers import get_level
     

In [None]:
df = pd.read_json('df_62k.json', orient='split')
df.head()

In [None]:
prior_len = len(df)

df['rdkit'] = df['canonical_smiles'].apply(MolFromSmiles)
df = df[~df['rdkit'].isna()]  # exclude invalid smiles codes

post_len = len(df)

print(F'{post_len} rdkit molecules generated')
print(F'Rows dropped = {prior_len - post_len}')

In [None]:
def mol_to_ecfp(mol, r=3, b=1024):
    """Generate ecfp for passed rdkit mol
    mol : rdkit.chem.Mol
    r : int (radius of ECFP)
      for convention, ECFP --> ECFP<2R> hence radius is passed.
    b : int
      Number of bits to use
    """
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, r, nBits=b)
    array = np.zeros((0,), dtype=np.int8)
    DataStructs.ConvertToNumpyArray(fp, array)
    return array

df['fingerprint'] = df['rdkit'].apply(mol_to_ecfp)

In [None]:
X = np.vstack(df['fingerprint'].values)
y = df.apply(lambda row: get_level(row, level_type='HOMO', subset='PBE+vdW_vacuum'), axis=1).values

print(np.isnan(y).sum())

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=20210817)

In [None]:
plt.figure(figsize=(20, 5))

plt.subplot(1, 2, 1)
plt.hist(y_train, bins=50, color='r', edgecolor='k')
plt.title(F'Distribution of HOMO energy calculated using PBE+vdW_vacuum')
plt.xlabel('Energy (eV')
plt.ylabel('PDF')

plt.subplot(1, 2, 2)
plt.hist(X_train.sum(1), bins=50, color='b', edgecolor='k')
plt.title('Histograph of `ON` bits')
plt.xlabel('# ON bits in molecule')
plt.ylabel('PDF')

plt.show()

In [None]:
sub_sample = np.random.choice(len(X_train), 5000, replace=False)
X_train_sample, y_train_sample = X_train[sub_sample], y_train[sub_sample]

similarities = 1 - pairwise_distances(X_train_sample, X_train_sample, metric='jaccard', n_jobs=-1)

In [None]:
sample = np.random.choice(similarities.ravel(), 20000, replace=False)

plt.figure(figsize=(15, 7))
plt.hist(sample, bins=100, edgecolor="black")
plt.title('Inter-species Tanimoto Similairty w.r.t. ECFP6_1024')
plt.xlabel('Tanimoto Similarity')
plt.ylabel('PDF')
plt.show()
     

In [None]:

reducer = TSNE(metric='jaccard', n_components=2)  # 2 output columns
embedding = reducer.fit_transform(X_train_sample)

plt.figure(figsize=(15, 7))
plt.title('t-SNE visualisation of ECFP6_1024')
p = plt.scatter(embedding[:, 0], embedding[:, 1], c=y_train_sample, cmap='RdYlGn_r')
cbar = plt.colorbar(p)
cbar.set_label('HOMO : PBE+vdW_vacuum (eV)')
plt.xlabel('X1')
plt.ylabel('X2', rotation=0)
plt.tight_layout()
plt.show()

In [None]:
from sklearn.metrics import mean_absolute_error
from sklearn.dummy import DummyRegressor

model = XGBRegressor(objective='reg:squarederror')

In [None]:
X_tr, X_cv, y_tr, y_cv = train_test_split(X_train, y_train, test_size=0.2)

In [None]:
model.fit(X_tr, y_tr)

In [None]:
y_pred_cv = model.predict(X_cv)
xgb_error_cv = mean_absolute_error(y_cv, y_pred_cv)
xgb_error_tr = mean_absolute_error(y_tr, model.predict(X_tr))

In [None]:
print(F'TR : Baseline XGBoost MAE = {xgb_error_tr:.3f} eV')
print(F'CV : Baseline XGBoost MAE = {xgb_error_cv:.3f} eV')

In [None]:
dummy = DummyRegressor(strategy='mean')
dummy.fit(X_tr, y_tr)
y_pred_cv_dum = dummy.predict(X_cv)
dummy_error_cv = mean_absolute_error(y_cv, y_pred_cv_dum)
dummy_error_tr = mean_absolute_error(y_tr, dummy.predict(X_tr))

print(F'TR : Mean prediction MAE = {dummy_error_tr:.3f} eV')
print(F'CV : Mean prediction MAE = {dummy_error_cv:.3f} eV')

In [None]:
plt.figure(figsize=(15,7))
plt.title('Distributions of base model predictions')
plt.hist(y_cv, bins=50, color='g', edgecolor='k', label='y')
plt.hist(y_pred_cv, bins=50, color='b', edgecolor='k', label='XGB')
plt.axvline(y_pred_cv_dum[0], color='r', label='μpred')

plt.xlabel('PDF')
plt.ylabel('HOMO (eV)')

plt.legend()
plt.grid(linestyle='--', alpha=0.6, color='k')

plt.show()

In [None]:
data_scores = {'train': [], 'test': []}

data_sizes = [10, 100, 1000, 10000, 20000, 40000]
potential_indices = np.arange(len(X_train))

for d in tqdm(data_sizes):
    for _ in range(4):

        tr_sample = np.random.choice(potential_indices, size=d, replace=False)
        cv_sample = np.delete(potential_indices, tr_sample)

        X_tr, y_tr = X_train[tr_sample], y_train[tr_sample]
        X_cv, y_cv = X_train[cv_sample], y_train[cv_sample]

        model = XGBRegressor(objective='reg:squarederror')

        model.fit(X_tr, y_tr)
        tr_score = mean_absolute_error(y_tr, model.predict(X_tr))
        cv_score = mean_absolute_error(y_cv, model.predict(X_cv))

        data_scores['train'].append(tr_score)
        data_scores['test'].append(cv_score)

In [None]:
repeated_sizes = sorted(data_sizes * 4)  # cheating because can just sort the values here rather than iterating
repeated_sizes_log = np.log10(repeated_sizes)  # cheating because can just sort the values here rather than iterating

In [None]:
plt.figure(figsize=(25, 7))

plt.subplot(1, 2, 1)
plt.title('TR / CV curve for XBGRegressor (Standard)')
plt.plot(repeated_sizes, data_scores['train'], '-o', label='train')
plt.plot(repeated_sizes, data_scores['test'], '-o', label='cv')
plt.xlabel('Training Set Size')
plt.ylabel('MAE (eV)')
plt.grid(alpha=0.6, color='k', linestyle='--')
plt.legend()

plt.subplot(1, 2, 2)
plt.title('TR / CV curve for XBGRegressor (log10)')
plt.plot(repeated_sizes_log, data_scores['train'], '-o', label='train')
plt.plot(repeated_sizes_log, data_scores['test'], '-o', label='cv')
plt.xlabel('Training Set Size (log10)')
plt.ylabel('MAE (eV)')
plt.grid(alpha=0.6, color='k', linestyle='--')
plt.legend()

plt.tight_layout()
plt.show()

In [None]:
scores = {'train': [], 'test': []}

depths = list(range(1, 6))

for d in tqdm(depths):
    X_tr, X_cv, y_tr, y_cv = train_test_split(X_train, y_train, test_size=0.2)

    model = XGBRegressor(objective='reg:squarederror', max_depth=d)
    model.fit(X_tr, y_tr)
    tr_score = mean_absolute_error(y_tr, model.predict(X_tr))
    cv_score = mean_absolute_error(y_cv, model.predict(X_cv))

    scores['train'].append(tr_score)
    scores['test'].append(cv_score)

In [None]:
plt.figure(figsize=(15, 5))
plt.title('Train / Test Curves for XGBRegressor')
plt.plot(depths, scores['train'], '-o', label='train')
plt.plot(depths, scores['test'], '-o', label='test')
plt.xlabel('Max Depth')
plt.ylabel('MAE (eV)')
plt.grid(alpha=0.6, color='k', linestyle='--')
plt.legend()
plt.ylim(0.2, 0.4)
plt.show()
     


Conclusions

Generated ECFP6_1024 fingerprints for 61,000 molecules
Used these to try and predict the HOMO energy levels for these molecules using the PBE+vdW_vacuum level of theory
While initial prediction results were promising (i.e. some relationship was observed which allowed a better than "crude" approach) overall the XGBoost regressor was found to suffer from bias
Solving this bias could be likely be achieved by the use of more electrostatic based descriptors