In [1]:
! rm -rf gpHSP
!git clone https://github.com/aspuru-guzik-group/gpHSP

Cloning into 'gpHSP'...
remote: Enumerating objects: 140, done.[K
remote: Counting objects: 100% (102/102), done.[K
remote: Compressing objects: 100% (72/72), done.[K
remote: Total 140 (delta 60), reused 72 (delta 30), pack-reused 38[K
Receiving objects: 100% (140/140), 32.98 MiB | 34.25 MiB/s, done.
Resolving deltas: 100% (74/74), done.


# Assuming in a colab enviroment

In [2]:
import sys
if 'google.colab' in sys.modules:
    print('In colab!')
    sys.path.insert(0,'gpHSP')
    !pip install rdkit-pypi mordred ml_collections ngboost gpflow

In colab!


In [3]:
import os
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import scipy.stats as stats

import ngboost
import tensorflow as tf
import gpflow as gpf
import gphsp

gphsp.notebook_context()
gphsp.print_modules([ngboost, tf , gpf])

ngboost    = 0.3.12
tensorflow = 2.8.0
gpflow     = 2.4.0


## Utilities

In [4]:
at_data_dir = lambda x: os.path.join('gpHSP/data', x)
at_model_dir = lambda x: os.path.join('gpHSP/models', x)

# Load data and get all smiles

In [5]:
df = pd.read_csv('LMU_Molecules.csv')
df['smiles'] = df['smiles'].apply(gphsp.get_isomeric_smiles)
gphsp.peek_df(df)

Index(['names', 'smiles'], dtype='object')
(8, 2)


Unnamed: 0,names,smiles
0,PCBM,COC(=O)CCC[C@]1(c2ccccc2)[C@]23c4c5c6c7c8c9c5c...


Create proxy dataset

In [26]:
df = pd.DataFrame()
df['names'] =['PTB7_Th']
df['smiles'] = ['C(C)CC[C@H](Cc1sc(c2c3c(sc(c4sc(C)c5sc(c(c45)F)C(=O)OC[C@@H](CCCC)CC)c3)c(c3ccc(C[C@@H](CCCC)CC)s3)c3cc(sc23)C)cc1)CC']
df['smiles'] = df['smiles'].apply(gphsp.get_isomeric_smiles)
gphsp.peek_df(df)

Index(['names', 'smiles'], dtype='object')
(1, 2)


Unnamed: 0,names,smiles
0,PTB7_Th,CCCC[C@@H](CC)COC(=O)c1sc2c(C)sc(-c3cc4c(-c5cc...


Load features

In [27]:
features = gphsp.SmilesMap(at_data_dir('mordred_features.npz'))

In [28]:
smi  = df['smiles'].to_numpy(str)
needs_update = np.array([not s in features.index for s in smi])
print(f'Found {needs_update.sum()} smiles not found in SmilesMap')
if needs_update.sum():
    new_smi = smi[needs_update]
    new_values = gphsp.calculate_mordred(new_smi)
    features.update(new_smi, new_values)
    values = features(smi)
    assert np.isnan(values).sum()==0, 'Found nan, recalcualte mask and model (colab 0 & 2)'

Found 1 smiles not found in SmilesMap


100%|██████████| 1/1 [00:05<00:00,  5.88s/it]


  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


# Train gpHSP model on polymers

In [29]:
model_suffix = 'HSP_ngboost_mol.pkl'
names = gphsp.Y_COLS
models = {name: gphsp.load_model(at_model_dir(f"{name}_{model_suffix}")) for name in names}
model_suffix = 'ngboost_polymer.pkl'
poly_models = {name: gphsp.load_model(at_model_dir(f"{name}_{model_suffix}")) for name in names}

In [30]:
smi = df['smiles'].to_numpy(str)
mol_x = features(smi)
poly_x = gphsp.predictions_as_features(mol_x, models)

In [31]:
for name, model in models.items():
    df[f'{name}_mol'] = model.pred_dist(mol_x).mean()
for name, model in poly_models.items():
    df[f'{name}_poly'] = model.pred_dist(poly_x).mean()
df

Unnamed: 0,names,smiles,δd_mol,δp_mol,δh_mol,δd_poly,δp_poly,δh_poly
0,PTB7_Th,CCCC[C@@H](CC)COC(=O)c1sc2c(C)sc(-c3cc4c(-c5cc...,21.528,6.067,5.677,18.635,2.367,3.719


In [None]:
df.to_csv('hsp_calculations.csv', index=False)