In [None]:
! rm -rf gpHSP
!git clone https://github.com/aspuru-guzik-group/gpHSP

Cloning into 'gpHSP'...
remote: Enumerating objects: 134, done.[K
remote: Counting objects: 100% (96/96), done.[K
remote: Compressing objects: 100% (67/67), done.[K
remote: Total 134 (delta 56), reused 69 (delta 29), pack-reused 38[K
Receiving objects: 100% (134/134), 32.98 MiB | 32.47 MiB/s, done.
Resolving deltas: 100% (70/70), done.


# Assuming in a colab enviroment

In [None]:
import sys
if 'google.colab' in sys.modules:
    print('In colab!')
    sys.path.insert(0,'gpHSP')
    !pip install rdkit-pypi mordred ml_collections ngboost gpflow

In colab!


In [None]:
import os
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import scipy.stats as stats

import ngboost
import tensorflow as tf
import gpflow as gpf
import gphsp

gphsp.notebook_context()
gphsp.print_modules([ngboost, tf , gpf])

ngboost    = 0.3.12
tensorflow = 2.7.0
gpflow     = 2.3.1


## Utilities

In [None]:
at_data_dir = lambda x: os.path.join('gpHSP/data', x)
at_model_dir = lambda x: os.path.join('gpHSP/models', x)

# Load data and get all smiles

In [None]:
df = pd.read_csv(at_data_dir('sample_molecules.csv'))
df['smiles'] = df['smiles'].apply(gphsp.get_isomeric_smiles)
gphsp.peek_df(df)

Index(['names', 'smiles'], dtype='object')
(11, 2)


Unnamed: 0,names,smiles
0,J52-Cl,CCCCCCCC[C@@H](CCCCCC)Cn1nc2c(-c3ccc(C)s3)c(F)...


Load features

In [None]:
features = gphsp.SmilesMap(at_data_dir('mordred_features.npz'))

# Train gpHSP model on polymers

In [None]:
model_suffix = 'HSP_ngboost_mol.pkl'
names = gphsp.Y_COLS
models = {name: gphsp.load_model(at_model_dir(f"{name}_{model_suffix}")) for name in names}
model_suffix = 'ngboost_polymer.pkl'
poly_models = {name: gphsp.load_model(at_model_dir(f"{name}_{model_suffix}")) for name in names}

In [None]:
smi = df['smiles'].to_numpy(str)
mol_x = features(smi)
poly_x = gphsp.predictions_as_features(mol_x, models)

In [None]:
for name, model in models.items():
    df[f'{name}_mol'] = model.pred_dist(mol_x).mean()
for name, model in poly_models.items():
    df[f'{name}_poly'] = model.pred_dist(poly_x).mean()
df

Unnamed: 0,names,smiles,δd_mol,δp_mol,δh_mol,δd_poly,δp_poly,δh_poly
0,J52-Cl,CCCCCCCC[C@@H](CCCCCC)Cn1nc2c(-c3ccc(C)s3)c(F)...,21.534,6.556,6.884,18.754,7.06,3.679
1,BTA3,CCCCCCCCn1nc2c(/C=c3\sc(=C(C#N)C#N)n(CC)c3=O)c...,21.855,5.656,6.831,19.066,3.767,3.677
2,BTA1,CCCCCCCCn1nc2c(/C=C3\SC(=S)N(CC)C3=O)ccc(-c3cc...,21.302,5.108,6.222,18.893,4.576,4.074
3,Trichloromethane,ClC(Cl)Cl,18.014,3.659,5.687,19.041,4.254,4.536
4,THF (Tetrahydrofuran),C1CCOC1,16.989,5.062,7.558,17.969,5.088,5.292
5,2-MeTHF (2-Methyloxolane),C[C@H]1CCCO1,17.137,6.877,5.181,18.983,9.204,13.027
6,Methylbenzene,Cc1ccccc1,18.167,1.673,2.249,18.69,4.445,4.478
7,o-xy (Ortho-xylene),Cc1ccccc1C,18.042,1.756,2.899,18.851,4.445,4.478
8,A,CCCC[C@H](CC)Cc1sc(-c2c3cc(-c4ccc(-c5sc(-c6ccc...,21.728,5.535,5.819,19.024,4.518,4.074
9,B,CCCCCCCCCCCC[C@H](CCCCCCCCCC)Cn1c2c3sc(/C=C4\C...,20.448,3.951,4.178,18.987,4.243,2.452


In [None]:
df.to_csv('hsp_calculations.csv', index=False)