In [4]:
! rm -rf gpHSP
!git clone https://github.com/aspuru-guzik-group/gpHSP

Cloning into 'gpHSP'...
remote: Enumerating objects: 57, done.[K
remote: Counting objects: 100% (19/19), done.[K
remote: Compressing objects: 100% (14/14), done.[K
remote: Total 57 (delta 7), reused 17 (delta 5), pack-reused 38[K
Unpacking objects: 100% (57/57), done.


# Assuming in a colab enviroment

In [None]:
import sys
if 'google.colab' in sys.modules:
    print('In colab!')
    !pip install rdkit-pypi mordred ml_collections ngboost gpflow
    sys.path.insert(0,'gpHSP')

In [20]:
import pandas as pd
import os
import numpy as np
import gphsp

## Utilities

In [9]:
at_data_dir = lambda x: os.path.join('gpHSP/data', x)

# Load data and get all smiles

In [14]:
df = pd.read_csv(at_data_dir('Solvents_exp.csv'))
df['smiles'] = df['smiles'].apply(gphsp.get_isomeric_smiles)
all_smi = df['smiles'].tolist()
y_cols = ['δd', 'δp', 'δh']
gphsp.peek_df(df)

Index(['key', 'Type', 'δd', 'δp', 'δh', 'smiles', 'ID_type', 'ID', 'Ref',
       'organic', 'n_electrons', 'n_atoms', 'charge', 'MolWt', 'label',
       'finished', 'job_name', 'homo', 'lumo', 'gap', 'd-moments', 'dipole',
       'polar', 'run_time', 'n_cores', 'compute_time', 'Area', 'Hba', 'Hbd',
       'Volume', 'sigma_mom_0', 'sigma_mom_1', 'sigma_mom_2', 'sigma_mom_3',
       'sigma_mom_4', 'sigma_mom_5', 'sigma_norm', 'sigma_profile', 'drug'],
      dtype='object')
(193, 39)


Unnamed: 0,key,Type,δd,δp,δh,smiles,ID_type,ID,Ref,organic,n_electrons,n_atoms,charge,MolWt,label,finished,job_name,homo,lumo,gap,d-moments,dipole,polar,run_time,n_cores,compute_time,Area,Hba,Hbd,Volume,sigma_mom_0,sigma_mom_1,sigma_mom_2,sigma_mom_3,sigma_mom_4,sigma_mom_5,sigma_norm,sigma_profile,drug
0,"1,1,1-Trichloroethane",Solvent,16.8,4.3,2.0,CC(Cl)(Cl)Cl,CAS,71-55-6,1,True,32,5,0,133.405,Exp-0,True,SOLSPE_Exp-0,-7.096702,-1.107639,5.989063,[-2.37335554 -0.11768285 0.07442233],2.377437,64.37432,94.214,4,376.856,1.3258,0.0,0.0,0.1262,0,16.6,-3.835673,4.240154,-2.020584,1.539736,41.428082,[ 0. 0. 0. 0. ...,False


In [16]:
poly_df = pd.read_csv(at_data_dir('Polymers_exp.csv'))
poly_df['smiles'] = poly_df['smiles'].apply(gphsp.get_isomeric_smiles)
all_smi = all_smi + poly_df['smiles'].tolist()
gphsp.peek_df(poly_df)

Index(['label', 'δd', 'δp', 'δh', 'smiles', 'test', 'poly_label', 'n_copies',
       'organic', 'n_electrons', 'n_atoms', 'charge', 'MolWt', 'n_frags',
       'largest', 'finished', 'job_name', 'poly_smiles', 'homo', 'lumo', 'gap',
       'd-moments', 'dipole', 'polar', 'run_time', 'n_cores', 'compute_time',
       'Area', 'Hba', 'Hbd', 'Volume', 'sigma_mom_0', 'sigma_mom_1',
       'sigma_mom_2', 'sigma_mom_3', 'sigma_mom_4', 'sigma_mom_5',
       'sigma_norm', 'sigma_profile'],
      dtype='object')
(31, 39)


Unnamed: 0,label,δd,δp,δh,smiles,test,poly_label,n_copies,organic,n_electrons,n_atoms,charge,MolWt,n_frags,largest,finished,job_name,poly_smiles,homo,lumo,gap,d-moments,dipole,polar,run_time,n_cores,compute_time,Area,Hba,Hbd,Volume,sigma_mom_0,sigma_mom_1,sigma_mom_2,sigma_mom_3,sigma_mom_4,sigma_mom_5,sigma_norm,sigma_profile
0,Polyacrylonitrile-n5,20.0,15.1,7.9,CCC#N,True,Polyacrylonitrile,5,True,110,45,0,275.4,1,True,True,SOLSPE2_Polyacrylonitrile-n5,[C@@H](C#N)(CCC#N)C[C@@H](C#N)C[C@H](C#N)CCC#N,-7.879002,-1.120048,6.758954,[ 0.58302575 -8.49403297 -1.86749721],8.716425,211.62387,468.596,4,1874.384,3.047,1.5386,0.2196,0.3427,0,175.0,26.1,154.0,51.4,165.0,66.770346,[ 0. 0. 0. 0. ...


In [17]:
new_df = pd.read_csv(at_data_dir('sample_molecules.csv'))
new_df['smiles'] = new_df['smiles'].apply(gphsp.get_isomeric_smiles)
all_smi = all_smi + new_df['smiles'].tolist()
gphsp.peek_df(new_df)

Index(['names', 'smiles'], dtype='object')
(11, 2)


Unnamed: 0,names,smiles
0,J52-Cl,CCCCCCCC[C@@H](CCCCCC)Cn1nc2c(-c3ccc(C)s3)c(F)...


Calculate a mordred values and a mask

In [21]:
all_smi = np.unique([gphsp.get_isomeric_smiles(s) for s in all_smi])
all_values = gphsp.calculate_mordred(all_smi)
mask = gphsp.calculate_mask(all_values)
np.savez_compressed(at_data_dir('mordred_features.npz'),
                    smiles = all_smi,
                    values = all_values,
                    mask = mask)
features = gphsp.SmilesMap(at_data_dir('mordred_features.npz'))

  1%|          | 2/226 [00:02<05:54,  1.58s/it]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 34%|███▎      | 76/226 [00:32<07:11,  2.88s/it]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|██████████| 226/226 [01:32<00:00,  2.43it/s]
