In [1]:
from rdkit.Chem import MolFromSmiles, Descriptors, MolFromSmarts


def create_esol_descriptors(smiles):
    
    mol = MolFromSmiles(smiles)

    mw = Descriptors.MolWt(mol)
    logp = Descriptors.MolLogP(mol)
    rotb = Descriptors.NumRotatableBonds(mol)
    arom_proportion = len(mol.GetSubstructMatches(MolFromSmarts("a"))) / Descriptors.HeavyAtomCount(mol)
    
    # other descriptors in Delaney's publication
    # hbd = Descriptors.NHOHCount(mol)
    # hba = Descriptors.NOCount(mol)
    hbd = Descriptors.NumHDonors(mol)
    hba = Descriptors.NumHAcceptors(mol)
    non_carbon_proportion = len(mol.GetSubstructMatches(MolFromSmarts("[!#6]"))) / Descriptors.HeavyAtomCount(mol)
    psa = Descriptors.TPSA(mol, includeSandP=True)
    # plus some extra one
    fsp3 = Descriptors.FractionCSP3(mol)
    
    return mw, logp, rotb, arom_proportion, hbd, hba, non_carbon_proportion, psa, fsp3

smiles = 'OCCc1ccn2cnccc12'
create_esol_descriptors(smiles)

(162.192, 0.8691, 2, 0.75, 1, 3, 0.25, 37.53, 0.2222222222222222)

In [2]:
import pandas as pd


df = pd.read_csv('delaney.csv')
df.head()

Unnamed: 0,Compound ID,measured log(solubility:mol/L),ESOL predicted log(solubility:mol/L),SMILES
0,"1,1,1,2-Tetrachloroethane",-2.18,-2.794,ClCC(Cl)(Cl)Cl
1,"1,1,1-Trichloroethane",-2.0,-2.232,CC(Cl)(Cl)Cl
2,"1,1,2,2-Tetrachloroethane",-1.74,-2.549,ClC(Cl)C(Cl)Cl
3,"1,1,2-Trichloroethane",-1.48,-1.961,ClCC(Cl)Cl
4,"1,1,2-Trichlorotrifluoroethane",-3.04,-3.077,FC(F)(Cl)C(F)(Cl)Cl


In [3]:
descriptors = [create_esol_descriptors(smiles) for smiles in df['SMILES']]
col_names = ['mw', 'logp', 'rotb', 'ap', 'hbd', 'hba', 'non_cp', 'psa', 'fsp3']
final_df = pd.DataFrame(descriptors, columns=col_names)
final_df = final_df.join(df['measured log(solubility:mol/L)'], how="left")
final_df = final_df.rename(columns={'measured log(solubility:mol/L)': 'logs'})

In [4]:
final_df.to_csv('delaney_descriptors.csv')