In [2]:
from rdkit.Chem import MolFromSmiles, Descriptors, MolFromSmarts
from rdkit.ML.Descriptors import MoleculeDescriptors


def create_descriptors(smiles):
    
    mols = [MolFromSmiles(i) for i in smiles]
    
    include_list=['MolLogP', 'MaxPartialCharge', 'SlogP_VSA3', 
                  'PEOE_VSA6', 'BertzCT', 'BCUT2D_MRLOW',
                  'SMR_VSA10', 'Kappa1', 'BCUT2D_MWHI', 
                  'VSA_EState9', 'FractionCSP3', 'SlogP_VSA2',
                  'SMR_VSA7', 'VSA_EState2', 'PEOE_VSA9', 
                  'PEOE_VSA7', 'PEOE_VSA8', 'VSA_EState10', 'VSA_EState7', 'PEOE_VSA3', 'PEOE_VSA6', 'EState_VSA4']
    
    desc_to_calc = [x[0] for x in Descriptors._descList if x[0] in include_list]
    calc = MoleculeDescriptors.MolecularDescriptorCalculator(desc_to_calc)
    col_names = calc.GetDescriptorNames()
    desc = [calc.CalcDescriptors(mol) for mol in mols]
    
    return [desc, col_names]

smiles = 'OCCc1ccn2cnccc12'
create_descriptors([smiles])

[[(0.09901477588388696,
   16.24996920929941,
   0.2998301427760753,
   380.6639128453302,
   7.307223378584982,
   4.9839785209472085,
   0.0,
   24.117007251546223,
   19.00056910773907,
   11.844020792380803,
   5.516700717616262,
   36.41719284661092,
   21.098082486562625,
   6.4208216229260096,
   5.516700717616262,
   0.0,
   3.986792800453515,
   6.180808767951625,
   0.0,
   0.2222222222222222,
   0.8691)],
 ('MaxPartialCharge',
  'BCUT2D_MWHI',
  'BCUT2D_MRLOW',
  'BertzCT',
  'Kappa1',
  'PEOE_VSA3',
  'PEOE_VSA6',
  'PEOE_VSA7',
  'PEOE_VSA8',
  'PEOE_VSA9',
  'SMR_VSA10',
  'SMR_VSA7',
  'SlogP_VSA2',
  'SlogP_VSA3',
  'EState_VSA4',
  'VSA_EState10',
  'VSA_EState2',
  'VSA_EState7',
  'VSA_EState9',
  'FractionCSP3',
  'MolLogP')]

In [3]:
import pandas as pd


df = pd.read_csv('delaney.csv')
df.head()

Unnamed: 0,Compound ID,measured log(solubility:mol/L),ESOL predicted log(solubility:mol/L),SMILES
0,"1,1,1,2-Tetrachloroethane",-2.18,-2.794,ClCC(Cl)(Cl)Cl
1,"1,1,1-Trichloroethane",-2.0,-2.232,CC(Cl)(Cl)Cl
2,"1,1,2,2-Tetrachloroethane",-1.74,-2.549,ClC(Cl)C(Cl)Cl
3,"1,1,2-Trichloroethane",-1.48,-1.961,ClCC(Cl)Cl
4,"1,1,2-Trichlorotrifluoroethane",-3.04,-3.077,FC(F)(Cl)C(F)(Cl)Cl


In [6]:
trainDescrs = create_descriptors([i for i in df['SMILES'] ])
final_df = pd.DataFrame(trainDescrs[0], columns=trainDescrs[1])
final_df = final_df.join(df['measured log(solubility:mol/L)'], how="left")
final_df = final_df.rename(columns={'measured log(solubility:mol/L)': 'logs'})

In [7]:
final_df.to_csv('vsa_descriptors.csv')