In [20]:
import rdkit.Chem as Chem
import pandas as pd
import numpy as np

In [21]:
df=pd.read_csv(r'./flavone_clean_60K.csv')
df.shape

(60250, 2)

In [22]:
def RDkit_descriptors(smiles):
    """
    Function will return all 208 RDKit descriptors
    smiles is a pandas series or a list of smiles
    """
    from rdkit.ML.Descriptors import MoleculeDescriptors
    from rdkit.Chem import Descriptors
    mols = [Chem.MolFromSmiles(i) for i in smiles] 
    calc = MoleculeDescriptors.MolecularDescriptorCalculator([x[0] for x in Descriptors._descList])
    desc_names = calc.GetDescriptorNames()
    
    Mol_descriptors =[]
    for mol in mols:
        # add hydrogens to molecules
        mol=Chem.AddHs(mol)
        # Calculate all 208 descriptors for each molecule
        descriptors = calc.CalcDescriptors(mol)
        Mol_descriptors.append(descriptors)
    return Mol_descriptors,desc_names 

In [23]:
# call function
Mol_descriptors,desc_names = RDkit_descriptors(smiles=df.SMILES)
df_rdkit = pd.DataFrame(Mol_descriptors,columns=desc_names)
df_rdkit.shape

(60250, 208)

In [24]:
df_rdkit.insert(0,"SMILES",df.SMILES)

In [25]:
df_rdkit.head()

Unnamed: 0,SMILES,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,CCCS(=O)(=O)c1cccc2c(=O)c(C(C)C)c(-c3ccccc3)oc12,14.24093,-6.332079,14.24093,1.077489,0.648982,370.47,348.294,370.12388,136,...,0,0,1,0,0,0,0,0,0,0
1,O=c1c(S(=O)(=O)[N+](=O)[O-])c(-c2ccccc2)oc2ccc...,13.024943,-5.772429,13.024943,0.881957,0.538577,331.305,322.233,331.015058,116,...,0,1,0,0,0,0,0,0,0,0
2,CC(=O)c1cccc2oc(-c3ccccc3C)cc(=O)c12,13.15552,-3.362113,13.15552,0.870761,0.666238,278.307,264.195,278.094294,104,...,0,0,0,0,0,0,0,0,0,0
3,CS(=O)(=O)c1c(-c2ccccc2C(=O)[N+](=O)[O-])oc2cc...,14.821051,-5.921292,14.821051,1.178626,0.495791,391.332,381.252,391.016201,138,...,0,0,1,0,0,0,0,0,0,0
4,CS(=O)(=O)c1cccc2oc(-c3ccccc3)cc(=O)c12,13.015239,-5.418128,13.015239,0.730454,0.729718,300.335,288.239,300.04563,106,...,0,0,1,0,0,0,0,0,0,0


In [26]:
df_rdkit.to_csv('flavone_clean_60K_smi_208f.csv',index=False)