**Import Necessary Libraries**

In [56]:
import numpy as np
import pandas as pd
import rdkit.Chem as Chem
from rdkit.Chem import rdMolDescriptors 

**Method 1: Get Molecular Properties with the rdMolDescriptors**

In [57]:

def smi_to_mol_prop(smi,property_names = list(rdMolDescriptors.Properties.GetAvailableProperties())):
    """
    This function will return a numpy array with all molecular properties of a SMILES string
    Input (smi): a SMILES string
    Output: np.array()
    eg. df['mol_prop'] = df.SMILES.apply(smi2molprop)
    output: each row of df['mol_prop'] will have a np.array with all molecular props.
    """
    property_getter = rdMolDescriptors.Properties(property_names)
    
    mol = Chem.MolFromSmiles(smi)
    props = None
    if mol:
        props = np.array(property_getter.ComputeProperties(mol))
    return props

**Example**

In [58]:
smiles=["CCCN1c2cc3nsnc3cc2N(CC)C1OC"  ,"CCN1c2cc3nsnc3c(OC)c2N(C)C1OC(C)C"  ,"COC1N(C)c2cc3nsnc3c(OCC(C)C)c2N1C" , "CCCN1c2cc3nsnc3c(C)c2N(C)C1OC", "CCCCOc1c2c(cc3nsnc13)N(C)C(OC)N2C" ]
df_smi = pd.DataFrame(smiles,columns=["SMILES"])
df_smi

Unnamed: 0,SMILES
0,CCCN1c2cc3nsnc3cc2N(CC)C1OC
1,CCN1c2cc3nsnc3c(OC)c2N(C)C1OC(C)C
2,COC1N(C)c2cc3nsnc3c(OCC(C)C)c2N1C
3,CCCN1c2cc3nsnc3c(C)c2N(C)C1OC
4,CCCCOc1c2c(cc3nsnc13)N(C)C(OC)N2C


**Use rdMolDescriptors to get Molecular Properties**

In [59]:
properties=list(rdMolDescriptors.Properties.GetAvailableProperties())

**Save properties of each SMILES as a numpy array in a pandas column "all_props"**

In [60]:
df_smi["all_props"]=df_smi["SMILES"].apply(lambda x:smi_to_mol_prop(x,property_names=properties))
df_smi

Unnamed: 0,SMILES,all_props
0,CCCN1c2cc3nsnc3cc2N(CC)C1OC,"[278.120132196, 278.38100000000003, 5.0, 0.0, ..."
1,CCN1c2cc3nsnc3c(OC)c2N(C)C1OC(C)C,"[308.13069687999996, 308.40700000000004, 6.0, ..."
2,COC1N(C)c2cc3nsnc3c(OCC(C)C)c2N1C,"[308.13069688, 308.40700000000004, 6.0, 0.0, 4..."
3,CCCN1c2cc3nsnc3c(C)c2N(C)C1OC,"[278.120132196, 278.38100000000003, 5.0, 0.0, ..."
4,CCCCOc1c2c(cc3nsnc13)N(C)C(OC)N2C,"[308.13069688, 308.40700000000004, 6.0, 0.0, 5..."


**Save each property in pandas columns**

In [61]:
df_props = pd.DataFrame(df_smi['all_props'].to_list(),columns=properties)
print(df_props.shape)
df_props

(5, 43)


Unnamed: 0,exactmw,amw,lipinskiHBA,lipinskiHBD,NumRotatableBonds,NumHBD,NumHBA,NumHeavyAtoms,NumAtoms,NumHeteroatoms,...,chi0n,chi1n,chi2n,chi3n,chi4n,hallKierAlpha,kappa1,kappa2,kappa3,Phi
0,278.120132,278.381,5.0,0.0,4.0,0.0,6.0,19.0,37.0,6.0,...,11.458722,6.621294,3.51821,3.51821,2.728492,-1.4,12.624573,4.938843,1.861405,3.28162
1,308.130697,308.407,6.0,0.0,4.0,0.0,7.0,21.0,41.0,7.0,...,12.952757,7.062349,3.658888,3.658888,2.800392,-1.6,14.342004,5.306722,2.112588,3.624239
2,308.130697,308.407,6.0,0.0,4.0,0.0,7.0,21.0,41.0,7.0,...,12.952757,6.947449,3.62727,3.62727,2.675933,-1.6,14.342004,5.306722,2.202667,3.624239
3,278.120132,278.381,5.0,0.0,3.0,0.0,6.0,19.0,37.0,6.0,...,11.674265,6.467822,3.67578,3.67578,2.912419,-1.4,12.624573,4.610774,1.701704,3.063635
4,308.130697,308.407,6.0,0.0,5.0,0.0,7.0,21.0,41.0,7.0,...,12.78962,7.091607,3.810556,3.810556,2.744501,-1.6,14.342004,5.650112,2.202667,3.858758


RDKit Generated 43 properties for 5 given molecules

In [62]:
df_props.columns

Index(['exactmw', 'amw', 'lipinskiHBA', 'lipinskiHBD', 'NumRotatableBonds',
       'NumHBD', 'NumHBA', 'NumHeavyAtoms', 'NumAtoms', 'NumHeteroatoms',
       'NumAmideBonds', 'FractionCSP3', 'NumRings', 'NumAromaticRings',
       'NumAliphaticRings', 'NumSaturatedRings', 'NumHeterocycles',
       'NumAromaticHeterocycles', 'NumSaturatedHeterocycles',
       'NumAliphaticHeterocycles', 'NumSpiroAtoms', 'NumBridgeheadAtoms',
       'NumAtomStereoCenters', 'NumUnspecifiedAtomStereoCenters', 'labuteASA',
       'tpsa', 'CrippenClogP', 'CrippenMR', 'chi0v', 'chi1v', 'chi2v', 'chi3v',
       'chi4v', 'chi0n', 'chi1n', 'chi2n', 'chi3n', 'chi4n', 'hallKierAlpha',
       'kappa1', 'kappa2', 'kappa3', 'Phi'],
      dtype='object')

**Combine two dataframes df_smi and df_props into one df_com and drop the "all_props**

In [63]:
df_com=pd.concat([df_smi,df_props],axis=1)
df_com=df_com.drop(columns=["all_props"])
df_com.head()


Unnamed: 0,SMILES,exactmw,amw,lipinskiHBA,lipinskiHBD,NumRotatableBonds,NumHBD,NumHBA,NumHeavyAtoms,NumAtoms,...,chi0n,chi1n,chi2n,chi3n,chi4n,hallKierAlpha,kappa1,kappa2,kappa3,Phi
0,CCCN1c2cc3nsnc3cc2N(CC)C1OC,278.120132,278.381,5.0,0.0,4.0,0.0,6.0,19.0,37.0,...,11.458722,6.621294,3.51821,3.51821,2.728492,-1.4,12.624573,4.938843,1.861405,3.28162
1,CCN1c2cc3nsnc3c(OC)c2N(C)C1OC(C)C,308.130697,308.407,6.0,0.0,4.0,0.0,7.0,21.0,41.0,...,12.952757,7.062349,3.658888,3.658888,2.800392,-1.6,14.342004,5.306722,2.112588,3.624239
2,COC1N(C)c2cc3nsnc3c(OCC(C)C)c2N1C,308.130697,308.407,6.0,0.0,4.0,0.0,7.0,21.0,41.0,...,12.952757,6.947449,3.62727,3.62727,2.675933,-1.6,14.342004,5.306722,2.202667,3.624239
3,CCCN1c2cc3nsnc3c(C)c2N(C)C1OC,278.120132,278.381,5.0,0.0,3.0,0.0,6.0,19.0,37.0,...,11.674265,6.467822,3.67578,3.67578,2.912419,-1.4,12.624573,4.610774,1.701704,3.063635
4,CCCCOc1c2c(cc3nsnc13)N(C)C(OC)N2C,308.130697,308.407,6.0,0.0,5.0,0.0,7.0,21.0,41.0,...,12.78962,7.091607,3.810556,3.810556,2.744501,-1.6,14.342004,5.650112,2.202667,3.858758


***Method 2: Generate 208 2D Descriptors with rdkit.Chem.Descriptors***

In [64]:
from rdkit.Chem import Descriptors

desc_list = [desc[0] for desc in Descriptors._descList]
print(len(desc_list))
print(desc_list)

208
['MaxEStateIndex', 'MinEStateIndex', 'MaxAbsEStateIndex', 'MinAbsEStateIndex', 'qed', 'MolWt', 'HeavyAtomMolWt', 'ExactMolWt', 'NumValenceElectrons', 'NumRadicalElectrons', 'MaxPartialCharge', 'MinPartialCharge', 'MaxAbsPartialCharge', 'MinAbsPartialCharge', 'FpDensityMorgan1', 'FpDensityMorgan2', 'FpDensityMorgan3', 'BCUT2D_MWHI', 'BCUT2D_MWLOW', 'BCUT2D_CHGHI', 'BCUT2D_CHGLO', 'BCUT2D_LOGPHI', 'BCUT2D_LOGPLOW', 'BCUT2D_MRHI', 'BCUT2D_MRLOW', 'BalabanJ', 'BertzCT', 'Chi0', 'Chi0n', 'Chi0v', 'Chi1', 'Chi1n', 'Chi1v', 'Chi2n', 'Chi2v', 'Chi3n', 'Chi3v', 'Chi4n', 'Chi4v', 'HallKierAlpha', 'Ipc', 'Kappa1', 'Kappa2', 'Kappa3', 'LabuteASA', 'PEOE_VSA1', 'PEOE_VSA10', 'PEOE_VSA11', 'PEOE_VSA12', 'PEOE_VSA13', 'PEOE_VSA14', 'PEOE_VSA2', 'PEOE_VSA3', 'PEOE_VSA4', 'PEOE_VSA5', 'PEOE_VSA6', 'PEOE_VSA7', 'PEOE_VSA8', 'PEOE_VSA9', 'SMR_VSA1', 'SMR_VSA10', 'SMR_VSA2', 'SMR_VSA3', 'SMR_VSA4', 'SMR_VSA5', 'SMR_VSA6', 'SMR_VSA7', 'SMR_VSA8', 'SMR_VSA9', 'SlogP_VSA1', 'SlogP_VSA10', 'SlogP_VSA11', 

In [65]:
mols=[Chem.MolFromSmiles(x) for x in smiles]

In [66]:
from rdkit.ML.Descriptors import MoleculeDescriptors

calc = MoleculeDescriptors.MolecularDescriptorCalculator(desc_list)

rdkit_desc = [calc.CalcDescriptors(m) for m in mols]

In [67]:
df_2ddesc = pd.DataFrame(rdkit_desc,columns=desc_list)
print(df_2ddesc.shape)
df_2ddesc

(5, 208)


Unnamed: 0,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,NumRadicalElectrons,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,5.679135,-0.018399,5.679135,0.018399,0.859797,278.381,260.237,278.120132,102,0,...,0,0,0,0,0,0,0,0,0,0
1,6.070164,-0.130482,6.070164,0.130482,0.865358,308.407,288.247,308.130697,114,0,...,0,0,0,0,0,0,0,0,0,0
2,6.07376,-0.142056,6.07376,0.142056,0.864962,308.407,288.247,308.130697,114,0,...,0,0,0,0,0,0,0,0,0,0
3,5.642885,-0.038121,5.642885,0.038121,0.862945,278.381,260.237,278.120132,102,0,...,0,0,0,0,0,0,0,0,0,0
4,6.052742,-0.135459,6.052742,0.135459,0.791643,308.407,288.247,308.130697,114,0,...,0,0,0,0,0,0,0,0,1,0


**Remove all columns with only zeros**

In [68]:
print(df_2ddesc.shape)
df_2ddesc=df_2ddesc.drop(df_2ddesc.columns[df_2ddesc.eq(0).all()], axis=1)
print(df_2ddesc.shape)

(5, 208)
(5, 112)


**Merge df_2ddesc and df_props**

In [69]:
df_com2=pd.concat([df_com,df_2ddesc],axis=1)
print(df_com.shape,df_2ddesc.shape,df_com2.shape)
df_com2.head()


(5, 44) (5, 112) (5, 156)


Unnamed: 0,SMILES,exactmw,amw,lipinskiHBA,lipinskiHBD,NumRotatableBonds,NumHBD,NumHBA,NumHeavyAtoms,NumAtoms,...,MolMR,fr_Ar_N,fr_NH0,fr_aniline,fr_aryl_methyl,fr_benzene,fr_bicyclic,fr_ether,fr_methoxy,fr_unbrch_alkane
0,CCCN1c2cc3nsnc3cc2N(CC)C1OC,278.120132,278.381,5.0,0.0,4.0,0.0,6.0,19.0,37.0,...,78.869,2,4,2,0,1,2,1,1,0
1,CCN1c2cc3nsnc3c(OC)c2N(C)C1OC(C)C,308.130697,308.407,6.0,0.0,4.0,0.0,7.0,21.0,41.0,...,85.399,2,4,2,0,1,2,2,1,0
2,COC1N(C)c2cc3nsnc3c(OCC(C)C)c2N1C,308.130697,308.407,6.0,0.0,4.0,0.0,7.0,21.0,41.0,...,85.351,2,4,2,0,1,2,2,1,0
3,CCCN1c2cc3nsnc3c(C)c2N(C)C1OC,278.120132,278.381,5.0,0.0,3.0,0.0,6.0,19.0,37.0,...,78.989,2,4,2,1,1,2,1,1,0
4,CCCCOc1c2c(cc3nsnc13)N(C)C(OC)N2C,308.130697,308.407,6.0,0.0,5.0,0.0,7.0,21.0,41.0,...,85.421,2,4,2,0,1,2,2,1,1


**Drop duplicate columns**

In [70]:
df_com2=df_com2.loc[:,~df_com2.columns.duplicated()].copy()

Method 3: Morgan fingerprints

In [71]:
import rdkit.Chem as Chem
from rdkit.Chem import AllChem
def compute_ecfp_features(mol: Chem.Mol, ecfp_degree=2, ecfp_power=11) -> np.ndarray:
  """Computes ECFP features for provided rdkit molecule.
    source: https://www.programcreek.com/python/example/89502/rdkit.Chem.AllChem.GetMorganFingerprintAsBitVect
  Parameters:
  -----------
    mol: rdkit molecule
      Molecule to featurize.
    ecfp_degree: int
      ECFP radius ( 2 for morgan fp)
    ecfp_power: int
      Number of bits to store ECFP features (2^ecfp_power will be length of
      ECFP array)
  Returns:
  --------
    ecfp_array: np.ndarray
      Returns an array of size 2^ecfp_power where array at index i has a 1 if
      that ECFP fragment is found in the molecule and array at index j has a 0
      if ECFP fragment not in molecule.
  """
  bit_vect = AllChem.GetMorganFingerprintAsBitVect(mol, ecfp_degree, nBits=2**ecfp_power)
  return np.array(bit_vect)

***When comparing the ECFP/FCFP fingerprints and the Morgan fingerprints generated by the RDKit, remember that the 4 in ECFP4 corresponds to the diameter of the atom environments considered, while the Morgan fingerprints take a radius parameter. So the examples above, with radius=2, are roughly equivalent to ECFP4 and FCFP4***

**Example**

In [72]:
# example
smiles=["CCCN1c2cc3nsnc3cc2N(CC)C1OC" ,"COC1N(C)c2cc3nsnc3c(OCC(C)C)c2N1C" , "CCCN1c2cc3nsnc3c(C)c2N(C)C1OC"]
mols = [Chem.MolFromSmiles(smi) for smi in smiles]
fingerprints = np.stack([compute_ecfp_features(m) for m in mols]) 
print(fingerprints.shape)

(3, 2048)


In [73]:
df_mfp=pd.DataFrame(fingerprints,columns=["mfp"+str(x) for x in range(fingerprints.shape[1])])

In [74]:
df_mfp_com=pd.concat([pd.DataFrame(smiles,columns=["SMILES"]),df_mfp],axis=1)
df_mfp_com

Unnamed: 0,SMILES,mfp0,mfp1,mfp2,mfp3,mfp4,mfp5,mfp6,mfp7,mfp8,...,mfp2038,mfp2039,mfp2040,mfp2041,mfp2042,mfp2043,mfp2044,mfp2045,mfp2046,mfp2047
0,CCCN1c2cc3nsnc3cc2N(CC)C1OC,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,COC1N(C)c2cc3nsnc3c(OCC(C)C)c2N1C,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,CCCN1c2cc3nsnc3c(C)c2N(C)C1OC,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
