## Importing libraries and reading data

In [3]:
import pandas as pd
import numpy as np
from pathlib import Path
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem import Descriptors

In [17]:
HERE = Path(_dh[-1])
DATA_PATH = HERE.parent.parent.parent/'data'/'fda_approved_datasets'/'fda_approved_drugs.csv'

fda_drugs = pd.read_csv(DATA_PATH)
print(fda_drugs.shape)
fda_drugs.head()

(1895, 14)


Unnamed: 0,name,chembl_id,clean_smiles,first_approval_year,indication_class,molecule_type,withdrawn_flag,therapeutic_flag,polymer_flag,inorganic_flag,natural_product_flag,oral,parenteral,topical
0,GUANIDINE HYDROCHLORIDE,CHEMBL1200728,N=C(N)N,1939,,Small molecule,False,True,False,False,False,True,False,False
1,ACETOHYDROXAMIC ACID,CHEMBL734,CC(=O)NO,1983,Enzyme Inhibitor (urease),Small molecule,False,True,False,False,False,True,False,False
2,HYDROXYUREA,CHEMBL467,NC(=O)NO,1967,Antineoplastic,Small molecule,False,True,False,False,False,True,False,False
3,CYSTEAMINE,CHEMBL602,NCCS,1994,CYSTEAMINE HYDROCHLORIDE,Small molecule,False,True,False,False,False,True,False,True
4,DIMETHYL SULFOXIDE,CHEMBL504,C[S+](C)[O-],1978,Anti-Inflammatory (topical),Small molecule,False,True,False,False,False,False,True,False


In [39]:
# We just need chembl_id and clean_smiles for this
drug_smiles = fda_drugs[['chembl_id', 'clean_smiles']]

## Calculating descriptors

In [40]:
print(len(Descriptors._descList))

208


In [54]:
def getMolDescriptors(smiles, chembl_id,missingVal=np.nan):
    """
    Calculate the full list of descriptors for a mol object or return NaN is cannot be resolved
    """
    res = {}
    res['smiles'] = smiles
    res['chembl_id'] = chembl_id
    for nm, fn in Descriptors._descList:
        try:
            val = fn(Chem.MolFromSmiles(smiles))
        except:
            import traceback
            traceback.print_exc()
            val = missingVal

        res[nm] = val
    
    return res

In [55]:
doravirine = 'Cn1c(n[nH]c1=O)Cn2ccc(c(c2=O)Oc3cc(cc(c3)Cl)C#N)C(F)(F)F'
chembl_id = 'CHEMBL10'

In [57]:
getMolDescriptors(doravirine, chembl_id)

{'smiles': 'Cn1c(n[nH]c1=O)Cn2ccc(c(c2=O)Oc3cc(cc(c3)Cl)C#N)C(F)(F)F',
 'chembl_id': 'CHEMBL10',
 'MaxEStateIndex': 13.412553309006833,
 'MinEStateIndex': -4.871620672188628,
 'MaxAbsEStateIndex': 13.412553309006833,
 'MinAbsEStateIndex': 0.045220418860841605,
 'qed': 0.6914051268589834,
 'MolWt': 425.754,
 'HeavyAtomMolWt': 414.66600000000005,
 'ExactMolWt': 425.050251552,
 'NumValenceElectrons': 150,
 'NumRadicalElectrons': 0,
 'MaxPartialCharge': 0.4197525104273902,
 'MinPartialCharge': -0.45079941098947357,
 'MaxAbsPartialCharge': 0.45079941098947357,
 'MinAbsPartialCharge': 0.4197525104273902,
 'FpDensityMorgan1': 1.3103448275862069,
 'FpDensityMorgan2': 2.0344827586206895,
 'FpDensityMorgan3': 2.6206896551724137,
 'BCUT2D_MWHI': 35.495691906445956,
 'BCUT2D_MWLOW': 10.182401353178228,
 'BCUT2D_CHGHI': 2.363442602497932,
 'BCUT2D_CHGLO': -2.1532454345808123,
 'BCUT2D_LOGPHI': 2.362094239067197,
 'BCUT2D_LOGPLOW': -2.2620565247489415,
 'BCUT2D_MRHI': 6.30376236817795,
 'BCUT2D_MRLO