## Import libraries and read data

In [2]:
from pathlib import Path

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# ML
from sklearn.preprocessing import StandardScaler

# RDKit
from rdkit import Chem, DataStructs
from rdkit.Chem import Draw, rdFingerprintGenerator, rdMolDescriptors, AllChem
from rdkit.Chem import PandasTools, rdDepictor, rdFMCS
from rdkit.Chem.Draw import IPythonConsole, rdMolDraw2D

Load FDA approved drugs

In [6]:
HERE = Path(_dh[-1])
DATA_FOLDER = HERE.parent.parent/'data/fda_approved_datasets/'

fda_drugs = pd.read_csv(DATA_FOLDER/'fda_approved_drugs.csv')
fda_drugs.head(3)

Unnamed: 0,name,chembl_id,clean_smiles,first_approval_year,indication_class,molecule_type,withdrawn_flag,therapeutic_flag,polymer_flag,inorganic_flag,natural_product_flag,oral,parenteral,topical
0,GUANIDINE HYDROCHLORIDE,CHEMBL1200728,N=C(N)N,1939,,Small molecule,False,True,False,False,False,True,False,False
1,ACETOHYDROXAMIC ACID,CHEMBL734,CC(=O)NO,1983,Enzyme Inhibitor (urease),Small molecule,False,True,False,False,False,True,False,False
2,HYDROXYUREA,CHEMBL467,NC(=O)NO,1967,Antineoplastic,Small molecule,False,True,False,False,False,True,False,False


In [8]:
fda_drugs['clean_smiles'].to_list()

['N=C(N)N',
 'CC(=O)NO',
 'NC(=O)NO',
 'NCCS',
 'C[S+](C)[O-]',
 'Cc1cn[nH]c1',
 'C1CNCCN1',
 'CC(O)C(=O)O',
 'Nc1ccncc1',
 'N[C@@H]1CONC1=O',
 'O=C(O)CCCO',
 'Nc1ccncc1N',
 'Oc1cccc(O)c1',
 'NCCc1cc[nH]n1',
 'NCCc1c[nH]cn1',
 'Cn1ccnc1S',
 'CC(C)CCON=O',
 'C[N+](C)(C)CC(=O)O',
 'NC(CO)(CO)CO',
 'NC(=O)c1cnccn1',
 'OCC(S)CS',
 'COc1ccc(O)cc1',
 'O=C(O)P(=O)(O)O',
 'On1ccccc1=S',
 'Nc1nc(=O)[nH]cc1F',
 'C=CC(N)CCC(=O)O',
 'CN(C)C(=N)NC(=N)N',
 'O=c1[nH]cc(F)c(=O)[nH]1',
 'COC(=O)/C=C/C(=O)O',
 'NCC(=O)CCC(=O)O',
 'NCCCCCC(=O)O',
 'N[C@@H]1C[C@H]1c1ccccc1',
 'CC(N)Cc1ccccc1',
 'C[C@H](N)Cc1ccccc1',
 'Oc1ncnc2[nH]ncc12',
 'O=C(O)Cc1ccccc1',
 'CNCCc1ccccn1',
 'NNCCc1ccccc1',
 'Nc1ccc(C(=O)O)cc1',
 'COC(=O)c1cccnc1',
 'NNC(=O)c1ccncc1',
 'C[n+]1ccccc1C=NO',
 'C[C@@H]1O[C@@H]1P(=O)(O)O',
 'O=C(O)c1ccccc1O',
 'Cc1c(O)c(=O)ccn1C',
 'C1N2CN3CN1CN(C2)C3',
 'CCC1(C)CC(=O)NC1=O',
 'CNC(C)CCC=C(C)C',
 'NC(=O)CN1CCCC1=O',
 'CN1C(=O)OC(C)(C)C1=O',
 'C#CC(O)(/C=C/Cl)CC',
 'COC(=O)/C=C/C(=O)OC',
 'CCCC

In [None]:
def count_ring_systems(mol, includeSpiro=False):
    ri = mol.GetRingInfo()
    systems = []

    # Scaping 
    if len(ri.AtomRings()) == 0:
        return [0]
        
    for ring in ri.AtomRings():
        ringAts = set(ring)
        nSystems = []
        for system in systems:
            nInCommon = len(ringAts.intersection(system))

            if nInCommon and (includeSpiro or nInCommon>1):
                ringAts = ringAts.union(system)
            else:
                nSystems.append(system)
            
            nSystems.append(ringAts)
            systems = nSystems
    return len(systems)
