In [2]:
import os
from pathlib import Path
import glob
import pandas as pd

In [3]:
# count number of molecules

In [4]:
drugInputDir = '/data/local/ringsys/202111_approveddrug/smiles_ids'
csv_files = glob.glob(os.path.join(drugInputDir, "*.csv"))
frames = []
for f in csv_files:
    df = pd.read_csv(f)
    frames.append(df)
drugDF = pd.concat(frames)

In [5]:
len(set(drugDF.conID))

2225

In [6]:
drug_idSet = set()
for i in list(drugDF.mol_stereoIDs):
    if '[' in i:
        ids = i.replace('[','').replace(']','').replace("'","").replace(' ','')
        id_list = ids.split(',')
        for item in id_list:
            drug_idSet.add(item)
    else:
        drug_idSet.add(i)

In [7]:
len(drug_idSet)

2238

In [10]:
# how many approved drugs has ring

In [11]:
from rdkit import Chem

def has_ring(row):
    mol = Chem.MolFromSmiles(row.preprocessedSmiles)
    if Chem.GetSSSR(mol) != 0:
        return 1
    else:
        return 0

In [12]:
drugDF['has_ring']=drugDF.apply(lambda row:has_ring(row),axis=1)   

In [13]:
drugDF_has_ring = drugDF[drugDF['has_ring']==1]

In [14]:
len(drugDF_has_ring)

2001

In [15]:
len(set(drugDF_has_ring.conID)),\
len(set(drugDF_has_ring.conID))/len(set(drugDF.conID))

(1986, 0.8925842696629214)

In [16]:
drug_has_ring_idSet = set()
for i in list(drugDF_has_ring.mol_stereoIDs):
    if '[' in i:
        ids = i.replace('[','').replace(']','').replace("'","").replace(' ','')
        id_list = ids.split(',')
        for item in id_list:
            drug_has_ring_idSet.add(item)
    else:
        drug_has_ring_idSet.add(i)

In [17]:
len(drug_has_ring_idSet),len(drug_has_ring_idSet)/len(drug_idSet)

(1996, 0.8918677390527256)

In [18]:
# count number of unique ring systems

In [19]:
drugs_dir = '/data/local/ringsys/202111_approveddrug/'
drug_uniq_ring_Folder = Path(drugs_dir)/ 'uniqueRingSystems/'

drug_ring_noStereo = pd.read_csv(str(drug_uniq_ring_Folder)+'/drug_uniqueRingSystems_noStereo.txt',sep='\t')
drug_ring_Stereo = pd.read_csv(str(drug_uniq_ring_Folder)+'/drug_uniqueRingSystems_Stereo.txt',sep='\t')

In [20]:
len(drug_ring_noStereo)

596

In [21]:
len(drug_ring_Stereo)

602

In [23]:
# singtons

In [24]:
len(drug_ring_noStereo[drug_ring_noStereo['nMol_conID']==1])

351

In [25]:
len(drug_ring_Stereo[drug_ring_Stereo['nMol_stereoID']==1])

357

In [26]:
# no. macrocycles

In [27]:
from rdkit.Chem import PandasTools

PandasTools.AddMoleculeColumnToFrame(drug_ring_noStereo, 'ringSmiles_noStereo', 'ringMolecule')
PandasTools.AddMoleculeColumnToFrame(drug_ring_Stereo, 'RingSmiles', 'ringMolecule')

In [28]:
from rdkit.Chem import MolFromSmarts

def macrocycle(mol):
    if mol.HasSubstructMatch(MolFromSmarts('[r;!r3;!r4;!r5;!r6;!r7;!r8;!r9;!r10;!r11]')):
        return 1
    else:
        return 0

In [29]:
drug_ring_noStereo['macrocycle'] = drug_ring_noStereo.apply(lambda row: macrocycle(row.ringMolecule),axis=1)
drug_ring_Stereo['macrocycle'] = drug_ring_Stereo.apply(lambda row: macrocycle(row.ringMolecule),axis=1)

In [30]:
len(drug_ring_noStereo[drug_ring_noStereo['macrocycle']==1])

52

In [31]:
len(drug_ring_Stereo[drug_ring_Stereo['macrocycle']==1])

52

In [32]:
# %chiral

In [33]:
def get_nof_chiral_centers(mol):
    return len(Chem.FindMolChiralCenters(mol, includeUnassigned=True))

In [34]:
drug_ring_noStereo['n_chiral'] = drug_ring_noStereo['ringMolecule'].apply(lambda row:get_nof_chiral_centers(row))
drug_ring_Stereo['n_chiral'] = drug_ring_Stereo['ringMolecule'].apply(lambda row:get_nof_chiral_centers(row))

In [36]:
len(drug_ring_Stereo[drug_ring_Stereo['n_chiral']!=0]),\
len(drug_ring_Stereo[drug_ring_Stereo['n_chiral']!=0])/len(drug_ring_Stereo)

(186, 0.3089700996677741)

In [37]:
len(drug_ring_noStereo[drug_ring_noStereo['n_chiral']!=0]),\
len(drug_ring_noStereo[drug_ring_noStereo['n_chiral']!=0])/len(drug_ring_noStereo)

(180, 0.30201342281879195)