In [1]:
import os,glob
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem import PandasTools
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Lipinski

In [2]:
resultsFolder = '/home/ychen/projects/nps_ringsys/20210901_analysis/'

npRings_noStereo = PandasTools.LoadSDF(resultsFolder+'npRings_noStereo.sdf')
npRings_Stereo = PandasTools.LoadSDF(resultsFolder+'npRings_Stereo.sdf')

scRings_noStereo = PandasTools.LoadSDF(resultsFolder+'zincRings_noStereo.sdf')
scRings_Stereo = PandasTools.LoadSDF(resultsFolder+'zincRings_Stereo.sdf')

In [3]:
npRings_noStereo

Unnamed: 0,ringSmiles_noStereo,nMol_conID,pMol_conID,ID,ROMol
0,c1ccccc1,72297,31.31%,4,
1,C1CCOCC1,44113,19.10%,22,
2,O=c1ccoc2ccccc12,8879,3.85%,44,
3,C1CCOC1,7178,3.11%,18,
4,C1CCCCC1,6616,2.87%,54,
...,...,...,...,...,...
30998,O=C1C=CC=CC2C(C=CC=CC=CCCNC(=O)C1)C=CC1C(=O)CC...,1,0.00%,9131,
30999,O=C1C2C=CC3C=CCCCCCCC=CC=CC=CC3C2C2CCCN12,1,0.00%,9130,
31000,O=C1C=CC2CC(=O)CC2CC1,1,0.00%,9128,
31001,C1=CC2CCCCC3OC23CC1,1,0.00%,9127,


In [4]:
def get_molecule_composition(mol, requestedAtomicNum):
    '''
    Counts the number of atoms of a given element in the ring system

    :param mol: the ring system molecule
    :param requestedAtomicNum: atomic number of the element for which the occurrence should be counted
    :return: the number of atoms of an element
    '''
    counter = 0
    for atom in mol.GetAtoms():
        atomicNum = atom.GetAtomicNum()
        if atomicNum == requestedAtomicNum:
            counter += 1
    return counter

In [5]:
def num_aromatic_atoms(mol):
    numAromaticAtoms = 0
    for atom in mol.GetAtoms():
        if atom.GetIsAromatic():
            numAromaticAtoms += 1
    return numAromaticAtoms

In [7]:
def num_aromatic_heteroatoms(mol):
    numAromaticAtoms = 0
    for atom in mol.GetAtoms():
        if atom.GetIsAromatic() and atom.GetAtomicNum() !=6:
            numAromaticAtoms += 1
    return numAromaticAtoms

In [8]:
def get_nof_chiral_centers(mol):
    return len(Chem.FindMolChiralCenters(mol, includeUnassigned=True))

In [9]:
def cal_properties(df):
    df['N'] = df['ROMol'].apply(get_molecule_composition, args=(7,))
    df['O'] = df['ROMol'].apply(get_molecule_composition, args=(8,))
    df['H'] = df['ROMol'].apply(get_molecule_composition, args=(1,))
    df['B'] = df['ROMol'].apply(get_molecule_composition, args=(5,))
    df['C'] = df['ROMol'].apply(get_molecule_composition, args=(6,))
    df['F'] = df['ROMol'].apply(get_molecule_composition, args=(9,))
    df['Si'] = df['ROMol'].apply(get_molecule_composition, args=(14,))
    df['P'] = df['ROMol'].apply(get_molecule_composition, args=(15,))
    df['S'] = df['ROMol'].apply(get_molecule_composition, args=(16,))
    df['Cl'] = df['ROMol'].apply(get_molecule_composition, args=(17,))
    df['Se'] = df['ROMol'].apply(get_molecule_composition, args=(34,))
    df['Br'] = df['ROMol'].apply(get_molecule_composition, args=(35,))
    df['I'] = df['ROMol'].apply(get_molecule_composition, args=(53,))
    
    df['a_aro'] = df['ROMol'].apply(num_aromatic_atoms)

    df['a_heteroaro'] = df['ROMol'].apply(num_aromatic_heteroatoms)

    df['chiral'] = df['ROMol'].apply(get_nof_chiral_centers)

    df['element_except_C'] = df['N'] + df['O'] + df['B'] + \
    df['F'] + df['Si'] + df['P'] + df['S'] +\
    df['Cl'] + df['Se'] + df['Br'] + df['I']

    df['element_except_C_O'] = df['N'] + df['B'] + \
    df['F'] + df['Si'] + df['P'] + df['S'] +\
    df['Cl'] + df['Se'] + df['Br'] + df['I']

    df['element_except_C_N'] = df['O'] + df['B'] + \
    df['F'] + df['Si'] + df['P'] + df['S'] +\
    df['Cl'] + df['Se'] + df['Br'] + df['I']

    df['halogens'] = df['F'] + df['Cl'] + \
    df['Br'] + df['I']

    df['has_C=C'] = df.apply(lambda row:row.ROMol.HasSubstructMatch(Chem.MolFromSmarts('C=C')),axis=1)
    
    return df

In [None]:
cal_properties(npRings_noStereo)
cal_properties(npRings_Stereo)

In [None]:
list_np_noStereo = [npRings_noStereo.head(30),npRings_noStereo.head(100),npRings_noStereo]
list_np_Stereo = [npRings_Stereo.head(30),npRings_Stereo.head(100),npRings_Stereo]

In [None]:
for df1,df2,top in zip(list_np_noStereo,list_np_Stereo,['top-30','top-100','all']):
    print(top)
    print(len(df1[df1['element_except_C'] ==0]),'/',len(df2[df2['element_except_C'] ==0]))
    print(len(df1[df1['element_except_C_O'] ==0]),'/',len(df2[df2['element_except_C_O'] ==0]))
    print(len(df1[df1['O'] !=0]),'/',len(df2[df2['O'] !=0]))
    print(len(df1[df1['element_except_C_N'] ==0]),'/',len(df2[df2['element_except_C_N'] ==0]))
    print(len(df1[df1['N'] !=0]),'/',len(df2[df2['N'] !=0]))
    print(len(df1[df1['S'] !=0]),'/',len(df2[df2['S'] !=0]))
    print(len(df1[df1['halogens'] !=0]),'/',len(df2[df2['halogens'] !=0]))
    print(len(df1[df1['a_aro'] !=0]),'/',len(df2[df2['a_aro'] !=0]))
    print(len(df1[df1['a_heteroaro'] !=0]),'/',len(df2[df2['a_heteroaro'] !=0]))
    print(len(df1[df1['chiral'] !=0]),'/',len(df2[df2['chiral'] !=0]))
    print(len(df1[df1['has_C=C'] ==True]),'/',len(df2[df2['has_C=C'] ==True]))

In [None]:
for df1,df2,top,num in zip(list_np_noStereo,list_np_Stereo,['top-30','top-100','all'],[[30,30],[100,100],[31003,38662]]):
    print(top)
    print('{:0.2f}'.format(len(df1[df1['element_except_C'] ==0])/num[0]),'/',
          '{:0.2f}'.format(len(df2[df2['element_except_C'] ==0])/num[1]))
    print('{:0.2f}'.format(len(df1[df1['element_except_C_O'] ==0])/num[0]),'/',
          '{:0.2f}'.format(len(df2[df2['element_except_C_O'] ==0])/num[1]))
    print('{:0.2f}'.format(len(df1[df1['O'] !=0])/num[0]),'/','{:0.2f}'.format(len(df2[df2['O'] !=0])/num[1]))
    print('{:0.2f}'.format(len(df1[df1['element_except_C_N'] ==0])/num[0]),'/',
          '{:0.2f}'.format(len(df2[df2['element_except_C_N'] ==0])/num[1]))
    print('{:0.2f}'.format(len(df1[df1['N'] !=0])/num[0]),'/','{:0.2f}'.format(len(df2[df2['N'] !=0])/num[1]))
    print('{:0.2f}'.format(len(df1[df1['S'] !=0])/num[0]),'/','{:0.2f}'.format(len(df2[df2['S'] !=0])/num[1]))
    print('{:0.2f}'.format(len(df1[df1['halogens'] !=0])/num[0]),'/',
          '{:0.2f}'.format(len(df2[df2['halogens'] !=0])/num[1]))
    print('{:0.2f}'.format(len(df1[df1['a_aro'] !=0])/num[0]),'/',
          '{:0.2f}'.format(len(df2[df2['a_aro'] !=0])/num[1]))
    print('{:0.2f}'.format(len(df1[df1['a_heteroaro'] !=0])/num[0]),'/',
          '{:0.2f}'.format(len(df2[df2['a_heteroaro'] !=0])/num[1]))
    print('{:0.2f}'.format(len(df1[df1['chiral'] !=0])/num[0]),'/',
          '{:0.2f}'.format(len(df2[df2['chiral'] !=0])/num[1]))
    print('{:0.2f}'.format(len(df1[df1['has_C=C'] ==True])/num[0]),'/',
          '{:0.2f}'.format(len(df2[df2['has_C=C'] ==True])/num[1]))

In [None]:
cal_properties(scRings_noStereo)
cal_properties(scRings_Stereo)

In [None]:
list_sc_noStereo = [scRings_noStereo.head(30),scRings_noStereo.head(100),scRings_noStereo]
list_sc_Stereo = [scRings_Stereo.head(30),scRings_Stereo.head(100),scRings_Stereo]

In [None]:
for df1,df2,top in zip(list_sc_noStereo,list_sc_Stereo,['top-30','top-100','all']):
    print(top)
    print(len(df1[df1['element_except_C'] ==0]),'/',len(df2[df2['element_except_C'] ==0]))
    print(len(df1[df1['element_except_C_O'] ==0]),'/',len(df2[df2['element_except_C_O'] ==0]))
    print(len(df1[df1['O'] !=0]),'/',len(df2[df2['O'] !=0]))
    print(len(df1[df1['element_except_C_N'] ==0]),'/',len(df2[df2['element_except_C_N'] ==0]))
    print(len(df1[df1['N'] !=0]),'/',len(df2[df2['N'] !=0]))
    print(len(df1[df1['S'] !=0]),'/',len(df2[df2['S'] !=0]))
    print(len(df1[df1['halogens'] !=0]),'/',len(df2[df2['halogens'] !=0]))
    print(len(df1[df1['a_aro'] !=0]),'/',len(df2[df2['a_aro'] !=0]))
    print(len(df1[df1['a_heteroaro'] !=0]),'/',len(df2[df2['a_heteroaro'] !=0]))
    print(len(df1[df1['chiral'] !=0]),'/',len(df2[df2['chiral'] !=0]))
    print(len(df1[df1['has_C=C'] ==True]),'/',len(df2[df2['has_C=C'] ==True]))

In [None]:
for df1,df2,top,num in zip(list_sc_noStereo,list_sc_Stereo,['top-30','top-100','all'],[[30,30],[100,100],[30265,53229]]):
    print(top)
    print('{:0.2f}'.format(len(df1[df1['element_except_C'] ==0])/num[0]),'/',
          '{:0.2f}'.format(len(df2[df2['element_except_C'] ==0])/num[1]))
    print('{:0.2f}'.format(len(df1[df1['element_except_C_O'] ==0])/num[0]),'/',
          '{:0.2f}'.format(len(df2[df2['element_except_C_O'] ==0])/num[1]))
    print('{:0.2f}'.format(len(df1[df1['O'] !=0])/num[0]),'/','{:0.2f}'.format(len(df2[df2['O'] !=0])/num[1]))
    print('{:0.2f}'.format(len(df1[df1['element_except_C_N'] ==0])/num[0]),'/',
          '{:0.2f}'.format(len(df2[df2['element_except_C_N'] ==0])/num[1]))
    print('{:0.2f}'.format(len(df1[df1['N'] !=0])/num[0]),'/','{:0.2f}'.format(len(df2[df2['N'] !=0])/num[1]))
    print('{:0.2f}'.format(len(df1[df1['S'] !=0])/num[0]),'/','{:0.2f}'.format(len(df2[df2['S'] !=0])/num[1]))
    print('{:0.2f}'.format(len(df1[df1['halogens'] !=0])/num[0]),'/',
          '{:0.2f}'.format(len(df2[df2['halogens'] !=0])/num[1]))
    print('{:0.2f}'.format(len(df1[df1['a_aro'] !=0])/num[0]),'/',
          '{:0.2f}'.format(len(df2[df2['a_aro'] !=0])/num[1]))
    print('{:0.2f}'.format(len(df1[df1['a_heteroaro'] !=0])/num[0]),'/',
          '{:0.2f}'.format(len(df2[df2['a_heteroaro'] !=0])/num[1]))
    print('{:0.2f}'.format(len(df1[df1['chiral'] !=0])/num[0]),'/',
          '{:0.2f}'.format(len(df2[df2['chiral'] !=0])/num[1]))
    print('{:0.2f}'.format(len(df1[df1['has_C=C'] ==True])/num[0]),'/',
          '{:0.2f}'.format(len(df2[df2['has_C=C'] ==True])/num[1]))