In [1]:
from time import time
import pandas as pd
import numpy as np
import SARpy
import operator
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
from sklearn.model_selection import train_test_split
from pprint import pprint
from rdkit.Chem import AllChem as Chem
from matplotlib import pyplot as plt
import matplotlib
import matplotlib.patches as mpatches
import scipy.stats as stats
import statsmodels.stats.multitest as smm

In [2]:
structures = pd.read_csv("./Most_No_DILIConcern_Dataset_for_SAs.csv")
structures

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,PubChem_CID,Compound Name,SMILES,MW,vDILIConcern
0,0,0,34869,amineptine,O=C(O)CCCCCCNC1c2ccccc2CCc2ccccc21,337.204179,vMost-DILI-Concern
1,1,1,2717,chlormezanone,CN1C(=O)CCS(=O)(=O)C1c1ccc(Cl)cc1,273.022642,vMost-DILI-Concern
2,2,2,65679,droxicam,CN1c2c(oc(=O)n(-c3ccccn3)c2=O)-c2ccccc2S1(=O)=O,357.041941,vMost-DILI-Concern
3,3,3,65869,ebrotidine,NC(N)=Nc1nc(CSCCN=CNS(=O)(=O)c2ccc(Br)cc2)cs1,475.975849,vMost-DILI-Concern
4,4,4,4495,nimesulide,CS(=O)(=O)Nc1ccc([N+](=O)[O-])cc1Oc1ccccc1,308.046692,vMost-DILI-Concern
5,5,5,5591,troglitazone,Cc1c(C)c2c(c(C)c1O)CCC(C)(COc1ccc(CC3SC(=O)NC3...,441.160994,vMost-DILI-Concern
6,6,6,62959,trovafloxacin,NC1[C@@H]2CN(c3nc4c(cc3F)c(=O)c(C(=O)O)cn4-c3c...,416.109625,vMost-DILI-Concern
7,7,7,2081,alaproclate,CC(N)C(=O)OC(C)(C)Cc1ccc(Cl)cc1,255.102606,vMost-DILI-Concern
8,8,8,3086677,alatrofloxacin mesylate,C[C@H](N)C(=O)N[C@@H](C)C(=O)NC1[C@@H]2CN(c3nc...,558.183853,vMost-DILI-Concern
9,9,9,30951,alclofenac,C=CCOc1ccc(CC(=O)O)cc1Cl,226.039672,vMost-DILI-Concern


In [3]:
Compound_Name = structures['Compound Name'].tolist()
SMILES = structures.loc[:,['SMILES']]

In [4]:
def label (x):
    if x == 'vMost-DILI-Concern':
        return 1
    if x == 'vNo-DILI-Concern':
        return 0

In [5]:
SMILES['DILI_LABEL'] = structures['vDILIConcern'].apply(label)
SMILES.to_csv("Data_for_SARpy.csv")

In [6]:
data = pd.read_csv("Data_for_SARpy.csv")
x = data[['SMILES','DILI_LABEL']]
y = data['DILI_LABEL']
x.to_csv("Train_Data_for_SARpy.csv")

In [7]:
x.head()

Unnamed: 0,SMILES,DILI_LABEL
0,O=C(O)CCCCCCNC1c2ccccc2CCc2ccccc21,1
1,CN1C(=O)CCS(=O)(=O)C1c1ccc(Cl)cc1,1
2,CN1c2c(oc(=O)n(-c3ccccn3)c2=O)-c2ccccc2S1(=O)=O,1
3,NC(N)=Nc1nc(CSCCN=CNS(=O)(=O)c2ccc(Br)cc2)cs1,1
4,CS(=O)(=O)Nc1ccc([N+](=O)[O-])cc1Oc1ccccc1,1


In [8]:
filt1 = SARpy.Filter('DILI_LABEL',0, operator.eq)
filt2 = SARpy.Filter('DILI_LABEL',1, operator.eq)
dictionary = {'INACTIVE':filt1, 'ACTIVE':filt2}
dataset = SARpy.loadDataset("Train_Data_for_SARpy.csv",'csv',dictionary,'SMILES')



Loading dataset...

 Read 401 molecular structures
('', 174, 'ACTIVE')
('', 227, 'INACTIVE')


In [9]:
SARpy.fragmentize(dataset,2,15)
rules = SARpy.extract(dataset, minHits = 5,minLR = 1, minPrecision = 0) # 'OPTIMAL'
SARpy.saveSmarts(rules,'ruleset.txt')



Fragmenting...	

 2095	substructures found...
 4480	substructures found...
 4806	substructures found...
 3325	substructures found...
 1654	substructures found...
 664	substructures found...
 185	substructures found...
 37	substructures found...
 5	substructures found...
 0	substructures found...

FRAGMENTS: 17251

Evaluating fragments on the training set...

    -> elapsed time: 35.77 seconds
         fragmentation 33.94 seconds
              matching 1.83 seconds


Extracting rules...	

 8798 ACTIVE substructures
  233 of which are potential alerts

 11401 INACTIVE substructures
  1639 of which are potential alerts

 Extracted:
 20	ACTIVE
 27	INACTIVE

RULES: 47

 -> time: 1.86 seconds


47 RULES have been saved


In [10]:
## Way to get SARpy substructures for DILI (not for noDILI)

accuracy_list = []
sensitivity_list = []
specificity_list = []
SMARTS_list = []
LR_list = []
Target_list = []
test_comp_matched_list = []
for rule in rules:
    myrules = [rule,rule]
    SARpy.saveSmarts(myrules,'ruleset.txt')
    alerts = pd.read_csv("ruleset.txt", sep='\t')
    alerts = alerts.ix[1]
    #print(alerts)
    SMARTS = alerts['SMARTS']
    TARGET = alerts['Target']
    Target_list.append(TARGET)
#    if TARGET == 'ACTIVE':
        
    
    LR = alerts['Training LR']
    pred = SARpy.predict(myrules,dataset) #dataset_test
    if pred >= 1:
        accuracy,sensitivity,specificity = SARpy.validate(dataset) #
            #print(accuracy,sensitivity,specificity,LR, SMARTS)
        pred = (pred/167.0)*100
        test_comp_matched_list.append((pred))
        accuracy_list.append(accuracy)
        sensitivity_list.append(sensitivity)
        specificity_list.append(specificity)
        SMARTS_list.append(SMARTS)
        LR_list.append(LR)
            
            
        
    else:
        continue
        
        
df = pd.DataFrame()
df['SMARTS'] = SMARTS_list
df['TYPE'] = Target_list
df['Accuracy'] = accuracy_list
df['Sensitivity'] = sensitivity_list
df['Specificity'] = specificity_list
df['LR_training'] = LR_list
df



2 RULES have been saved


Predicting...

 23 structures matched


Validating...
lol

 Binary classification:
  INACTIVE = POSITIVE
  otherwise = NEGATIVE

 ACCURACY:	0.49
 sensitivity:	0.10
 specificity:	1.00

CONFUSION MATRIX:
YES	NO	<-any alert?
23	204	POSITIVES
0	174	NEGATIVES

 ACCURACY:	0.49
 sensitivity:	0.10
 specificity:	1.00

CONFUSION MATRIX:
YES	NO	<-any alert?
23	204	POSITIVES
0	174	NEGATIVES
  0.491271820449 = ss_accuracy


2 RULES have been saved


Predicting...

 18 structures matched


Validating...
lol

 Binary classification:
  INACTIVE = POSITIVE
  otherwise = NEGATIVE

 ACCURACY:	0.48
 sensitivity:	0.08
 specificity:	1.00

CONFUSION MATRIX:
YES	NO	<-any alert?
18	209	POSITIVES
0	174	NEGATIVES

 ACCURACY:	0.48
 sensitivity:	0.08
 specificity:	1.00

CONFUSION MATRIX:
YES	NO	<-any alert?
18	209	POSITIVES
0	174	NEGATIVES
  0.478802992519 = ss_accuracy


2 RULES have been saved


Predicting...

 10 structures matched


Validating...
lol

 Binary classification:
  ACTIV

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  





Validating...
lol

 Binary classification:
  ACTIVE = POSITIVE
  otherwise = NEGATIVE

 ACCURACY:	0.58
 sensitivity:	0.03
 specificity:	1.00

CONFUSION MATRIX:
YES	NO	<-any alert?
5	169	POSITIVES
1	226	NEGATIVES

 ACCURACY:	0.58
 sensitivity:	0.03
 specificity:	1.00

CONFUSION MATRIX:
YES	NO	<-any alert?
5	169	POSITIVES
1	226	NEGATIVES
  0.576059850374 = ss_accuracy


2 RULES have been saved


Predicting...

 15 structures matched


Validating...
lol

 Binary classification:
  INACTIVE = POSITIVE
  otherwise = NEGATIVE

 ACCURACY:	0.45
 sensitivity:	0.05
 specificity:	0.98

CONFUSION MATRIX:
YES	NO	<-any alert?
11	216	POSITIVES
4	170	NEGATIVES

 ACCURACY:	0.45
 sensitivity:	0.05
 specificity:	0.98

CONFUSION MATRIX:
YES	NO	<-any alert?
11	216	POSITIVES
4	170	NEGATIVES
  0.451371571072 = ss_accuracy


2 RULES have been saved


Predicting...

 26 structures matched


Validating...
lol

 Binary classification:
  INACTIVE = POSITIVE
  otherwise = NEGATIVE

 ACCURACY:	0.48
 sensitivity:	

Predicting...

 34 structures matched


Validating...
lol

 Binary classification:
  INACTIVE = POSITIVE
  otherwise = NEGATIVE

 ACCURACY:	0.49
 sensitivity:	0.13
 specificity:	0.97

CONFUSION MATRIX:
YES	NO	<-any alert?
29	198	POSITIVES
5	169	NEGATIVES

 ACCURACY:	0.49
 sensitivity:	0.13
 specificity:	0.97

CONFUSION MATRIX:
YES	NO	<-any alert?
29	198	POSITIVES
5	169	NEGATIVES
  0.493765586035 = ss_accuracy


2 RULES have been saved


Predicting...

 41 structures matched


Validating...
lol

 Binary classification:
  ACTIVE = POSITIVE
  otherwise = NEGATIVE

 ACCURACY:	0.55
 sensitivity:	0.10
 specificity:	0.90

CONFUSION MATRIX:
YES	NO	<-any alert?
18	156	POSITIVES
23	204	NEGATIVES

 ACCURACY:	0.55
 sensitivity:	0.10
 specificity:	0.90

CONFUSION MATRIX:
YES	NO	<-any alert?
18	156	POSITIVES
23	204	NEGATIVES
  0.5536159601 = ss_accuracy


2 RULES have been saved


Predicting...

 36 structures matched


Validating...
lol

 Binary classification:
  INACTIVE = POSITIVE
  otherwise = N

Unnamed: 0,SMARTS,TYPE,Accuracy,Sensitivity,Specificity,LR_training
0,CC(c1ccccc1)CCN(C)C,INACTIVE,0.491272,0.101322,1.0,inf
1,CC[N+](C)(C),INACTIVE,0.478803,0.079295,1.0,inf
2,c1ccco1,ACTIVE,0.591022,0.057471,1.0,inf
3,O=CNCc1ccccc1,ACTIVE,0.586035,0.045977,1.0,inf
4,N(CCN(C)CC(=O))CC,INACTIVE,0.458853,0.048458,0.994253,8.43
5,CC1CCCCN1C,INACTIVE,0.488778,0.101322,0.994253,17.63
6,NC(=O)N,ACTIVE,0.591022,0.063218,0.995595,14.35
7,CNCCc1cccc(O)c1,INACTIVE,0.476309,0.07489,1.0,inf
8,C1CCC(O)CC1O,INACTIVE,0.46384,0.052863,1.0,inf
9,Nc1c(CC(=O))cccc1,ACTIVE,0.581047,0.034483,1.0,inf


In [11]:
df['Source'] = 'SARpy'

In [12]:
# Add MOSS substructures (from KNIME workflow)
MOSS_SS_Active = pd.read_csv("./moss_results.csv")

In [13]:
MOSS_SS_Active['Fragment'].tolist()

['o1:c(:c:c:c:1)-C',
 'N(-N)-C',
 'n1:c:n:c:c:c:1-N-C',
 'o1:c2:c(:c(:c:1-C-C)-C(-c1:c:c:c(:c:c:1)-O)=O):c:c:c:c:2',
 'n1:c:n:c:c:c:1-N-c1:c:c:c:c:c:1',
 'N1(-C-C-N-C-C-1)-c1:c:c:c:c:c:1',
 'O=C-c1:c:c:c(:c:c:1)-C',
 'F-c1:c(:c:c:c:c:1)-N',
 'F-c1:c:c(:c:c:c:1)-F',
 'N-c1:c:c:c(:c:c:1)-N',
 'n1:c(:n:c:c:c:1)-N-C',
 'N1-C(-N-C-C-1=O)=O',
 'N-C1-C-C-1',
 'N(-N)-C-C',
 'N(-N)=C',
 'O1-C(-C-C-C-C-C-C-C(-C-C-C-C(-C-1)-C)-C)=O',
 'n1(:c(:n:c:c:c:1)=O)-C1-O-C(-C-C-1)-C-O',
 'N1(-C-C-N(-C-C-1)-C)-c1:c:c:c:c:c:1',
 'S(-N-C)(-c1:c:c:c(:c:c:1)-N)(=O)=O',
 'N(-C(-C)=O)-C-c1:c:c:c:c:c:1',
 'N(-O)(-c1:c:c:c(:c:c:1)-N)=O',
 'O(-c1:c(:c:c:c(:c:1)-C)-C)-C',
 'n1:c:c(:c(:c:c:1)=O)-C(-O)=O',
 'Cl-c1:c(:c:c:c:c:1)-O-C',
 'Cl-c1:c(:c:c:c:c:1)-N',
 'Cl-c1:c:c(:c:c:c:1)-N',
 'F-c1:c:c:c(:c:c:1)-N',
 'n1(:n:c:n:c:1)-C-C-C',
 'n1:c:c:c(:c:c:1)-C=O',
 'n1(:c:c(:c:c:1)-C)-C',
 'n1:c(:c(:c:c:1)-C)-C',
 'n1:c(:c:c:c:c:1)-N',
 'o1:c(:c:c:c:1-C)-C',
 'O(-C(-C-C)-C)-C',
 'N(-C1-C(-C(-O-C(-C-1)-C)-O-C1-C(-C-C(-C(-C(-C

In [14]:

MOSS_SS_Active = MOSS_SS_Active['Fragment'].tolist()

MOSS_SS_Active = dict(zip(MOSS_SS_Active, ['ACTIVE'] * len(MOSS_SS_Active)))


MOSS_SS_Active = pd.DataFrame(MOSS_SS_Active.items(), columns=['SMARTS', 'TYPE'])


In [15]:
#MOSS_SS = pd.concat([MOSS_SS_Active,MOSS_SS_Inactive])
MOSS_SS = MOSS_SS_Active
MOSS_SS['Source'] = 'MOSS'
df = pd.concat([MOSS_SS,df])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  after removing the cwd from sys.path.


In [16]:
df

Unnamed: 0,Accuracy,LR_training,SMARTS,Sensitivity,Source,Specificity,TYPE
0,,,N(-C-C(-O)-c1:c:c:c:c:c:1)-C-C-C-C,,MOSS,,ACTIVE
1,,,o1:c(:c:c:c:1)-C,,MOSS,,ACTIVE
2,,,N-C1-C-C-1,,MOSS,,ACTIVE
3,,,O(-c1:c:c:c(:c:c:1)-C=C)-C,,MOSS,,ACTIVE
4,,,O(-c1:c:c(:c:c:c:1)-C(-C)-C)-C,,MOSS,,ACTIVE
5,,,N(-C1-C(-C(-O-C(-C-1)-C)-O-C1-C(-C-C(-C(-C(-C-...,,MOSS,,ACTIVE
6,,,O(-c1:c:c:c(:c:c:1)-C(-c1:c:c:c:c:c:1)=C)-C-C,,MOSS,,ACTIVE
7,,,S(-c1:c:c:c:c:c:1)-C-C-C,,MOSS,,ACTIVE
8,,,n1:c(:c(:c:c:1)-C=O)-C,,MOSS,,ACTIVE
9,,,N(-N)=C,,MOSS,,ACTIVE


In [17]:
# Add literature SMARTS

Literature_SMARTS = pd.read_csv("./Literature_SMARTS.csv")

In [18]:
Literature_SMARTS['Source'] = 'Hewitt et al. (2013)'
Literature_SMARTS['Source'][16:] = "Liu et al. (2015)"
Literature_SMARTS

Unnamed: 0,SMARTS,TYPE,Paper,Source
0,C(=C(c1ccccc1)c2ccc(cc2)OCCN(C)C)c3ccccc3,ACTIVE,https://www.ncbi.nlm.nih.gov/pubmed/23875763,Hewitt et al. (2013)
1,c1nc2c(n1)c(ncn2)[NH2],ACTIVE,https://www.ncbi.nlm.nih.gov/pubmed/23875764,Hewitt et al. (2013)
2,O=CC=CC=CC=C,ACTIVE,https://www.ncbi.nlm.nih.gov/pubmed/23875765,Hewitt et al. (2013)
3,C1C(=O)NC1,ACTIVE,https://www.ncbi.nlm.nih.gov/pubmed/23875766,Hewitt et al. (2013)
4,C1C(=O)NC(=O)NC1(=O),ACTIVE,https://www.ncbi.nlm.nih.gov/pubmed/23875767,Hewitt et al. (2013)
5,c1ccc2c(c1)N(CCCN(C)C)c3ccccc3S2,ACTIVE,https://www.ncbi.nlm.nih.gov/pubmed/23875768,Hewitt et al. (2013)
6,c1(N)nc(=O)ncc1,ACTIVE,https://www.ncbi.nlm.nih.gov/pubmed/23875769,Hewitt et al. (2013)
7,c1([OH])c([OH])cccc1,ACTIVE,https://www.ncbi.nlm.nih.gov/pubmed/23875770,Hewitt et al. (2013)
8,O=CC[NH]CC(=O)N(C)C,ACTIVE,https://www.ncbi.nlm.nih.gov/pubmed/23875771,Hewitt et al. (2013)
9,C12CCC3C(C1CCC2)CCc4c3ccc(c4)O,ACTIVE,https://www.ncbi.nlm.nih.gov/pubmed/23875772,Hewitt et al. (2013)


In [19]:
df = pd.concat([Literature_SMARTS,df])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [20]:
df.reset_index(inplace=True)

In [21]:
x

Unnamed: 0,SMILES,DILI_LABEL
0,O=C(O)CCCCCCNC1c2ccccc2CCc2ccccc21,1
1,CN1C(=O)CCS(=O)(=O)C1c1ccc(Cl)cc1,1
2,CN1c2c(oc(=O)n(-c3ccccn3)c2=O)-c2ccccc2S1(=O)=O,1
3,NC(N)=Nc1nc(CSCCN=CNS(=O)(=O)c2ccc(Br)cc2)cs1,1
4,CS(=O)(=O)Nc1ccc([N+](=O)[O-])cc1Oc1ccccc1,1
5,Cc1c(C)c2c(c(C)c1O)CCC(C)(COc1ccc(CC3SC(=O)NC3...,1
6,NC1[C@@H]2CN(c3nc4c(cc3F)c(=O)c(C(=O)O)cn4-c3c...,1
7,CC(N)C(=O)OC(C)(C)Cc1ccc(Cl)cc1,1
8,C[C@H](N)C(=O)N[C@@H](C)C(=O)NC1[C@@H]2CN(c3nc...,1
9,C=CCOc1ccc(CC(=O)O)cc1Cl,1


In [22]:
## Tabulate results (throughout this script PPV is used as a synomym for precision)



df2 = pd.DataFrame(index = x['SMILES'])
df2['DILI_LABEL'] = x['DILI_LABEL'].to_list()
df2['Compound_Name'] = Compound_Name

substructure_stats = pd.DataFrame()

#generate FPs
smis = list(df2.index)

mols = [Chem.MolFromSmiles(smile) for smile in smis]

fps_bit = [list(Chem.GetMorganFingerprintAsBitVect(mol,2, nBits=2048)) for mol in mols]


h = 0 # count TPs
l = 0 # count FPs
smart_list = []
PPV_list = []
percent_hits = []
labels = []
sources = []
odds_ratio_list = []
p_value_list = []
perc_hit_list = []
type_list = []
DB_count_list = []
for index, row in df.iterrows():
    i=0
    j=0
    k=0
    z=0
    t=0
    c=0
    v=0
    DB_count = 0
    hit_list = []
    smart = row['SMARTS']
    TYPE = row['TYPE']
    source = row['Source']
    
 
    if TYPE == 'ACTIVE':
#        for DB in Drugbank_comps:
#            m = Chem.MolFromSmiles(DB)
#            patt = Chem.MolFromSmarts(smart)
#            if m.HasSubstructMatch(patt):
#                DB_count+=1
#            else:
#                continue
#        DB_count_list.append(DB_count)
    
    
        for index, row in x.iterrows():
            label = row['DILI_LABEL']
            a = row['SMILES']   
            m = Chem.MolFromSmiles(a)
            patt = Chem.MolFromSmarts(smart)
            if m.HasSubstructMatch(patt) is True:
                i+=1
            if m.HasSubstructMatch(patt) is True and label == 1: # True positive
                #print('TP')
                j+=1
            if m.HasSubstructMatch(patt) is True and label == 0: # False positive
                #print('FP')
                k+=1
            if m.HasSubstructMatch(patt) is True:
                hit_list.append(1)
            if m.HasSubstructMatch(patt) is False:
                hit_list.append('0')
                
                
            if m.HasSubstructMatch(patt) is True and label == 1: # compound is DILI and has SS == TP
                z+=1
            if m.HasSubstructMatch(patt) is False and label == 1: # compound is DILI and does not have SS == FN
                t+=1
            if m.HasSubstructMatch(patt) is True and label == 0: # compound is noDILI and has SS == FP
                c+=1
            if m.HasSubstructMatch(patt) is False and label == 0: # compound is noDILI and does not have SS == TN
                v+=1    
                
        #if source == 'Hewitt et al. (2013)':
           # sources.append(0)
        if source == 'Liu et al. (2015)':
            sources.append(1)
        elif source == 'SARpy':
            sources.append(2)
        elif source == 'MOSS':
            sources.append(3)
        else:
            continue
                
            
        #print(j)
        #print(len(x.loc[x['DILI_LABEL']==1]))
        perc_hits = (j*1.0/len(x.loc[x['DILI_LABEL']==1]))*100.0
        percent_hits.append(perc_hits)
        labels.append(0)

    
        if ((j+k) == 0):
            PPV = 0
        else:

            PPV = (j/(j+k*1.0))
            print('{0:.3f}'.format(PPV), smart, perc_hits) # positive predictive value (most important for substructures to classify DILI)


        #PPV_other = z/(z+c)
        print(PPV,j,z,k,c)

        PPV_list.append(PPV)

        # Fisher one-sided test

        oddsratio, pvalue = stats.fisher_exact([[z, c], [t, v]], alternative='greater')

        odds_ratio_list.append(oddsratio)
        p_value_list.append(pvalue)
        perc_hit_list.append(perc_hits)
        type_list.append(TYPE)
        smart_list.append(smart)


        if PPV >= 0.0: # filter for PPV

            #df2[smart+'_'+TYPE] = hit_list
            print(TYPE,i,j,k,pvalue,oddsratio,source,len(x['SMILES']),len(x.loc[x['DILI_LABEL']==1]),len(x.loc[x['DILI_LABEL']==0]))
            #df2[smart] = hit_list

            h+=j
            l+=k
            #PPV_list[smart] = PPV
        else:
            continue
        
substructure_stats = pd.DataFrame(index = smart_list)
substructure_stats['PPV'] = PPV_list
substructure_stats['Source'] = sources
substructure_stats['odds_ratio'] = odds_ratio_list
substructure_stats['p_value'] = p_value_list
substructure_stats['perc_hits'] = perc_hit_list
substructure_stats['type'] = type_list


print(h,l) # number of TPs and FPs

('0.353', 'C12CCCCC1C3C(CCC3)CC2', 3.4482758620689653)
(0.35294117647058826, 6, 6, 11, 11)
('ACTIVE', 17, 6, 11, 0.8254202650543838, 0.7012987012987013, 'Liu et al. (2015)', 401, 174, 227)
('1.000', 'NN', 7.471264367816093)
(1.0, 13, 13, 0, 0)
('ACTIVE', 13, 13, 0, 1.4863150037132446e-05, inf, 'Liu et al. (2015)', 401, 174, 227)
('0.917', 'a[C!R]C(=O)[OH]', 6.321839080459771)
(0.9166666666666666, 11, 11, 1, 1)
('ACTIVE', 12, 11, 1, 0.0006321274182821177, 15.251533742331288, 'Liu et al. (2015)', 401, 174, 227)
('0.706', '[#6]S(=O)(=O)N[#6]', 6.896551724137931)
(0.7058823529411765, 12, 12, 5, 5)
('ACTIVE', 17, 12, 5, 0.01974107969931541, 3.2888888888888888, 'Liu et al. (2015)', 401, 174, 227)
('0.857', 'c1ccccc1[NH2]', 3.4482758620689653)
(0.8571428571428571, 6, 6, 1, 1)
('ACTIVE', 7, 6, 1, 0.028254275541412608, 8.071428571428571, 'Liu et al. (2015)', 401, 174, 227)
('0.431', 'O\xe2\x80\x89=\xe2\x80\x89[S;X3]', 89.08045977011494)
(0.4305555555555556, 155, 155, 205, 205)
('ACTIVE', 360, 1

('1.000', 'O(-c1:c:c:c:c:c:c:c:c:c:1)-C', 1.1494252873563218)
(1.0, 2, 2, 0, 0)
('ACTIVE', 2, 2, 0, 0.18766832917695445, inf, 'MOSS', 401, 174, 227)
('1.000', 'N(-N)-C(-C)-C', 1.7241379310344827)
(1.0, 3, 3, 0, 0)
('ACTIVE', 3, 3, 0, 0.08089963062270986, inf, 'MOSS', 401, 174, 227)
('0.182', 'N(-C-C1-C-C-C-C-C-1)-C-C', 1.1494252873563218)
(0.18181818181818182, 2, 2, 9, 9)
('ACTIVE', 11, 2, 9, 0.9831334224951945, 0.28165374677002586, 'MOSS', 401, 174, 227)
('1.000', 'n1:c(:n:c:c:c:1)-N-c1:c:c:c:c:c:1', 1.7241379310344827)
(1.0, 3, 3, 0, 0)
('ACTIVE', 3, 3, 0, 0.08089963062270986, inf, 'MOSS', 401, 174, 227)
('1.000', 'N(-C(-N)=O)-C(-C)-C', 1.1494252873563218)
(1.0, 2, 2, 0, 0)
('ACTIVE', 2, 2, 0, 0.18766832917695445, inf, 'MOSS', 401, 174, 227)
('1.000', 'n1:c(:c2:c(:c:c:1):c:c:c:c:2)-C', 1.1494252873563218)
(1.0, 2, 2, 0, 0)
('ACTIVE', 2, 2, 0, 0.18766832917695445, inf, 'MOSS', 401, 174, 227)
('1.000', 'F-C(-F)(-F)-c1:c(:c:c:c(:c:1)-N)-N(-O)=O', 1.1494252873563218)
(1.0, 2, 2, 0, 0)
('

('1.000', 'O-c1:c(:c:c:c:c:1)-C(-O)=O', 1.1494252873563218)
(1.0, 2, 2, 0, 0)
('ACTIVE', 2, 2, 0, 0.18766832917695445, inf, 'MOSS', 401, 174, 227)
('1.000', 'n1(:n:c:n:c:1)-C-C-C', 2.2988505747126435)
(1.0, 4, 4, 0, 0)
('ACTIVE', 4, 4, 0, 0.03475838401124628, inf, 'MOSS', 401, 174, 227)
('1.000', 'N(-O)=C(-N)-c1:c:c:c(:c:c:1)-C', 0.5747126436781609)
(1.0, 1, 1, 0, 0)
('ACTIVE', 1, 1, 0, 0.43391521196993144, inf, 'MOSS', 401, 174, 227)
('1.000', 'F-c1:c(:c:c:c(:c:1)-F)-C(-O)(-C-n1:n:c:n:c:1)-C', 1.1494252873563218)
(1.0, 2, 2, 0, 0)
('ACTIVE', 2, 2, 0, 0.18766832917695445, inf, 'MOSS', 401, 174, 227)
('1.000', 'S(-N-C)(-c1:c:c:c(:c:c:1)-N)(=O)=O', 1.1494252873563218)
(1.0, 2, 2, 0, 0)
('ACTIVE', 2, 2, 0, 0.18766832917695445, inf, 'MOSS', 401, 174, 227)
('1.000', 'F-c1:c(:c:c2:n:c:c(:c(:c:2:c:1)=O)-C(-O)=O)-N1-C-C-N-C-C-1', 1.1494252873563218)
(1.0, 2, 2, 0, 0)
('ACTIVE', 2, 2, 0, 0.18766832917695445, inf, 'MOSS', 401, 174, 227)
('1.000', 'O-C(-C=C(-C=C-C=C(-C=C-C)-C)-C)=O', 0.5747126436

('ACTIVE', 12, 11, 1, 0.0006321274182821177, 15.251533742331288, 'SARpy', 401, 174, 227)
('1.000', 'Nc1c(CC(=O))cccc1', 3.4482758620689653)
(1.0, 6, 6, 0, 0)
('ACTIVE', 6, 6, 0, 0.006351985679484426, inf, 'SARpy', 401, 174, 227)
('1.000', 'Nc4cccc(Cl)c4', 4.022988505747127)
(1.0, 7, 7, 0, 0)
('ACTIVE', 7, 7, 0, 0.002701604035832382, inf, 'SARpy', 401, 174, 227)
('0.846', 'C(=O)c1ccc(O)cc1', 6.321839080459771)
(0.8461538461538461, 11, 11, 2, 2)
('ACTIVE', 13, 11, 2, 0.002537967580802621, 7.5920245398773005, 'SARpy', 401, 174, 227)
('1.000', 'c3ccc(F)cc3F', 2.8735632183908044)
(1.0, 5, 5, 0, 0)
('ACTIVE', 5, 5, 0, 0.014883942775598891, inf, 'SARpy', 401, 174, 227)
('0.833', 'c1ccc(SC)cc1', 2.8735632183908044)
(0.8333333333333334, 5, 5, 1, 1)
('ACTIVE', 6, 5, 1, 0.05754372825614781, 6.686390532544379, 'SARpy', 401, 174, 227)
('0.625', 'OC(=O)C(CC(C)C)', 2.8735632183908044)
(0.625, 5, 5, 3, 3)
('ACTIVE', 8, 5, 3, 0.22833692016517843, 2.2090729783037477, 'SARpy', 401, 174, 227)
('0.818', 'N

In [23]:
print(len(smart_list),len(PPV_list),len(sources))

(233, 233, 233)


In [24]:
substructure_stats.loc[substructure_stats['Source']==3]['PPV'].describe()

count    201.000000
mean       0.855948
std        0.281823
min        0.000000
25%        0.833333
50%        1.000000
75%        1.000000
max        1.000000
Name: PPV, dtype: float64

In [25]:
len(type_list)

233

In [26]:
rej, pval_corr = smm.multipletests(p_value_list, method='fdr_bh')[:2]
substructure_stats['BH_Adjusted_p_value'] = list(pval_corr)

In [27]:
substructure_stats

Unnamed: 0,PPV,Source,odds_ratio,p_value,perc_hits,type,BH_Adjusted_p_value
C12CCCCC1C3C(CCC3)CC2,0.352941,1,0.701299,0.825420,3.448276,ACTIVE,0.898705
NN,1.000000,1,inf,0.000015,7.471264,ACTIVE,0.003463
a[C!R]C(=O)[OH],0.916667,1,15.251534,0.000632,6.321839,ACTIVE,0.018411
[#6]S(=O)(=O)N[#6],0.705882,1,3.288889,0.019741,6.896552,ACTIVE,0.170358
c1ccccc1[NH2],0.857143,1,8.071429,0.028254,3.448276,ACTIVE,0.218884
O = [S;X3],0.430556,1,0.875481,0.716293,89.080460,ACTIVE,0.798546
[S;X2&!R],0.500000,1,1.315476,0.426868,3.448276,ACTIVE,0.515828
a[C!R](=O)a,0.916667,1,15.251534,0.000632,6.321839,ACTIVE,0.018411
"C[F,Cl,Br,I]",0.565217,1,1.752174,0.137635,7.471264,ACTIVE,0.275011
C1CC1N,0.800000,1,5.317647,0.114256,2.298851,ACTIVE,0.275011


In [28]:
substructure_stats.to_csv("./Structural_alerts.csv")