# 9. Analysis of the enumeration results on the DrugBank molecules

In this notebook we analyse the enumerated molecules on the drug molecules from the DrugBank dataset.

In [1]:
import ast
import math
import pandas as pd
from itertools import chain

from rdkit import Chem
from rdkit.Chem import AllChem
fpgen = AllChem.GetMorganGenerator(radius=2, fpSize=2048, includeChirality=True)

from molsig.enumerate_signature import save_mol_plot

# Results path

In [None]:
path_results = "C:/Users/meyerp/Documents/INRAE/Diophantine/Enumération/github/results/"

# Load drugbank enumeration results

In [2]:
df_full = pd.ExcelFile(path_results + "RevSig_drugs.xlsx")
df = pd.read_excel(df_full, "signature")
df2 = pd.read_excel(df_full, "xref")

# Suppress duplicates

In [None]:
df_no_duplicates = df.drop_duplicates(subset="smi", keep="first")
print(df.shape, df_no_duplicates.shape)

In [None]:
with pd.ExcelWriter(path + 'RevSig_drugs_screened.xlsx', mode='a', if_sheet_exists='replace') as writer:
    df1_no_duplicates.to_excel(writer, sheet_name='signature', index=False)
    df2.to_excel(writer, sheet_name='xref', index=False)  # Optional, to keep the second sheet intact

In [None]:
print(df2.shape)
df2.drop_duplicates(inplace=True)
print(df2.shape)

In [None]:
len(list(df2["cid"]))

In [None]:
PCid_list_full = []
for i in range(df.shape[0]):
    PCid = df["PubchemCid"][i]
    PCid_list = ast.literal_eval(PCid)
    PCid_list_flat = list(chain.from_iterable(PCid_list))
    PCid_list_full = PCid_list_full + PCid_list_flat

In [None]:
for x in PCid_list_full:
    if PCid_list_full.count(x) == 2:
        print(x)

In [None]:
len(PCid_list_full), len(list(set(PCid_list_full)))

# Statistics

In [3]:
df.shape

(9516, 19)

In [4]:
nb_new_mols = 0
nb_input_ecfp_producing_new_mols = 0
nb_input_ecfp_producing_no_new_mols = 0
for i in range(df.shape[0]):
    new_smiles = list(df["NewSmiles"])[i]
    new_smiles_list = ast.literal_eval(new_smiles)
    nb_new = len(new_smiles_list)
    nb_new_mols += nb_new
    if len(new_smiles_list) > 0:
        nb_input_ecfp_producing_new_mols += 1
    else:
        nb_input_ecfp_producing_no_new_mols += 1
print(f"Nb new mol: {nb_new_mols}. Nb ecfp producing new mol: {nb_input_ecfp_producing_new_mols}. Nb ecfp not producing new mol: {nb_input_ecfp_producing_no_new_mols}")

Nb new mol: 3691. Nb ecfp producing new mol: 1313. Nb ecfp not producing new mol: 8203


In [5]:
nb_new_mol_not_ref_in_pubchem = 0
nb_new_mol_ref_in_pubchem = 0
for i in range(df.shape[0]):
#    PCid = df["PubchemCidFilter"][i]
    PCid = df["PubchemCid"][i]
    PCid_list = ast.literal_eval(PCid)
    nb_new_mol_not_ref_in_pubchem += PCid_list.count([])
    nb_new_mol_ref_in_pubchem += len(PCid_list) - PCid_list.count([])

    #print(i, PCid_list, PCid_list.count([]), len(PCid_list) - PCid_list.count([]))

print(f"Nb new mol ref in pubchem: {nb_new_mol_ref_in_pubchem}. Nb new mol not ref in pubchem: {nb_new_mol_not_ref_in_pubchem}.")
print(100 * nb_new_mol_ref_in_pubchem / nb_new_mols, "%")

Nb new mol ref in pubchem: 778. Nb new mol not ref in pubchem: 2913.
21.078298564074778 %


- patents

In [6]:
def check_if_id_has_patent(pcid):
    nb_patents = sum(list(df2[df2["cid"] == pcid]["patent_manual"]))
    return nb_patents > 0

def check_if_id_has_assay(pcid):
    nb_patents = sum(list(df2[df2["cid"] == pcid]["assay_count"]))
    return nb_patents > 0

def check_if_id_has_positive_assay(pcid):
    nb_patents = sum(list(df2[df2["cid"] == pcid]["assay_count_active"]))
    return nb_patents > 0

In [7]:
patents_list_full = []
assays_list_full = []
pos_assays_list_full = []
for i in range(df.shape[0]):
#    PCid = df["PubchemCidFilter"][i]
    PCid = df["PubchemCid"][i]
    PCid_list = ast.literal_eval(PCid)
    #print(i, PCid_list)
    patents_list = []
    assays_list = []
    pos_assays_list = []
    for mols in PCid_list:
        #print("mols", mols)
        has_patent_or_not = 0
        has_assay_or_not = 0
        has_pos_assay_or_not = 0
        if mols != []:
            for pcid in mols:
                if check_if_id_has_patent(pcid):
                    has_patent_or_not = 1
                if check_if_id_has_assay(pcid):
                    has_assay_or_not = 1
                if check_if_id_has_positive_assay(pcid):
                    has_pos_assay_or_not = 1
        patents_list.append(has_patent_or_not)
        assays_list.append(has_assay_or_not)
        pos_assays_list.append(has_pos_assay_or_not)
    patents_list_full = patents_list_full + patents_list
    assays_list_full = assays_list_full + assays_list
    pos_assays_list_full = pos_assays_list_full + pos_assays_list

print(f"Nb patented: {sum(patents_list_full)} out of {len(patents_list_full)}")
print(f"Nb assays: {sum(assays_list_full)} out of {len(assays_list_full)}")
print(f"Nb positive assays: {sum(pos_assays_list_full)} out of {len(pos_assays_list_full)}")

Nb patented: 469 out of 3691
Nb assays: 217 out of 3691
Nb positive assays: 170 out of 3691


- new molecules enumerated with new sigs has intermediate

In [10]:
def has_single_atom(smiles: str) -> bool:
    mol = Chem.MolFromSmiles(smiles)
    return mol.GetNumAtoms() == 1

nb_new_mol_through_intermediate_new_sigs = 0
nb_new_mol_through_intermediate_new_sigs_not_single_atomed = 0

nb_new_mol_through_no_intermediate_new_sigs = 0

for i in range(df.shape[0]):
    smi_initial = list(df["smi"])[i]
    name = list(df["Name"])[i]
    new_smiles = list(df["NewSmiles"])[i]
    new_smiles_list = ast.literal_eval(new_smiles)
    if new_smiles_list != []:
        if list(df["NsigTrue"])[i] > 1:
            nb_new_mol_through_intermediate_new_sigs += 1
            if has_single_atom(smi_initial) == False:
                nb_new_mol_through_intermediate_new_sigs_not_single_atomed += 1
                print(i, name, new_smiles_list, list(df["NsigTrue"])[i], list(df["DRUGBANK_ID"])[i])
        else:
            nb_new_mol_through_no_intermediate_new_sigs += 1

print(f"Nb new mol through intermediate new sigs: {nb_new_mol_through_intermediate_new_sigs}")
print(f"Nb new mol through intermediate new sigs not single atomed: {nb_new_mol_through_intermediate_new_sigs_not_single_atomed}")
print(f"Nb new mol through no intermediate new sigs: {nb_new_mol_through_no_intermediate_new_sigs}")

3693 Estrone {'C[C@]12CC[C@@H]3c4ccc(O)cc4CC[C@H]1[C@@H]3CCC2=O'} 2 DB00655
5351 Alcaftadine {'CN1CCC2=C(c3ccccc3CC1)c1ncc(C=O)n1CC2'} 2 DB06766
5836 Estrone sulfate {'C[C@]12CC[C@@H]3c4ccc(OS(=O)(=O)O)cc4CC[C@H]1[C@@H]3CCC2=O'} 2 DB04574
7481 Paxalisib {'CC1(C)OCCN2CCOCCn3c1nc1c2nc(-c2cnc(N)nc2)nc13'} 2 DB15186
7920 Emicerfont {'COc1ccc(N2CCNC(=O)N3CCc4c(cc(C)nc42)-n2ccc3n2)c(C)c1'} 2 DB12910
Nb new mol through intermediate new sigs: 19
Nb new mol through intermediate new sigs not single atomed: 5
Nb new mol through no intermediate new sigs: 9497


- plot molecules with new sigs

In [14]:
df[df["NsigTrue"] > 1 ]

Unnamed: 0,ID,smi,Name,wt,Nsig,NsigTrue,FoundSig,Nmol,Foundmol,CT ecfp_sig,CT sig_mol,CT ecfp_mol,CT solve_partitions,ThresholdPart,ThresholdRec,NewSmiles,NewSmilesList,NewInchi,PubchemCid
1,1,[Cu],Copper,62.929597,2,2,1,2,1,4.33657,0.001355,4.337929,0.0,False,False,{'[Ru+4]'},['[Ru+4]'],['InChI=1S/Ru/q+4'],[[5461100]]
4,4,[Gd],Gadolinium,157.924104,2,2,1,2,1,4.370022,0.001389,4.371415,0.0,False,False,{'[NaH2-]'},['[NaH2-]'],['InChI=1S/Na.2H/q-1;;'],[[]]
5,5,[InH3],Indium In-111,117.927353,2,2,1,2,1,4.357245,0.001381,4.358631,0.0,False,False,{'[U+5]'},['[U+5]'],['InChI=1S/U/q+5'],[[]]
17,17,[Tc],Technetium Tc-99m,96.906365,2,2,1,2,1,4.453004,0.001452,4.45446,0.0,False,False,{'[Rb+]'},['[Rb+]'],['InChI=1S/Rb/q+1'],[[105153]]
35,35,[OH-],Hydroxide ion,17.003288,2,2,1,2,1,4.43446,0.001404,4.435869,0.0,False,False,{'[AsH2-]'},['[AsH2-]'],['InChI=1S/AsH2/h1H2/q-1'],[[]]
42,42,[Ca],Calcium,39.962591,2,2,1,2,1,4.426381,0.001928,4.428313,0.0,False,False,{'[Mo+2]'},['[Mo+2]'],['InChI=1S/Mo/q+2'],[[185498]]
48,48,[Cu+2],Cupric cation,62.9285,2,2,1,2,1,4.353535,0.001386,4.354926,0.0,False,False,{'[Ru+2]'},['[Ru+2]'],['InChI=1S/Ru/q+2'],[[3792939]]
49,49,[Cr+3],Chromic cation,51.938862,2,2,1,2,1,4.314448,0.001404,4.315856,0.0,False,False,{'[FH2+]'},['[FH2+]'],['InChI=1S/FH2/h1H2/q+1'],[[]]
51,51,[Si],Silicon,27.976927,3,3,1,3,1,4.312748,0.002046,4.314799,0.0,False,False,"{'[SiH4]', '[O+6]'}","['[SiH4]', '[O+6]']","['InChI=1S/H4Si/h1H4', 'InChI=1S/O/q+6']","[[23953], []]"
52,52,[Ag],Silver,106.905097,2,2,1,2,1,4.410785,0.001714,4.412504,0.0,False,False,{'[No]'},['[No]'],['InChI=1S/No'],[[24822]]


# Plots

In [35]:
def get_patent_id(pcid):
    nb_patents = sum(list(df2[df2["cid"] == pcid]["patent_manual"]))
    return nb_patents

def get_bioassay_id(pcid):
    nb_assays = sum(list(df2[df2["cid"] == pcid]["assay_count"]))
    return nb_assays

def get_positive_bioassay_id(pcid):
    nb_positive_assays = sum(list(df2[df2["cid"] == pcid]["assay_count_active"]))
    return nb_positive_assays

In [47]:
## paper
input_drug_id = "DB15026" #CXA-10
#input_drug_id = "DB01083" #orlistat

In [55]:
## supplementary
#fig1
#input_drug_id = "DB01204" #Mitoxantrone
#input_drug_id = "DB14104" #Linoleic acid
#input_drug_id = "DB09298" #Silybin

#fig2
#input_drug_id = "DB12174"
#input_drug_id = "DB00942"
#input_drug_id = "DB00580"
#input_drug_id = "DB05316"

In [56]:
smi_init = list(df[df["DRUGBANK_ID"] == input_drug_id]["smi"])[0]
input_drug_name = list(df[df["DRUGBANK_ID"] == input_drug_id]["Name"])[0]
new_smis = list(ast.literal_eval(list(df[df["DRUGBANK_ID"] == input_drug_id]["NewSmilesList"])[0]))
cids = ast.literal_eval(list(df[df["DRUGBANK_ID"] == input_drug_id]["PubchemCid"])[0])

for i in range(len(new_smis)):
    smi = new_smis[i]
    cid = cids[i]
    print(cid)
    if len(cid) == 0:
        file_name = input_drug_name + "_new_enum_" + str(i + 1) + "_unref"
    else:
        pcid = cid[0]
        nb_patents = get_patent_id(pcid)
        nb_assays = get_bioassay_id(pcid)
        nb_positive_assays = get_positive_bioassay_id(pcid)
        file_name = input_drug_name + "_new_enum_" + str(i + 1) + "_ref_" + str(pcid) + "_patents_" + str(nb_patents) + "_bioassays_" + str(nb_assays) + "_posassays_" + str(nb_positive_assays)
    print(file_name)
    mol = Chem.MolFromSmiles(smi)
    #save_mol_plot(mol, file_name)

[69466671]
Pimavanserin_new_enum_1_ref_69466671_patents_6_bioassays_0_posassays_0
<?xml version='1.0' encoding='iso-8859-1'?>
<svg version='1.1' baseProfile='full'
              xmlns='http://www.w3.org/2000/svg'
                      xmlns:rdkit='http://www.rdkit.org/xml'
                      xmlns:xlink='http://www.w3.org/1999/xlink'
                  xml:space='preserve'
width='300px' height='300px' viewBox='0 0 300 300'>
<!-- END OF HEADER -->
<path class='bond-0 atom-0 atom-1' d='M 15.0,166.7 L 36.1,176.3' style='fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2.0px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1' />
<path class='bond-1 atom-1 atom-2' d='M 36.1,176.3 L 38.4,199.4' style='fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2.0px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1' />
<path class='bond-2 atom-1 atom-3' d='M 36.1,176.3 L 55.0,162.8' style='fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2.0px;stroke-linecap:butt

# Molecules enumerating new molecular signatures

In [5]:
df_new_sig = df[df["NsigTrue"] > 1]

def count_atoms(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return 0
    return mol.GetNumAtoms()
df_new_sig = df_new_sig[df_new_sig['smi'].apply(lambda x: count_atoms(x) > 1)]

df_new_sig

Unnamed: 0,ID,DRUGBANK_ID,smi,Name,wt,Nsig,NsigTrue,FoundSig,Nmol,Foundmol,CT ecfp_sig,CT sig_mol,CT ecfp_mol,CT solve_partitions,ThresholdPart,ThresholdRec,NewSmiles,NewSmilesList,NewInchi,PubchemCid
3693,3693,DB00655,C[C@]12CC[C@@H]3c4ccc(O)cc4CC[C@H]3[C@@H]1CCC2=O,Estrone,270.16198,2,2,1,2,1,11.739378,0.129637,11.869092,0.013598,False,False,{'C[C@]12CC[C@@H]3c4ccc(O)cc4CC[C@H]1[C@@H]3CC...,['C[C@]12CC[C@@H]3c4ccc(O)cc4CC[C@H]1[C@@H]3CC...,['InChI=1S/C18H22O2/c1-18-9-8-14-13-4-3-12(19)...,[[]]
5351,5351,DB06766,CN1CCC(=C2c3ccccc3CCn3c(C=O)cnc32)CC1,Alcaftadine,307.168462,2,2,1,2,1,9.137461,0.140328,9.277812,0.034125,False,False,{'CN1CCC2=C(c3ccccc3CC1)c1ncc(C=O)n1CC2'},['CN1CCC2=C(c3ccccc3CC1)c1ncc(C=O)n1CC2'],['InChI=1S/C19H21N3O/c1-21-9-6-14-4-2-3-5-17(1...,[[]]
5836,5836,DB04574,C[C@]12CC[C@@H]3c4ccc(OS(=O)(=O)O)cc4CC[C@H]3[...,Estrone sulfate,350.118795,2,2,1,2,1,18.128103,0.158547,18.286711,0.015228,False,False,{'C[C@]12CC[C@@H]3c4ccc(OS(=O)(=O)O)cc4CC[C@H]...,['C[C@]12CC[C@@H]3c4ccc(OS(=O)(=O)O)cc4CC[C@H]...,['InChI=1S/C18H22O5S/c1-18-9-8-14-13-4-3-12(23...,[[]]
7481,7481,DB15186,CC1(C)OCCn2c1nc1c(N3CCOCC3)nc(-c3cnc(N)nc3)nc12,Paxalisib,382.186572,2,2,1,2,1,23.138091,1.068226,24.206429,0.042745,False,False,{'CC1(C)OCCN2CCOCCn3c1nc1c2nc(-c2cnc(N)nc2)nc13'},['CC1(C)OCCN2CCOCCn3c1nc1c2nc(-c2cnc(N)nc2)nc13'],['InChI=1S/C18H22N8O2/c1-18(2)16-22-12-14-23-1...,[[]]
7920,7920,DB12910,COc1ccc(N2CCc3c(-n4ccc(N5CCNC5=O)n4)cc(C)nc32)...,Emicerfont,404.196074,2,2,1,2,1,18.077431,0.252096,18.329646,0.263739,False,False,{'COc1ccc(N2CCNC(=O)N3CCc4c(cc(C)nc42)-n2ccc3n...,['COc1ccc(N2CCNC(=O)N3CCc4c(cc(C)nc42)-n2ccc3n...,['InChI=1S/C22H24N6O2/c1-14-12-16(30-3)4-5-18(...,[[]]


In [10]:
for i in range(df_new_sig.shape[0]):
    smi = list(df_new_sig["smi"])[i]
    new_smiles_list = ast.literal_eval(list(df_new_sig["NewSmilesList"])[i])
    new_smi = new_smiles_list[0]
    drug_id = list(df_new_sig["DRUGBANK_ID"])[i]
    print(smi, new_smis)

    file_name = drug_id
    mol = Chem.MolFromSmiles(smi)
    #save_mol_plot(mol, file_name)
    
    file_name = drug_id + "_new_smi"
    mol = Chem.MolFromSmiles(new_smi)
    #save_mol_plot(mol, file_name)

C[C@]12CC[C@@H]3c4ccc(O)cc4CC[C@H]3[C@@H]1CCC2=O ['COc1ccc(N2CCNC(=O)N3CCc4c(cc(C)nc42)-n2ccc3n2)c(C)c1']
<?xml version='1.0' encoding='iso-8859-1'?>
<svg version='1.1' baseProfile='full'
              xmlns='http://www.w3.org/2000/svg'
                      xmlns:rdkit='http://www.rdkit.org/xml'
                      xmlns:xlink='http://www.w3.org/1999/xlink'
                  xml:space='preserve'
width='300px' height='300px' viewBox='0 0 300 300'>
<!-- END OF HEADER -->
<path class='bond-0 atom-1 atom-0' d='M 75.4,165.1 L 58.5,193.3 L 54.7,190.6 Z' style='fill:#000000;fill-rule:evenodd;fill-opacity:1;stroke:#000000;stroke-width:0.5px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;' />
<path class='bond-1 atom-1 atom-2' d='M 75.4,165.1 L 92.3,193.2' style='fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2.0px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1' />
<path class='bond-2 atom-2 atom-3' d='M 92.3,193.2 L 125.0,192.7' style='fill:none;fill-rule:ev