In [1]:
import pubchempy

import os
import gzip
import collections
import xml.etree.ElementTree as ET
import pandas as pd

# Load drugbank database

In [2]:
xml_path = os.path.join('../../PhD_study/DDI_corpus/download', 'full database.xml.gz')
with gzip.open(xml_path) as xml_file:
    tree = ET.parse(xml_file)
root = tree.getroot()

In [15]:
ns = '{http://www.drugbank.ca}'
inchi_template = "{ns}calculated-properties/{ns}property[{ns}kind='InChI']/{ns}value"

rows = list()
not_inchi=0
for i, drug in enumerate(root):
    row = collections.OrderedDict()
    assert drug.tag == ns + 'drug'
    row['drugbank_id'] = drug.findtext(ns + "drugbank-id[@primary='true']")
    row['name'] = drug.findtext(ns + "name")
    #row['inchi'] = drug.findtext(inchi_template.format(ns = ns))
    if drug.findtext(inchi_template.format(ns = ns))==None:
        not_inchi+=1
        continue
    try:
        compounds = pubchempy.get_compounds(drug.findtext(inchi_template.format(ns = ns)), namespace='inchi')
    except pubchempy.BadRequestError:
        print('BadRequestError', row)
        continue
    compound, = compounds
    row['pubchem_id'] = compound.cid
    rows.append(row)
    #break

BadRequestError OrderedDict([('drugbank_id', 'DB00350'), ('name', 'Minoxidil')])
BadRequestError OrderedDict([('drugbank_id', 'DB00475'), ('name', 'Chlordiazepoxide')])
BadRequestError OrderedDict([('drugbank_id', 'DB02524'), ('name', "2',3'-O-{4-[Hydroxy(oxido)-λ5-azanylidene]-2,6-dinitro-2,5-cyclohexadiene-1,1-diyl}adenosine 5'-(tetrahydrogen triphosphate)")])
BadRequestError OrderedDict([('drugbank_id', 'DB03382'), ('name', 'S-oxy-L-cysteine')])
BadRequestError OrderedDict([('drugbank_id', 'DB03629'), ('name', "Pyridoxal-5'-Phosphate-N-Oxide")])
BadRequestError OrderedDict([('drugbank_id', 'DB04722'), ('name', '2-[3-chloro-6-[2,2-difluoro-2-(1-oxidopyridin-1-ium-2-yl)ethyl]imino-1-hydroxypyridin-2-yl]-N-[(1R)-1-(3-chlorophenyl)ethyl]acetamide')])
BadRequestError OrderedDict([('drugbank_id', 'DB04858'), ('name', 'Tirapazamine')])
BadRequestError OrderedDict([('drugbank_id', 'DB05025'), ('name', 'Arimoclomol')])
BadRequestError OrderedDict([('drugbank_id', 'DB06635'), ('name', 'Otamix

In [16]:
not_inchi

3294

In [24]:
columns = ['drugbank_id', 'name', 'pubchem_id']
drugbank_df = pd.DataFrame.from_dict(rows)[columns]
drugbank_df = drugbank_df[-drugbank_df.pubchem_id.isnull()]
drugbank_df['pubchem_id'] = drugbank_df['pubchem_id'].astype(int)
drugbank_df.to_csv('dataset/drugbank_cid.csv', index=False)
display(drugbank_df.shape, drugbank_df.head())

(11091, 3)

Unnamed: 0,drugbank_id,name,cid
0,DB00006,Bivalirudin,101041682
1,DB00007,Leuprolide,657181
2,DB00014,Goserelin,5311128
4,DB00035,Desmopressin,5311065
5,DB00050,Cetrorelix,25074887


# STITCH to DrugBank mapping

In [25]:
def stitch_flat_to_pubchem(cid):
    assert cid.startswith('CID')
    return int(cid[3:]) - 1e8


def stitch_stereo_to_pubchem(cid):
    assert cid.startswith('CID')
    return int(cid[3:])

### meddra_freq.tsv.gz

In [45]:
columns = ['stitch_id_flat', 'stitch_id_sterio', 'umls_cui_from_label', 'placebo', 'frequency', 'lower', 'upper',
           'meddra_type', 'umls_cui_from_meddra', 'side_effect_name',]
freq_df = pd.read_table('../../PhD_study/DDI_corpus/SIDER_data/meddra_freq.tsv.gz', names=columns)
freq_df.head(2)

Unnamed: 0,stitch_id_flat,stitch_id_sterio,umls_cui_from_label,placebo,frequency,lower,upper,meddra_type,umls_cui_from_meddra,side_effect_name
0,CID100000085,CID000010917,C0000737,,21%,0.21,0.21,LLT,C0000737,Abdominal pain
1,CID100000085,CID000010917,C0000737,,21%,0.21,0.21,PT,C0000737,Abdominal pain


In [46]:
freq_df['pubchem_id'] = freq_df.stitch_id_sterio.map(stitch_stereo_to_pubchem)
freq_df = drugbank_df.merge(freq_df)
freq_df.to_csv('dataset/drugbank_side_effect.csv', index=False)
display(freq_df.shape, freq_df.head())

(203703, 13)

Unnamed: 0,drugbank_id,name,cid,stitch_id_flat,stitch_id_sterio,umls_cui_from_label,placebo,frequency,lower,upper,meddra_type,umls_cui_from_meddra,side_effect_name
0,DB00035,Desmopressin,5311065,CID100027991,CID005311065,C0000737,,0.65%,0.0065,0.0065,LLT,C0000737,Abdominal pain
1,DB00035,Desmopressin,5311065,CID100027991,CID005311065,C0000737,,0.65%,0.0065,0.0065,PT,C0000737,Abdominal pain
2,DB00035,Desmopressin,5311065,CID100027991,CID005311065,C0000737,,0.65%,0.0065,0.0065,PT,C0687713,Gastrointestinal pain
3,DB00035,Desmopressin,5311065,CID100027991,CID005311065,C0000737,,1%,0.01,0.01,LLT,C0000737,Abdominal pain
4,DB00035,Desmopressin,5311065,CID100027991,CID005311065,C0000737,,1%,0.01,0.01,PT,C0000737,Abdominal pain


In [41]:
len(freq_df.drugbank_id.unique())

663

In [30]:
len(freq_df.stitch_id_sterio.unique())

1035

In [47]:
freq_df.stitch_id_sterio.unique()

array(['CID005311065', 'CID016131215', 'CID000006322', 'CID000005961',
       'CID005280453', 'CID000000750', 'CID000054687', 'CID000060846',
       'CID005362129', 'CID000082153', 'CID000002284', 'CID000089594',
       'CID000003958', 'CID000059768', 'CID000387447', 'CID000034359',
       'CID000002369', 'CID000003365', 'CID000005591', 'CID000065028',
       'CID000002519', 'CID000071329', 'CID000447043', 'CID000060164',
       'CID000004195', 'CID000004679', 'CID000041781', 'CID000002771',
       'CID000077993', 'CID000003476', 'CID005362440', 'CID000053232',
       'CID005486971', 'CID000005391', 'CID000004197', 'CID000004463',
       'CID000020279', 'CID000056959', 'CID000004075', 'CID000054746',
       'CID000002955', 'CID000441383', 'CID000001775', 'CID054671203',
       'CID000002578', 'CID000004171', 'CID000005095', 'CID000003784',
       'CID005284627', 'CID000158781', 'CID000002179', 'CID000002153',
       'CID000005920', 'CID000003114', 'CID000003676', 'CID000005656',
      

In [43]:
freq_df.meddra_type.unique()

array(['LLT', 'PT', nan], dtype=object)

In [48]:
freq_side_effect = freq_df.drop(columns=['placebo', 'frequency', 'meddra_type', 'umls_cui_from_meddra'])
freq_side_effect.drop_duplicates(keep='first', inplace=True)
freq_side_effect

Unnamed: 0,drugbank_id,name,cid,stitch_id_flat,stitch_id_sterio,umls_cui_from_label,lower,upper,side_effect_name
0,DB00035,Desmopressin,5311065,CID100027991,CID005311065,C0000737,0.006500,0.006500,Abdominal pain
2,DB00035,Desmopressin,5311065,CID100027991,CID005311065,C0000737,0.006500,0.006500,Gastrointestinal pain
3,DB00035,Desmopressin,5311065,CID100027991,CID005311065,C0000737,0.010000,0.010000,Abdominal pain
5,DB00035,Desmopressin,5311065,CID100027991,CID005311065,C0000737,0.010000,0.010000,Gastrointestinal pain
6,DB00035,Desmopressin,5311065,CID100027991,CID005311065,C0000737,0.020000,0.020000,Abdominal pain
...,...,...,...,...,...,...,...,...,...
203693,DB15351,Dihematoporphyrin ether,57166,CID100057166,CID000057166,C1536009,0.080000,0.080000,Oesophageal oedema
203695,DB15351,Dihematoporphyrin ether,57166,CID100057166,CID000057166,C3263723,0.144928,0.144928,Injury
203697,DB15351,Dihematoporphyrin ether,57166,CID100057166,CID000057166,C3263723,0.191781,0.191781,Injury
203699,DB15351,Dihematoporphyrin ether,57166,CID100057166,CID000057166,C3263723,0.191824,0.191824,Injury


In [52]:
freq_side_effect = freq_side_effect.sort_values('upper', ascending=False).drop_duplicates(['umls_cui_from_label'])
freq_side_effect

Unnamed: 0,drugbank_id,name,cid,stitch_id_flat,stitch_id_sterio,umls_cui_from_label,lower,upper,side_effect_name
118850,DB00938,Salmeterol,5152,CID100005152,CID000005152,C0040822,0.01,1.0000,Tremor
119470,DB00949,Felbamate,3331,CID100003331,CID000003331,C0085631,0.01,1.0000,Agitation
119404,DB00949,Felbamate,3331,CID100003331,CID000003331,C0039231,0.01,1.0000,Tachycardia
119383,DB00949,Felbamate,3331,CID100003331,CID000003331,C0033774,0.01,1.0000,Pruritus
119369,DB00949,Felbamate,3331,CID100003331,CID000003331,C0030252,0.01,1.0000,Palpitations
...,...,...,...,...,...,...,...,...,...
59021,DB00537,Ciprofloxacin,2764,CID100002764,CID000002764,C0852388,0.00,0.0001,Visual colour distortions
62579,DB00564,Carbamazepine,2554,CID100002554,CID000002554,C0005944,0.00,0.0001,Bone metabolism disorder
59032,DB00537,Ciprofloxacin,2764,CID100002764,CID000002764,C0857751,0.00,0.0001,Erythema multiforme minor
110724,DB00864,Tacrolimus,445643,CID100005372,CID000445643,C0476369,0.00,0.0001,Echocardiogram abnormal


In [53]:
freq_side_effect.to_csv('dataset/drugbank_side_effect.csv', index=False)