In [1]:
import os
import gzip
import collections
import xml.etree.ElementTree as ET
import pandas as pd

# Load drugbank database

In [2]:
xml_path = os.path.join('../../PhD_study/DDI_corpus/download', 'full database.xml.gz')
with gzip.open(xml_path) as xml_file:
    tree = ET.parse(xml_file)
root = tree.getroot()

# Extract 'pubchem_id' from each drug

In [3]:
ns = '{http://www.drugbank.ca}'
pubchem_id_template = "{ns}external-identifiers/{ns}external-identifier[{ns}resource='PubChem Compound']/{ns}identifier"

rows = list()
not_inchi=0
for i, drug in enumerate(root):
    row = collections.OrderedDict()
    assert drug.tag == ns + 'drug'
    row['drugbank_id'] = drug.findtext(ns + "drugbank-id[@primary='true']")
    row['name'] = drug.findtext(ns + "name")    
    row['pubchem_id'] = drug.findtext(pubchem_id_template.format(ns = ns))
    if row['pubchem_id'] == None:
        continue
    rows.append(row)
    
columns = ['drugbank_id', 'name', 'pubchem_id']
drugbank_df = pd.DataFrame.from_dict(rows)[columns]
drugbank_df['pubchem_id'] = drugbank_df['pubchem_id'].astype(int)
drugbank_df.to_csv('dataset/drugbank_pubchem_id.csv', index=False)
display(drugbank_df.shape, drugbank_df.head())

(8725, 3)

Unnamed: 0,drugbank_id,name,pubchem_id
0,DB00006,Bivalirudin,16129704
1,DB00014,Goserelin,5311128
2,DB00027,Gramicidin D,45267103
3,DB00050,Cetrorelix,25074887
4,DB00080,Daptomycin,16134395


## Load dataset of drug with 'pubchem_id' extracted via pubchempy.get_compounds(inchi)

In [4]:
drugbank_cid = pd.read_csv('dataset/drugbank_cid.csv')

drug_pubchem_id = pd.concat([drugbank_df, drugbank_cid])
drug_pubchem_id.drop_duplicates(keep='first', inplace=True)
drug_pubchem_id = drug_pubchem_id.sort_values(["drugbank_id"], ascending = (True))
drug_pubchem_id

Unnamed: 0,drugbank_id,name,pubchem_id
0,DB00006,Bivalirudin,16129704
0,DB00006,Bivalirudin,101041682
1,DB00007,Leuprolide,657181
1,DB00014,Goserelin,5311128
2,DB00027,Gramicidin D,45267103
...,...,...,...
11086,DB16735,PAT-1251,122536283
11087,DB16737,MM3122,155925845
11088,DB16739,MK-886,3651377
11089,DB16741,Bortezomib D-mannitol,12990536


# STITCH to DrugBank mapping

In [5]:
def stitch_flat_to_pubchem(cid):
    assert cid.startswith('CID')
    return int(cid[3:]) - 1e8


def stitch_stereo_to_pubchem(cid):
    assert cid.startswith('CID')
    return int(cid[3:])

## meddra_freq.tsv.gz

In [6]:
columns = ['stitch_id_flat', 'stitch_id_sterio', 'umls_cui_from_label', 'placebo', 'frequency', 'lower', 'upper',
           'meddra_type', 'umls_cui_from_meddra', 'side_effect_name',]
freq_df = pd.read_table('../../PhD_study/DDI_corpus/SIDER_data/meddra_freq.tsv.gz', names=columns)
freq_df.head(2)

Unnamed: 0,stitch_id_flat,stitch_id_sterio,umls_cui_from_label,placebo,frequency,lower,upper,meddra_type,umls_cui_from_meddra,side_effect_name
0,CID100000085,CID000010917,C0000737,,21%,0.21,0.21,LLT,C0000737,Abdominal pain
1,CID100000085,CID000010917,C0000737,,21%,0.21,0.21,PT,C0000737,Abdominal pain


In [7]:
freq_df['pubchem_id'] = freq_df.stitch_id_sterio.map(stitch_stereo_to_pubchem)
freq_df = drug_pubchem_id.merge(freq_df)
display(freq_df.shape, freq_df.head())

(214269, 13)

Unnamed: 0,drugbank_id,name,pubchem_id,stitch_id_flat,stitch_id_sterio,umls_cui_from_label,placebo,frequency,lower,upper,meddra_type,umls_cui_from_meddra,side_effect_name
0,DB00006,Bivalirudin,16129704,CID116129704,CID016129704,C0000737,,0.8%,0.008,0.008,LLT,C0000737,Abdominal pain
1,DB00006,Bivalirudin,16129704,CID116129704,CID016129704,C0000737,,0.8%,0.008,0.008,PT,C0000737,Abdominal pain
2,DB00006,Bivalirudin,16129704,CID116129704,CID016129704,C0000737,,0.8%,0.008,0.008,PT,C0687713,Gastrointestinal pain
3,DB00006,Bivalirudin,16129704,CID116129704,CID016129704,C0000737,,1%,0.01,0.01,LLT,C0000737,Abdominal pain
4,DB00006,Bivalirudin,16129704,CID116129704,CID016129704,C0000737,,1%,0.01,0.01,PT,C0000737,Abdominal pain


In [8]:
len(freq_df.drugbank_id.unique())

712

In [9]:
len(freq_df.stitch_id_sterio.unique())

713

In [10]:
freq_df = freq_df.drop(columns=['placebo', 'frequency', 'meddra_type', 'umls_cui_from_meddra'])
freq_df.drop_duplicates(keep='first', inplace=True)
freq_df

Unnamed: 0,drugbank_id,name,pubchem_id,stitch_id_flat,stitch_id_sterio,umls_cui_from_label,lower,upper,side_effect_name
0,DB00006,Bivalirudin,16129704,CID116129704,CID016129704,C0000737,0.008000,0.008000,Abdominal pain
2,DB00006,Bivalirudin,16129704,CID116129704,CID016129704,C0000737,0.008000,0.008000,Gastrointestinal pain
3,DB00006,Bivalirudin,16129704,CID116129704,CID016129704,C0000737,0.010000,0.010000,Abdominal pain
5,DB00006,Bivalirudin,16129704,CID116129704,CID016129704,C0000737,0.010000,0.010000,Gastrointestinal pain
6,DB00006,Bivalirudin,16129704,CID116129704,CID016129704,C0000737,0.047663,0.047663,Abdominal pain
...,...,...,...,...,...,...,...,...,...
214259,DB15351,Dihematoporphyrin ether,57166,CID100057166,CID000057166,C1536009,0.080000,0.080000,Oesophageal oedema
214261,DB15351,Dihematoporphyrin ether,57166,CID100057166,CID000057166,C3263723,0.144928,0.144928,Injury
214263,DB15351,Dihematoporphyrin ether,57166,CID100057166,CID000057166,C3263723,0.191781,0.191781,Injury
214265,DB15351,Dihematoporphyrin ether,57166,CID100057166,CID000057166,C3263723,0.191824,0.191824,Injury


In [11]:
freq_df = freq_df.sort_values('upper', ascending=False).drop_duplicates(['drugbank_id', 'pubchem_id', 'umls_cui_from_label'])
freq_df

Unnamed: 0,drugbank_id,name,pubchem_id,stitch_id_flat,stitch_id_sterio,umls_cui_from_label,lower,upper,side_effect_name
21264,DB00268,Ropinirole,5095,CID100005095,CID000005095,C0018772,0.01,1.0,Hearing impaired
124612,DB00915,Amantadine,2130,CID100002130,CID000002130,C0085649,0.10,1.0,Oedema peripheral
124638,DB00915,Amantadine,2130,CID100002130,CID000002130,C0235198,0.01,1.0,Cerebration impaired
124633,DB00915,Amantadine,2130,CID100002130,CID000002130,C0234518,0.01,1.0,Dysarthria
124631,DB00915,Amantadine,2130,CID100002130,CID000002130,C0233571,0.01,1.0,Agitation
...,...,...,...,...,...,...,...,...,...
198916,DB06769,Bendamustine,65628,CID100065628,CID000065628,C0027424,0.00,0.0,Nasal congestion
198840,DB06769,Bendamustine,65628,CID100065628,CID000065628,C0013395,0.00,0.0,Dyspepsia
182868,DB01440,gamma-Hydroxybutyric acid,10413,CID100010413,CID000010413,C0700590,0.00,0.0,Sweating increased
182872,DB01440,gamma-Hydroxybutyric acid,10413,CID100010413,CID000010413,C0947912,0.00,0.0,Myasthenia


In [12]:
freq_df.to_csv('dataset/drugbank_freq_side_effect.csv', index=False)