In [1]:
import os
import gzip
import collections
import xml.etree.ElementTree as ET
import pandas as pd

# Load drugbank database

In [2]:
xml_path = os.path.join('../../PhD_study/DDI_corpus/download', 'full database.xml.gz')
with gzip.open(xml_path) as xml_file:
    tree = ET.parse(xml_file)
root = tree.getroot()

# Extract 'pubchem_id' from each drug

In [3]:
ns = '{http://www.drugbank.ca}'
pubchem_id_template = "{ns}external-identifiers/{ns}external-identifier[{ns}resource='PubChem Compound']/{ns}identifier"

rows = list()
not_inchi=0
for i, drug in enumerate(root):
    row = collections.OrderedDict()
    assert drug.tag == ns + 'drug'
    row['drugbank_id'] = drug.findtext(ns + "drugbank-id[@primary='true']")
    row['name'] = drug.findtext(ns + "name")    
    row['pubchem_id'] = drug.findtext(pubchem_id_template.format(ns = ns))
    if row['pubchem_id'] == None:
        continue
    rows.append(row)
    
columns = ['drugbank_id', 'name', 'pubchem_id']
drugbank_df = pd.DataFrame.from_dict(rows)[columns]
drugbank_df['pubchem_id'] = drugbank_df['pubchem_id'].astype(int)
drugbank_df.to_csv('dataset/drugbank_pubchem_id.csv', index=False)
display(drugbank_df.shape, drugbank_df.head())

(8725, 3)

Unnamed: 0,drugbank_id,name,pubchem_id
0,DB00006,Bivalirudin,16129704
1,DB00014,Goserelin,5311128
2,DB00027,Gramicidin D,45267103
3,DB00050,Cetrorelix,25074887
4,DB00080,Daptomycin,16134395


## Load dataset of drug with 'pubchem_id' extracted via pubchempy.get_compounds(inchi)

In [4]:
drugbank_cid = pd.read_csv('dataset/drugbank_cid.csv')

drug_pubchem_id = pd.concat([drugbank_df, drugbank_cid])
drug_pubchem_id.drop_duplicates(keep='first', inplace=True)
drug_pubchem_id = drug_pubchem_id.sort_values(["drugbank_id"], ascending = (True))
drug_pubchem_id

Unnamed: 0,drugbank_id,name,pubchem_id
0,DB00006,Bivalirudin,16129704
0,DB00006,Bivalirudin,101041682
1,DB00007,Leuprolide,657181
1,DB00014,Goserelin,5311128
2,DB00027,Gramicidin D,45267103
...,...,...,...
11086,DB16735,PAT-1251,122536283
11087,DB16737,MM3122,155925845
11088,DB16739,MK-886,3651377
11089,DB16741,Bortezomib D-mannitol,12990536


# STITCH to DrugBank mapping

In [5]:
def stitch_flat_to_pubchem(cid):
    assert cid.startswith('CID')
    return int(cid[3:]) - 1e8


def stitch_stereo_to_pubchem(cid):
    assert cid.startswith('CID')
    return int(cid[3:])

## meddra_all_label_se.tsv.gz

In [6]:
columns = ['url', 'stitch_id_flat', 'stitch_id_sterio', 'umls_cui_from_label', 'meddra_type', 'umls_cui_from_meddra', 'side_effect_name']
meddra_all_label = pd.read_table('../../PhD_study/DDI_corpus/SIDER_data/meddra_all_label_se.tsv.gz', names=columns)
meddra_all_label = meddra_all_label.drop(columns=['url', 'meddra_type', 'umls_cui_from_meddra'])
meddra_all_label.drop_duplicates(keep='first', inplace=True)
meddra_all_label

Unnamed: 0,stitch_id_flat,stitch_id_sterio,umls_cui_from_label,side_effect_name
0,CID100216416,CID000216416,C0000737,Abdominal pain
2,CID100216416,CID000216416,C0000737,Gastrointestinal pain
3,CID100216416,CID000216416,C0002170,Alopecia
5,CID100216416,CID000216416,C0002395,Dementia Alzheimer's type
7,CID100216416,CID000216416,C0002622,Amnesia
...,...,...,...,...
4753079,CID100119830,CID005481350,C0033771,Prurigo
4753117,CID100119830,CID005481350,C0221201,Rash macular
4753119,CID100119830,CID005481350,C0221203,Rash vesicular
4753130,CID100119830,CID005481350,C0497365,Rash generalised


In [7]:
meddra_all_label['pubchem_id'] = meddra_all_label.stitch_id_sterio.map(stitch_stereo_to_pubchem)
meddra_all_label = drug_pubchem_id.merge(meddra_all_label)
display(meddra_all_label.shape, meddra_all_label.head(10))

(131839, 7)

Unnamed: 0,drugbank_id,name,pubchem_id,stitch_id_flat,stitch_id_sterio,umls_cui_from_label,side_effect_name
0,DB00006,Bivalirudin,16129704,CID116129704,CID016129704,C0002792,Anaphylactic shock
1,DB00006,Bivalirudin,16129704,CID116129704,CID016129704,C0002871,Anaemia
2,DB00006,Bivalirudin,16129704,CID116129704,CID016129704,C0002962,Angina pectoris
3,DB00006,Bivalirudin,16129704,CID116129704,CID016129704,C0003855,Arteriovenous fistula
4,DB00006,Bivalirudin,16129704,CID116129704,CID016129704,C0004604,Back pain
5,DB00006,Bivalirudin,16129704,CID116129704,CID016129704,C0005778,Clotting
6,DB00006,Bivalirudin,16129704,CID116129704,CID016129704,C0005778,Coagulopathy
7,DB00006,Bivalirudin,16129704,CID116129704,CID016129704,C0007177,Cardiac tamponade
8,DB00006,Bivalirudin,16129704,CID116129704,CID016129704,C0008031,Chest pain
9,DB00006,Bivalirudin,16129704,CID116129704,CID016129704,C0009492,Compartment syndrome


In [8]:
meddra_all_label = meddra_all_label.drop_duplicates(['drugbank_id', 'pubchem_id', 'umls_cui_from_label'])
display(meddra_all_label.shape, meddra_all_label.head(10))

(106102, 7)

Unnamed: 0,drugbank_id,name,pubchem_id,stitch_id_flat,stitch_id_sterio,umls_cui_from_label,side_effect_name
0,DB00006,Bivalirudin,16129704,CID116129704,CID016129704,C0002792,Anaphylactic shock
1,DB00006,Bivalirudin,16129704,CID116129704,CID016129704,C0002871,Anaemia
2,DB00006,Bivalirudin,16129704,CID116129704,CID016129704,C0002962,Angina pectoris
3,DB00006,Bivalirudin,16129704,CID116129704,CID016129704,C0003855,Arteriovenous fistula
4,DB00006,Bivalirudin,16129704,CID116129704,CID016129704,C0004604,Back pain
5,DB00006,Bivalirudin,16129704,CID116129704,CID016129704,C0005778,Clotting
7,DB00006,Bivalirudin,16129704,CID116129704,CID016129704,C0007177,Cardiac tamponade
8,DB00006,Bivalirudin,16129704,CID116129704,CID016129704,C0008031,Chest pain
9,DB00006,Bivalirudin,16129704,CID116129704,CID016129704,C0009492,Compartment syndrome
10,DB00006,Bivalirudin,16129704,CID116129704,CID016129704,C0009782,Connective tissue disorder


In [9]:
len(meddra_all_label.drugbank_id.unique())

1072

In [10]:
len(meddra_all_label.stitch_id_sterio.unique())

1072

In [11]:
meddra_all_label.to_csv('dataset/drugbank_all_side_effect.csv', index=False)

In [33]:
df2 = meddra_all_label.groupby(['drugbank_id']).size().sort_values(ascending=False).reset_index(name='counts')
df2

Unnamed: 0,drugbank_id,counts
0,DB01238,769
1,DB00230,727
2,DB01175,673
3,DB00268,622
4,DB00734,607
...,...,...
1067,DB00259,2
1068,DB06691,1
1069,DB09563,1
1070,DB11256,1


In [34]:
len(meddra_all_label.side_effect_name.unique())

5375