In [2]:
import numpy as np
import pandas as pd
import json
import xml.etree.ElementTree as ET
import re
from copy import deepcopy


In [20]:
targets = pd.read_csv('data/targets_and_families.csv')
gpcr_targets = targets[targets['Type']== 'gpcr']
gpcr_targets = gpcr_targets[['Family name','Target name','HGNC symbol','Human SwissProt']]
gpcr_targets.columns = ['target_family_name','target_name','target_gene_symbol','target_uniprot_id']
gpcr_targets

Unnamed: 0,target_family_name,target_name,target_gene_symbol,target_uniprot_id
0,5-Hydroxytryptamine receptors,5-HT<sub>1A</sub> receptor,HTR1A,P08908
1,5-Hydroxytryptamine receptors,5-HT<sub>1B</sub> receptor,HTR1B,P28222
2,5-Hydroxytryptamine receptors,5-HT<sub>1D</sub> receptor,HTR1D,P28221
3,5-Hydroxytryptamine receptors,5-ht<sub>1e</sub> receptor,HTR1E,P28566
4,5-Hydroxytryptamine receptors,5-HT<sub>1F</sub> receptor,HTR1F,P30939
...,...,...,...,...
406,Vasopressin and oxytocin receptors,V<sub>2</sub> receptor,AVPR2,P30518
407,Vasopressin and oxytocin receptors,OT receptor,OXTR,P30559
408,VIP and PACAP receptors,PAC<sub>1</sub> receptor,ADCYAP1R1,P41586
409,VIP and PACAP receptors,VPAC<sub>1</sub> receptor,VIPR1,P32241


In [85]:
atc = pd.read_csv('data/ATC.csv')

atc = atc[['Preferred Label','Class ID','ATC LEVEL']]
atc.columns = ['label','code','level']
atc['code'] = atc.code.apply(lambda x: x.split('/')[-1])

top_level_atc = atc[atc['level'] == 1]
top_level_atc_dict = dict(zip(top_level_atc.code, top_level_atc.label))

bottom_level_atc = atc[atc['level'] == 1]
bottom_level_atc_dict = dict(zip(bottom_level_atc.code, bottom_level_atc.label))


In [None]:
ip = ET.iterparse('drugbank.xml', events=("start", "end"))
max_lines = 100000000

path = []
template = {'target_ids':[],'target_names':[],'modes_of_action':[]}
outputs = []

strip_tag = lambda tag: re.sub('{http://www.drugbank.ca}','',tag)

for i, (event, elem) in enumerate(ip):
    if event == 'start':
        path.append(strip_tag(elem.tag))
    
        if path == ['drugbank','drug']:
            output_ = {}
        elif path[2:] == ['groups']:
            output_['groups'] = []
        elif path[2:] == ['atc-codes']:
            output_['atc-codes'] = []
        elif path[2:] == ['targets']:
            output_['targets']  = []
        elif path[2:] == ['targets','target']:
            output_['targets'].append({})
        elif path[2:] == ['targets','target','actions']:
            output_['targets'][-1]['actions'] = []
        

    if event == 'end':
        
        if path[2:] == ['drugbank-id'] and 'primary' in elem.attrib.keys():
            output_['drugbank_id'] = elem.text           
        if path[2:] == ['name']:
            output_['name'] = elem.text
        if path[2:] == ['groups','group']:
            output_['groups'].append(elem.text)
        if path[2:] == ['atc-codes','atc-code']:
            output_['atc-codes'].append(elem.attrib['code'])
        if path[2:] == ['targets','target','id']:
            output_['targets'][-1]['target_id'] = elem.text
        if path[2:] == ['targets','target','name']:
            output_['targets'][-1]['target_name'] = elem.text
        if path[2:] == ['targets','target','polypeptide']:
            output_['targets'][-1]['uniprot_id'] = elem.attrib['id']
        if path[2:] == ['targets','target','actions','action']:
            output_['targets'][-1]['actions'].append(elem.text)
        if path[2:] == ['targets','target','known-action']:
            output_['targets'][-1]['known-action'] = elem.text

        if path == ['drugbank','drug']:
            output_['type'] = elem.attrib['type']
            outputs.append(deepcopy(output_))
        path.pop()
    if i > max_lines:
        break

output = pd.DataFrame(outputs)
output['groups'] = output.groups.map(json.dumps)
output['atc-codes'] = output['atc-codes'].map(json.dumps)
output['targets'] = output.targets.map(json.dumps)
output.to_csv('data/drugbank_extracted.csv')


In [10]:
# Load drugs
drugbank_drugs = pd.read_csv('../data/drug_targets/drugbank_extracted.csv',index_col=0)
drugbank_drugs['groups'] = drugbank_drugs.groups.map(json.loads)
drugbank_drugs['atc-codes'] = drugbank_drugs['atc-codes'].map(json.loads)
drugbank_drugs['targets'] = drugbank_drugs.targets.map(json.loads)

# Explode by target-drug pair
drugbank_drug_targets = drugbank_drugs.explode('targets')
drugbank_drug_targets = drugbank_drug_targets[~drugbank_drug_targets.targets.isna()]
drugbank_drug_targets = drugbank_drug_targets.reset_index().drop(columns='index')
target_info = pd.json_normalize(drugbank_drug_targets.targets)
drugbank_drug_targets = pd.concat((drugbank_drug_targets.drop(columns='targets'),target_info),axis=1)
drugbank_drug_targets = drugbank_drug_targets.rename(columns={'uniprot_id':'target_uniprot_id'})

# Filter by known action
check_actions = drugbank_drug_targets.actions.map(lambda actions: len(actions) > 0) & \
    (drugbank_drug_targets['known-action'] == 'yes')
drugbank_drug_targets = drugbank_drug_targets[check_actions]
drugbank_drug_targets = drugbank_drug_targets[['name','groups','atc-codes','type','actions','target_uniprot_id']]
drugbank_drug_targets.columns = ['drug_name','approval_status_groups','atc-codes','drug_type','actions','target_uniprot_id']

# Filter by approval
is_approved = drugbank_drug_targets.approval_status_groups.map(lambda groups: set(['approved']).issubset(groups))
is_investigational = drugbank_drug_targets.approval_status_groups.map(lambda groups: set(['investigational']).issubset(groups))
is_nutraceutical = drugbank_drug_targets.approval_status_groups.map(lambda groups: set(['nutraceutical']).issubset(groups))
drugbank_drug_targets['approval_status'] = np.select(
    [is_approved & ~is_nutraceutical, is_investigational & ~is_approved & ~is_nutraceutical],
    ['approved','investigational'], default='other' 
)
drugbank_drug_targets = drugbank_drug_targets.drop(columns='approval_status_groups')
drugbank_drug_targets = drugbank_drug_targets[drugbank_drug_targets.approval_status.isin(('approved','investigational'))]
drugbank_drug_targets


# Lookup toplevel atc codes
# atc = pd.read_csv('../data/drug_targets/ATC.csv')
# atc = atc[['Preferred Label','Class ID','ATC LEVEL']]
# atc.columns = ['label','code','level']
# atc['code'] = atc.code.apply(lambda x: x.split('/')[-1])
# top_level_atc = atc[atc['level'] == 1]
# top_level_atc_dict = dict(zip(top_level_atc.code, top_level_atc.label))
# bottom_level_atc = atc[atc['level'] == 5]
# bottom_level_atc_dict = dict(zip(bottom_level_atc.code, bottom_level_atc.label))

# drugbank_drug_targets['atc_disease_areas'] = drugbank_drug_targets['atc-codes'].map(lambda x: [top_level_atc_dict[x_[0]] for x_ in x])
# drugbank_drug_targets['atc_diseases'] = drugbank_drug_targets['atc-codes'].map(lambda x: [bottom_level_atc_dict[x_] for x_ in x])

#drugbank_drug_targets.merge(gpcr_targets,on='target_uniprot_id')

Unnamed: 0,drug_name,atc-codes,drug_type,actions,target_uniprot_id,approval_status
0,Lepirudin,[B01AE02],biotech,[inhibitor],P00734,approved
1,Cetuximab,[L01XC06],biotech,[antagonist],P00533,approved
10,Denileukin diftitox,[L01XX29],biotech,[binder],P01589,approved
11,Denileukin diftitox,[L01XX29],biotech,[agonist],P14784,approved
13,Etanercept,[L04AB01],biotech,[antibody],P01375,approved
...,...,...,...,...,...,...
18861,Berotralstat,[],small molecule,[inhibitor],P03952,approved
18862,Gallium Ga-68 gozetotide,[],small molecule,[binder],P07288,approved
18863,Razuprotafib,[],small molecule,[inhibitor],P23467,investigational
18864,Mezagitamab,[],biotech,[inhibitor],P28907,investigational


In [37]:
ot_drugs = []
with open('data/opentargets_drugs_20.06.json','r') as fid:
    for i, line in enumerate(fid):
        OT_drug = json.loads(line)
        if i == 1:        
            print()
        output = dict(
            target_gene_symbol = OT_drug['target']['gene_info']['symbol'],
            drug_name = OT_drug['drug']['molecule_name'],
            drug_chembl_id = OT_drug['drug']['id'].split('/')[-1],
            disease_area = OT_drug['disease']['efo_info']['therapeutic_area']['labels'],
            disease_name = OT_drug['disease']['efo_info']['label'],
            disease_id = OT_drug['disease']['id']
        )
        ot_drugs.append(output)

ot_drugs = pd.DataFrame(ot_drugs)
ot_drugs['disease_area'] = ot_drugs.disease_area.map(json.dumps)
ot_drugs.to_csv('data/ot_drugs_extracted.csv')




In [49]:
ot_drugs = pd.read_csv('data/ot_drugs_extracted.csv',index_col=0)
ot_drugs['disease_area'] = ot_drugs['disease_area'].map(json.loads)
ot_drugs['drug_name'] = ot_drugs.drug_name.str.lower()
ot_drugs.merge(gpcr_targets,on='target_gene_symbol').groupby(['target_gene_symbol','drug_name']).agg({'disease_area':sum,'disease_name':list}).reset_index()

Unnamed: 0,target_gene_symbol,drug_name,disease_area,disease_name
0,ADORA1,adenosine,"[cardiovascular disease, respiratory or thorac...","[paroxysmal tachycardia, Supraventricular tach..."
1,ADORA1,bay1067197,"[cardiovascular disease, respiratory or thorac...","[heart failure, heart failure, heart failure, ..."
2,ADORA1,caffeine,"[cardiovascular disease, nervous system diseas...","[migraine disorder, migraine disorder, migrain..."
3,ADORA1,capadenoson,"[phenotype, cardiovascular disease, respirator...","[angina pectoris, atrial fibrillation]"
4,ADORA1,derenofylline,"[cardiovascular disease, respiratory or thorac...",[congestive heart failure]
...,...,...,...,...
1161,TBXA2R,seratrodast,[phenotype],[Obstructive lung disease]
1162,TBXA2R,terbogrel,"[cardiovascular disease, respiratory or thorac...","[pulmonary arterial hypertension, pulmonary ar..."
1163,TRHR,protirelin,"[respiratory or thoracic disease, cell prolife...","[acute respiratory distress syndrome, cancer]"
1164,TSHR,thyrotropin,"[cell proliferation disorder, endocrine system...","[head and neck malignant neoplasia, papillary ..."


In [45]:
congreve_drugs = pd.read_csv('results/congreve2020_drugs_by_gpcr_target.csv')
congreve_drugs['uniprot_name'] = congreve_drugs['uniprot_name'].map(lambda x: x.strip() + '_human')
congreve_drugs#.groupby('uniprot_name').agg({'MOA':set})

Unnamed: 0,drug_name,uniprot_name,action,action_class,drug_type
0,abaloparatide,pth1r_human,agonist,activating,peptide
1,abarelix,gnrhr_human,antagonist,inactivating,peptide
2,acebutolol,adrb1_human,antagonist,inactivating,SME
3,acepromazine,5ht1a_human,antagonist,inactivating,SME
4,"acepromazine, aceprometazine",5ht2a_human,antagonist,inactivating,SME
...,...,...,...,...,...
902,zolmitriptan,5ht1d_human,agonist,activating,SME
903,zolmitriptan,5ht1f_human,agonist,activating,SME
904,zuclopenthixol,drd1_human,antagonist,inactivating,SME
905,zuclopenthixol,drd2_human,antagonist,inactivating,SME


In [5]:
# Merge with GPCRs
drugbank_gpcr_drug_targets = drugbank_drug_targets.merge(gpcrs, on='uniprot_accession')
drugbank_gpcr_drug_targets[['drugbank_id','name','type','uniprot_name','target_name','actions','groups','atc-codes']].to_csv('drugbank_gpcr_drug_targets.csv')

print(drugbank_gpcr_drug_targets.drugbank_id.unique().__len__(), 'approved GPCR drugs in drugbank')
print(drugbank_gpcr_drug_targets.target_id.unique().__len__(), 'approved GPCR drug targets in drugbank')

476 approved GPCR drugs in drugbank
115 approved GPCR drug targets in drugbank


In [6]:
pd.Series(drugbank_gpcr_drug_targets.actions.sum()).value_counts()

antagonist                                   466
agonist                                      362
partial agonist                               33
ligand                                         9
inhibitor                                      7
binder                                         5
inverse agonist                                5
modulator                                      4
multitarget                                    4
stimulator                                     2
partial antagonist                             2
activator                                      2
potentiator                                    1
weak inhibitor                                 1
downregulator                                  1
inducer                                        1
negative modulator                             1
suppressor                                     1
inhibits downstream inflammation cascades      1
dtype: int64

In [9]:
action_class_mappings = {
    'antagonist':'inactivating',
    'inhibitor':'inactivating',
    'partial antagonist':'inactivating', 
    'inverse agonist':'inactivating',
    'agonist':'activating',
    'activator':'activating',
    'partial agonist':'activating'     
}

drugbank_gpcr_targets = drugbank_gpcr_drug_targets.groupby('uniprot_name').aggregate({'actions':sum}).reset_index()
drugbank_gpcr_targets['actions'] = drugbank_gpcr_targets.actions.map(set)
drugbank_gpcr_targets = drugbank_gpcr_targets.explode('actions')
drugbank_gpcr_targets['action_class'] = drugbank_gpcr_targets.actions.map(action_class_mappings).fillna('other')
drugbank_gpcr_targets['approved'] = 'Y'
drugbank_gpcr_targets = drugbank_gpcr_targets[['uniprot_name','action_class','approved']].drop_duplicates()
drugbank_gpcr_targets
drugbank_gpcr_targets = drugbank_gpcr_targets.pivot(index='uniprot_name',columns='action_class',values='approved').fillna('')
drugbank_gpcr_targets['any'] = np.where(drugbank_gpcr_targets.sum(axis=1) != '','Y','')
drugbank_gpcr_targets = drugbank_gpcr_targets.reset_index()
drugbank_gpcr_targets = drugbank_gpcr_targets.merge(gpcrs[['uniprot_name']], how='right',on='uniprot_name').fillna('')
drugbank_gpcr_targets.to_csv('drugbank_gpcr_targets_with_approved_moa.csv')
drugbank_gpcr_targets


Unnamed: 0,uniprot_name,activating,inactivating,other,any
0,ACKR1_HUMAN,,,,
1,ACKR2_HUMAN,,,,
2,ACKR3_HUMAN,,,,
3,ACKR4_HUMAN,,,,
4,PACR_HUMAN,,,,
...,...,...,...,...,...
388,TSHR_HUMAN,Y,,,Y
389,UR2R_HUMAN,,,,
390,VIPR1_HUMAN,,,,
391,VIPR2_HUMAN,,,,


In [12]:
drugbank_gpcr_drug_targets

Unnamed: 0,drugbank_id,name,groups,atc-codes,type,target_id,target_name,actions,known-action,uniprot_accession,receptor_name,uniprot_name
0,DB00007,Leuprolide,"[approved, investigational]","[L02AE51, L02AE02]",small molecule,BE0000203,Gonadotropin-releasing hormone receptor,[agonist],yes,P30968,gonadotropin releasing hormone receptor,GNRHR_HUMAN
1,DB00014,Goserelin,[approved],[L02AE03],small molecule,BE0000203,Gonadotropin-releasing hormone receptor,[agonist],yes,P30968,gonadotropin releasing hormone receptor,GNRHR_HUMAN
2,DB00050,Cetrorelix,"[approved, investigational]",[H01CC02],small molecule,BE0000203,Gonadotropin-releasing hormone receptor,[antagonist],yes,P30968,gonadotropin releasing hormone receptor,GNRHR_HUMAN
3,DB00106,Abarelix,"[approved, investigational, withdrawn]",[L02BX01],small molecule,BE0000203,Gonadotropin-releasing hormone receptor,[antagonist],yes,P30968,gonadotropin releasing hormone receptor,GNRHR_HUMAN
4,DB00644,Gonadorelin,"[approved, investigational, vet_approved]","[V04CM01, H01CA01]",small molecule,BE0000203,Gonadotropin-releasing hormone receptor,[agonist],yes,P30968,gonadotropin releasing hormone receptor,GNRHR_HUMAN
...,...,...,...,...,...,...,...,...,...,...,...,...
870,DB11700,Setmelanotide,"[approved, investigational]",[],small molecule,BE0009682,Melanocortin receptor 4,[agonist],yes,P32245,melanocortin 4 receptor,MC4R_HUMAN
871,DB12457,Rimegepant,"[approved, investigational]",[],small molecule,BE0009009,Calcitonin gene-related peptide type 1 receptor,[antagonist],yes,Q16602,calcitonin receptor like receptor,CALRL_HUMAN
872,DB15328,Ubrogepant,"[approved, investigational]",[],small molecule,BE0009009,Calcitonin gene-related peptide type 1 receptor,[antagonist],yes,Q16602,calcitonin receptor like receptor,CALRL_HUMAN
873,DB12498,Mogamulizumab,"[approved, investigational]",[L01XC25],biotech,BE0009787,C-C chemokine receptor type 4,[antagonist],yes,P51679,C-C motif chemokine receptor 4,CCR4_HUMAN


In [3]:
atc = pd.read_csv('ATC.csv')
atc_drugs = atc[atc['ATC LEVEL']==5][['Class ID','Preferred Label']]
atc_drugs['Code'] = atc_drugs['Class ID'].apply(lambda x: x[x.rfind('/')+1::])
atc_drugs = atc_drugs.drop(columns = 'Class ID')
atc_drugs

Unnamed: 0,Preferred Label,Code
0,silicones,A03AX13
1,cefatrizine,J01DB07
3,urofollitropin,G03GA04
4,promethazine,D04AA10
5,pinacidil and diuretics,C02LX01
...,...,...
6560,fluorometholone,D07AB06
6561,cortivazol,H02AB17
6562,poldine,A03AB11
6563,"eritrityl tetranitrate, combinations",C01DA63
