In [53]:
import pandas as pd
import numpy as np


In [54]:
path_to_file = "/Users/aditgupta/Google Drive/Research/T2D/Summer_2018/drug_names.tsv" #this is the path to the file for all the drug names and corresponding ID's
df5 = pd.read_csv(path_to_file,delimiter="\t", header=None) #reads file, converts to csv, and explains to functionthat the delimeter are tabs
df5.rename(columns={0:'ID'}, inplace=True) #renames index zero to 'ID'
df5.rename(columns={1:'drug_name'}, inplace=True) #renames index one to drug name
#OVERALL: df5 contains the drug name based on the 'CID' ID 


In [55]:
df5.head(10)


Unnamed: 0,ID,drug_name
0,CID100000085,carnitine
1,CID100000119,gamma-aminobutyric
2,CID100000137,5-aminolevulinic
3,CID100000143,leucovorin
4,CID100000146,5-methyltetrahydrofolate
5,CID100000158,PGE2
6,CID100000159,prostacyclin
7,CID100000160,prostaglandin
8,CID100000175,acetate
9,CID100000187,acetylcholine


In [56]:
df5.loc[df5["drug_name"].isin(["repaglinide"])] #the drug, repaglinide, which targets the PPARG gene (which variants are associated with T2D(Majithia et al PNAS 2014)) is present in the database.
#However its side effects aren't one of the four traits linked to type 2 diabetes, so the drug and samples don't get included in the final sample

Unnamed: 0,ID,drug_name
561,CID100004547,repaglinide


In [57]:
import xml.etree.ElementTree as ET
import lxml.html
#importing libraries that parse the xml file with detailed drug information 

In [58]:
path_to_file = '/Users/aditgupta/Google Drive/Research/T2D/Summer_2018/drugbank_full_database.xml' #file location of the drugbank database

In [59]:
tree = ET.parse(path_to_file) # function that parses the xml file from drugbank

In [60]:
def find_categories_matching_string(drug_element, category_name):
    all_categories = drug_element.findall(".//*{http://www.drugbank.ca}category")
    categories_wanted = list(filter(lambda category: category.text == category_name, all_categories))
    return categories_wanted

In [61]:
def get_id_and_name_from_target(target):
    id = target.find("{http://www.drugbank.ca}id").text
    name = target.find("{http://www.drugbank.ca}name").text
    effect = target.find("*{http://www.drugbank.ca}category") #Blood Glucose Lowering
    gene = target.find("*{http://www.drugbank.ca}gene-name")
    #glucose_categories = find_categories_matching_string(target, "Blood Glucose Lowering Agents")
    #diabetes_categories = find_categories_matching_string(target, "Drugs Used in Diabetes")

    
    if gene is None:
        gene_found = None
        
    else:
        gene_found= gene.text
    return id, name, gene_found
#function that returns the target id, target name, and gene name of a particular sample

In [62]:
all_drugs = tree.findall("{http://www.drugbank.ca}drug")

In [63]:
new_results = []
total_found = 0
for drug in all_drugs:
    drug_name = drug.find("{http://www.drugbank.ca}name").text
    drug_id = drug.find("{http://www.drugbank.ca}drugbank-id[@primary]").text
    element_containing_all_targets = drug.find("{http://www.drugbank.ca}targets")
    list_of_targets = element_containing_all_targets.findall("{http://www.drugbank.ca}target")
    glucose_categories = find_categories_matching_string(drug, "Blood Glucose Lowering Agents")
    diabetes_categories = find_categories_matching_string(drug, "Drugs Used in Diabetes")
    if glucose_categories == []:
        glucose_categories = None
    if diabetes_categories == []:
        diabetes_categories = None
    for target in list_of_targets:
        target_id, target_name, target_gene = get_id_and_name_from_target(target)
        new_results.append([drug_name, drug_id, target_id, target_name, target_gene, glucose_categories, diabetes_categories])
    total_found += 1
#creates an array and list that contains the drug name, drug id, target id, target name, and target gene as extracted from drug bank

In [64]:
second_results = []


In [65]:
total_found = 0
for drug in all_drugs:
    drug_name = drug.find("{http://www.drugbank.ca}name").text
    drug_id = drug.find("{http://www.drugbank.ca}drugbank-id[@primary]").text
    element_containing_all_targets = drug.find("{http://www.drugbank.ca}targets")
    list_of_targets = element_containing_all_targets.findall("{http://www.drugbank.ca}target")
    for target in list_of_targets:
        target_id, target_name, target_gene = get_id_and_name_from_target(target)
        second_results.append([drug_name, drug_id, target_id, target_name, target_gene])
    total_found += 1
#creates an array and list that contains the drug name, drug id, target id, target name, and target gene as extracted from drug bank

In [66]:
test_results = []
test_found = 0
for drug in all_drugs:
    drug_name = drug.find("{http://www.drugbank.ca}name").text
    drug_id = drug.find("{http://www.drugbank.ca}drugbank-id[@primary]").text
    element_containing_all_targets = drug.find("{http://www.drugbank.ca}targets")
    list_of_targets = element_containing_all_targets.findall("{http://www.drugbank.ca}target")
    diabetes_categories = find_categories_matching_string(drug, "Drugs Used in Diabetes")
    if diabetes_categories == []:
        diabetes_categories = None
    for target in list_of_targets:
        target_id, target_name, target_gene = get_id_and_name_from_target(target)
        test_results.append([drug_name, drug_id, target_id, target_name, target_gene, diabetes_categories])
    test_found += 1

In [67]:
drug = all_drugs[0]
element_containing_all_targets = drug.find("{http://www.drugbank.ca}targets")
list_of_targets = element_containing_all_targets.findall("{http://www.drugbank.ca}target")
target = list_of_targets[0]


In [68]:
drug

<Element '{http://www.drugbank.ca}drug' at 0x23efd1d68>

In [69]:
df = pd.DataFrame(new_results) #df contains the results from the parsing of the xml file from drugbank

In [70]:
df.columns = ["drug_name", "drug_id", "target_id", "target_name", "target_gene","target_effect_glucose_?","target_effect_diabetes"] #renaming the columns of the dataframe
#"target_effect_glucose_?"


In [71]:
df50 = df

In [72]:
df.shape

(18495, 7)

In [73]:
genes_wanted = df.dropna()['target_gene'].unique()


In [74]:
df50.dropna().head()

Unnamed: 0,drug_name,drug_id,target_id,target_name,target_gene,target_effect_glucose_?,target_effect_diabetes
102,Insulin Human,DB00030,BE0000033,Insulin receptor,INSR,[[]],[[]]
103,Insulin Human,DB00030,BE0000858,Insulin-like growth factor 1 receptor,IGF1R,[[]],[[]]
104,Insulin Human,DB00030,BE0002123,Retinoblastoma-associated protein,RB1,[[]],[[]]
105,Insulin Human,DB00030,BE0000941,Cathepsin D,CTSD,[[]],[[]]
106,Insulin Human,DB00030,BE0001183,Insulin-degrading enzyme,IDE,[[]],[[]]


In [75]:
string_to_write = "\n".join(genes_wanted)


In [76]:
with open("output.txt", "w") as f:
    f.write(string_to_write)
    

In [77]:
df.head()


Unnamed: 0,drug_name,drug_id,target_id,target_name,target_gene,target_effect_glucose_?,target_effect_diabetes
0,Lepirudin,DB00001,BE0000048,Prothrombin,F2,,
1,Cetuximab,DB00002,BE0000767,Epidermal growth factor receptor,EGFR,,
2,Cetuximab,DB00002,BE0000901,Low affinity immunoglobulin gamma Fc region re...,FCGR3B,,
3,Cetuximab,DB00002,BE0002093,Complement C1r subcomponent,C1R,,
4,Cetuximab,DB00002,BE0002094,Complement C1q subcomponent subunit A,C1QA,,


In [78]:
df.dropna(subset = ['target_effect_glucose_?']) #only drops nans for whether they effect glucose. There are more drugs that target glucose than target diabetes


Unnamed: 0,drug_name,drug_id,target_id,target_name,target_gene,target_effect_glucose_?,target_effect_diabetes
102,Insulin Human,DB00030,BE0000033,Insulin receptor,INSR,[[]],[[]]
103,Insulin Human,DB00030,BE0000858,Insulin-like growth factor 1 receptor,IGF1R,[[]],[[]]
104,Insulin Human,DB00030,BE0002123,Retinoblastoma-associated protein,RB1,[[]],[[]]
105,Insulin Human,DB00030,BE0000941,Cathepsin D,CTSD,[[]],[[]]
106,Insulin Human,DB00030,BE0001183,Insulin-degrading enzyme,IDE,[[]],[[]]
107,Insulin Human,DB00030,BE0002124,Neuroendocrine convertase 2,PCSK2,[[]],[[]]
108,Insulin Human,DB00030,BE0001123,Carboxypeptidase E,CPE,[[]],[[]]
109,Insulin Human,DB00030,BE0002125,Neuroendocrine convertase 1,PCSK1,[[]],[[]]
110,Insulin Human,DB00030,BE0001147,Protein NOV homolog,NOV,[[]],[[]]
111,Insulin Human,DB00030,BE0000942,Low-density lipoprotein receptor-related prote...,LRP2,[[]],[[]]


In [79]:
df2 = pd.DataFrame(second_results)
df2.columns = ["drug_name", "drug_id", "target_id", "target_name", "target_gene"] #renaming the columns of the dataframe
df2.head()


Unnamed: 0,drug_name,drug_id,target_id,target_name,target_gene
0,Lepirudin,DB00001,BE0000048,Prothrombin,F2
1,Cetuximab,DB00002,BE0000767,Epidermal growth factor receptor,EGFR
2,Cetuximab,DB00002,BE0000901,Low affinity immunoglobulin gamma Fc region re...,FCGR3B
3,Cetuximab,DB00002,BE0002093,Complement C1r subcomponent,C1R
4,Cetuximab,DB00002,BE0002094,Complement C1q subcomponent subunit A,C1QA


In [80]:
df2["drug_name"] = df2["drug_name"].str.lower() #all drug names in the dataframe are made lower case for simplicity 


In [81]:
indexed_df = df2.set_index("drug_name") #new dataframe where the index is drug_name


In [82]:
indexed_df.head()


Unnamed: 0_level_0,drug_id,target_id,target_name,target_gene
drug_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
lepirudin,DB00001,BE0000048,Prothrombin,F2
cetuximab,DB00002,BE0000767,Epidermal growth factor receptor,EGFR
cetuximab,DB00002,BE0000901,Low affinity immunoglobulin gamma Fc region re...,FCGR3B
cetuximab,DB00002,BE0002093,Complement C1r subcomponent,C1R
cetuximab,DB00002,BE0002094,Complement C1q subcomponent subunit A,C1QA


In [83]:
#Find all drugs that have at most two targets and all targets that have at least two drugs targeting it
df3 = df['drug_name'].value_counts() 
df4 = df3.to_frame(name=None)
df5 = df4.reset_index()
df6 = df5.rename(columns={'drug_name':'number_of_targets'}, inplace=False)


In [84]:
df7 = df6.rename(columns={'index':'drug_name'}, inplace=False)
df7.head()
#df7 is a dataframe that displays the number of targets given a drug 


Unnamed: 0,drug_name,number_of_targets
0,Fostamatinib,300
1,Artenimol,192
2,Copper,147
3,NADH,144
4,Zinc,124


In [85]:
#purpose of this cell is to find the number of drugs that target a specific target. Only targets that have more than two drugs targeting it are desired 
target_freq = df['target_id'].value_counts() #number of drugs that target a given target ID in a series
df8 = target_freq.to_frame(name=None)
df9 = df8.reset_index()
df10 = df9.rename(columns={'target_id':'target_freq'}, inplace=False)
df11 = df10.rename(columns={'index':'target_id'}, inplace=False)
df11.head(10)
#df20 is a datagrame that shows the total number of drugs that target a given target ID


Unnamed: 0,target_id,target_freq
0,BE0001072,137
1,BE0000048,112
2,BE0000123,111
3,BE0000756,108
4,BE0000451,105
5,BE0000442,104
6,BE0004796,101
7,BE0000322,100
8,BE0001739,99
9,BE0000092,97


In [86]:
df12 = df7[(df7['number_of_targets'])<3]  
df12.head()
#df10 is a dataframe that only contains drugs that have a maximum of two targets


Unnamed: 0,drug_name,number_of_targets
1508,"Interferon Alfa-2b, Recombinant",2
1509,N-(4-(2-((3-Chlorophenylmethyl)Amino)Ethyl)Phe...,2
1510,Befunolol,2
1511,Progabide,2
1512,"2,4-Diamino-5-(3,4,5-Trimethoxy-Benzyl)-Pyrimi...",2


In [87]:
df12['drug_name'].unique() #displays all unique genes that at most two targets. There are 5891 genes that fit this criteria

array(['Interferon Alfa-2b, Recombinant',
       'N-(4-(2-((3-Chlorophenylmethyl)Amino)Ethyl)Phenyl)-2-Thiophecarboxamidine',
       'Befunolol', ..., 'Prasugrel', 'Secukinumab',
       '(3Z,5S,6R,7S,8R,8aR)-3-(octylimino)hexahydro[1,3]oxazolo[3,4-a]pyridine-5,6,7,8-tetrol'],
      dtype=object)

In [88]:
df13 = df11[(df11['target_freq'])>1]  #df21 is a dataframe that contains all targets that have at least two drugs targeting it
df13.head(15) #displays first 15 targets 


Unnamed: 0,target_id,target_freq
0,BE0001072,137
1,BE0000048,112
2,BE0000123,111
3,BE0000756,108
4,BE0000451,105
5,BE0000442,104
6,BE0004796,101
7,BE0000322,100
8,BE0001739,99
9,BE0000092,97


In [89]:
df14 = pd.merge(df, df12, on='drug_name', how='inner')
df15 = df14.dropna(subset= ['target_effect_glucose_?', 'target_gene'])
df15.head()


Unnamed: 0,drug_name,drug_id,target_id,target_name,target_gene,target_effect_glucose_?,target_effect_diabetes,number_of_targets
35,Insulin Lispro,DB00046,BE0000033,Insulin receptor,INSR,[[]],[[]],2
36,Insulin Lispro,DB00046,BE0000858,Insulin-like growth factor 1 receptor,IGF1R,[[]],[[]],2
37,Insulin Glargine,DB00047,BE0000033,Insulin receptor,INSR,[[]],[[]],2
38,Insulin Glargine,DB00047,BE0000858,Insulin-like growth factor 1 receptor,IGF1R,[[]],[[]],2
154,Sulfisoxazole,DB00263,BE0000803,Dihydropteroate synthase,folP,[[]],,1


In [90]:
df16 = pd.merge(df15, df13, on='target_id', how='outer') #merges df21(which contains all targets with more than two drugs targetin it) and df27 (the current master dataframe)
df16.dropna().head()
#there are several Nan values for targets that had less than two drugs targeting it or targets that werent in the previous master dataframe (like BE0001072). 
#These must be removed (and it is done in df29)

Unnamed: 0,drug_name,drug_id,target_id,target_name,target_gene,target_effect_glucose_?,target_effect_diabetes,number_of_targets,target_freq
0,Insulin Lispro,DB00046,BE0000033,Insulin receptor,INSR,[[]],[[]],2.0,16.0
1,Insulin Glargine,DB00047,BE0000033,Insulin receptor,INSR,[[]],[[]],2.0,16.0
2,Insulin Aspart,DB01306,BE0000033,Insulin receptor,INSR,[[]],[[]],1.0,16.0
3,Insulin Detemir,DB01307,BE0000033,Insulin receptor,INSR,[[]],[[]],1.0,16.0
4,Insulin Glulisine,DB01309,BE0000033,Insulin receptor,INSR,[[]],[[]],1.0,16.0


In [91]:
genes_desired = df16.dropna()['target_gene'].unique()
string_to_write = "\n".join(genes_desired)
with open("genes_desired.txt", "w") as f:
    f.write(string_to_write)
    

In [92]:
drugs_used = df16.dropna()['drug_name'].unique()
string_to_write = "\n".join(drugs_used)
with open("drugs_used.txt", "w")as f:
    f.write(string_to_write)
    

In [93]:
df17 = pd.DataFrame(test_results)
df17.columns = ["drug_name","drug_id","target_id","target_name","target_gene", "target_effect_diabetes"]
df18 = df17.dropna() #displays all drugs used in diabetes


In [94]:
df18['drug_name'].unique()


array(['Insulin Human', 'Insulin Lispro', 'Insulin Glargine',
       'Insulin Pork', 'Troglitazone', 'Glimepiride', 'Acarbose',
       'Metformin', 'Rosiglitazone', 'Acetohexamide', 'Miglitol',
       'Simvastatin', 'Chlorpropamide', 'Nateglinide', 'Tolazamide',
       'Repaglinide', 'Phenformin', 'Glyburide', 'Glipizide',
       'Gliclazide', 'Tolbutamide', 'Pioglitazone', 'Gliquidone',
       'Mitiglinide', 'Sitagliptin', 'Exenatide', 'Pramlintide',
       'Glisoxepide', 'Insulin Aspart', 'Insulin Detemir',
       'Insulin Glulisine', 'Glycodiazine', 'Tolrestat', 'Vildagliptin',
       'Voglibose', 'Alogliptin', 'Dapagliflozin', 'Saxagliptin',
       'Liraglutide', 'Linagliptin', 'Canagliflozin', 'Empagliflozin',
       'Albiglutide', 'Dulaglutide', 'Lixisenatide', 'Insulin Degludec'],
      dtype=object)

In [95]:
df29 = pd.merge(df18, df12, on='drug_name', how='inner')
df30 = df29.dropna(subset= ['target_effect_diabetes', 'target_gene'])


In [96]:
df31 = pd.merge(df30, df13, on='target_id', how='outer')
df32 = df31.dropna()


In [97]:
df32['target_gene'].unique()


array(['INSR', 'IGF1R', 'PRKAB1', 'KCNJ1', 'ABCC8', 'PPARG', 'PRKAA1',
       'KCNJ8', 'VEGFA', 'DPP4', 'GLP1R', 'MGAM', 'SLC5A2'], dtype=object)

In [98]:
df33 = df.dropna(subset = ['target_effect_glucose_?', 'target_effect_diabetes']) #only displays samples of drugs that effect both glucose and diabetes


In [99]:
df34 = pd.merge(df33, df12, on='drug_name', how='inner')
df35 = df34.dropna()
df35.head()

Unnamed: 0,drug_name,drug_id,target_id,target_name,target_gene,target_effect_glucose_?,target_effect_diabetes,number_of_targets
0,Insulin Lispro,DB00046,BE0000033,Insulin receptor,INSR,[[]],[[]],2
1,Insulin Lispro,DB00046,BE0000858,Insulin-like growth factor 1 receptor,IGF1R,[[]],[[]],2
2,Insulin Glargine,DB00047,BE0000033,Insulin receptor,INSR,[[]],[[]],2
3,Insulin Glargine,DB00047,BE0000858,Insulin-like growth factor 1 receptor,IGF1R,[[]],[[]],2
4,Metformin,DB00331,BE0000254,5'-AMP-activated protein kinase subunit beta-1,PRKAB1,[[]],[[]],1


In [100]:
df36 = pd.merge(df35, df13, on='target_id', how='outer')
df37 = df36.dropna()
df37.loc[df37['target_gene'].isin(["ABCC8"])]
#df37.head()
#df37 includes all samples that affects both diabetes and glucose levels and fits within the criteria of max two target & >two drugs per target

Unnamed: 0,drug_name,drug_id,target_id,target_name,target_gene,target_effect_glucose_?,target_effect_diabetes,number_of_targets,target_freq
12,Chlorpropamide,DB00672,BE0000207,ATP-binding cassette sub-family C member 8,ABCC8,[[]],[[]],1.0,11.0
13,Nateglinide,DB00731,BE0000207,ATP-binding cassette sub-family C member 8,ABCC8,[[]],[[]],2.0,11.0
14,Repaglinide,DB00912,BE0000207,ATP-binding cassette sub-family C member 8,ABCC8,[[]],[[]],2.0,11.0
15,Glipizide,DB01067,BE0000207,ATP-binding cassette sub-family C member 8,ABCC8,[[]],[[]],2.0,11.0
16,Gliclazide,DB01120,BE0000207,ATP-binding cassette sub-family C member 8,ABCC8,[[]],[[]],2.0,11.0
17,Tolbutamide,DB01124,BE0000207,ATP-binding cassette sub-family C member 8,ABCC8,[[]],[[]],2.0,11.0
18,Gliquidone,DB01251,BE0000207,ATP-binding cassette sub-family C member 8,ABCC8,[[]],[[]],2.0,11.0
19,Mitiglinide,DB01252,BE0000207,ATP-binding cassette sub-family C member 8,ABCC8,[[]],[[]],2.0,11.0
20,Glycodiazine,DB01382,BE0000207,ATP-binding cassette sub-family C member 8,ABCC8,[[]],[[]],2.0,11.0
25,Tolazamide,DB00839,BE0008670,"Sulfonylurea receptor 1, Kir6.2",ABCC8,[[]],[[]],1.0,2.0


In [101]:
df37['target_gene'].unique() #unique genes that effect diabetes AND glucose levels

array(['INSR', 'IGF1R', 'PRKAB1', 'KCNJ1', 'ABCC8', 'PPARG', 'PRKAA1',
       'KCNJ8', 'VEGFA', 'DPP4', 'GLP1R', 'MGAM', 'SLC5A2'], dtype=object)

In [102]:
df37['drug_name'].unique()

array(['Insulin Lispro', 'Insulin Glargine', 'Insulin Aspart',
       'Insulin Detemir', 'Insulin Glulisine', 'Insulin Degludec',
       'Metformin', 'Acetohexamide', 'Tolbutamide', 'Glycodiazine',
       'Chlorpropamide', 'Nateglinide', 'Repaglinide', 'Glipizide',
       'Gliclazide', 'Gliquidone', 'Mitiglinide', 'Tolazamide',
       'Phenformin', 'Glisoxepide', 'Sitagliptin', 'Vildagliptin',
       'Alogliptin', 'Saxagliptin', 'Linagliptin', 'Exenatide',
       'Liraglutide', 'Albiglutide', 'Dulaglutide', 'Lixisenatide',
       'Voglibose', 'Dapagliflozin', 'Canagliflozin', 'Empagliflozin'],
      dtype=object)

In [103]:
df38 = df36.dropna(subset = ['target_effect_glucose_?'])
df38['target_gene'].unique() #Genes Specific to glucose only (othere are overlaps with diabetes-affecting target)

array(['INSR', 'IGF1R', 'PRKAB1', 'KCNJ1', 'ABCC8', 'PPARG', 'PRKAA1',
       'KCNJ8', 'VEGFA', 'DPP4', 'GLP1R', 'MGAM', 'SLC5A2', 'SLC5A1'],
      dtype=object)

In [104]:
df39 = df36.dropna(subset = ['target_effect_diabetes'])
df39['target_gene'].unique() #genes specific to diabetes only (there are overlaps with glucose-affecting targets)

array(['INSR', 'IGF1R', 'PRKAB1', 'KCNJ1', 'ABCC8', 'PPARG', 'PRKAA1',
       'KCNJ8', 'VEGFA', 'DPP4', 'GLP1R', 'MGAM', 'SLC5A2', 'SLC5A1'],
      dtype=object)