# Testing environment for DoSE

## Setup

### Load libraries

In [1]:
import pandas as pd
import numpy as np 
import gseapy
from biothings_client import get_client

### Define data

In [2]:
seeds_file = "Input/0007079.txt"
betweenness_file = "Input/0007079_added_200_dmd_betweenness_hub_0.01.txt"
significance_file = "Input/0007079_added_200_dmd_significance_hub_1.txt"
diseases_file = "Input/ICD10_commROCG_raw.txt"
disease_clusters_file = "Input/ICD10_commROCG_cluster.txt"

### Load data

In [3]:
disease_id = "0007079"
seeds = pd.read_csv(seeds_file, sep="\t", header=None)[0]
betweenness = pd.read_csv(betweenness_file, sep="\t")['node']
significance = pd.read_csv(significance_file, sep="\t")['node']
diseases = pd.read_csv(diseases_file, sep="\t", header=None)
disease_clusters = pd.read_csv(disease_clusters_file, sep="\t", header=None)

In [4]:
import timeit
start = timeit.default_timer()

stop = timeit.default_timer()
print('Time: ', stop - start)  

Time:  4.699100099969655e-05


## Network from NeDReX

In [44]:
from urllib import request, parse
import json

In [45]:
url = 'https://api.nedrex.net/graph_builder'

In [46]:
myobj = {
    "nodes":["protein"],
    "edges":["protein_interacts_with_protein"],
    "iid_evidence":["exp"],
    "ppi_self_loops": True,
    "taxid":[9606],
    "concise": True,
    "include_omim": True,
    "disgenet_threshold": 0,
    "use_omim_ids": False,
}

In [47]:
data = json.dumps(myobj).encode('utf8')
req =  request.Request(url, data=data) # this will make the method "POST"
resp = request.urlopen(req)

In [48]:
print(resp.read().decode('utf8'))

"e1e15418-2c09-4c9e-8ad5-ed02aed4d5a8"


## Enriched gene values

In [4]:
import gseapy

In [9]:
ID_TYPE_KEY = {'entrez': 'entrezgene', 'ensembl': 'ensembl.gene', 'symbol': 'symbol', 'uniprot': 'uniprot.Swiss-Prot'}

In [51]:
ENRICH_KEY = {'GO_Molecular_Function_2015':'go.MF', 'GO_Biological_Process_2015':'go.BP', 'GO_Cellular_Component_2015':'go.CC', 'KEGG_2016':'pathway.kegg'}

In [5]:
seeds

0    P28223
1    P00325
2    P00326
3    P47869
Name: 0, dtype: object

In [8]:
gene_id_mapping = pd.read_csv("../mapping_files/gene_id_mapping.csv")
gene_id_mapping.head(5)

Unnamed: 0,entrezgene,symbol,uniprot.Swiss-Prot,ensembl.gene
0,1,A1BG,P04217,ENSG00000121410
1,2,A2M,P01023,ENSG00000175899
2,9,NAT1,P18440,ENSG00000171428
3,10,NAT2,P11245,ENSG00000156006
4,12,SERPINA3,P01011,ENSG00000196136


In [11]:
gene_id_mapping[gene_id_mapping[ID_TYPE_KEY['uniprot']].isin(seeds)]['symbol']

73       ADH1B
74       ADH1C
1502    GABRA2
1924     HTR2A
Name: symbol, dtype: object

In [30]:
enrichr_df_target = gseapy.enrichr(gene_list=list(gene_id_mapping[gene_id_mapping[ID_TYPE_KEY['uniprot']].isin(seeds)]['symbol']),
                                               description='atts',
                                               gene_sets=list(ENRICH_KEY.keys()),
                                               cutoff=0.05).results
enrichr_df_target = enrichr_df_target[enrichr_df_target['Adjusted P-value'] < 0.05]
enrichr_df_target



Unnamed: 0,Gene_set,Term,Overlap,P-value,Adjusted P-value,Old P-value,Old Adjusted P-value,Odds Ratio,Combined Score,Genes
0,GO_Molecular_Function_2015,alcohol dehydrogenase (NAD) activity (GO:0004022),2/8,8.396276e-07,0.000022,0,0,3331.666667,46611.040785,ADH1C;ADH1B
1,GO_Molecular_Function_2015,"oxidoreductase activity, acting on the CH-OH g...",2/115,1.951745e-04,0.002195,0,0,175.955752,1502.946605,ADH1C;ADH1B
2,GO_Molecular_Function_2015,"oxidoreductase activity, acting on CH-OH group...",2/131,2.532625e-04,0.002195,0,0,154.007752,1275.351143,ADH1C;ADH1B
3,GO_Molecular_Function_2015,serotonin binding (GO:0051378),1/8,1.599128e-03,0.009354,0,0,951.857143,6128.338527,HTR2A
4,GO_Molecular_Function_2015,amine binding (GO:0043176),1/9,1.798886e-03,0.009354,0,0,832.833333,5263.996189,HTR2A
...,...,...,...,...,...,...,...,...,...,...
189,KEGG_2016,Inflammatory mediator regulation of TRP channe...,1/98,1.945771e-02,0.023521,0,0,68.381443,269.389509,HTR2A
190,KEGG_2016,Retrograde endocannabinoid signaling Homo sapi...,1/101,2.004884e-02,0.023521,0,0,66.320000,259.283608,GABRA2
191,KEGG_2016,Metabolic pathways Homo sapiens hsa01100,2/1239,2.115550e-02,0.023521,0,0,15.164915,58.473719,ADH1C;ADH1B
192,KEGG_2016,Serotonergic synapse Homo sapiens hsa04726,1/112,2.221403e-02,0.023521,0,0,59.714715,227.335771,HTR2A


In [31]:
enrichr_df_target[enrichr_df_target["Gene_set"]=="KEGG_2016"]["Term"]

176            Tyrosine metabolism Homo sapiens hsa00350
177         Fatty acid degradation Homo sapiens hsa00071
178             Retinol metabolism Homo sapiens hsa00830
179    Glycolysis / Gluconeogenesis Homo sapiens hsa0...
180    Drug metabolism - cytochrome P450 Homo sapiens...
181    Metabolism of xenobiotics by cytochrome P450 H...
182        Chemical carcinogenesis Homo sapiens hsa05204
183    Neuroactive ligand-receptor interaction Homo s...
184             Nicotine addiction Homo sapiens hsa05033
185             Taste transduction Homo sapiens hsa04742
186              GABAergic synapse Homo sapiens hsa04727
187                   Gap junction Homo sapiens hsa04540
188             Morphine addiction Homo sapiens hsa05032
189    Inflammatory mediator regulation of TRP channe...
190    Retrograde endocannabinoid signaling Homo sapi...
191             Metabolic pathways Homo sapiens hsa01100
192           Serotonergic synapse Homo sapiens hsa04726
193      Calcium signaling path

In [37]:
if len(enrichr_df_target) > 0:
        enrichr_df_target.insert(2, 'Term_ID', enrichr_df_target['Term'].str.extract(r'(GO:[0-9]*|hsa[0-9]*)')[0])

In [50]:
pivot = enrichr_df_target[['Gene_set','Term_ID']].pivot(columns='Gene_set')
pivot['Term_ID']

Gene_set,GO_Biological_Process_2015,GO_Molecular_Function_2015,KEGG_2016
0,,GO:0004022,
1,,GO:0016616,
2,,GO:0016614,
3,,GO:0051378,
4,,GO:0043176,
...,...,...,...
189,,,hsa04750
190,,,hsa04723
191,,,hsa01100
192,,,hsa04726


In [49]:
set(pivot['Term_ID']['KEGG_2016'].dropna())

{'hsa00010',
 'hsa00071',
 'hsa00350',
 'hsa00830',
 'hsa00980',
 'hsa00982',
 'hsa01100',
 'hsa04020',
 'hsa04080',
 'hsa04540',
 'hsa04723',
 'hsa04726',
 'hsa04727',
 'hsa04742',
 'hsa04750',
 'hsa05032',
 'hsa05033',
 'hsa05204'}

In [54]:
dicto = dict()
for key in ENRICH_KEY.keys():
    if key in pivot['Term_ID']:
        dicto[ENRICH_KEY[key]] = set(pivot['Term_ID'][key].dropna())
    else:
        dicto[ENRICH_KEY[key]] = set()
dicto

{'go.MF': {'GO:0001965',
  'GO:0004022',
  'GO:0004890',
  'GO:0004993',
  'GO:0005230',
  'GO:0005253',
  'GO:0005254',
  'GO:0008144',
  'GO:0008227',
  'GO:0008270',
  'GO:0015103',
  'GO:0015108',
  'GO:0015276',
  'GO:0016614',
  'GO:0016616',
  'GO:0016917',
  'GO:0022834',
  'GO:0030594',
  'GO:0043176',
  'GO:0051378'},
 'go.BP': {'GO:0001505',
  'GO:0001659',
  'GO:0003013',
  'GO:0003018',
  'GO:0006066',
  'GO:0006067',
  'GO:0006069',
  'GO:0006109',
  'GO:0006110',
  'GO:0006805',
  'GO:0006821',
  'GO:0006836',
  'GO:0006939',
  'GO:0007200',
  'GO:0007202',
  'GO:0007204',
  'GO:0007210',
  'GO:0007214',
  'GO:0007268',
  'GO:0007613',
  'GO:0009266',
  'GO:0010517',
  'GO:0010518',
  'GO:0010675',
  'GO:0010676',
  'GO:0010863',
  'GO:0014059',
  'GO:0014065',
  'GO:0014820',
  'GO:0014824',
  'GO:0014829',
  'GO:0014848',
  'GO:0015698',
  'GO:0016048',
  'GO:0019229',
  'GO:0030431',
  'GO:0030534',
  'GO:0032845',
  'GO:0034308',
  'GO:0035150',
  'GO:0042220',
  'GO

In [None]:
import timeit
start = timeit.default_timer()

stop = timeit.default_timer()
print('Time: ', stop - start)  