# Testing environment for DoSE

## Setup

### Load libraries

In [33]:
import pandas as pd
import gseapy
from biothings_client import get_client

### Define data

In [3]:
seeds_file = "Input/0007079.txt"
betweenness_file = "Input/0007079_added_200_dmd_betweenness_hub_0.01.txt"
significance_file = "Input/0007079_added_200_dmd_significance_hub_1.txt"
diseases_file = "Input/ICD10_commROCG_raw.txt"
disease_clusters_file = "Input/ICD10_commROCG_cluster.txt"

### Load data

In [5]:
seeds = pd.read_csv(seeds_file, sep="\t", header=None)[0]
betweenness = pd.read_csv(betweenness_file, sep="\t")['node']
significance = pd.read_csv(significance_file, sep="\t")['node']
diseases = pd.read_csv(diseases_file, sep="\t", header=None)
disease_clusters = pd.read_csv(disease_clusters_file, sep="\t", header=None)

In [6]:
import timeit
start = timeit.default_timer()

stop = timeit.default_timer()
print('Time: ', stop - start)  

Time:  2.2973000000092725e-05


## Mapper

In [34]:
mg = get_client("gene")

In [65]:
def get_gene_mapping(mg, gene_set):
    """
    Simple converter.

    :param gene_set: Set of gene ids
    :return: Dataframe
    """
    return mg.querymany(gene_set, scopes='uniprot.Swiss-Prot,symbol,ensembl.gene,entrezgene',
                        fields='symbol',
                        species='human', returnall=False, as_dataframe=True, df_index=False)

def get_gene_to_keggpathway(mg, gene_set):
    """
    Simple converter.

    :param gene_set: Set of gene ids
    :return: Dataframe
    """
    return mg.querymany(gene_set, scopes='uniprot.Swiss-Prot,symbol,ensembl.gene,entrezgene',
                        fields='pathway.kegg.id',
                        species='human', returnall=False, as_dataframe=True, df_index=False)

In [66]:
start = timeit.default_timer()
reference_mapping = get_gene_mapping(mg,seeds)
target_mapping = get_gene_mapping(mg,significance)
stop = timeit.default_timer()
print('Time: ', stop - start)

querying 1-4...done.
Finished.
querying 1-200...done.
Finished.
1 input query terms found dup hits:
	[('P34998', 2)]
Pass "returnall=True" to return complete lists of duplicate or missing query terms.
Time:  3.1941767309999705


In [67]:
target_mapping[target_mapping['query']=='P34998']

Unnamed: 0,query,_id,_score,symbol
43,P34998,1394,16.838882,CRHR1
44,P34998,104909134,16.356878,LINC02210-CRHR1


In [68]:
start = timeit.default_timer()
reference_kegg_mapping = get_gene_to_keggpathway(mg,seeds)
target_kegg_mapping = get_gene_to_keggpathway(mg,significance)
stop = timeit.default_timer()
print('Time: ', stop - start)

querying 1-4...done.
Finished.
querying 1-200...done.
Finished.
1 input query terms found dup hits:
	[('P34998', 2)]
Pass "returnall=True" to return complete lists of duplicate or missing query terms.
Time:  3.4341397289999804


In [69]:
target_kegg_mapping[target_kegg_mapping['query']=='P34998']

Unnamed: 0,query,_id,_score,pathway.kegg,pathway.kegg.id
43,P34998,104909134,16.818409,,
44,P34998,1394,16.644955,"[{'id': 'hsa04080'}, {'id': 'hsa04730'}, {'id'...",


In [38]:
md = get_client("disease")

In [58]:
def get_disease_mapping(md, diseases):
    """
    Simple converter.

    :param gene_set: Set of gene ids
    :return: Dataframe
    """
    return md.getdiseases(diseases,
                          fields='disgenet.genes_related_to_disease.gene_iddisgenet.variants_related_to_disease.rsid',
                          species='human', returnall=False, as_dataframe=True, df_index=False)

In [59]:
get_disease_mapping(md,diseases[0])

querying 1-25...done.
Finished.
25 input query terms found no hit:
	['E10', 'E11', 'E12', 'E13', 'E14', 'E66', 'F00', 'F01', 'F02', 'F03', 'G20', 'G30', 'G43', 'I10', '
Pass "returnall=True" to return complete lists of duplicate or missing query terms.


Unnamed: 0,query,notfound
0,E10,True
1,E11,True
2,E12,True
3,E13,True
4,E14,True
5,E66,True
6,F00,True
7,F01,True
8,F02,True
9,F03,True
