# Testing environment for DoSE

## Setup

### Load libraries

In [1]:
import pandas as pd
import gseapy
from biothings_client import get_client

### Define data

In [2]:
seeds_file = "Input/0007079.txt"
betweenness_file = "Input/0007079_added_200_dmd_betweenness_hub_0.01.txt"
significance_file = "Input/0007079_added_200_dmd_significance_hub_1.txt"
diseases_file = "Input/ICD10_commROCG_raw.txt"
disease_clusters_file = "Input/ICD10_commROCG_cluster.txt"

### Load data

In [3]:
seeds = pd.read_csv(seeds_file, sep="\t", header=None)[0]
betweenness = pd.read_csv(betweenness_file, sep="\t")['node']
significance = pd.read_csv(significance_file, sep="\t")['node']
diseases = pd.read_csv(diseases_file, sep="\t", header=None)
disease_clusters = pd.read_csv(disease_clusters_file, sep="\t", header=None)

In [4]:
import timeit
start = timeit.default_timer()

stop = timeit.default_timer()
print('Time: ', stop - start)  

Time:  1.1541999995756669e-05


## Mapper

In [90]:
#list(set(significance)-set(prev_mapping['uniprot.Swiss-Prot']))
pre_df = prev_mapping[prev_mapping['uniprot.Swiss-Prot'].isin(significance)]

Unnamed: 0,entrezgene,ensembl.gene,symbol,uniprot.Swiss-Prot
0,64131,ENSG00000103489,XYLT1,Q86Y38
1,64131,ENSG00000285395,XYLT1,Q86Y38
2,1268,ENSG00000118432,CNR1,P21554
3,2770,ENSG00000127955,GNAI1,P63096
4,2864,ENSG00000126266,FFAR1,O14842
...,...,...,...,...
208,5795,ENSG00000149177,PTPRJ,Q12913
209,2690,ENSG00000112964,GHR,P10912
210,2322,ENSG00000122025,FLT3,P36888
211,27040,ENSG00000213658,LAT,O43561


In [91]:
id_type_key = {'entrez':'entrezgene','ensembl':'ensembl.gene','name':'symbol','uniprot':'uniprot.Swiss-Prot'}

In [93]:
id_type = 'uniprot'
missing = list(set(significance)-set(prev_mapping[id_type_key[id_type]]))
missing

['O95467', 'P63092']

In [116]:
gene_ids=['uniprot.Swiss-Prot','symbol','ensembl.gene','entrezgene']
gene_ids.remove('uniprot.Swiss-Prot')
gene_ids

['symbol', 'ensembl.gene', 'entrezgene']

In [140]:
def get_gene_mapping(gene_set, id_type):
    """
    Simple converter.

    :param gene_set: Set of gene ids
    :return: Dataframe
    """
    # ===== Get mapping from local mapping file =====
    prev_mapping = pd.read_csv('gene_mapping.csv')
    pre_df = prev_mapping[prev_mapping[id_type_key[id_type]].isin(gene_set)]
    # ===== Get mapping for missing values =====
    missing = list(set(gene_set)-set(prev_mapping[id_type_key[id_type]]))
    if len(missing) > 0:
        gene_ids=['uniprot.Swiss-Prot','symbol','ensembl.gene','entrezgene']
        gene_ids.remove(id_type_key[id_type])
        mg = get_client("gene")
        mapping = mg.querymany(missing, scopes=id_type_key[id_type], fields=','.join(gene_ids),
                     species='human', returnall=False, as_dataframe=True, df_index=False)
        mapping.rename(columns={'query': id_type_key[id_type]}, inplace = True)
        # ===== Split if there are multiple ensembl ids =====
        if 'ensembl' in mapping:
            mapping = mapping.explode('ensembl')
            mask = ~mapping['ensembl'].isna() 
            mapping.loc[mask, 'ensembl'] = mapping.loc[mask, 'ensembl'].apply(lambda x: x.get('gene'))
            mapping['ensembl.gene'].fillna(mapping.ensembl, inplace=True)
        pre_df = pd.concat([pre_df, mapping[['entrezgene','ensembl.gene','symbol','uniprot.Swiss-Prot']]]).reset_index(drop=True)
    # TODO: Override old local data 
    return pre_df

def get_gene_to_keggpathway(mg, gene_set):
    """
    Simple converter.

    :param gene_set: Set of gene ids
    :return: Dataframe
    """
    return mg.querymany(gene_set, scopes='uniprot.Swiss-Prot,symbol,ensembl.gene,entrezgene',
                        fields='pathway.kegg.id',
                        species='human', returnall=False, as_dataframe=True, df_index=False)

In [141]:
start = timeit.default_timer()
reference_mapping = get_gene_mapping(seeds, 'uniprot')
target_mapping = get_gene_mapping(significance, 'uniprot')
stop = timeit.default_timer()
print('Time: ', stop - start)

querying 1-4...done.
Finished.
querying 1-2...done.
Finished.
Time:  2.9009012730002723


In [142]:
reference_mapping

Unnamed: 0,entrezgene,ensembl.gene,symbol,uniprot.Swiss-Prot
0,3356,ENSG00000102468,HTR2A,P28223
1,125,ENSG00000196616,ADH1B,P00325
2,126,ENSG00000248144,ADH1C,P00326
3,2555,ENSG00000151834,GABRA2,P47869


In [143]:
target_mapping

Unnamed: 0,entrezgene,ensembl.gene,symbol,uniprot.Swiss-Prot
0,64131,ENSG00000103489,XYLT1,Q86Y38
1,64131,ENSG00000285395,XYLT1,Q86Y38
2,1268,ENSG00000118432,CNR1,P21554
3,2770,ENSG00000127955,GNAI1,P63096
4,2864,ENSG00000126266,FFAR1,O14842
...,...,...,...,...
208,2322,ENSG00000122025,FLT3,P36888
209,27040,ENSG00000213658,LAT,O43561
210,3702,ENSG00000113263,ITK,Q08881
211,2778,ENSG00000087460,GNAS,O95467


In [98]:
#target_mapping.to_csv('gene_mapping.csv', index=False)
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(target_mapping['uniprot.Swiss-Prot'])

0                                Q86Y38
0                                Q86Y38
1                                P21554
2                                P63096
3                                O14842
4                                P50148
5                                Q99500
6                                Q9NS28
7                                O15539
7                                O15539
8                                P08754
9                                P04899
10                               O15492
11                               P50150
11                               P50150
12                               P63215
13                               P09471
14                               Q9HAV0
15                               P16520
16                               O60262
17                               P50151
18                               P62873
19                               P63218
20                               P62879
21     [Q5JWF2, P84996, P63092, O95467]


In [20]:
#target_mapping = target_mapping.applymap(str)
if 'ensembl' in target_mapping: # if multiple ensemble ids
    print(target_mapping[target_mapping['ensembl']!='nan'])

      query        _id     _score  \
0    Q86Y38      64131  16.311846   
7    O15539       8490  16.356878   
11   P50150       2786  15.875501   
43   P34998       1394  16.644955   
44   P34998  104909134  16.356878   
76   P01112       3265  15.305527   
80   Q53EQ6      84948   16.78657   
115  Q9NYW3      50837  16.818409   
165  P08575       5788  15.352335   
171  Q92835       3635   16.31735   

                                               ensembl entrezgene  \
0    [{'gene': 'ENSG00000103489'}, {'gene': 'ENSG00...      64131   
7    [{'gene': 'ENSG00000143248'}, {'gene': 'ENSG00...       8490   
11   [{'gene': 'ENSG00000168243'}, {'gene': 'ENSG00...       2786   
43   [{'gene': 'ENSG00000120088'}, {'gene': 'ENSG00...       1394   
44   [{'gene': 'ENSG00000263715'}, {'gene': 'ENSG00...  104909134   
76   [{'gene': 'ENSG00000174775'}, {'gene': 'ENSG00...       3265   
80   [{'gene': 'ENSG00000179886'}, {'gene': 'ENSG00...      84948   
115  [{'gene': 'ENSG00000121377'}, {'gen

In [9]:
start = timeit.default_timer()
reference_kegg_mapping = get_gene_to_keggpathway(mg,seeds)
target_kegg_mapping = get_gene_to_keggpathway(mg,significance)
stop = timeit.default_timer()
print('Time: ', stop - start)

querying 1-4...done.
Finished.
querying 1-200...done.
Finished.
1 input query terms found dup hits:
	[('P34998', 2)]
Pass "returnall=True" to return complete lists of duplicate or missing query terms.
Time:  3.357143184999984


In [10]:
target_kegg_mapping[target_kegg_mapping['query']=='P34998']

Unnamed: 0,query,_id,_score,pathway.kegg,pathway.kegg.id
43,P34998,1394,16.838882,"[{'id': 'hsa04080'}, {'id': 'hsa04730'}, {'id'...",
44,P34998,104909134,16.356878,,


In [11]:
md = get_client("disease")

In [16]:
def get_disease_mapping(md, diseases):
    """
    Simple converter.

    :param gene_set: Set of gene ids
    :return: Dataframe
    """
    return md.getdiseases(diseases,
                          fields='disgenet.genes_related_to_disease.gene_id,disgenet.variants_related_to_disease.rsid',
                          species='human', returnall=False, as_dataframe=True, df_index=False)

In [19]:
mondos=['MONDO:0004979','MONDO:0016264','MONDO:0012996']

In [20]:
#get_disease_mapping(md,diseases[0])
get_disease_mapping(md,mondos)

querying 1-3...done.


Unnamed: 0,query,_id,_version,disgenet._license,disgenet.genes_related_to_disease.gene_id,disgenet.genes_related_to_disease,disgenet.variants_related_to_disease
0,MONDO:0004979,MONDO:0004979,1,https://creativecommons.org/licenses/by/4.0/,7040.0,,
1,MONDO:0016264,MONDO:0016264,1,https://creativecommons.org/licenses/by/4.0/,,"[{'gene_id': 58}, {'gene_id': 60}, {'gene_id':...","[{'rsid': 'rs11065904'}, {'rsid': 'rs121434254..."
2,MONDO:0012996,MONDO:0012996,1,https://creativecommons.org/licenses/by/4.0/,,"[{'gene_id': 2628}, {'gene_id': 6535}, {'gene_...","[{'rsid': 'rs1566842679'}, {'rsid': 'rs3975147..."
