# Testing environment for DoSE

## Setup

### Load libraries

In [1]:
import pandas as pd
import numpy as np 
import gseapy
from biothings_client import get_client

### Define data

In [2]:
seeds_file = "Input/0007079.txt"
betweenness_file = "Input/0007079_added_200_dmd_betweenness_hub_0.01.txt"
significance_file = "Input/0007079_added_200_dmd_significance_hub_1.txt"
diseases_file = "Input/ICD10_commROCG_raw.txt"
disease_clusters_file = "Input/ICD10_commROCG_cluster.txt"

### Load data

In [3]:
seeds = pd.read_csv(seeds_file, sep="\t", header=None)[0]
betweenness = pd.read_csv(betweenness_file, sep="\t")['node']
significance = pd.read_csv(significance_file, sep="\t")['node']
diseases = pd.read_csv(diseases_file, sep="\t", header=None)
disease_clusters = pd.read_csv(disease_clusters_file, sep="\t", header=None)

In [4]:
import timeit
start = timeit.default_timer()

stop = timeit.default_timer()
print('Time: ', stop - start)  

Time:  5.153799975232687e-05


## Mapper

In [5]:
id_type_key = {'entrez':'entrezgene','ensembl':'ensembl.gene','symbol':'symbol','uniprot':'uniprot.Swiss-Prot'}
gene_ids=['uniprot.Swiss-Prot','symbol','ensembl.gene','entrezgene']

In [77]:
def preprocess_results(mapping, multicol, singlecol, key, explode=False):
    
    def convert_to_string(cell, key):
        if str(cell) != 'nan':
            extracted_ids = [val.get(key) for val in cell]
            return ';'.join(extracted_ids)
        return cell
    mapping[multicol] = mapping[multicol].apply(lambda x: convert_to_string(x, key)) if multicol in mapping else np.nan
    if singlecol in mapping:
        mapping[multicol].fillna(mapping[singlecol], inplace=True)
        mapping = mapping.drop(columns=[singlecol])
    if explode:
        mapping = mapping[multicol].split(';').explode(multicol)
        mapping.rename(columns={multicol: singlecol}, inplace = True)
    return mapping

def get_prev_mapping(gene_set, id_type, file):
    # ===== Get mapping from local mapping file =====
    prev_mapping = pd.read_csv(file, header=0, dtype=str)
    df = prev_mapping[prev_mapping[id_type_key[id_type]].isin(gene_set)]
    # ===== Get missing values =====
    missing = list(set(gene_set)-set(prev_mapping[id_type_key[id_type]]))
    return (df, missing, prev_mapping)
    

def get_gene_mapping(gene_set, id_type):
    """
    Simple converter.

    :param gene_set: Set of gene ids
    :return: Dataframe
    """
    # ===== Get mapping from previous mappings =====
    df, missing, prev_mapping = get_prev_mapping(gene_set=gene_set, id_type=id_type, file='gene_id_mapping.csv')
    # ===== Get mapping for missing values =====
    if len(missing) > 0:
        mg = get_client("gene")
        mapping = mg.querymany(missing, scopes=id_type_key[id_type], fields=','.join(gene_ids),
                     species='human', returnall=False, as_dataframe=True, df_index=False)
        mapping = mapping.drop(columns=[id_type_key[id_type]])
        mapping.rename(columns={'query': id_type_key[id_type]}, inplace = True)
        # ===== Split if there are multiple ensembl ids =====
        if 'ensembl' in mapping:
            mapping = preprocess_results(mapping=mapping, multicol='ensembl', singlecol='ensembl.gene', key='gene', explode=True)
        mapping = mapping.drop(columns=['_id','_score'])
        # ===== Add results from missing values =====
        pd.concat([prev_mapping,mapping]).to_csv('gene_id_mapping.csv', index=False)
        df = pd.concat([df, mapping]).reset_index(drop=True)
    return df

def get_gene_to_attributes(gene_set, id_type):
    """
    Simple converter.

    :param gene_set: Set of gene ids
    :return: Dataframe
    """
    # ===== Get gene ID mappings =====
    gene_mapping, _, _ = get_prev_mapping(gene_set=gene_set, id_type=id_type, file='gene_id_mapping.csv')
    df, missing, prev_mapping = get_prev_mapping(gene_set=set(gene_mapping['entrezgene']), id_type='entrez', file='gene_att_mapping.csv')
    if len(missing) > 0:
        mg = get_client("gene")
        gene_ids=['uniprot.Swiss-Prot','symbol','ensembl.gene','entrezgene']
        mapping = mg.querymany(missing, scopes=','.join(gene_ids),
                            fields='pathway.kegg.id, go.BP.id, go.CC.id, go.MF.id',
                            species='human', returnall=False, as_dataframe=True, df_index=False)
        mapping.rename(columns={'query': 'entrezgene'}, inplace = True)
        for column in ['go.BP','go.CC','go.MF','pathway.kegg']:
            mapping = preprocess_results(mapping=mapping, multicol=column, singlecol=column+'.id', key='id')
        mapping = mapping.drop(columns=['_id','_score'])
        # ===== Add results from missing values =====
        pd.concat([prev_mapping,mapping]).to_csv('gene_att_mapping.csv', index=False)        
        df = pd.concat([df, mapping]).reset_index(drop=True)
    # work with not unique values...
    mapping_subset = gene_mapping[['entrezgene', id_type_key[id_type]]].drop_duplicates()
    df = pd.merge(mapping_subset, df, on = ['entrezgene'], how = 'outer')
    df = df.drop(columns=['entrezgene'])
    df = df.fillna('').groupby(['uniprot.Swiss-Prot'], as_index = False).agg({'go.BP': ';'.join, 'go.CC': ';'.join, 'go.MF': ';'.join, 'pathway.kegg': ';'.join})
    return df

In [78]:
start = timeit.default_timer()
reference_mapping = get_gene_mapping(seeds, 'uniprot')
target_mapping = get_gene_mapping(significance, 'uniprot')
stop = timeit.default_timer()
print('Time: ', stop - start)

Time:  0.00626673900114838


In [79]:
reference_mapping

Unnamed: 0,entrezgene,ensembl.gene,symbol,uniprot.Swiss-Prot
191,125,ENSG00000196616,ADH1B,P00325
192,2555,ENSG00000151834,GABRA2,P47869
193,126,ENSG00000248144,ADH1C,P00326
194,3356,ENSG00000102468,HTR2A,P28223


In [80]:
target_mapping

Unnamed: 0,entrezgene,ensembl.gene,symbol,uniprot.Swiss-Prot
0,1394,ENSG00000120088,CRHR1,P34998
1,1394,ENSG00000276191,CRHR1,P34998
2,104909134,ENSG00000263715,LINC02210-CRHR1,P34998
3,104909134,ENSG00000278232,LINC02210-CRHR1,P34998
4,104909134,ENSG00000282456,LINC02210-CRHR1,P34998
...,...,...,...,...
212,1742,ENSG00000132535,DLG4,P78352
213,4684,ENSG00000149294,NCAM1,P13591
214,2778,ENSG00000087460,GNAS,O95467
215,2776,ENSG00000156052,GNAQ,P50148


In [81]:
start = timeit.default_timer()
reference_kegg_mapping = get_gene_to_attributes(seeds, 'uniprot')
target_kegg_mapping = get_gene_to_attributes(significance, 'uniprot')
stop = timeit.default_timer()
print('Time: ', stop - start)

Time:  0.02240458199958084


In [82]:
reference_kegg_mapping

Unnamed: 0,uniprot.Swiss-Prot,go.BP,go.CC,go.MF,pathway.kegg
0,P00325,GO:0001523;GO:0006069;GO:0006069;GO:0006069;GO...,GO:0005654;GO:0005829;GO:0005829;GO:0005829;GO...,GO:0004024;GO:0004024;GO:0004745;GO:0004745;GO...,hsa00010;hsa00071;hsa00350;hsa00620;hsa00830;h...
1,P00326,GO:0006069;GO:0006069;GO:0006069;GO:0042572;GO...,GO:0005654;GO:0005829;GO:0005829;GO:0005829;GO...,GO:0004022;GO:0004024;GO:0004745;GO:0008270,hsa00010;hsa00071;hsa00350;hsa00620;hsa00830;h...
2,P28223,GO:0001659;GO:0006874;GO:0007186;GO:0007187;GO...,GO:0005829;GO:0005886;GO:0005886;GO:0005887;GO...,GO:0001587;GO:0001618;GO:0001965;GO:0004993;GO...,hsa04020;hsa04080;hsa04540;hsa04726;hsa04750
3,P47869,GO:0001505;GO:0006836;GO:0007165;GO:0007214;GO...,GO:0005886;GO:0005887;GO:0030285;GO:0030424;GO...,GO:0004890;GO:0005237;GO:0005254;GO:0008503;GO...,hsa04080;hsa04723;hsa04727;hsa04742;hsa05032;h...


In [83]:
target_kegg_mapping

Unnamed: 0,uniprot.Swiss-Prot,go.BP,go.CC,go.MF,pathway.kegg
0,O00459,GO:0000165;GO:0001678;GO:0006661;GO:0008286;GO...,GO:0005634;GO:0005829;GO:0005942,GO:0001784;GO:0005515;GO:0019903;GO:0030971;GO...,hsa04012;hsa04014;hsa04015;hsa04024;hsa04062;h...
1,O14492,GO:0001922;GO:0007399;GO:0007596;GO:0008286;GO...,GO:0001725;GO:0001726;GO:0005737;GO:0005829;GO...,GO:0005068;GO:0005515;GO:0035591;GO:0042169;GO...,hsa04722;hsa04910
2,O14610,GO:0007186;GO:0007186;GO:0007602,GO:0005834;GO:0005834;GO:0005834,GO:0003924;GO:0003924;GO:0031681,hsa04014;hsa04062;hsa04151;hsa04371;hsa04713;h...
3,O14775,GO:0006457;GO:0007165;GO:0007186;GO:0007212;GO...,GO:0005634;GO:0005737;GO:0005829;GO:0005829;GO...,GO:0003924;GO:0005096;GO:0005096;GO:0005515;GO...,hsa04014;hsa04062;hsa04151;hsa04371;hsa04713;h...
4,O14842,GO:0007186;GO:0007204;GO:0030073;GO:0032024;GO...,GO:0005886;GO:0005886;GO:0005887,GO:0004930;GO:0008289;GO:0045125,hsa04911
...,...,...,...,...,...
195,Q9UN70,GO:0007155;GO:0007156;GO:0016339;GO:0050808,GO:0005887;GO:0016020,GO:0005509,
196,Q9UNN8,GO:0007596;GO:0050819;GO:0050819,GO:0005576;GO:0005615;GO:0005813;GO:0005886;GO...,GO:0005515;GO:0038023,hsa04610
197,Q9UQC2,GO:0007169;GO:0007411;GO:0008284;GO:0019221;GO...,GO:0005737;GO:0005829;GO:0005886,GO:0005068;GO:0005515;GO:0005547;GO:0043325,hsa04014;hsa04071;hsa04072;hsa04380;hsa04664;h...
198,Q9Y2G0,GO:0046854;GO:0072659;GO:0072659,GO:0005829;GO:0005886;GO:0005886;GO:0015629,GO:0005515,


querying 1-4...done.
Finished.
querying 1-200...done.
Finished.
1 input query terms found dup hits:
	[('P34998', 2)]
Pass "returnall=True" to return complete lists of duplicate or missing query terms.
Time:  3.357143184999984


In [10]:
target_kegg_mapping[target_kegg_mapping['query']=='P34998']

Unnamed: 0,query,_id,_score,pathway.kegg,pathway.kegg.id
43,P34998,1394,16.838882,"[{'id': 'hsa04080'}, {'id': 'hsa04730'}, {'id'...",
44,P34998,104909134,16.356878,,


In [11]:
md = get_client("disease")

In [16]:
def get_disease_mapping(md, diseases):
    """
    Simple converter.

    :param gene_set: Set of gene ids
    :return: Dataframe
    """
    return md.getdiseases(diseases,
                          fields='disgenet.genes_related_to_disease.gene_id,disgenet.variants_related_to_disease.rsid',
                          species='human', returnall=False, as_dataframe=True, df_index=False)

In [19]:
mondos=['MONDO:0004979','MONDO:0016264','MONDO:0012996']

In [20]:
#get_disease_mapping(md,diseases[0])
get_disease_mapping(md,mondos)

querying 1-3...done.


Unnamed: 0,query,_id,_version,disgenet._license,disgenet.genes_related_to_disease.gene_id,disgenet.genes_related_to_disease,disgenet.variants_related_to_disease
0,MONDO:0004979,MONDO:0004979,1,https://creativecommons.org/licenses/by/4.0/,7040.0,,
1,MONDO:0016264,MONDO:0016264,1,https://creativecommons.org/licenses/by/4.0/,,"[{'gene_id': 58}, {'gene_id': 60}, {'gene_id':...","[{'rsid': 'rs11065904'}, {'rsid': 'rs121434254..."
2,MONDO:0012996,MONDO:0012996,1,https://creativecommons.org/licenses/by/4.0/,,"[{'gene_id': 2628}, {'gene_id': 6535}, {'gene_...","[{'rsid': 'rs1566842679'}, {'rsid': 'rs3975147..."
