# Testing environment for DoSE

## Setup

### Load libraries

In [1]:
import pandas as pd
import numpy as np 
import gseapy
from biothings_client import get_client

### Define data

In [2]:
seeds_file = "Input/0007079.txt"
betweenness_file = "Input/0007079_added_200_dmd_betweenness_hub_0.01.txt"
significance_file = "Input/0007079_added_200_dmd_significance_hub_1.txt"
diseases_file = "Input/ICD10_commROCG_raw.txt"
disease_clusters_file = "Input/ICD10_commROCG_cluster.txt"

### Load data

In [3]:
seeds = pd.read_csv(seeds_file, sep="\t", header=None)[0]
betweenness = pd.read_csv(betweenness_file, sep="\t")['node']
significance = pd.read_csv(significance_file, sep="\t")['node']
diseases = pd.read_csv(diseases_file, sep="\t", header=None)
disease_clusters = pd.read_csv(disease_clusters_file, sep="\t", header=None)

In [4]:
import timeit
start = timeit.default_timer()

stop = timeit.default_timer()
print('Time: ', stop - start)  

Time:  3.7575000021661253e-05


## Mapper

In [47]:
id_type_key = {'entrez':'entrezgene','ensembl':'ensembl.gene','symbol':'symbol','uniprot':'uniprot.Swiss-Prot','mondo':'mondo'}
gene_ids=['uniprot.Swiss-Prot','symbol','ensembl.gene','entrezgene']

In [118]:
def preprocess_results(mapping, multicol, singlecol, key, explode=False):
    
    def convert_to_string(cell, key):
        if str(cell) != 'nan':
            extracted_ids = [val.get(key) for val in cell]
            return ';'.join(str(e) for e in list(set(extracted_ids)))
        return cell
    mapping[multicol] = mapping[multicol].apply(lambda x: convert_to_string(x, key)) if multicol in mapping else np.nan
    if singlecol in mapping:
        mapping[multicol].fillna(mapping[singlecol], inplace=True)
        mapping = mapping.drop(columns=[singlecol])
    if explode:
        mapping = mapping[multicol].split(';').explode(multicol)
        mapping.rename(columns={multicol: singlecol}, inplace = True)
    return mapping


def get_prev_mapping(in_set, id_type, file, sep):
    # ===== Get mapping from local mapping file =====
    mapping = pd.read_csv(file, sep=sep, header=0, dtype=str)
    if id_type == "ICD-10":
        mapping = split_and_expand_column(data=mapping, split_string=",", column_name="ICD-10")
        mapping_copy = mapping.copy()
        mapping_copy['ICD-10'] = mapping_copy['ICD-10'].str.split('.', expand=True)[0]
        mapping = pd.concat([mapping, mapping_copy], ignore_index=True)
    # ==== Map given disease set ====
    id_type = id_type_key[id_type] if id_type in id_type_key else id_type
    mapped_set = mapping[mapping[id_type].isin(in_set)]
    # ===== Get missing values =====
    missing = list(set(in_set) - set(mapping[id_type]))
    return mapped_set, missing, mapping
    

def get_gene_mapping(gene_set, id_type):
    """
    Simple converter.

    :param gene_set: Set of gene ids
    :return: Dataframe
    """
    # ===== Get mapping from previous mappings =====
    df, missing, prev_mapping = get_prev_mapping(in_set=gene_set, id_type=id_type, file='gene_id_mapping.csv', sep=",")
    # ===== Get mapping for missing values =====
    if len(missing) > 0:
        mg = get_client("gene")
        mapping = mg.querymany(missing, scopes=id_type_key[id_type], fields=','.join(gene_ids),
                     species='human', returnall=False, as_dataframe=True, df_index=False)
        mapping = mapping.drop(columns=[id_type_key[id_type]])
        mapping.rename(columns={'query': id_type_key[id_type]}, inplace = True)
        # ===== Split if there are multiple ensembl ids =====
        if 'ensembl' in mapping:
            mapping = preprocess_results(mapping=mapping, multicol='ensembl', singlecol='ensembl.gene', key='gene', explode=True)
        mapping = mapping.drop(columns=['_id','_score'])
        # ===== Add results from missing values =====
        pd.concat([prev_mapping,mapping]).to_csv('gene_id_mapping.csv', index=False)
        df = pd.concat([df, mapping]).reset_index(drop=True)
    return df

def get_gene_to_attributes(gene_set, id_type):
    """
    Simple converter.

    :param gene_set: Set of gene ids
    :return: Dataframe
    """
    # ===== Get gene ID mappings =====
    gene_mapping, _, _ = get_prev_mapping(in_set=gene_set, id_type=id_type, file='gene_id_mapping.csv', sep=",")
    df, missing, prev_mapping = get_prev_mapping(in_set=set(gene_mapping['entrezgene']), id_type='entrez', file='gene_att_mapping.csv', sep=",")
    if len(missing) > 0:
        mg = get_client("gene")
        gene_ids=['uniprot.Swiss-Prot','symbol','ensembl.gene','entrezgene']
        mapping = mg.querymany(missing, scopes=','.join(gene_ids),
                            fields='pathway.kegg.id, go.BP.id, go.CC.id, go.MF.id',
                            species='human', returnall=False, as_dataframe=True, df_index=False)
        mapping.rename(columns={'query': 'entrezgene'}, inplace = True)
        for column in ['go.BP','go.CC','go.MF','pathway.kegg']:
            mapping = preprocess_results(mapping=mapping, multicol=column, singlecol=column+'.id', key='id')
        mapping = mapping.drop(columns=['_id','_score'])
        # ===== Add results from missing values =====
        pd.concat([prev_mapping,mapping]).to_csv('gene_att_mapping.csv', index=False)        
        df = pd.concat([df, mapping]).reset_index(drop=True)
    # work with not unique values...
    mapping_subset = gene_mapping[['entrezgene', id_type_key[id_type]]].drop_duplicates()
    df = pd.merge(mapping_subset, df, on = ['entrezgene'], how = 'outer')
    df = df.drop(columns=['entrezgene'])
    df = df.fillna('').groupby([id_type_key[id_type]], as_index=False).agg({'go.BP': combine_rows, 'go.CC': combine_rows,
                                                                            'go.MF': combine_rows, 'pathway.kegg': combine_rows})
    return df

In [119]:
start = timeit.default_timer()
reference_mapping = get_gene_mapping(seeds, 'uniprot')
target_mapping = get_gene_mapping(significance, 'uniprot')
stop = timeit.default_timer()
print('Time: ', stop - start)

Time:  0.012357924002571963


In [120]:
reference_mapping

Unnamed: 0,entrezgene,ensembl.gene,symbol,uniprot.Swiss-Prot
191,125,ENSG00000196616,ADH1B,P00325
192,2555,ENSG00000151834,GABRA2,P47869
193,126,ENSG00000248144,ADH1C,P00326
194,3356,ENSG00000102468,HTR2A,P28223


In [121]:
target_mapping

Unnamed: 0,entrezgene,ensembl.gene,symbol,uniprot.Swiss-Prot
0,1394,ENSG00000120088,CRHR1,P34998
1,1394,ENSG00000276191,CRHR1,P34998
2,104909134,ENSG00000263715,LINC02210-CRHR1,P34998
3,104909134,ENSG00000278232,LINC02210-CRHR1,P34998
4,104909134,ENSG00000282456,LINC02210-CRHR1,P34998
...,...,...,...,...
212,1742,ENSG00000132535,DLG4,P78352
213,4684,ENSG00000149294,NCAM1,P13591
214,2778,ENSG00000087460,GNAS,O95467
215,2776,ENSG00000156052,GNAQ,P50148


In [122]:
start = timeit.default_timer()
reference_kegg_mapping = get_gene_to_attributes(seeds, 'uniprot')
target_kegg_mapping = get_gene_to_attributes(significance, 'uniprot')
stop = timeit.default_timer()
print('Time: ', stop - start)

Time:  0.03708536599879153


In [123]:
reference_kegg_mapping

Unnamed: 0,uniprot.Swiss-Prot,go.BP,go.CC,go.MF,pathway.kegg
0,P00325,"{GO:0001523, GO:0042573, GO:0042572, GO:0006069}","{GO:0005886, GO:0005829, GO:0005654}","{GO:0004745, GO:0004024, GO:0008270}","{hsa00980, hsa00620, hsa00010, hsa00350, hsa05..."
1,P00326,"{GO:0042573, GO:0042572, GO:0006069}","{GO:0005886, GO:0005829, GO:0005654}","{GO:0004745, GO:0004024, GO:0004022, GO:0008270}","{hsa00980, hsa00620, hsa00010, hsa00350, hsa05..."
2,P28223,"{GO:0006874, GO:2000300, GO:0007187, GO:000761...","{GO:0030424, GO:0099056, GO:0005886, GO:009866...","{GO:0001587, GO:0044877, GO:0001965, GO:000551...","{hsa04750, hsa04020, hsa04540, hsa04726, hsa04..."
3,P47869,"{GO:0042391, GO:0051932, GO:0050877, GO:000683...","{GO:0030424, GO:1902711, GO:0045202, GO:004300...","{GO:0004890, GO:0008503, GO:0005254, GO:190431...","{hsa05032, hsa04723, hsa04742, hsa04080, hsa04..."


In [124]:
target_kegg_mapping

Unnamed: 0,uniprot.Swiss-Prot,go.BP,go.CC,go.MF,pathway.kegg
0,O00459,"{GO:0050900, GO:0001678, GO:0006661, GO:003809...","{GO:0005942, GO:0005634, GO:0005829}","{GO:0030971, GO:0019903, GO:0005515, GO:000178...","{hsa05020, hsa04211, hsa04360, hsa05010, hsa05..."
1,O14492,"{GO:0019221, GO:0035556, GO:0050873, GO:005085...","{GO:0005886, GO:0005884, GO:0001725, GO:000582...","{GO:0042169, GO:0005515, GO:0005068, GO:004280...","{hsa04722, hsa04910}"
2,O14610,"{GO:0007186, GO:0007602}",{GO:0005834},"{GO:0031681, GO:0003924}","{hsa04371, hsa04713, hsa04926, hsa05032, hsa04..."
3,O14775,"{GO:0043547, GO:0007165, GO:1901386, GO:000718...","{GO:0098793, GO:0005834, GO:0005829, GO:000573...","{GO:0051087, GO:0031682, GO:0005515, GO:000509...","{hsa04371, hsa04713, hsa04926, hsa05032, hsa04..."
4,O14842,"{GO:0032691, GO:0050796, GO:0051928, GO:003007...","{GO:0005886, GO:0005887}","{GO:0008289, GO:0045125, GO:0004930}",{hsa04911}
...,...,...,...,...,...
195,Q9UN70,"{GO:0007155, GO:0007156, GO:0016339, GO:0050808}","{GO:0016020, GO:0005887}",{GO:0005509},{}
196,Q9UNN8,"{GO:0050819, GO:0007596}","{GO:0005615, GO:0005576, GO:0048471, GO:000588...","{GO:0005515, GO:0038023}",{hsa04610}
197,Q9UQC2,"{GO:0051897, GO:0019221, GO:0007169, GO:004801...","{GO:0005886, GO:0005829, GO:0005737}","{GO:0005068, GO:0005515, GO:0043325, GO:0005547}","{hsa05220, hsa04380, hsa04072, hsa04666, hsa04..."
198,Q9Y2G0,"{GO:0046854, GO:0072659}","{GO:0005886, GO:0015629, GO:0005829}",{GO:0005515},{}


In [13]:
full_ids_mapping = pd.read_csv("../disorders.map", sep="\t", dtype=str)
full_ids_mapping['parent ICD-10']=full_ids_mapping['ICD-10'].str.split('.',expand=True)[0]
full_ids_mapping

Unnamed: 0,mondo,omim,snomedct,umls,orpha,mesh,ncit,doid,meddra,medgen,ICD-10,parent ICD-10
0,0008118,164330,716180009,C1834013,2724,C537740,,,,,,
1,0010439,300829,,C1853577,,C543241,,,,,,
2,0008117,164310,763829004,C1834014,98897,C563508,,,,,G71.0,G71
3,0009448,242600,84121007,C0268654,42062,C536285,,,,,E72.0,E72
4,0008119,164400,715748006,C0752120,98755,,C129982,0050954,,,G11.8,G11
...,...,...,...,...,...,...,...,...,...,...,...,...
24115,0009507,245550,732961003,C1855551,1296,C538396,,,,,Q87.8,Q87
24116,0009508,245552,,C1855550,,C537549,,,,,,
24117,0009501,245340,766715000,C1855577,171690,C565449,,,,,G72.8,G72
24118,0009502,245348,,C1855565,79244,C565448,,,,,E74.4,E74


In [14]:
full_ids_mapping

Unnamed: 0,mondo,omim,snomedct,umls,orpha,mesh,ncit,doid,meddra,medgen,ICD-10,parent ICD-10
0,0008118,164330,716180009,C1834013,2724,C537740,,,,,,
1,0010439,300829,,C1853577,,C543241,,,,,,
2,0008117,164310,763829004,C1834014,98897,C563508,,,,,G71.0,G71
3,0009448,242600,84121007,C0268654,42062,C536285,,,,,E72.0,E72
4,0008119,164400,715748006,C0752120,98755,,C129982,0050954,,,G11.8,G11
...,...,...,...,...,...,...,...,...,...,...,...,...
24115,0009507,245550,732961003,C1855551,1296,C538396,,,,,Q87.8,Q87
24116,0009508,245552,,C1855550,,C537549,,,,,,
24117,0009501,245340,766715000,C1855577,171690,C565449,,,,,G72.8,G72
24118,0009502,245348,,C1855565,79244,C565448,,,,,E74.4,E74


In [109]:
def split_and_expand_column(data, split_string, column_name):
    s = data[column_name].str.split(split_string, expand=True).stack()
    i = s.index.get_level_values(0)
    df2 = data.loc[i].copy()
    df2[column_name] = s.values
    return df2

def combine_rows(x):
    return set(filter(None,';'.join(x).split(';')))

def get_disease_mapping(disease_set, id_type):
    # ==== Get Mondo IDs ====
    disease_id_set,_,_ = get_prev_mapping(in_set=disease_set, id_type=id_type, file="../disorders.map", sep="\t")
    mondo_set = list(set('MONDO:'+disease_id_set['mondo']))
    # ===== Get mapping from previous mappings =====
    df, missing, prev_mapping = get_prev_mapping(in_set=mondo_set, id_type='mondo', file='disease_disgenet_mapping.csv', sep=",")
    # ==== Get disgenet values ====
    if len(missing) > 0:
        md = get_client("disease")
        mapping = md.getdiseases(missing,
                                 fields='disgenet.genes_related_to_disease.gene_id,disgenet.variants_related_to_disease.rsid',
                                 species='human', returnall=False, as_dataframe=True, df_index=False)
        mapping.rename(columns={'query': 'mondo'}, inplace = True)
        # transform dataframe to combine single and multiple results
        mapping = preprocess_results(mapping=mapping, multicol='disgenet.genes_related_to_disease', 
                                     singlecol='disgenet.genes_related_to_disease.gene_id', key='gene_id')
        mapping = preprocess_results(mapping=mapping, multicol='disgenet.variants_related_to_disease', 
                                     singlecol='disgenet.variants_related_to_disease.rsid', key='rsid')
        mapping = mapping.drop(columns=['_id','_version','disgenet._license'])      
        # ===== Add results from missing values =====
        pd.concat([prev_mapping,mapping]).to_csv('disease_disgenet_mapping.csv', index=False)
        df = pd.concat([df, mapping]).reset_index(drop=True)
    # ==== Map back to previous ids ====
    df["mondo"] = df["mondo"].str.replace("MONDO:", "")
    # work with not unique values...
    mapping_subset = disease_id_set[['mondo', id_type]].drop_duplicates()
    df = pd.merge(mapping_subset, df, on = ['mondo'], how = 'outer')
    df = df.drop(columns=['mondo'])
    df = df.fillna('').groupby(id_type, as_index = False).agg({'disgenet.genes_related_to_disease': combine_rows, 'disgenet.variants_related_to_disease': combine_rows})
    return df

In [110]:
diseases[0]

0     E10
1     E11
2     E12
3     E13
4     E14
5     E66
6     F00
7     F01
8     F02
9     F03
10    G20
11    G30
12    G43
13    I10
14    I11
15    I12
16    I13
17    I15
18    I21
19    I22
20    I50
21    I63
22    I64
23    I70
24    J45
Name: 0, dtype: object

In [125]:
start = timeit.default_timer()
df = get_disease_mapping(disease_set=diseases[0], id_type='ICD-10')
stop = timeit.default_timer()
print('Time: ', stop - start)

Time:  0.24783846200080006


In [126]:
df

Unnamed: 0,ICD-10,disgenet.genes_related_to_disease,disgenet.variants_related_to_disease
0,E10,"{2067, 613, 83854, 2194, 252995, 2354, 3856, 9...","{rs2233580, rs11066280, rs2111485, rs145590578..."
1,E11,"{2067, 613, 83854, 2194, 252995, 2354, 3856, 9...","{rs2233580, rs11066280, rs2111485, rs156867070..."
2,E13,"{5459, 284106, 491, 10247, 4513, 4540, 2081, 6...","{rs746923441, rs121913148, rs781007453, rs5877..."
3,E66,"{27125, 5155, 59343, 79140, 2194, 83854, 25299...","{rs1695, rs4836133, rs9956279, rs10082248, rs1..."
4,F00,"{100038247, 5155, 933, 3084, 811, 473, 4790, 3...","{rs886424, rs669, rs2049161, rs9470080, rs4916..."
5,F01,"{79890, 6850, 3383, 9370, 1437, 6855, 25897, 8...","{rs1236699193, rs1554952291, rs775836288, rs18..."
6,F02,{},{}
7,G20,"{23205, 4287, 673, 5730, 4891, 1861, 26503, 50...","{rs71799110, rs1228608709, rs6280, rs4986791, ..."
8,G30,"{2067, 5155, 5628, 54802, 574503, 25902, 4909,...","{rs316341, rs63750900, rs847012, rs1365504296,..."
9,G43,"{2776, 494324, 3120, 317773, 3952, 23327, 3403...","{rs745738344, rs6724624, rs17301853, rs2234693..."


In [129]:
df2 = df.set_index('ICD-10').to_dict()
df2

{'disgenet.genes_related_to_disease': {'E10': {'2067',
   '613',
   '83854',
   '2194',
   '252995',
   '2354',
   '3856',
   '9641',
   '23456',
   '1504',
   '6374',
   '2710',
   '120227',
   '81033',
   '2267',
   '10747',
   '5265',
   '100132074',
   '5741',
   '488',
   '130271',
   '2201',
   '4593',
   '7350',
   '23175',
   '6093',
   '10257',
   '29126',
   '6785',
   '9314',
   '5021',
   '5716',
   '23682',
   '1869',
   '83856',
   '22915',
   '463',
   '1499',
   '117195',
   '6744',
   '6786',
   '3670',
   '6037',
   '24149',
   '6173',
   '3483',
   '8862',
   '7424',
   '8989',
   '1991',
   '3037',
   '55856',
   '3934',
   '84061',
   '6888',
   '9510',
   '6404',
   '63924',
   '490',
   '847',
   '5291',
   '410',
   '1445',
   '4627',
   '9031',
   '6964',
   '2309',
   '1666',
   '404672',
   '4852',
   '1483',
   '632',
   '10928',
   '2078',
   '26003',
   '10598',
   '114253699',
   '788',
   '225',
   '4137',
   '407033',
   '9361',
   '958',
   '1103',
   

In [133]:
df['disgenet.genes_related_to_disease']

0     {2067, 613, 83854, 2194, 252995, 2354, 3856, 9...
1     {2067, 613, 83854, 2194, 252995, 2354, 3856, 9...
2     {5459, 284106, 491, 10247, 4513, 4540, 2081, 6...
3     {27125, 5155, 59343, 79140, 2194, 83854, 25299...
4     {100038247, 5155, 933, 3084, 811, 473, 4790, 3...
5     {79890, 6850, 3383, 9370, 1437, 6855, 25897, 8...
6                                                    {}
7     {23205, 4287, 673, 5730, 4891, 1861, 26503, 50...
8     {2067, 5155, 5628, 54802, 574503, 25902, 4909,...
9     {2776, 494324, 3120, 317773, 3952, 23327, 3403...
10    {5155, 54558, 25833, 79140, 2194, 83854, 25299...
11    {5465, 183, 4879, 185, 1585, 26291, 728264, 43...
12    {11346, 7040, 1113, 183, 23417, 2247, 7827, 33...
13    {5155, 54558, 25833, 79140, 2194, 83854, 25299...
14    {4070, 84830, 25902, 25833, 252995, 11216, 357...
15    {4070, 84830, 25902, 25833, 252995, 11216, 357...
16    {2921, 2194, 3572, 284361, 2021, 5265, 5741, 4...
17    {11332, 4513, 23621, 1628, 140809, 3383, 9

# do the comparisson now

In [33]:
 filter(None, lst)