# Testing environment for DoSE

## Setup

### Load libraries

In [1]:
import pandas as pd
import numpy as np 
import gseapy
from biothings_client import get_client

### Define data

In [2]:
seeds_file = "Input/0007079.txt"
betweenness_file = "Input/0007079_added_200_dmd_betweenness_hub_0.01.txt"
significance_file = "Input/0007079_added_200_dmd_significance_hub_1.txt"
diseases_file = "Input/ICD10_commROCG_raw.txt"
disease_clusters_file = "Input/ICD10_commROCG_cluster.txt"

### Load data

In [3]:
seeds = pd.read_csv(seeds_file, sep="\t", header=None)[0]
betweenness = pd.read_csv(betweenness_file, sep="\t")['node']
significance = pd.read_csv(significance_file, sep="\t")['node']
diseases = pd.read_csv(diseases_file, sep="\t", header=None)
disease_clusters = pd.read_csv(disease_clusters_file, sep="\t", header=None)

In [4]:
import timeit
start = timeit.default_timer()

stop = timeit.default_timer()
print('Time: ', stop - start)  

Time:  3.8177000078576384e-05


## Mapper

In [5]:
id_type_key = {'entrez':'entrezgene','ensembl':'ensembl.gene','symbol':'symbol','uniprot':'uniprot.Swiss-Prot','mondo':'mondo'}
gene_ids=['uniprot.Swiss-Prot','symbol','ensembl.gene','entrezgene']

In [66]:
def preprocess_results(mapping, multicol, singlecol, key, explode=False):
    
    def convert_to_string(cell, key):
        if str(cell) != 'nan':
            extracted_ids = [val.get(key) for val in cell]
            return ';'.join(str(e) for e in list(set(extracted_ids)))
        return cell
    mapping[multicol] = mapping[multicol].apply(lambda x: convert_to_string(x, key)) if multicol in mapping else np.nan
    if singlecol in mapping:
        mapping[multicol].fillna(mapping[singlecol], inplace=True)
        mapping = mapping.drop(columns=[singlecol])
    if explode:
        mapping = mapping[multicol].split(';').explode(multicol)
        mapping.rename(columns={multicol: singlecol}, inplace = True)
    return mapping


def get_prev_mapping(in_set, id_type, file, sep):
    # ===== Get mapping from local mapping file =====
    mapping = pd.read_csv(file, sep=sep, header=0, dtype=str)
    if id_type == "ICD-10":
        mapping = split_and_expand_column(data=mapping, split_string=",", column_name="ICD-10")
    # ==== Map given disease set ====
    id_type = id_type_key[id_type] if id_type in id_type_key else id_type
    mapped_set = mapping[mapping[id_type].isin(in_set)]
    # ===== Get missing values =====
    missing = list(set(in_set) - set(mapping[id_type]))
    return mapped_set, missing, mapping
    

def get_gene_mapping(gene_set, id_type):
    """
    Simple converter.

    :param gene_set: Set of gene ids
    :return: Dataframe
    """
    # ===== Get mapping from previous mappings =====
    df, missing, prev_mapping = get_prev_mapping(in_set=gene_set, id_type=id_type, file='gene_id_mapping.csv', sep=",")
    # ===== Get mapping for missing values =====
    if len(missing) > 0:
        mg = get_client("gene")
        mapping = mg.querymany(missing, scopes=id_type_key[id_type], fields=','.join(gene_ids),
                     species='human', returnall=False, as_dataframe=True, df_index=False)
        mapping = mapping.drop(columns=[id_type_key[id_type]])
        mapping.rename(columns={'query': id_type_key[id_type]}, inplace = True)
        # ===== Split if there are multiple ensembl ids =====
        if 'ensembl' in mapping:
            mapping = preprocess_results(mapping=mapping, multicol='ensembl', singlecol='ensembl.gene', key='gene', explode=True)
        mapping = mapping.drop(columns=['_id','_score'])
        # ===== Add results from missing values =====
        pd.concat([prev_mapping,mapping]).to_csv('gene_id_mapping.csv', index=False)
        df = pd.concat([df, mapping]).reset_index(drop=True)
    return df

def get_gene_to_attributes(gene_set, id_type):
    """
    Simple converter.

    :param gene_set: Set of gene ids
    :return: Dataframe
    """
    # ===== Get gene ID mappings =====
    gene_mapping, _, _ = get_prev_mapping(in_set=gene_set, id_type=id_type, file='gene_id_mapping.csv', sep=",")
    df, missing, prev_mapping = get_prev_mapping(in_set=set(gene_mapping['entrezgene']), id_type='entrez', file='gene_att_mapping.csv', sep=",")
    if len(missing) > 0:
        mg = get_client("gene")
        gene_ids=['uniprot.Swiss-Prot','symbol','ensembl.gene','entrezgene']
        mapping = mg.querymany(missing, scopes=','.join(gene_ids),
                            fields='pathway.kegg.id, go.BP.id, go.CC.id, go.MF.id',
                            species='human', returnall=False, as_dataframe=True, df_index=False)
        mapping.rename(columns={'query': 'entrezgene'}, inplace = True)
        for column in ['go.BP','go.CC','go.MF','pathway.kegg']:
            mapping = preprocess_results(mapping=mapping, multicol=column, singlecol=column+'.id', key='id')
        mapping = mapping.drop(columns=['_id','_score'])
        # ===== Add results from missing values =====
        pd.concat([prev_mapping,mapping]).to_csv('gene_att_mapping.csv', index=False)        
        df = pd.concat([df, mapping]).reset_index(drop=True)
    # work with not unique values...
    mapping_subset = gene_mapping[['entrezgene', id_type_key[id_type]]].drop_duplicates()
    df = pd.merge(mapping_subset, df, on = ['entrezgene'], how = 'outer')
    df = df.drop(columns=['entrezgene'])
    df = df.fillna('').groupby([id_type_key[id_type]], as_index=False).agg({'go.BP': combine_rows, 'go.CC': combine_rows,
                                                                            'go.MF': combine_rows, 'pathway.kegg': combine_rows})
    return df


def combine_rows(x):
    return set(filter(None,';'.join(x).split(';')))

In [7]:
start = timeit.default_timer()
reference_mapping = get_gene_mapping(seeds, 'uniprot')
target_mapping = get_gene_mapping(significance, 'uniprot')
stop = timeit.default_timer()
print('Time: ', stop - start)

Time:  0.008130250000021988


In [8]:
reference_mapping

Unnamed: 0,entrezgene,ensembl.gene,symbol,uniprot.Swiss-Prot
191,125,ENSG00000196616,ADH1B,P00325
192,2555,ENSG00000151834,GABRA2,P47869
193,126,ENSG00000248144,ADH1C,P00326
194,3356,ENSG00000102468,HTR2A,P28223


In [9]:
target_mapping

Unnamed: 0,entrezgene,ensembl.gene,symbol,uniprot.Swiss-Prot
0,1394,ENSG00000120088,CRHR1,P34998
1,1394,ENSG00000276191,CRHR1,P34998
2,104909134,ENSG00000263715,LINC02210-CRHR1,P34998
3,104909134,ENSG00000278232,LINC02210-CRHR1,P34998
4,104909134,ENSG00000282456,LINC02210-CRHR1,P34998
...,...,...,...,...
212,1742,ENSG00000132535,DLG4,P78352
213,4684,ENSG00000149294,NCAM1,P13591
214,2778,ENSG00000087460,GNAS,O95467
215,2776,ENSG00000156052,GNAQ,P50148


In [10]:
start = timeit.default_timer()
reference_kegg_mapping = get_gene_to_attributes(seeds, 'uniprot')
target_kegg_mapping = get_gene_to_attributes(significance, 'uniprot')
stop = timeit.default_timer()
print('Time: ', stop - start)

Time:  0.0377670540001418


In [11]:
reference_kegg_mapping

Unnamed: 0,uniprot.Swiss-Prot,go.BP,go.CC,go.MF,pathway.kegg
0,P00325,"{GO:0001523, GO:0042572, GO:0006069, GO:0042573}","{GO:0005829, GO:0005654, GO:0005886}","{GO:0004024, GO:0008270, GO:0004745}","{hsa00620, hsa05204, hsa00830, hsa00982, hsa00..."
1,P00326,"{GO:0042572, GO:0006069, GO:0042573}","{GO:0005829, GO:0005654, GO:0005886}","{GO:0004024, GO:0004022, GO:0008270, GO:0004745}","{hsa00620, hsa05204, hsa00830, hsa00982, hsa00..."
2,P28223,"{GO:0030431, GO:0046718, GO:0014832, GO:000720...","{GO:0070852, GO:0005887, GO:0030425, GO:004302...","{GO:0001587, GO:0051378, GO:0004993, GO:007188...","{hsa04020, hsa04080, hsa04750, hsa04540, hsa04..."
3,P47869,"{GO:0051932, GO:0050877, GO:1904862, GO:006007...","{GO:0005887, GO:0034707, GO:0032590, GO:009879...","{GO:0004890, GO:0005254, GO:0030594, GO:000523...","{hsa04080, hsa04727, hsa04723, hsa04742, hsa05..."


In [12]:
target_kegg_mapping

Unnamed: 0,uniprot.Swiss-Prot,go.BP,go.CC,go.MF,pathway.kegg
0,O00459,"{GO:0032869, GO:0010506, GO:0015031, GO:003809...","{GO:0005634, GO:0005829, GO:0005942}","{GO:0046982, GO:0005515, GO:0019903, GO:003097...","{hsa04211, hsa05226, hsa05169, hsa04066, hsa04..."
1,O14492,"{GO:0019221, GO:0050851, GO:0007399, GO:001922...","{GO:0005884, GO:0005737, GO:0005886, GO:000172...","{GO:0005068, GO:0005515, GO:0042802, GO:003559...","{hsa04910, hsa04722}"
2,O14610,"{GO:0007186, GO:0007602}",{GO:0005834},"{GO:0031681, GO:0003924}","{hsa04727, hsa04926, hsa04728, hsa04371, hsa04..."
3,O14775,"{GO:0007212, GO:0043547, GO:0007165, GO:000645...","{GO:0005834, GO:0005737, GO:0005634, GO:009879...","{GO:0003924, GO:0051087, GO:0031682, GO:000551...","{hsa04727, hsa04926, hsa04728, hsa04371, hsa04..."
4,O14842,"{GO:0030073, GO:0051928, GO:0050796, GO:000720...","{GO:0005887, GO:0005886}","{GO:0004930, GO:0045125, GO:0008289}",{hsa04911}
...,...,...,...,...,...
195,Q9UN70,"{GO:0007156, GO:0016339, GO:0007155, GO:0050808}","{GO:0005887, GO:0016020}",{GO:0005509},{}
196,Q9UNN8,"{GO:0050819, GO:0007596}","{GO:0005887, GO:0005576, GO:0009986, GO:000592...","{GO:0005515, GO:0038023}",{hsa04610}
197,Q9UQC2,"{GO:0019221, GO:0030316, GO:0007169, GO:004330...","{GO:0005886, GO:0005829, GO:0005737}","{GO:0005068, GO:0005515, GO:0043325, GO:0005547}","{hsa05220, hsa04072, hsa04664, hsa04071, hsa04..."
198,Q9Y2G0,"{GO:0046854, GO:0072659}","{GO:0015629, GO:0005886, GO:0005829}",{GO:0005515},{}


In [64]:
full_ids_mapping = pd.read_csv("../new_disorder.map", sep="\t", dtype=str)
full_ids_mapping

Unnamed: 0,mondo,omim,snomedct,umls,orpha,mesh,ncit,doid,meddra,medgen,ICD-10
0,0008118,164330,716180009,C1834013,2724,C537740,,,,,
1,0010439,300829,,C1853577,,C543241,,,,,
2,0008117,164310,763829004,C1834014,98897,C563508,,,,,"G71,G71.0"
3,0009448,242600,84121007,C0268654,42062,C536285,,,,,"E72,E72.0"
4,0008119,164400,715748006,C0752120,98755,,C129982,0050954,,,"G11,G11.8"
...,...,...,...,...,...,...,...,...,...,...,...
24115,0009507,245550,732961003,C1855551,1296,C538396,,,,,"Q87,Q87.8"
24116,0009508,245552,,C1855550,,C537549,,,,,
24117,0009501,245340,766715000,C1855577,171690,C565449,,,,,"G72,G72.8"
24118,0009502,245348,,C1855565,79244,C565448,,,,,"E74.4,E74"


In [65]:
full_ids_mapping.count()

mondo       24120
omim         8841
snomedct     8962
umls        16234
orpha        9363
mesh         8075
ncit         6953
doid         8944
meddra       1144
medgen          1
ICD-10       9561
dtype: int64

In [67]:
def split_and_expand_column(data, split_string, column_name):
    s = data[column_name].str.split(split_string, expand=True).stack()
    i = s.index.get_level_values(0)
    df2 = data.loc[i].copy()
    df2[column_name] = s.values
    return df2

def get_disease_mapping(disease_set, id_type):
    # ==== Get Mondo IDs ====
    disease_id_set,_,_ = get_prev_mapping(in_set=disease_set, id_type=id_type, file="../disorders.map", sep="\t")
    mondo_set = list(set('MONDO:'+disease_id_set['mondo']))
    # ===== Get mapping from previous mappings =====
    df, missing, prev_mapping = get_prev_mapping(in_set=mondo_set, id_type='mondo', file='disease_disgenet_mapping.csv', sep=",")
    # ==== Get disgenet values ====
    if len(missing) > 0:
        md = get_client("disease")
        mapping = md.getdiseases(missing,
                                 fields='disgenet.genes_related_to_disease.gene_id,disgenet.variants_related_to_disease.rsid,ctd.pathway_related_to_disease.kegg_pathway_id',
                                 species='human', returnall=False, as_dataframe=True, df_index=False)
        mapping.rename(columns={'query': 'mondo'}, inplace = True)
        # transform dataframe to combine single and multiple results
        mapping = preprocess_results(mapping=mapping, multicol='disgenet.genes_related_to_disease', 
                                     singlecol='disgenet.genes_related_to_disease.gene_id', key='gene_id')
        mapping = preprocess_results(mapping=mapping, multicol='disgenet.variants_related_to_disease', 
                                     singlecol='disgenet.variants_related_to_disease.rsid', key='rsid')
        mapping = preprocess_results(mapping=mapping, multicol='ctd.pathway_related_to_disease', 
                                     singlecol='ctd.pathway_related_to_disease.kegg_pathway_id', key='kegg_pathway_id')
        mapping = mapping.drop(columns=['_id','_version','disgenet._license'])      
        # ===== Add results from missing values =====
        pd.concat([prev_mapping,mapping]).to_csv('disease_disgenet_mapping.csv', index=False)
        df = pd.concat([df, mapping]).reset_index(drop=True)
    # ==== Map back to previous ids ====
    df["mondo"] = df["mondo"].str.replace("MONDO:", "")
    # work with not unique values...
    mapping_subset = disease_id_set[['mondo', id_type]].drop_duplicates()
    df = pd.merge(mapping_subset, df, on = ['mondo'], how = 'outer')
    df = df.drop(columns=['mondo'])
    df = df.fillna('').groupby(id_type, as_index = False).agg({'disgenet.genes_related_to_disease': combine_rows, 'disgenet.variants_related_to_disease': combine_rows, 'ctd.pathway_related_to_disease': combine_rows})
    return df

In [68]:
diseases[0]

0     E10
1     E11
2     E12
3     E13
4     E14
5     E66
6     F00
7     F01
8     F02
9     F03
10    G20
11    G30
12    G43
13    I10
14    I11
15    I12
16    I13
17    I15
18    I21
19    I22
20    I50
21    I63
22    I64
23    I70
24    J45
Name: 0, dtype: object

In [69]:
start = timeit.default_timer()
df = get_disease_mapping(disease_set=diseases[0], id_type='ICD-10')
stop = timeit.default_timer()
print('Time: ', stop - start)

Time:  0.16399298299984366


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["mondo"] = df["mondo"].str.replace("MONDO:", "")


In [70]:
df

Unnamed: 0,ICD-10,disgenet.genes_related_to_disease,disgenet.variants_related_to_disease,ctd.pathway_related_to_disease
0,E10,"{4760, 3412, 3119, 1234.0, 3407, 283460, 23118...","{rs121908261, rs1566092470, rs587776825, rs237...","{hsa04720, hsa04211, hsa04020, hsa00630, hsa_M..."
1,E11,"{5122, 8277, 268, 6542, 7075, 128229, 221547, ...","{rs749877032, rs1272388614, rs8026743, rs93189...","{hsa04720, hsa04211, hsa04020, hsa04512, hsa00..."
2,E13,"{84919, 5078, 6833, 2688, 3952, 3479, 3643, 94...","{rs938519025, rs869025179, rs886037750, rs1219...","{hsa04211, hsa04520, hsa05164, hsa04913, hsa04..."
3,F01,"{4318, 728, 4828, 55504, 1071, 5653, 171558, 1...","{rs10491487, rs17501010, rs1217691063, rs10512...",{}
4,G20,"{714, 4318, 10725, 63982, 5886, 3439, 6908, 18...","{rs912601230, rs2230288, rs652438, rs150562946...","{hsa04211, hsa04020, hsa04940, hsa00760, hsa04..."
5,G30,"{5122, 1718, 3126, 2521, 83660, 3579, 421, 506...","{rs573167, rs71377714, rs62117160, rs6834555, ...","{hsa04720, hsa04211, hsa04020, hsa04512, hsa04..."
6,G43,{},{},"{hsa04020, hsa05220, hsa04520, hsa04915, hsa05..."
7,I10,"{5122, 268, 54457, 1161, 3579, 50616, 2994, 65...","{rs9943291, rs12632110, rs2230288, rs4147064, ...","{hsa04720, hsa04211, hsa04020, hsa04512, hsa00..."
8,I11,"{4318, 400550, 104564225, 26548, 7077, 1585, 4...",{},{}
9,I12,"{80216, 80350, 7170, 1241, 7422, 285, 1113, 63...",{rs11739136},"{hsa05220, hsa05414, hsa04512, hsa04110, hsa04..."


In [21]:
df2 = df.set_index('ICD-10').to_dict()
df2

{'disgenet.genes_related_to_disease': {'E10': {'7071',
   '84076',
   '3670',
   '56606',
   '8854',
   '3240',
   '2875',
   '11169',
   '2643',
   '25970',
   '407025',
   '25820',
   '9790',
   '5579',
   '51237',
   '3358',
   '590',
   '5269',
   '1968',
   '54414',
   '2354',
   '7291',
   '473',
   '23576',
   '1952',
   '10452',
   '374',
   '1048',
   '10950',
   '10087',
   '2903',
   '314',
   '4158',
   '19',
   '109',
   '5313',
   '54209',
   '10864',
   '3402',
   '6194',
   '343045',
   '407050',
   '5110',
   '4286',
   '9770',
   '54832',
   '23219',
   '717',
   '2524',
   '2312',
   '6812',
   '7128',
   '133522',
   '7750',
   '4084',
   '4049',
   '3454',
   '9370',
   '9398',
   '4513',
   '29933',
   '5799',
   '116844',
   '79447',
   '250',
   '114086',
   '1435',
   '11266',
   '9394',
   '3037',
   '3410',
   '22808',
   '6446',
   '5105',
   '23387',
   '4922',
   '84833',
   '653108',
   '3727',
   '3190',
   '1020',
   '257019',
   '11122',
   '27190',
  

In [22]:
df['disgenet.genes_related_to_disease']

0     {7071, 84076, 3670, 56606, 8854, 3240, 2875, 1...
1     {7071, 84076, 3670, 56606, 8854, 3240, 2875, 1...
2     {5459, 9451, 7054, 4477, 4512, 340061, 3172, 4...
3     {339, 3670, 56606, 3240, 2875, 375612, 10888, ...
4     {8379, 27185, 3060, 338340, 64478, 3587, 6387,...
5     {5476, 4682, 351, 2875, 2099, 12, 2534, 2308, ...
6                                                    {}
7     {2146, 254428, 23435, 126, 3851, 1030, 8398, 5...
8     {79731, 389203, 27185, 64231, 8854, 3240, 2875...
9     {7442, 1909, 3120, 3060, 2099, 11169, 7306, 79...
10    {1187, 56606, 3240, 2875, 11169, 26119, 9319, ...
11    {4306, 51573, 10753, 4879, 183, 406980, 400550...
12    {3605, 23417, 59, 2261, 6387, 183, 1277, 1579,...
13    {1187, 56606, 3240, 2875, 11169, 26119, 9319, ...
14    {339, 27185, 3670, 56606, 3240, 100126336, 287...
15    {27185, 3670, 56606, 3240, 100126336, 2875, 25...
16    {1187, 64231, 3240, 100126336, 2875, 11169, 26...
17    {1909, 7057, 4016, 241, 255738, 23435, 324

# do the comparisson now

In [33]:
 filter(None, lst)