# Testing environment for DoSE

## Setup

### Load libraries

In [1]:
import pandas as pd
import numpy as np 
import gseapy
from biothings_client import get_client

### Define data

In [2]:
seeds_file = "Input/0007079.txt"
betweenness_file = "Input/0007079_added_200_dmd_betweenness_hub_0.01.txt"
significance_file = "Input/0007079_added_200_dmd_significance_hub_1.txt"
diseases_file = "Input/ICD10_commROCG_raw.txt"
disease_clusters_file = "Input/ICD10_commROCG_cluster.txt"

### Load data

In [76]:
disease_id = "0007079"
seeds = pd.read_csv(seeds_file, sep="\t", header=None)[0]
betweenness = pd.read_csv(betweenness_file, sep="\t")['node']
significance = pd.read_csv(significance_file, sep="\t")['node']
diseases = pd.read_csv(diseases_file, sep="\t", header=None)
disease_clusters = pd.read_csv(disease_clusters_file, sep="\t", header=None)

In [4]:
import timeit
start = timeit.default_timer()

stop = timeit.default_timer()
print('Time: ', stop - start)  

Time:  3.149899930576794e-05


## Mapper

In [5]:
id_type_key = {'entrez':'entrezgene','ensembl':'ensembl.gene','symbol':'symbol','uniprot':'uniprot.Swiss-Prot','mondo':'mondo'}
gene_ids=['uniprot.Swiss-Prot','symbol','ensembl.gene','entrezgene']

In [224]:
def preprocess_results(mapping, multicol, singlecol, key, explode=False):
    
    def convert_to_string(cell, key):
        if str(cell) != 'nan':
            extracted_ids = [val.get(key) for val in cell]
            return ';'.join(str(e) for e in list(set(extracted_ids)))
        return cell
    mapping[multicol] = mapping[multicol].apply(lambda x: convert_to_string(x, key)) if multicol in mapping else np.nan
    if singlecol in mapping:
        mapping[multicol].fillna(mapping[singlecol], inplace=True)
        mapping = mapping.drop(columns=[singlecol])
    if explode:
        mapping = mapping[multicol].split(';').explode(multicol)
        mapping.rename(columns={multicol: singlecol}, inplace = True)
    return mapping


def get_prev_mapping(in_set, id_type, file, sep):
    # ===== Get mapping from local mapping file =====
    mapping = pd.read_csv(file, sep=sep, header=0, dtype=str)
    if id_type == "ICD-10":
        mapping = split_and_expand_column(data=mapping, split_string=",", column_name="ICD-10")
    # ==== Map given disease set ====
    id_type = id_type_key[id_type] if id_type in id_type_key else id_type
    mapped_set = mapping[mapping[id_type].isin(in_set)]
    # ===== Get missing values =====
    missing = list(set(in_set) - set(mapping[id_type]))
    return mapped_set, missing, mapping
    

def get_gene_mapping(gene_set, id_type):
    """
    Simple converter.

    :param gene_set: Set of gene ids
    :return: Dataframe
    """
    # ===== Get mapping from previous mappings =====
    df, missing, prev_mapping = get_prev_mapping(in_set=gene_set, id_type=id_type, file='gene_id_mapping.csv', sep=",")
    # ===== Get mapping for missing values =====
    if len(missing) > 0:
        mg = get_client("gene")
        mapping = mg.querymany(missing, scopes=id_type_key[id_type], fields=','.join(gene_ids),
                     species='human', returnall=False, as_dataframe=True, df_index=False)
        mapping = mapping.drop(columns=[id_type_key[id_type]])
        mapping.rename(columns={'query': id_type_key[id_type]}, inplace = True)
        # ===== Split if there are multiple ensembl ids =====
        if 'ensembl' in mapping:
            mapping = preprocess_results(mapping=mapping, multicol='ensembl', singlecol='ensembl.gene', key='gene', explode=True)
        mapping = mapping.drop(columns=['_id','_score'])
        # ===== Add results from missing values =====
        pd.concat([prev_mapping,mapping]).to_csv('gene_id_mapping.csv', index=False)
        df = pd.concat([df, mapping]).reset_index(drop=True)
    return df

def get_gene_to_attributes(gene_set, id_type):
    """
    Simple converter.

    :param gene_set: Set of gene ids
    :return: Dataframe
    """
    # ===== Get gene ID mappings =====
    gene_mapping, _, _ = get_prev_mapping(in_set=gene_set, id_type=id_type, file='gene_id_mapping.csv', sep=",")
    df, missing, prev_mapping = get_prev_mapping(in_set=set(gene_mapping['entrezgene']), id_type='entrez', file='gene_att_mapping.csv', sep=",")
    if len(missing) > 0:
        mg = get_client("gene")
        gene_ids=['uniprot.Swiss-Prot','symbol','ensembl.gene','entrezgene']
        mapping = mg.querymany(missing, scopes=','.join(gene_ids),
                            fields='pathway.kegg.id, go.BP.id, go.CC.id, go.MF.id',
                            species='human', returnall=False, as_dataframe=True, df_index=False)
        mapping.rename(columns={'query': 'entrezgene'}, inplace = True)
        for column in ['go.BP','go.CC','go.MF','pathway.kegg']:
            mapping = preprocess_results(mapping=mapping, multicol=column, singlecol=column+'.id', key='id')
        mapping = mapping.drop(columns=['_id','_score'])
        # ===== Add results from missing values =====
        pd.concat([prev_mapping,mapping]).to_csv('gene_att_mapping.csv', index=False)        
        df = pd.concat([df, mapping]).reset_index(drop=True)
    # work with not unique values...
    mapping_subset = gene_mapping[['entrezgene', id_type_key[id_type]]].drop_duplicates()
    df = pd.merge(mapping_subset, df, on = ['entrezgene'], how = 'outer')
    df = df.drop(columns=['entrezgene'])
    df = df.fillna('').groupby([id_type_key[id_type]], as_index=False).agg({'go.BP': combine_rows, 'go.CC': combine_rows,
                                                                            'go.MF': combine_rows, 'pathway.kegg': combine_rows})
    return df


def combine_rows(x):
    return set(filter(None,';'.join(x).split(';')))

In [7]:
start = timeit.default_timer()
reference_mapping = get_gene_mapping(seeds, 'uniprot')
target_mapping = get_gene_mapping(significance, 'uniprot')
stop = timeit.default_timer()
print('Time: ', stop - start)

Time:  0.010089086000334646


In [8]:
reference_mapping

Unnamed: 0,entrezgene,ensembl.gene,symbol,uniprot.Swiss-Prot
191,125,ENSG00000196616,ADH1B,P00325
192,2555,ENSG00000151834,GABRA2,P47869
193,126,ENSG00000248144,ADH1C,P00326
194,3356,ENSG00000102468,HTR2A,P28223


In [9]:
target_mapping

Unnamed: 0,entrezgene,ensembl.gene,symbol,uniprot.Swiss-Prot
0,1394,ENSG00000120088,CRHR1,P34998
1,1394,ENSG00000276191,CRHR1,P34998
2,104909134,ENSG00000263715,LINC02210-CRHR1,P34998
3,104909134,ENSG00000278232,LINC02210-CRHR1,P34998
4,104909134,ENSG00000282456,LINC02210-CRHR1,P34998
...,...,...,...,...
212,1742,ENSG00000132535,DLG4,P78352
213,4684,ENSG00000149294,NCAM1,P13591
214,2778,ENSG00000087460,GNAS,O95467
215,2776,ENSG00000156052,GNAQ,P50148


In [10]:
start = timeit.default_timer()
reference_kegg_mapping = get_gene_to_attributes(seeds, 'uniprot')
target_kegg_mapping = get_gene_to_attributes(significance, 'uniprot')
stop = timeit.default_timer()
print('Time: ', stop - start)

Time:  0.05635279499983881


In [11]:
reference_kegg_mapping

Unnamed: 0,uniprot.Swiss-Prot,go.BP,go.CC,go.MF,pathway.kegg
0,P00325,"{GO:0001523, GO:0006069, GO:0042573, GO:0042572}","{GO:0005886, GO:0005829, GO:0005654}","{GO:0004024, GO:0008270, GO:0004745}","{hsa05204, hsa00620, hsa00010, hsa00071, hsa00..."
1,P00326,"{GO:0006069, GO:0042573, GO:0042572}","{GO:0005886, GO:0005829, GO:0005654}","{GO:0004024, GO:0008270, GO:0004022, GO:0004745}","{hsa05204, hsa00620, hsa00010, hsa00071, hsa00..."
2,P28223,"{GO:0001659, GO:0007613, GO:0045821, GO:004814...","{GO:0005886, GO:0099055, GO:0043198, GO:003042...","{GO:0001587, GO:0044877, GO:0001965, GO:000499...","{hsa04540, hsa04020, hsa04750, hsa04726, hsa04..."
3,P47869,"{GO:0007268, GO:0001505, GO:0034220, GO:004239...","{GO:0043005, GO:0005886, GO:1902711, GO:003470...","{GO:0005254, GO:1904315, GO:0008503, GO:002285...","{hsa05032, hsa04727, hsa04723, hsa05033, hsa04..."


In [12]:
target_kegg_mapping

Unnamed: 0,uniprot.Swiss-Prot,go.BP,go.CC,go.MF,pathway.kegg
0,O00459,"{GO:0008286, GO:0034976, GO:0050852, GO:005090...","{GO:0005829, GO:0005942, GO:0005634}","{GO:0019903, GO:0005515, GO:0030971, GO:004698...","{hsa04072, hsa05418, hsa05160, hsa04012, hsa04..."
1,O14492,"{GO:0050851, GO:0008286, GO:0007399, GO:003003...","{GO:0005886, GO:0005884, GO:0001725, GO:000582...","{GO:0005515, GO:0035591, GO:0042802, GO:000506...","{hsa04910, hsa04722}"
2,O14610,"{GO:0007602, GO:0007186}",{GO:0005834},"{GO:0031681, GO:0003924}","{hsa04926, hsa05200, hsa05032, hsa04727, hsa05..."
3,O14775,"{GO:0007186, GO:0007165, GO:0006457, GO:004354...","{GO:0005834, GO:0098793, GO:1902773, GO:000582...","{GO:0051087, GO:0003924, GO:0005515, GO:000509...","{hsa04926, hsa05200, hsa05032, hsa04727, hsa05..."
4,O14842,"{GO:0007204, GO:0007186, GO:0032691, GO:003202...","{GO:0005886, GO:0005887}","{GO:0008289, GO:0045125, GO:0004930}",{hsa04911}
...,...,...,...,...,...
195,Q9UN70,"{GO:0007155, GO:0050808, GO:0016339, GO:0007156}","{GO:0016020, GO:0005887}",{GO:0005509},{}
196,Q9UNN8,"{GO:0007596, GO:0050819}","{GO:0048471, GO:0005615, GO:0005886, GO:000557...","{GO:0005515, GO:0038023}",{hsa04610}
197,Q9UQC2,"{GO:0007169, GO:0038095, GO:0051897, GO:003031...","{GO:0005737, GO:0005829, GO:0005886}","{GO:0005515, GO:0005068, GO:0005547, GO:0043325}","{hsa04072, hsa05220, hsa04014, hsa04071, hsa04..."
198,Q9Y2G0,"{GO:0072659, GO:0046854}","{GO:0005886, GO:0005829, GO:0015629}",{GO:0005515},{}


In [172]:
full_ids_mapping = pd.read_csv("../new_disorders.map", sep="\t", dtype=str)
full_ids_mapping

Unnamed: 0,mondo,omim,snomedct,umls,orpha,mesh,ncit,doid,meddra,medgen,ICD-10
0,0008118,164330,716180009,C1834013,2724,C537740,,,,,
1,0010439,300829,,C1853577,,C543241,,,,,
2,0008117,164310,763829004,C1834014,98897,C563508,,,,,"G71,G71.0"
3,0009448,242600,84121007,C0268654,42062,C536285,,,,,"E72,E72.0"
4,0008119,164400,715748006,C0752120,98755,,C129982,0050954,,,"G11,G11.8"
...,...,...,...,...,...,...,...,...,...,...,...
24115,0009507,245550,732961003,C1855551,1296,C538396,,,,,"Q87,Q87.8"
24116,0009508,245552,,C1855550,,C537549,,,,,
24117,0009501,245340,766715000,C1855577,171690,C565449,,,,,"G72,G72.8"
24118,0009502,245348,,C1855565,79244,C565448,,,,,"E74,E74.4"


In [14]:
full_ids_mapping.count()

mondo       24120
omim         8841
snomedct     8962
umls        16234
orpha        9363
mesh         8075
ncit         6953
doid         8944
meddra       1144
medgen          1
ICD-10       9561
dtype: int64

In [280]:
def split_and_expand_column(data, split_string, column_name):
    s = data[column_name].str.split(split_string, expand=True).stack()
    i = s.index.get_level_values(0)
    df2 = data.loc[i].copy()
    df2[column_name] = s.values
    return df2

def get_disease_mapping(disease_set, id_type):
    # ==== Get Mondo IDs ====
    disease_id_set,_,_ = get_prev_mapping(in_set=disease_set, id_type=id_type, file="../new_disorders.map", sep="\t")
    mondo_set = list(set('MONDO:'+disease_id_set['mondo']))
    # ===== Get mapping from previous mappings =====
    df, missing, prev_mapping = get_prev_mapping(in_set=mondo_set, id_type='mondo', file='disease_disgenet_mapping.csv', sep=",")
    # ==== Get disgenet values ====
    if len(missing) > 0:
        md = get_client("disease")
        mapping = md.getdiseases(missing,
                                 fields='disgenet.genes_related_to_disease.gene_id,disgenet.variants_related_to_disease.rsid,ctd.pathway_related_to_disease.kegg_pathway_id',
                                 species='human', returnall=False, as_dataframe=True, df_index=False)
        mapping.rename(columns={'query': 'mondo'}, inplace = True)
        # transform dataframe to combine single and multiple results
        mapping = preprocess_results(mapping=mapping, multicol='disgenet.genes_related_to_disease', 
                                     singlecol='disgenet.genes_related_to_disease.gene_id', key='gene_id')
        mapping = preprocess_results(mapping=mapping, multicol='disgenet.variants_related_to_disease', 
                                     singlecol='disgenet.variants_related_to_disease.rsid', key='rsid')
        mapping = preprocess_results(mapping=mapping, multicol='ctd.pathway_related_to_disease', 
                                     singlecol='ctd.pathway_related_to_disease.kegg_pathway_id', key='kegg_pathway_id')
        mapping = mapping.drop(columns=['_id','_version','disgenet._license']) 
        # ==== Get pathways from file ====
        mondo_to_pathway = pd.read_csv('mondo_to_pathways.csv')
        mapping = mapping.merge(mondo_to_pathway, on='mondo', how='left')
        #  work with nan float values
        mapping = mapping.fillna('')
        mapping = mapping.astype(str)
        # combine with ctd pathway mapping 
        mapping['ctd.pathway_related_to_disease'] = mapping['ctd.pathway_related_to_disease'] + ";" + mapping['pathways']
        mapping = mapping.drop(columns=['pathways'])
        mapping = mapping.drop_duplicates()
        # ===== Add results from missing values =====
        pd.concat([prev_mapping,mapping]).to_csv('disease_disgenet_mapping.csv', index=False)
        df = pd.concat([df, mapping]).reset_index(drop=True)
    # ==== Map back to previous ids ====
    df["mondo"] = df["mondo"].str.split(':').str[1]
    # work with not unique values...
    columns = ['mondo', id_type] if id_type != 'mondo' else ['mondo']
    mapping_subset = disease_id_set[columns].drop_duplicates()
    df = pd.merge(mapping_subset, df, on = ['mondo'], how = 'outer')
    df = df.drop(columns=['mondo']) if id_type != 'mondo' else df
    df = df.fillna('').groupby(id_type, as_index = False).agg({'disgenet.genes_related_to_disease': combine_rows, 'disgenet.variants_related_to_disease': combine_rows, 'ctd.pathway_related_to_disease': combine_rows})
    return df

In [281]:
diseases[0]

0     E10
1     E11
2     E12
3     E13
4     E14
5     E66
6     F00
7     F01
8     F02
9     F03
10    G20
11    G30
12    G43
13    I10
14    I11
15    I12
16    I13
17    I15
18    I21
19    I22
20    I50
21    I63
22    I64
23    I70
24    J45
Name: 0, dtype: object

In [282]:
start = timeit.default_timer()
disease_df = get_disease_mapping(disease_set=diseases[0], id_type='ICD-10')
stop = timeit.default_timer()
print('Time: ', stop - start)

querying 1-117...done.
Time:  2.994510136999452


In [283]:
disease_df

Unnamed: 0,ICD-10,disgenet.genes_related_to_disease,disgenet.variants_related_to_disease,ctd.pathway_related_to_disease
0,E10,"{7432, 7351, 10240, 8691, 9882, 8570, 5806, 12...","{rs268, rs773661614, rs10757283, rs1333049, rs...","{hsa01212, hsa04072, hsa01200, hsa05418, hsa05..."
1,E11,"{640, 3767, 1056, 3670, 3953, 5335, 222546, 10...","{rs773661614, rs529294719, rs149703259, rs7477...","{hsa01212, hsa04072, hsa04727, hsa00591, hsa04..."
2,E13,"{84447, 3479, 23413, 107075310, 10247, 5078, 5...","{rs1560408865, rs781007453, rs35932623, rs7469...","{hsa04932, hsa05164, hsa04072, hsa04141, hsa04..."
3,E14,"{7432, 7351, 10240, 8691, 9882, 8570, 5806, 12...","{rs268, rs773661614, rs10757283, rs1333049, rs...","{hsa01212, hsa04072, hsa01200, hsa05418, hsa04..."
4,E66,"{653702, 203238, 280, 7351, 7432, 9882, 5806, ...","{rs12970134, rs9947301, rs268, rs17081231, rs7...","{hsa04932, hsa00230, hsa04020, hsa00740, hsa00..."
5,F00,"{617, 7432, 5021, 4804, 6285, 5267, 3699, 46, ...","{rs324981, rs2509843, rs10812227, rs802568, rs...",{}
6,F01,"{2335, 57096, 5021, 3303, 5743, 7296, 5468, 83...","{rs1555729510, rs1333049, rs113993969, rs77222...","{hsa05200, hsa04919, hsa_M00682, hsa04658, hsa..."
7,G20,"{2335, 116442, 7351, 6530, 3303, 5743, 5468, 5...","{rs72470545, rs9347683, rs33949390, rs39751848...","{hsa04072, hsa00591, hsa01200, hsa_M00177, hsa..."
8,G30,"{7432, 7351, 56971, 8570, 26526, 57630, 10005,...","{rs12637471, rs1314386070, rs268, rs12721109, ...","{hsa04072, hsa01200, hsa05418, hsa05160, hsa04..."
9,G43,"{3578, 23063, 83667, 773, 288, 3508, 2099, 181...","{rs761597771, rs2234693, rs10504861, rs1227336...","{hsa04020, hsa04659, hsa04520, hsa04390, hsa04..."


In [199]:
def make_setup():
    def convert_to_string(elements):
        if str(elements) != 'nan':
            return ';'.join(str(e) for e in elements)
        return cell
    
    def get_df_from_url(content, column_names, header=None):
        df = pd.read_csv(io.StringIO(content), sep='\t', names=column_names, header=header, dtype=str)
        df.fillna('NULL', inplace=True)
        return df

    omim_to_hsa = get_df_from_url(
            content=io.TextIOWrapper(urlopen("http://rest.genome.jp/link/omim/hsa"), encoding="UTF-8").read(),
            column_names=['hsa', 'omim', 'reverse'])
    omim_to_hsa = omim_to_hsa[['hsa','omim']]
    hsa_to_pathway = get_df_from_url(
            content=io.TextIOWrapper(urlopen("http://rest.kegg.jp/link/pathway/hsa"), encoding="UTF-8").read(),
                                             column_names=['hsa', 'pathways'])
    hsa_to_pathway['pathways'] = hsa_to_pathway['pathways'].str.split(':').str[1]
    hsa_to_pathway = hsa_to_pathway.groupby('hsa', as_index = False).agg({'pathways': convert_to_string})

    omim_to_pathway = omim_to_hsa.merge(hsa_to_pathway, on='hsa', how='left')[['omim','pathways']]
    omim_to_pathway['omim'] = omim_to_pathway['omim'].str.split(':').str[1]
    
    full_ids_mapping = pd.read_csv("../new_disorders.map", sep="\t", dtype=str)
    mondo_to_pathway = full_ids_mapping[['mondo','omim']].merge(omim_to_pathway, on='omim')
    mondo_to_pathway['mondo'] = 'MONDO:'+mondo_to_pathway['mondo']
    
    mondo_to_pathway[['mondo','pathways']].to_csv('mondo_to_pathways.csv', index=False)
    return mondo_to_pathway

In [200]:
make_setup()

Unnamed: 0,mondo,omim,pathways
0,MONDO:0008117,164310,
1,MONDO:0009448,242600,hsa04974
2,MONDO:0009448,242600,hsa04974;hsa04978
3,MONDO:0009448,242600,
4,MONDO:0008119,164400,hsa04330;hsa05017;hsa05022
...,...,...,...
6343,MONDO:0009504,245400,hsa00020;hsa00640;hsa01100;hsa01200
6344,MONDO:0009509,245570,hsa04014;hsa04015;hsa04020;hsa04024;hsa04080;h...
6345,MONDO:0009501,245340,
6346,MONDO:0009502,245348,hsa00010;hsa00020;hsa00620;hsa01100;hsa01200


# do the comparisson now

### set to set

In [35]:
reference_kegg_mapping

Unnamed: 0,uniprot.Swiss-Prot,go.BP,go.CC,go.MF,pathway.kegg
0,P00325,"{GO:0042572, GO:0001523, GO:0042573, GO:0006069}","{GO:0005654, GO:0005886, GO:0005829}","{GO:0004745, GO:0004024, GO:0008270}","{hsa00620, hsa00830, hsa00071, hsa00350, hsa00..."
1,P00326,"{GO:0042572, GO:0042573, GO:0006069}","{GO:0005654, GO:0005886, GO:0005829}","{GO:0004745, GO:0004022, GO:0004024, GO:0008270}","{hsa00620, hsa00830, hsa00071, hsa00350, hsa00..."
2,P28223,"{GO:0006874, GO:0046718, GO:0030431, GO:000821...","{GO:0043025, GO:0030425, GO:0005886, GO:000588...","{GO:0044877, GO:0030594, GO:0051378, GO:000196...","{hsa04726, hsa04020, hsa04080, hsa04540, hsa04..."
3,P47869,"{GO:0060078, GO:1904862, GO:0051932, GO:000726...","{GO:0098982, GO:0043025, GO:0032590, GO:000588...","{GO:0030594, GO:0004890, GO:0005254, GO:002285...","{hsa05032, hsa04727, hsa04080, hsa04723, hsa04..."


In [36]:
target_kegg_mapping

Unnamed: 0,uniprot.Swiss-Prot,go.BP,go.CC,go.MF,pathway.kegg
0,O00459,"{GO:0015031, GO:0048010, GO:0042307, GO:004685...","{GO:0005634, GO:0005829, GO:0005942}","{GO:0001784, GO:0046982, GO:0030971, GO:001990...","{hsa04072, hsa04662, hsa05169, hsa04722, hsa05..."
1,O14492,"{GO:0050873, GO:0001922, GO:0007596, GO:005085...","{GO:0001725, GO:0005886, GO:0005737, GO:000588...","{GO:0005515, GO:0005068, GO:0042169, GO:004280...","{hsa04910, hsa04722}"
2,O14610,"{GO:0007602, GO:0007186}",{GO:0005834},"{GO:0003924, GO:0031681}","{hsa05163, hsa05032, hsa04726, hsa04727, hsa04..."
3,O14775,"{GO:0043547, GO:1901386, GO:0007186, GO:000645...","{GO:0098793, GO:0005737, GO:0005634, GO:190277...","{GO:0003924, GO:0031682, GO:0030159, GO:000551...","{hsa05163, hsa05032, hsa04726, hsa04727, hsa04..."
4,O14842,"{GO:0042593, GO:0032024, GO:0051928, GO:003007...","{GO:0005887, GO:0005886}","{GO:0008289, GO:0045125, GO:0004930}",{hsa04911}
...,...,...,...,...,...
195,Q9UN70,"{GO:0016339, GO:0050808, GO:0007156, GO:0007155}","{GO:0016020, GO:0005887}",{GO:0005509},{}
196,Q9UNN8,"{GO:0007596, GO:0050819}","{GO:0005886, GO:0005615, GO:0005887, GO:000557...","{GO:0038023, GO:0005515}",{hsa04610}
197,Q9UQC2,"{GO:0048015, GO:0007169, GO:0043306, GO:000741...","{GO:0005737, GO:0005886, GO:0005829}","{GO:0043325, GO:0005068, GO:0005547, GO:0005515}","{hsa04072, hsa04071, hsa04014, hsa04666, hsa04..."
198,Q9Y2G0,"{GO:0046854, GO:0072659}","{GO:0015629, GO:0005886, GO:0005829}",{GO:0005515},{}


In [325]:
def create_ref_dict(mapping, keys):
    reference_dict = dict()
    for att_type in keys:
        reference_dict[att_type] = set.union(*mapping[att_type])
    return reference_dict

In [318]:
def evaluate_values(mapping, ref_dict, threshold, keys):
    def get_intersection(values_set, ref_set):
        if len(values_set) == 0:
            return 0.0
        return (len(values_set & ref_set)/len(values_set))
    
    evaluation = list()
    for attribute in keys:
        evaluated_series = mapping[attribute].apply(get_intersection, ref_set=ref_dict[attribute])
        evaluation.append([attribute, str(len(evaluated_series[evaluated_series > threshold])/len(evaluated_series))])
    return evaluation

In [329]:
def compare_set_to_set(ref, ref_id_type, targets, targets_id_type, threshold=0.0):
    reference_mapping = get_gene_to_attributes(ref, ref_id_type)
    target_mapping = get_gene_to_attributes(targets, targets_id_type)
    ref_dict = create_ref_dict(mapping=reference_mapping, keys=reference_mapping.columns[1:])
    result = evaluate_values(mapping=target_mapping, ref_dict=ref_dict, threshold=threshold, keys=target_mapping.columns[1:])
    return result

In [330]:
compare_set_to_set(ref=seeds, ref_id_type='uniprot', targets=significance, targets_id_type='uniprot')

[['go.BP', '0.74'],
 ['go.CC', '0.91'],
 ['go.MF', '0.91'],
 ['pathway.kegg', '0.365']]

In [335]:
def compare_id_to_set(ref_id, ref_id_type, targets, targets_id_type, threshold=0.0):
    disease_id_atts = get_disease_mapping({ref_id}, ref_id_type)
    disease_id_atts['pathway.kegg'] = disease_id_atts['ctd.pathway_related_to_disease']
    target_mapping = get_gene_to_attributes(targets, targets_id_type)
    ref_dict = create_ref_dict(mapping=disease_id_atts, keys=['pathway.kegg'])
    result = evaluate_values(mapping=target_mapping, ref_dict=ref_dict, threshold=threshold, keys=target_mapping.columns['pathway.kegg'])
    return result

In [336]:
compare_id_to_set(ref_id=disease_id, ref_id_type='mondo', targets=significance, targets_id_type='uniprot')

    uniprot.Swiss-Prot                                              go.BP  \
0               O00459  {GO:0008286, GO:0034976, GO:0050852, GO:005090...   
1               O14492  {GO:0050851, GO:0008286, GO:0007399, GO:003003...   
2               O14610                           {GO:0007602, GO:0007186}   
3               O14775  {GO:0007186, GO:0007165, GO:0006457, GO:004354...   
4               O14842  {GO:0007204, GO:0007186, GO:0032691, GO:003202...   
..                 ...                                                ...   
195             Q9UN70   {GO:0007155, GO:0050808, GO:0016339, GO:0007156}   
196             Q9UNN8                           {GO:0007596, GO:0050819}   
197             Q9UQC2  {GO:0007169, GO:0038095, GO:0051897, GO:003031...   
198             Q9Y2G0                           {GO:0072659, GO:0046854}   
199             Q9Y6M4  {GO:0018105, GO:0007165, GO:0006897, GO:001605...   

                                                 go.CC  \
0                

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["mondo"] = df["mondo"].str.split(':').str[1]


IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [337]:
disease_df

Unnamed: 0,ICD-10,disgenet.genes_related_to_disease,disgenet.variants_related_to_disease,ctd.pathway_related_to_disease
0,E10,"{7432, 7351, 10240, 8691, 9882, 8570, 5806, 12...","{rs268, rs773661614, rs10757283, rs1333049, rs...","{hsa01212, hsa04072, hsa01200, hsa05418, hsa05..."
1,E11,"{640, 3767, 1056, 3670, 3953, 5335, 222546, 10...","{rs773661614, rs529294719, rs149703259, rs7477...","{hsa01212, hsa04072, hsa04727, hsa00591, hsa04..."
2,E13,"{84447, 3479, 23413, 107075310, 10247, 5078, 5...","{rs1560408865, rs781007453, rs35932623, rs7469...","{hsa04932, hsa05164, hsa04072, hsa04141, hsa04..."
3,E14,"{7432, 7351, 10240, 8691, 9882, 8570, 5806, 12...","{rs268, rs773661614, rs10757283, rs1333049, rs...","{hsa01212, hsa04072, hsa01200, hsa05418, hsa04..."
4,E66,"{653702, 203238, 280, 7351, 7432, 9882, 5806, ...","{rs12970134, rs9947301, rs268, rs17081231, rs7...","{hsa04932, hsa00230, hsa04020, hsa00740, hsa00..."
5,F00,"{617, 7432, 5021, 4804, 6285, 5267, 3699, 46, ...","{rs324981, rs2509843, rs10812227, rs802568, rs...",{}
6,F01,"{2335, 57096, 5021, 3303, 5743, 7296, 5468, 83...","{rs1555729510, rs1333049, rs113993969, rs77222...","{hsa05200, hsa04919, hsa_M00682, hsa04658, hsa..."
7,G20,"{2335, 116442, 7351, 6530, 3303, 5743, 5468, 5...","{rs72470545, rs9347683, rs33949390, rs39751848...","{hsa04072, hsa00591, hsa01200, hsa_M00177, hsa..."
8,G30,"{7432, 7351, 56971, 8570, 26526, 57630, 10005,...","{rs12637471, rs1314386070, rs268, rs12721109, ...","{hsa04072, hsa01200, hsa05418, hsa05160, hsa04..."
9,G43,"{3578, 23063, 83667, 773, 288, 3508, 2099, 181...","{rs761597771, rs2234693, rs10504861, rs1227336...","{hsa04020, hsa04659, hsa04520, hsa04390, hsa04..."


In [65]:
def get_distance_matrix(sets):
    dis_mat = np.zeros((len(sets), len(sets)))
    for index1 in range(0, len(sets)):
        for index2 in range(index1, len(sets)):
            if len(sets[index1]) > 0 and len(sets[index2]) > 0:
                calc_dis = len(sets[index1] & sets[index2]) / len(sets[index1] | sets[index2]) 
            else:
                calc_dis = float('nan')
            # assign to matrix
            dis_mat[index1][index2] = calc_dis
            dis_mat[index2][index1] = calc_dis
    return pd.DataFrame(dis_mat)

comp_df = get_distance_matrix(disease_df['disgenet.genes_related_to_disease'])
comp_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,1.0,0.005072,0.03125,0.0,0.003546,0.001752,,0.00257,0.0,0.0,0.002568,0.001648,0.001648,0.002818,,0.002393,0.007576
1,0.005072,1.0,0.005508,0.063275,0.060226,0.272326,,0.302086,0.009287,0.009676,0.302364,0.290853,0.290853,0.264661,,0.323582,0.082609
2,0.03125,0.005508,1.0,0.004444,0.00361,0.002928,,0.003006,0.0,0.0,0.003004,0.003311,0.003311,0.003394,,0.003362,0.002545
3,0.0,0.063275,0.004444,1.0,0.099307,0.053198,,0.062919,0.02521,0.020161,0.062893,0.070632,0.070632,0.069414,,0.065207,0.090239
4,0.003546,0.060226,0.00361,0.099307,1.0,0.05,,0.054649,0.010239,0.02349,0.054627,0.062147,0.062147,0.060797,,0.054554,0.060956
5,0.001752,0.272326,0.002928,0.053198,0.05,1.0,,0.248476,0.006431,0.007883,0.248422,0.239601,0.239601,0.21642,,0.264055,0.057462
6,,,,,,,,,,,,,,,,,
7,0.00257,0.302086,0.003006,0.062919,0.054649,0.248476,,1.0,0.011168,0.013293,0.99957,0.298301,0.298301,0.293,,0.303201,0.071372
8,0.0,0.009287,0.0,0.02521,0.010239,0.006431,,0.011168,1.0,0.042857,0.011164,0.012128,0.012128,0.014148,,0.010552,0.009804
9,0.0,0.009676,0.0,0.020161,0.02349,0.007883,,0.013293,0.042857,1.0,0.013288,0.014294,0.014294,0.012366,,0.012925,0.014458


In [338]:
disease_clusters

Unnamed: 0,0,1,2
0,G43,567,Migraine
1,I64,470,"Stroke, not specified as haemorrhage or infarc..."
2,E10,438,Type1 diabetes mellitus
3,E11,438,Type2 diabetes mellitus
4,J45,438,Asthma
5,E13,438,Other specified diabetes mellitus
6,E66,438,Obesity
7,G30,438,Alzheimer
8,I15,438,Secondary hypertension
9,I21,438,Acute myocardial infarction


In [339]:
disease_clusters_df = get_disease_mapping(disease_set=disease_clusters[0], id_type='ICD-10')
disease_clusters_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["mondo"] = df["mondo"].str.split(':').str[1]


Unnamed: 0,ICD-10,disgenet.genes_related_to_disease,disgenet.variants_related_to_disease,ctd.pathway_related_to_disease
0,E10,"{7432, 7351, 10240, 8691, 9882, 8570, 5806, 12...","{rs268, rs773661614, rs10757283, rs1333049, rs...","{hsa01212, hsa04072, hsa01200, hsa05418, hsa05..."
1,E11,"{640, 3767, 1056, 3670, 3953, 5335, 222546, 10...","{rs773661614, rs529294719, rs149703259, rs7477...","{hsa01212, hsa04072, hsa04727, hsa00591, hsa04..."
2,E13,"{84447, 3479, 23413, 107075310, 10247, 5078, 5...","{rs1560408865, rs781007453, rs35932623, rs7469...","{hsa04932, hsa05164, hsa04072, hsa04141, hsa04..."
3,E14,"{7432, 7351, 10240, 8691, 9882, 8570, 5806, 12...","{rs268, rs773661614, rs10757283, rs1333049, rs...","{hsa01212, hsa04072, hsa01200, hsa05418, hsa04..."
4,E66,"{653702, 203238, 280, 7351, 7432, 9882, 5806, ...","{rs12970134, rs9947301, rs268, rs17081231, rs7...","{hsa04932, hsa00230, hsa04020, hsa00740, hsa00..."
5,F00,"{617, 7432, 5021, 4804, 6285, 5267, 3699, 46, ...","{rs324981, rs2509843, rs10812227, rs802568, rs...",{}
6,F01,"{2335, 57096, 5021, 3303, 5743, 7296, 5468, 83...","{rs1555729510, rs1333049, rs113993969, rs77222...","{hsa05200, hsa04919, hsa_M00682, hsa04658, hsa..."
7,G20,"{2335, 116442, 7351, 6530, 3303, 5743, 5468, 5...","{rs72470545, rs9347683, rs33949390, rs39751848...","{hsa04072, hsa00591, hsa01200, hsa_M00177, hsa..."
8,G30,"{7432, 7351, 56971, 8570, 26526, 57630, 10005,...","{rs12637471, rs1314386070, rs268, rs12721109, ...","{hsa04072, hsa01200, hsa05418, hsa05160, hsa04..."
9,G43,"{3578, 23063, 83667, 773, 288, 3508, 2099, 181...","{rs761597771, rs2234693, rs10504861, rs1227336...","{hsa04020, hsa04659, hsa04520, hsa04390, hsa04..."
