# Testing environment for DoSE

## Setup

### Load libraries

In [1]:
import pandas as pd
import numpy as np 
import gseapy
from biothings_client import get_client

### Define data

In [2]:
seeds_file = "Input/0007079.txt"
betweenness_file = "Input/0007079_added_200_dmd_betweenness_hub_0.01.txt"
significance_file = "Input/0007079_added_200_dmd_significance_hub_1.txt"
diseases_file = "Input/ICD10_commROCG_raw.txt"
disease_clusters_file = "Input/ICD10_commROCG_cluster.txt"

### Load data

In [3]:
seeds = pd.read_csv(seeds_file, sep="\t", header=None)[0]
betweenness = pd.read_csv(betweenness_file, sep="\t")['node']
significance = pd.read_csv(significance_file, sep="\t")['node']
diseases = pd.read_csv(diseases_file, sep="\t", header=None)
disease_clusters = pd.read_csv(disease_clusters_file, sep="\t", header=None)

In [4]:
import timeit
start = timeit.default_timer()

stop = timeit.default_timer()
print('Time: ', stop - start)  

Time:  4.0045997593551874e-05


## Mapper

In [5]:
id_type_key = {'entrez':'entrezgene','ensembl':'ensembl.gene','symbol':'symbol','uniprot':'uniprot.Swiss-Prot','mondo':'mondo'}
gene_ids=['uniprot.Swiss-Prot','symbol','ensembl.gene','entrezgene']

In [6]:
def preprocess_results(mapping, multicol, singlecol, key, explode=False):
    
    def convert_to_string(cell, key):
        if str(cell) != 'nan':
            extracted_ids = [val.get(key) for val in cell]
            return ';'.join(str(e) for e in list(set(extracted_ids)))
        return cell
    mapping[multicol] = mapping[multicol].apply(lambda x: convert_to_string(x, key)) if multicol in mapping else np.nan
    if singlecol in mapping:
        mapping[multicol].fillna(mapping[singlecol], inplace=True)
        mapping = mapping.drop(columns=[singlecol])
    if explode:
        mapping = mapping[multicol].split(';').explode(multicol)
        mapping.rename(columns={multicol: singlecol}, inplace = True)
    return mapping


def get_prev_mapping(in_set, id_type, file, sep):
    # ===== Get mapping from local mapping file =====
    mapping = pd.read_csv(file, sep=sep, header=0, dtype=str)
    if id_type == "ICD-10":
        mapping = split_and_expand_column(data=mapping, split_string=",", column_name="ICD-10")
        mapping_copy = mapping.copy()
        mapping_copy['ICD-10'] = mapping_copy['ICD-10'].str.split('.', expand=True)[0]
        mapping = pd.concat([mapping, mapping_copy], ignore_index=True)
    # ==== Map given disease set ====
    id_type = id_type_key[id_type] if id_type in id_type_key else id_type
    mapped_set = mapping[mapping[id_type].isin(in_set)]
    # ===== Get missing values =====
    missing = list(set(in_set) - set(mapping[id_type]))
    return mapped_set, missing, mapping
    

def get_gene_mapping(gene_set, id_type):
    """
    Simple converter.

    :param gene_set: Set of gene ids
    :return: Dataframe
    """
    # ===== Get mapping from previous mappings =====
    df, missing, prev_mapping = get_prev_mapping(in_set=gene_set, id_type=id_type, file='gene_id_mapping.csv', sep=",")
    # ===== Get mapping for missing values =====
    if len(missing) > 0:
        mg = get_client("gene")
        mapping = mg.querymany(missing, scopes=id_type_key[id_type], fields=','.join(gene_ids),
                     species='human', returnall=False, as_dataframe=True, df_index=False)
        mapping = mapping.drop(columns=[id_type_key[id_type]])
        mapping.rename(columns={'query': id_type_key[id_type]}, inplace = True)
        # ===== Split if there are multiple ensembl ids =====
        if 'ensembl' in mapping:
            mapping = preprocess_results(mapping=mapping, multicol='ensembl', singlecol='ensembl.gene', key='gene', explode=True)
        mapping = mapping.drop(columns=['_id','_score'])
        # ===== Add results from missing values =====
        pd.concat([prev_mapping,mapping]).to_csv('gene_id_mapping.csv', index=False)
        df = pd.concat([df, mapping]).reset_index(drop=True)
    return df

def get_gene_to_attributes(gene_set, id_type):
    """
    Simple converter.

    :param gene_set: Set of gene ids
    :return: Dataframe
    """
    # ===== Get gene ID mappings =====
    gene_mapping, _, _ = get_prev_mapping(in_set=gene_set, id_type=id_type, file='gene_id_mapping.csv', sep=",")
    df, missing, prev_mapping = get_prev_mapping(in_set=set(gene_mapping['entrezgene']), id_type='entrez', file='gene_att_mapping.csv', sep=",")
    if len(missing) > 0:
        mg = get_client("gene")
        gene_ids=['uniprot.Swiss-Prot','symbol','ensembl.gene','entrezgene']
        mapping = mg.querymany(missing, scopes=','.join(gene_ids),
                            fields='pathway.kegg.id, go.BP.id, go.CC.id, go.MF.id',
                            species='human', returnall=False, as_dataframe=True, df_index=False)
        mapping.rename(columns={'query': 'entrezgene'}, inplace = True)
        for column in ['go.BP','go.CC','go.MF','pathway.kegg']:
            mapping = preprocess_results(mapping=mapping, multicol=column, singlecol=column+'.id', key='id')
        mapping = mapping.drop(columns=['_id','_score'])
        # ===== Add results from missing values =====
        pd.concat([prev_mapping,mapping]).to_csv('gene_att_mapping.csv', index=False)        
        df = pd.concat([df, mapping]).reset_index(drop=True)
    # work with not unique values...
    mapping_subset = gene_mapping[['entrezgene', id_type_key[id_type]]].drop_duplicates()
    df = pd.merge(mapping_subset, df, on = ['entrezgene'], how = 'outer')
    df = df.drop(columns=['entrezgene'])
    df = df.fillna('').groupby([id_type_key[id_type]], as_index=False).agg({'go.BP': combine_rows, 'go.CC': combine_rows,
                                                                            'go.MF': combine_rows, 'pathway.kegg': combine_rows})
    return df

In [8]:
start = timeit.default_timer()
reference_mapping = get_gene_mapping(seeds, 'uniprot')
target_mapping = get_gene_mapping(significance, 'uniprot')
stop = timeit.default_timer()
print('Time: ', stop - start)

Time:  0.01517541499924846


In [9]:
reference_mapping

Unnamed: 0,entrezgene,ensembl.gene,symbol,uniprot.Swiss-Prot
191,125,ENSG00000196616,ADH1B,P00325
192,2555,ENSG00000151834,GABRA2,P47869
193,126,ENSG00000248144,ADH1C,P00326
194,3356,ENSG00000102468,HTR2A,P28223


In [10]:
target_mapping

Unnamed: 0,entrezgene,ensembl.gene,symbol,uniprot.Swiss-Prot
0,1394,ENSG00000120088,CRHR1,P34998
1,1394,ENSG00000276191,CRHR1,P34998
2,104909134,ENSG00000263715,LINC02210-CRHR1,P34998
3,104909134,ENSG00000278232,LINC02210-CRHR1,P34998
4,104909134,ENSG00000282456,LINC02210-CRHR1,P34998
...,...,...,...,...
212,1742,ENSG00000132535,DLG4,P78352
213,4684,ENSG00000149294,NCAM1,P13591
214,2778,ENSG00000087460,GNAS,O95467
215,2776,ENSG00000156052,GNAQ,P50148


In [11]:
start = timeit.default_timer()
reference_kegg_mapping = get_gene_to_attributes(seeds, 'uniprot')
target_kegg_mapping = get_gene_to_attributes(significance, 'uniprot')
stop = timeit.default_timer()
print('Time: ', stop - start)

Time:  0.06864726200001314


In [12]:
reference_kegg_mapping

Unnamed: 0,uniprot.Swiss-Prot,go.BP,go.CC,go.MF,pathway.kegg
0,P00325,"{GO:0001523, GO:0006069, GO:0042572, GO:0042573}","{GO:0005654, GO:0005886, GO:0005829}","{GO:0008270, GO:0004024, GO:0004745}","{hsa05204, hsa00982, hsa00620, hsa00071, hsa00..."
1,P00326,"{GO:0042573, GO:0006069, GO:0042572}","{GO:0005654, GO:0005886, GO:0005829}","{GO:0004022, GO:0008270, GO:0004024, GO:0004745}","{hsa05204, hsa00982, hsa00620, hsa00071, hsa00..."
2,P28223,"{GO:0007210, GO:0046718, GO:2000300, GO:004326...","{GO:0098666, GO:0099055, GO:0099056, GO:000590...","{GO:0042802, GO:0001587, GO:0005515, GO:003059...","{hsa04020, hsa04750, hsa04080, hsa04726, hsa04..."
3,P47869,"{GO:0007268, GO:0060078, GO:0006836, GO:000150...","{GO:0099060, GO:1902711, GO:0098982, GO:004520...","{GO:0005237, GO:0022851, GO:0030594, GO:190431...","{hsa04723, hsa04080, hsa04742, hsa05032, hsa05..."


In [13]:
target_kegg_mapping

Unnamed: 0,uniprot.Swiss-Prot,go.BP,go.CC,go.MF,pathway.kegg
0,O00459,"{GO:0050852, GO:0048010, GO:0008286, GO:001050...","{GO:0005634, GO:0005829, GO:0005942}","{GO:0019903, GO:0001784, GO:0046935, GO:000551...","{hsa05235, hsa04810, hsa05162, hsa05220, hsa05..."
1,O14492,"{GO:0007399, GO:0007596, GO:0030036, GO:004657...","{GO:0001725, GO:0005737, GO:0001726, GO:000588...","{GO:0042802, GO:0005515, GO:0005068, GO:004216...","{hsa04722, hsa04910}"
2,O14610,"{GO:0007186, GO:0007602}",{GO:0005834},"{GO:0003924, GO:0031681}","{hsa04151, hsa05163, hsa04713, hsa05200, hsa04..."
3,O14775,"{GO:0007186, GO:0007212, GO:0006457, GO:000716...","{GO:0005829, GO:1902773, GO:0005634, GO:000583...","{GO:0051087, GO:0031682, GO:0005515, GO:000509...","{hsa04151, hsa05163, hsa04713, hsa05200, hsa04..."
4,O14842,"{GO:0007186, GO:0007204, GO:0050796, GO:005192...","{GO:0005886, GO:0005887}","{GO:0045125, GO:0004930, GO:0008289}",{hsa04911}
...,...,...,...,...,...
195,Q9UN70,"{GO:0007155, GO:0016339, GO:0007156, GO:0050808}","{GO:0016020, GO:0005887}",{GO:0005509},{}
196,Q9UNN8,"{GO:0007596, GO:0050819}","{GO:0005813, GO:0048471, GO:0070062, GO:000561...","{GO:0038023, GO:0005515}",{hsa04610}
197,Q9UQC2,"{GO:0048015, GO:0030316, GO:0019221, GO:000716...","{GO:0005886, GO:0005829, GO:0005737}","{GO:0043325, GO:0005515, GO:0005547, GO:0005068}","{hsa04380, hsa05220, hsa04664, hsa04071, hsa04..."
198,Q9Y2G0,"{GO:0046854, GO:0072659}","{GO:0015629, GO:0005886, GO:0005829}",{GO:0005515},{}


In [14]:
full_ids_mapping = pd.read_csv("../disorders.map", sep="\t", dtype=str)
full_ids_mapping['parent ICD-10']=full_ids_mapping['ICD-10'].str.split('.',expand=True)[0]
full_ids_mapping

Unnamed: 0,mondo,omim,snomedct,umls,orpha,mesh,ncit,doid,meddra,medgen,ICD-10,parent ICD-10
0,0008118,164330,716180009,C1834013,2724,C537740,,,,,,
1,0010439,300829,,C1853577,,C543241,,,,,,
2,0008117,164310,763829004,C1834014,98897,C563508,,,,,G71.0,G71
3,0009448,242600,84121007,C0268654,42062,C536285,,,,,E72.0,E72
4,0008119,164400,715748006,C0752120,98755,,C129982,0050954,,,G11.8,G11
...,...,...,...,...,...,...,...,...,...,...,...,...
24115,0009507,245550,732961003,C1855551,1296,C538396,,,,,Q87.8,Q87
24116,0009508,245552,,C1855550,,C537549,,,,,,
24117,0009501,245340,766715000,C1855577,171690,C565449,,,,,G72.8,G72
24118,0009502,245348,,C1855565,79244,C565448,,,,,E74.4,E74


In [15]:
with np.printoptions(threshold=np.inf):
    print(full_ids_mapping['ICD-10'].unique())

[nan 'G71.0' 'E72.0' 'G11.8' 'Q56.1' 'H90.3' 'G31.8' 'Q80.4' 'Q87.0'
 'Q80.2' 'Q87.8' 'F42' 'Q77.7' 'D64.4' 'E85.0' 'G12.2' 'E77.8' 'E03.1'
 'G30' 'H53.8' 'Q77.3' 'E83.3' 'Q83.3' 'G11.4' 'D81.8' 'Q04.3' 'H35.5'
 'C41.9' 'E70.3' 'D64.0' 'D18.0' 'Q82.8,Q43.8' 'H47.2' 'G93.2' 'H47.0'
 'K83.1' 'K59.8' 'N46' 'E80.0' 'D80.0' 'E27.4' 'Q99.8' 'E76.0' 'G60.8'
 'H18.5' 'D84.8' 'Q78.8' 'D81.4' 'C49.9' 'Q87.3' 'G24.5' 'Q67.4' 'G93.4'
 'E80.5' 'E72.5' 'E88.8' 'E74.3' 'M06.8' 'F72' 'H90.5' 'G11.0' 'D80.5'
 'D44.8' 'J84.81' 'G23.0' 'Q87.1' 'G51.4' 'I42.0' 'E28.3,E29.1' 'E26.8'
 'Q71.6' 'Q75.4' 'E29.1' 'E72.8' 'Q34.8' 'E27.1' 'C26.9' 'E74.0' 'E31.0'
 'E54' 'Q74.0' 'S00.T98' 'G71.1' 'E78.5' 'J39.2' 'Q45.1' 'Q10.3' 'K86.1'
 'E67.1' 'M60,G72.49,M60.9' 'G60.0' 'M61.5' 'Q78.2' 'Q82.8' 'Q11.2'
 'T78.3' 'E23.0' 'G20' 'E71.1,E71.110' 'Q41.9,Q41.8,Q41.2,Q41.1,Q41.0'
 'M93.2' 'Q78.4' 'Q78.0' 'N00.N99' 'D81.1' 'E22.0' 'Q77.8' 'C44.5' 'J96.0'
 'C56' 'Q02' 'N20,N20.2' 'G40.4' 'D58.1' 'D27' 'E75.2,E75.23' 'N97'
 'K

In [92]:
import re
changes = list()
for idx, row in full_ids_mapping.iterrows():
    if  not (isinstance(full_ids_mapping.loc[idx, 'ICD-10'], float) and np.isnan(full_ids_mapping.loc[idx, 'ICD-10'])):
        new_ids = set()
        cur_ids = full_ids_mapping.loc[idx, 'ICD-10'].split(",")
        for cur_id in cur_ids:
            ids = list()
            if "-" in cur_id:
                ids = cur_id.split("-")
                print(cur_id)
            elif re.search(r'[A-Z][0-9]{2}[.][A-Z][0-9]{2}', cur_id):
                ids = cur_id.split(".")
            else:
                ids = re.findall(r"([A-Z][0-9]{1,})[.,-]?", cur_id)
            changes.append(cur_id+" ----> "+str(ids))
            new_ids.update(ids)
        #print(new_ids)

        


N70-N77
A00-A09
G20-G26
H01.021-H01.029
I60-I69
H60-H62
N60-N65
D80-D89
H25-H28
H80-H83
B50-B64
D55-D59
D00-D09
B65-B83
F99-F99
H02.121-129
B35-B49
E00-E07
C60-C63
I00-I02
N10-N16
M00-M02
M91-M94
F30-F39
M60-M63
K70-K77
B20-B20
J96-J99
G00-G99
H40-H42
I10-I15
E08-E13
I20-I25
D80-D89
C50-C50


In [86]:
changes

["G71.0 ----> ['G71']",
 "E72.0 ----> ['E72']",
 "G11.8 ----> ['G11']",
 "Q56.1 ----> ['Q56']",
 "H90.3 ----> ['H90']",
 "G31.8 ----> ['G31']",
 "Q80.4 ----> ['Q80']",
 "Q87.0 ----> ['Q87']",
 "Q80.2 ----> ['Q80']",
 "Q87.8 ----> ['Q87']",
 "F42 ----> ['F42']",
 "Q87.8 ----> ['Q87']",
 "G71.0 ----> ['G71']",
 "Q87.8 ----> ['Q87']",
 "Q87.8 ----> ['Q87']",
 "Q77.7 ----> ['Q77']",
 "H90.3 ----> ['H90']",
 "D64.4 ----> ['D64']",
 "E85.0 ----> ['E85']",
 "G12.2 ----> ['G12']",
 "E77.8 ----> ['E77']",
 "Q80.2 ----> ['Q80']",
 "E03.1 ----> ['E03']",
 "Q87.8 ----> ['Q87']",
 "G30 ----> ['G30']",
 "H53.8 ----> ['H53']",
 "Q77.3 ----> ['Q77']",
 "E83.3 ----> ['E83']",
 "Q83.3 ----> ['Q83']",
 "Q87.8 ----> ['Q87']",
 "G11.4 ----> ['G11']",
 "D81.8 ----> ['D81']",
 "G71.0 ----> ['G71']",
 "G11.8 ----> ['G11']",
 "E77.8 ----> ['E77']",
 "Q04.3 ----> ['Q04']",
 "H35.5 ----> ['H35']",
 "C41.9 ----> ['C41']",
 "G11.4 ----> ['G11']",
 "E70.3 ----> ['E70']",
 "D64.0 ----> ['D64']",
 "D18.0 ----> ['D18'

In [37]:
full_ids_mapping

Unnamed: 0,mondo,omim,snomedct,umls,orpha,mesh,ncit,doid,meddra,medgen,ICD-10,parent ICD-10
0,0008118,164330,716180009,C1834013,2724,C537740,,,,,,
1,0010439,300829,,C1853577,,C543241,,,,,,
2,0008117,164310,763829004,C1834014,98897,C563508,,,,,G71.0,G71
3,0009448,242600,84121007,C0268654,42062,C536285,,,,,E72.0,E72
4,0008119,164400,715748006,C0752120,98755,,C129982,0050954,,,G11.8,G11
...,...,...,...,...,...,...,...,...,...,...,...,...
24115,0009507,245550,732961003,C1855551,1296,C538396,,,,,Q87.8,Q87
24116,0009508,245552,,C1855550,,C537549,,,,,,
24117,0009501,245340,766715000,C1855577,171690,C565449,,,,,G72.8,G72
24118,0009502,245348,,C1855565,79244,C565448,,,,,E74.4,E74


In [16]:
full_ids_mapping.count()

mondo            24120
omim              8841
snomedct          8962
umls             16234
orpha             9363
mesh              8075
ncit              6953
doid              8944
meddra            1144
medgen               1
ICD-10            9561
parent ICD-10     9561
dtype: int64

In [17]:
def split_and_expand_column(data, split_string, column_name):
    s = data[column_name].str.split(split_string, expand=True).stack()
    i = s.index.get_level_values(0)
    df2 = data.loc[i].copy()
    df2[column_name] = s.values
    return df2

def combine_rows(x):
    return set(filter(None,';'.join(x).split(';')))

def get_disease_mapping(disease_set, id_type):
    # ==== Get Mondo IDs ====
    disease_id_set,_,_ = get_prev_mapping(in_set=disease_set, id_type=id_type, file="../disorders.map", sep="\t")
    mondo_set = list(set('MONDO:'+disease_id_set['mondo']))
    # ===== Get mapping from previous mappings =====
    df, missing, prev_mapping = get_prev_mapping(in_set=mondo_set, id_type='mondo', file='disease_disgenet_mapping.csv', sep=",")
    # ==== Get disgenet values ====
    if len(missing) > 0:
        md = get_client("disease")
        mapping = md.getdiseases(missing,
                                 fields='disgenet.genes_related_to_disease.gene_id,disgenet.variants_related_to_disease.rsid,ctd.pathway_related_to_disease.kegg_pathway_id',
                                 species='human', returnall=False, as_dataframe=True, df_index=False)
        mapping.rename(columns={'query': 'mondo'}, inplace = True)
        # transform dataframe to combine single and multiple results
        mapping = preprocess_results(mapping=mapping, multicol='disgenet.genes_related_to_disease', 
                                     singlecol='disgenet.genes_related_to_disease.gene_id', key='gene_id')
        mapping = preprocess_results(mapping=mapping, multicol='disgenet.variants_related_to_disease', 
                                     singlecol='disgenet.variants_related_to_disease.rsid', key='rsid')
        mapping = preprocess_results(mapping=mapping, multicol='ctd.pathway_related_to_disease', 
                                     singlecol='ctd.pathway_related_to_disease.kegg_pathway_id', key='kegg_pathway_id')
        mapping = mapping.drop(columns=['_id','_version','disgenet._license'])      
        # ===== Add results from missing values =====
        pd.concat([prev_mapping,mapping]).to_csv('disease_disgenet_mapping.csv', index=False)
        df = pd.concat([df, mapping]).reset_index(drop=True)
    # ==== Map back to previous ids ====
    df["mondo"] = df["mondo"].str.replace("MONDO:", "")
    # work with not unique values...
    mapping_subset = disease_id_set[['mondo', id_type]].drop_duplicates()
    df = pd.merge(mapping_subset, df, on = ['mondo'], how = 'outer')
    df = df.drop(columns=['mondo'])
    df = df.fillna('').groupby(id_type, as_index = False).agg({'disgenet.genes_related_to_disease': combine_rows, 'disgenet.variants_related_to_disease': combine_rows, 'ctd.pathway_related_to_disease': combine_rows})
    return df

In [18]:
diseases[0]

0     E10
1     E11
2     E12
3     E13
4     E14
5     E66
6     F00
7     F01
8     F02
9     F03
10    G20
11    G30
12    G43
13    I10
14    I11
15    I12
16    I13
17    I15
18    I21
19    I22
20    I50
21    I63
22    I64
23    I70
24    J45
Name: 0, dtype: object

In [19]:
start = timeit.default_timer()
df = get_disease_mapping(disease_set=diseases[0], id_type='ICD-10')
stop = timeit.default_timer()
print('Time: ', stop - start)

Time:  0.12326342700180248


In [20]:
df

Unnamed: 0,ICD-10,disgenet.genes_related_to_disease,disgenet.variants_related_to_disease,ctd.pathway_related_to_disease
0,E10,"{486, 4133, 6198, 3645, 3606, 6929, 3557, 1036...","{rs9273643, rs10757274, rs12779790, rs11868035...","{hsa04658, hsa05132, hsa01521, hsa01522, hsa05..."
1,E11,"{486, 4133, 6198, 3645, 3606, 6929, 3557, 1036...","{rs9273643, rs10757274, rs12779790, rs11868035...","{hsa_M00035, hsa04658, hsa_M00118, hsa05132, h..."
2,E13,"{3952, 4567, 2121, 8894, 3342, 2081, 4512, 545...","{rs28937890, rs104893881, rs104893880, rs75246...","{hsa04151, hsa05160, hsa04960, hsa04072, hsa04..."
3,E66,"{831, 6198, 3645, 3606, 3557, 10365, 134391, 1...","{rs2796749, rs1800947, rs8192524, rs999943, rs...","{hsa04060, hsa04152, hsa04932, hsa04920, hsa04..."
4,F00,"{54487, 84867, 3606, 6571, 3645, 8406, 3557, 3...","{rs3771829, rs6627057, rs11789407, rs2279709, ...",{}
5,F01,"{7079, 1073, 2260, 1950, 3606, 101290498, 3552...","{rs113993968, rs864622782, rs113993971, rs1555...","{hsa04658, hsa01522, hsa_M00682, hsa04320, hsa..."
6,F02,{},{},{}
7,G20,"{6609, 7472, 2717, 6571, 5047, 8074, 23286, 54...","{rs33939927, rs34778348, rs63750756, rs2072374...","{hsa04658, hsa05132, hsa01521, hsa01522, hsa05..."
8,G30,"{23678, 8443, 831, 3645, 3606, 4133, 5923, 619...","{rs12978931, rs2298369, rs908832, rs2516049, r...","{hsa04658, hsa05132, hsa01521, hsa01522, hsa05..."
9,G43,"{192142, 3357, 3123, 7045, 183, 4209, 9351, 13...","{rs548294, rs104894561, rs11172113, rs2860174,...","{hsa01522, hsa04961, hsa05205, hsa05144, hsa04..."


In [21]:
df2 = df.set_index('ICD-10').to_dict()
df2

{'disgenet.genes_related_to_disease': {'E10': {'7071',
   '84076',
   '3670',
   '56606',
   '8854',
   '3240',
   '2875',
   '11169',
   '2643',
   '25970',
   '407025',
   '25820',
   '9790',
   '5579',
   '51237',
   '3358',
   '590',
   '5269',
   '1968',
   '54414',
   '2354',
   '7291',
   '473',
   '23576',
   '1952',
   '10452',
   '374',
   '1048',
   '10950',
   '10087',
   '2903',
   '314',
   '4158',
   '19',
   '109',
   '5313',
   '54209',
   '10864',
   '3402',
   '6194',
   '343045',
   '407050',
   '5110',
   '4286',
   '9770',
   '54832',
   '23219',
   '717',
   '2524',
   '2312',
   '6812',
   '7128',
   '133522',
   '7750',
   '4084',
   '4049',
   '3454',
   '9370',
   '9398',
   '4513',
   '29933',
   '5799',
   '116844',
   '79447',
   '250',
   '114086',
   '1435',
   '11266',
   '9394',
   '3037',
   '3410',
   '22808',
   '6446',
   '5105',
   '23387',
   '4922',
   '84833',
   '653108',
   '3727',
   '3190',
   '1020',
   '257019',
   '11122',
   '27190',
  

In [22]:
df['disgenet.genes_related_to_disease']

0     {7071, 84076, 3670, 56606, 8854, 3240, 2875, 1...
1     {7071, 84076, 3670, 56606, 8854, 3240, 2875, 1...
2     {5459, 9451, 7054, 4477, 4512, 340061, 3172, 4...
3     {339, 3670, 56606, 3240, 2875, 375612, 10888, ...
4     {8379, 27185, 3060, 338340, 64478, 3587, 6387,...
5     {5476, 4682, 351, 2875, 2099, 12, 2534, 2308, ...
6                                                    {}
7     {2146, 254428, 23435, 126, 3851, 1030, 8398, 5...
8     {79731, 389203, 27185, 64231, 8854, 3240, 2875...
9     {7442, 1909, 3120, 3060, 2099, 11169, 7306, 79...
10    {1187, 56606, 3240, 2875, 11169, 26119, 9319, ...
11    {4306, 51573, 10753, 4879, 183, 406980, 400550...
12    {3605, 23417, 59, 2261, 6387, 183, 1277, 1579,...
13    {1187, 56606, 3240, 2875, 11169, 26119, 9319, ...
14    {339, 27185, 3670, 56606, 3240, 100126336, 287...
15    {27185, 3670, 56606, 3240, 100126336, 2875, 25...
16    {1187, 64231, 3240, 100126336, 2875, 11169, 26...
17    {1909, 7057, 4016, 241, 255738, 23435, 324

# do the comparisson now

In [33]:
 filter(None, lst)