# Testing environment for DoSE

## Setup

### Load libraries

In [1]:
import pandas as pd
import numpy as np 
import gseapy
from biothings_client import get_client

### Define data

In [2]:
seeds_file = "Input/0007079.txt"
betweenness_file = "Input/0007079_added_200_dmd_betweenness_hub_0.01.txt"
significance_file = "Input/0007079_added_200_dmd_significance_hub_1.txt"
diseases_file = "Input/ICD10_commROCG_raw.txt"
disease_clusters_file = "Input/ICD10_commROCG_cluster.txt"

### Load data

In [3]:
disease_id = "0007079"
seeds = pd.read_csv(seeds_file, sep="\t", header=None)[0]
betweenness = pd.read_csv(betweenness_file, sep="\t")['node']
significance = pd.read_csv(significance_file, sep="\t")['node']
diseases = pd.read_csv(diseases_file, sep="\t", header=None)
disease_clusters = pd.read_csv(disease_clusters_file, sep="\t", header=None)

In [4]:
import timeit
start = timeit.default_timer()

stop = timeit.default_timer()
print('Time: ', stop - start)  

Time:  4.699100099969655e-05


## Network from NeDReX

In [8]:
import os
import pandas as pd

In [9]:
iid_files = {"rat": "rat_annotated_PPIs",
            "human": "human_annotated_PPIs",
            "mouse": "mouse_annotated_PPIs",
            "yeast": "yeast_annotated_PPIs",
            "fly": "fly_annotated_PPIs",
            "worm": "worm_annotated_PPIs",
            "chicken": "chicken_annotated_PPIs"}

In [10]:
os.system("mkdir tmp")
os.system("wget 'http://iid.ophid.utoronto.ca/static/download/"+iid_files['rat']+".txt.gz' -P tmp/")
os.system("gzip -d tmp/"+iid_files['rat']+".txt.gz")

0

In [11]:
with open('tmp/'+iid_files['rat']+"_uniprot.sif", 'a') as sif_file1:
    with open('tmp/'+iid_files['rat']+"_symbols.sif", 'a') as sif_file2:
        with open("tmp/"+iid_files['rat']+".txt") as fp:
            line_split = fp.readline().split("\t")
            sif_file1.write(line_split[0]+"\t"+line_split[7]+"\t"+line_split[1]+"\n")
            sif_file2.write(line_split[2]+"\t"+line_split[7]+"\t"+line_split[3]+"\n")
            for line in fp:
                line_split = line.split("\t")
                sif_file1.write(line_split[0]+"\t"+line_split[7]+"\t"+line_split[1]+"\n")
                sif_file2.write(line_split[2]+"\t"+line_split[7]+"\t"+line_split[3]+"\n")

In [12]:
os.system("mv tmp/*.sif ./")
os.system("rm -R tmp")

0

In [4]:
import numpy as np
from scipy.sparse import csr_matrix

c = csr_matrix(np.array([[1,2,0],
                         [4,5,6],
                         [7,8,9]])) # construct an example matrix
d = dict(c.todok().items()) # convert to dictionary of keys format
print(d)

{(0, 0): 1, (1, 0): 4, (2, 0): 7, (0, 1): 2, (1, 1): 5, (2, 1): 8, (1, 2): 6, (2, 2): 9}


In [15]:
id1, id2 = 0, 2
#if (id1, id2) in d:
print(d[(id1,id2)])

KeyError: (0, 2)

In [13]:
for m, n in c.todok().items():
    s, t = m
    print(s,t , n)

0 0 1
1 0 4
2 0 7
0 1 2
1 1 5
2 1 8
1 2 6
2 2 9


In [24]:
df = pd.read_csv("../test.csv", dtype=str)
df

Unnamed: 0,mondo,omim,snomedct,umls,orpha,mesh,doid,ICD-10
0,0012344,609790,,C1853360,,C565228,0110044,G30
1,0013673,614296,734022008,C4518338,411590,C565631,,E13
2,0011016,601208,,C1832605,,C563371,0110750,E10
3,0012335,609734,702949005,C1857854,71526,C565726,,E66
4,0012321,609636,,C1864828,,C566465,0110043,G30
...,...,...,...,...,...,...,...,...
182,0004784,,389145006,C0155877,,,9415,J45
183,0016464,,,C0342337,2298,,,E13
184,0017636,,,CN203531,306669,,,G20
185,0017279,,715345007,,2828,,0060894,G20


In [23]:
def split_and_expand_column(data, split_string, column_name):
    """
    Split column value in data by split_string and expand the dataframe
    to have a separate row for each value in split set.

    :param data: dataframe with data
    :param split_string: separator of values in cell
    :param column_name: column to split each cell of
    :return: expanded data dataframe
    """
    s = data[column_name].str.split(split_string, expand=True).stack()
    i = s.index.get_level_values(0)
    df2 = data.loc[i].copy()
    df2[column_name] = s.values
    return df2

icd_unstack = split_and_expand_column(data=df, split_string=",", column_name="ICD-10")
mapping = pd.concat([icd_unstack, df[df['ICD-10'] != '']])
mapping

Unnamed: 0,mondo,omim,snomedct,umls,orpha,mesh,doid,ICD-10
0,0012344,609790,,C1853360,,C565228,0110044,G30
1,0013673,614296,734022008,C4518338,411590,C565631,,E13
2,0011016,601208,,C1832605,,C563371,0110750,E10
3,0012335,609734,702949005,C1857854,71526,C565726,,E66
4,0012321,609636,,C1864828,,C566465,0110043,G30
...,...,...,...,...,...,...,...,...
182,0004784,,389145006,C0155877,,,9415,J45
183,0016464,,,C0342337,2298,,,E13
184,0017636,,,CN203531,306669,,,G20
185,0017279,,715345007,,2828,,0060894,G20


In [25]:
mapping = df.copy()

In [83]:
def combine_rowsets(x):
    #print(x)
    return set().union(x)

In [31]:
import timeit
start = timeit.default_timer()
dichti = mapping.set_index('mondo')[['ICD-10']].T.to_dict('list')
stop = timeit.default_timer()
print('Time: ', stop - start)  

Time:  0.006711977000122715


  dichti = mapping.set_index('mondo')[['ICD-10']].T.to_dict('list')


In [91]:
import timeit
start = timeit.default_timer()
dicht = df.groupby('mondo')[['ICD-10']].agg(lambda g: combine_rowsets(set(g.values))).to_dict()['ICD-10']
stop = timeit.default_timer()
print('Time: ', stop - start)  

Time:  0.0063025869994817185


In [51]:
df.groupby('mondo')[['ICD-10']]

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fee41850790>

In [27]:
import timeit
start = timeit.default_timer()
att_id_to_id = dict()
for att_id in mapping['mondo'].unique():
    att_id_to_id[att_id] = list(mapping[mapping['mondo'] == att_id]['ICD-10'])
stop = timeit.default_timer()
print('Time: ', stop - start)  

Time:  0.02314313700026105


In [79]:
dichti

{'0012344': ['G30'],
 '0013673': ['E13'],
 '0011016': ['E10'],
 '0012335': ['E66'],
 '0012321': ['G30'],
 '0011057': ['I64'],
 '0011033': ['E10'],
 '0011068': ['E10'],
 '0014768': ['F01'],
 '0001491': ['J45'],
 '0014796': ['G20'],
 '0001302': ['I11'],
 '0002679': ['I63'],
 '0001200': ['I15'],
 '0015825': ['E66'],
 '0001134': ['I10'],
 '0013078': ['E10'],
 '0014309': ['E66'],
 '0004979': ['J45'],
 '0004975': ['G30'],
 '0002277': ['I70'],
 '0002275': ['I70'],
 '0014231': ['G20'],
 '0011847': ['G43'],
 '0010482': ['G20'],
 '0011777': ['G30'],
 '0008071': ['I15'],
 '0012919': ['E10'],
 '0012920': ['E10'],
 '0012921': ['E10'],
 '0011647': ['G30'],
 '0012961': ['E10'],
 '0000980': ['I70'],
 '0011572': ['E10'],
 '0011562': ['G20'],
 '0011561': ['G30'],
 '0011517': ['I15'],
 '0011502': ['E13'],
 '0000914': ['F01'],
 '0009101': ['E13'],
 '0000700': ['G43'],
 '0009192': ['E13'],
 '0013992': ['E66'],
 '0013991': ['E66'],
 '0012630': ['G30'],
 '0012631': ['G30'],
 '0011302': ['E10'],
 '0012632': [

In [88]:
dicht

{'0000700': {'G43'},
 '0000914': {'F01'},
 '0000980': {'I70'},
 '0001134': {'I10'},
 '0001200': {'I15'},
 '0001302': {'I11'},
 '0001491': {'J45'},
 '0001815': {'G20'},
 '0001876': {'I70'},
 '0002275': {'I70'},
 '0002277': {'I70'},
 '0002679': {'I63'},
 '0004648': {'F01'},
 '0004765': {'J45'},
 '0004781': {'I21'},
 '0004784': {'J45'},
 '0004975': {'G30'},
 '0004979': {'J45'},
 '0005009': {'I50'},
 '0005010': {'I21', 'I22'},
 '0005015': {'E10', 'E11', 'E12', 'E13', 'E14'},
 '0005044': {'I10', 'I11', 'I12', 'I13', 'I15'},
 '0005068': {'I21', 'I22'},
 '0005071': {'G20', 'G30', 'G43'},
 '0005084': {'F00'},
 '0005147': {'E10'},
 '0005148': {'E11'},
 '0005180': {'G20'},
 '0005277': {'G43'},
 '0005311': {'I70'},
 '0005475': {'G43'},
 '0006044': {'I12'},
 '0006658': {'I70'},
 '0006716': {'I21', 'I22'},
 '0006727': {'I50'},
 '0006947': {'I15'},
 '0006993': {'I50'},
 '0007089': {'G30'},
 '0007432': {'F01'},
 '0007454': {'E10'},
 '0007669': {'E11'},
 '0007772': {'I15'},
 '0008071': {'I15'},
 '0008

In [92]:
print(dichti['0011057'])
print(dicht['0011057'])
print(att_id_to_id['0011057'])

['I64']
{'I63', 'I64'}
['I63', 'I64']
