# Testing environment for DoSE

## Setup

### Load libraries

In [1]:
import pandas as pd
import numpy as np 
import gseapy
from biothings_client import get_client

### Define data

In [2]:
seeds_file = "Input/0007079.txt"
betweenness_file = "Input/0007079_added_200_dmd_betweenness_hub_0.01.txt"
significance_file = "Input/0007079_added_200_dmd_significance_hub_1.txt"
diseases_file = "Input/ICD10_commROCG_raw.txt"
disease_clusters_file = "Input/ICD10_commROCG_cluster.txt"

### Load data

In [3]:
disease_id = "0007079"
seeds = pd.read_csv(seeds_file, sep="\t", header=None)[0]
betweenness = pd.read_csv(betweenness_file, sep="\t")['node']
significance = pd.read_csv(significance_file, sep="\t")['node']
diseases = pd.read_csv(diseases_file, sep="\t", header=None)
disease_clusters = pd.read_csv(disease_clusters_file, sep="\t", header=None)

In [4]:
import timeit
start = timeit.default_timer()

stop = timeit.default_timer()
print('Time: ', stop - start)  

Time:  4.699100099969655e-05


## Network from NeDReX

In [44]:
from urllib import request, parse
import json

In [45]:
url = 'https://api.nedrex.net/graph_builder'

In [46]:
myobj = {
    "nodes":["protein"],
    "edges":["protein_interacts_with_protein"],
    "iid_evidence":["exp"],
    "ppi_self_loops": True,
    "taxid":[9606],
    "concise": True,
    "include_omim": True,
    "disgenet_threshold": 0,
    "use_omim_ids": False,
}

In [47]:
data = json.dumps(myobj).encode('utf8')
req =  request.Request(url, data=data) # this will make the method "POST"
resp = request.urlopen(req)

In [48]:
print(resp.read().decode('utf8'))

"e1e15418-2c09-4c9e-8ad5-ed02aed4d5a8"


## TEST

In [3]:
FILES_DIR = "../mapping_files/"

In [4]:
file_names = {'gene_ids': FILES_DIR + 'gene_id_mapping.csv',
                  'gene_atts': FILES_DIR + 'gene_att_mapping.csv',
                  'disorder_ids': FILES_DIR + 'disease_id_mapping.csv',
                  'disorder_atts': FILES_DIR + 'disease_att_mapping.csv',
                  'go_BP': FILES_DIR + 'gene_dist_go_BP.csv',
                  'go_CC': FILES_DIR + 'gene_dist_go_CC.csv',
                  'go_MF': FILES_DIR + 'gene_dist_go_MF.csv',
                  'pathway_kegg': FILES_DIR + 'gene_dist_pathway_kegg.csv',
                  'related_genes': FILES_DIR + 'disease_dist_rel_genes.csv',
                  'related_variants': FILES_DIR + 'disease_dist_rel_variants.csv',
                  'related_pathways': FILES_DIR + 'disease_dist_rel_pathways.csv'}

In [None]:
def _load_file_distances(self, file_readers: list, sep: str, mapping_name: str):
    header=


with open(self.file_names['go_BP']) as f1, open(self.file_names['go_CC']) as f2, \
                    open(self.file_names['go_MF']) as f3, open(self.file_names['pathway_kegg']) as f4:
    

In [5]:
gene_ids = pd.read_csv(file_names['gene_ids'], dtype=str)
gene_ids

Unnamed: 0,entrezgene,symbol,uniprot.Swiss-Prot,ensembl.gene
0,1,A1BG,P04217,ENSG00000121410
1,2,A2M,P01023,ENSG00000175899
2,9,NAT1,P18440,ENSG00000171428
3,10,NAT2,P11245,ENSG00000156006
4,12,SERPINA3,P01011,ENSG00000196136
...,...,...,...,...
16013,54331,GNG2,P59768,ENSG00000172020
16014,3215,HOXB5,P09067,ENSG00000087460
16015,375519,GJB7,Q6PEY0,ENSG00000173020
16016,22979,EFR3B,Q9Y2G0,ENSG00000186469


In [7]:
import graph_tool as gt

In [9]:
g = gt.Graph()
g.add_vertex(len(gene_ids.entrezgene))

<generator object Graph.add_vertex.<locals>.<genexpr> at 0x7fd6dcb25a50>

In [12]:
g.get_vertices()

array([    0,     1,     2, ..., 16015, 16016, 16017])

In [25]:
vertex_ids = g.new_vertex_property("string")
for v in g.iter_vertices():
    g.vertex_properties["ids"][v] = gene_ids.entrezgene[v]

In [29]:
print(g.vertex_properties["ids"].a)

None


In [27]:
g.list_properties()

ids            (vertex)  (type: string)


In [34]:
import os
import time
import psutil
start_time = time.time()
go_BP = pd.read_csv(file_names['go_BP'])
memory_usage = '{0:.2f}'.format(psutil.Process(os.getpid()).memory_info().rss / 1024 ** 2)
time_usage = time.strftime("%H:%M:%S", time.gmtime(time.time() - start_time))
print('[{}|{}MB] '.format(time_usage, memory_usage))

[00:00:57|3602.38MB] 


In [50]:
go_BP.set_index('entrezgene', inplace=True)

In [32]:
import os
import time
import psutil
start_time = time.time()
go_BP = gt.load_graph('../entrez-go_BP-network.graphml')
memory_usage = '{0:.2f}'.format(psutil.Process(os.getpid()).memory_info().rss / 1024 ** 2)
time_usage = time.strftime("%H:%M:%S", time.gmtime(time.time() - start_time))
print('[{}|{}MB] '.format(time_usage, memory_usage))

[00:00:14|527.83MB] 


In [57]:
go_BP = None

In [55]:
import scipy
import os
import time
import psutil
start_time = time.time()
sparse = scipy.sparse.csr_matrix(go_BP.values)
memory_usage = '{0:.2f}'.format(psutil.Process(os.getpid()).memory_info().rss / 1024 ** 2)
time_usage = time.strftime("%H:%M:%S", time.gmtime(time.time() - start_time))
print('[{}|{}MB] '.format(time_usage, memory_usage))

[00:00:01|4828.75MB] 


In [52]:
sparse

<14430x14430 sparse matrix of type '<class 'numpy.float64'>'
	with 10367466 stored elements in Compressed Sparse Row format>

In [49]:
go_BP.columns

Index(['entrezgene', '1', '10', '100', '1000', '10000', '10001', '10003',
       '1001', '10010',
       ...
       '9987', '9988', '9989', '999', '9990', '9991', '9992', '9993', '9994',
       '9997'],
      dtype='object', length=14431)

In [54]:
print(sparse[3, :].toarray())

[[0.         0.         0.01428571 ... 0.02941176 0.         0.        ]]


In [56]:
sparse[3,4]

0.0212765957446808

In [59]:
go_BP[3,4]

TypeError: 'NoneType' object is not subscriptable

In [60]:
scipy.sparse.save_npz("yourmatrix.npz", sparse)

In [62]:
scipy.sparse.load_npz("yourmatrix.npz")

<14430x14430 sparse matrix of type '<class 'numpy.float64'>'
	with 10367466 stored elements in Compressed Sparse Row format>

In [4]:
import scipy.sparse as sp
test = sp.coo_matrix(None)

In [8]:
test.nnz

0