# Testing environment for DoSE

## Setup

### Load libraries

In [1]:
import pandas as pd
import numpy as np 
import gseapy
from biothings_client import get_client

### Define data

In [2]:
seeds_file = "Input/0007079.txt"
betweenness_file = "Input/0007079_added_200_dmd_betweenness_hub_0.01.txt"
significance_file = "Input/0007079_added_200_dmd_significance_hub_1.txt"
diseases_file = "Input/ICD10_commROCG_raw.txt"
disease_clusters_file = "Input/ICD10_commROCG_cluster.txt"

### Load data

In [3]:
disease_id = "0007079"
seeds = pd.read_csv(seeds_file, sep="\t", header=None)[0]
betweenness = pd.read_csv(betweenness_file, sep="\t")['node']
significance = pd.read_csv(significance_file, sep="\t")['node']
diseases = pd.read_csv(diseases_file, sep="\t", header=None)
disease_clusters = pd.read_csv(disease_clusters_file, sep="\t", header=None)

In [4]:
import timeit
start = timeit.default_timer()

stop = timeit.default_timer()
print('Time: ', stop - start)  

Time:  4.699100099969655e-05


## Testing

In [65]:
import json
import requests

In [69]:
r = requests.get("http://82.148.225.92:8022/protein_interacts_with_protein/details")
max_num = r.json()['count']
max_num

2634845

In [99]:
import timeit
start = timeit.default_timer()
network = list()
for i in range(0, max_num, 1000):
    r = requests.get("http://82.148.225.92:8022/ppi_paginated?skip="+str(i)+"&limit=1000")
    for line in r.json(): 
        network.append([line['memberOne'][8:], line['memberOne'][8:], ";".join(line['assertedBy']), ";".join(line['evidenceTypes'])])
df = pd.DataFrame(network, columns = ['ID1', 'ID2', 'Sources', 'Evidence'])
df
stop = timeit.default_timer()
print('Time: ', stop - start)  

Time:  5093.539029483996


In [101]:
df.to_csv("network.csv", index=False)

10000 = 00:00:14; 
100000 = 00:02:22

In [98]:
r = requests.get("http://82.148.225.92:8022/ppi_paginated?skip=0&limit=1000000")

KeyboardInterrupt: 

In [97]:
r.text

'[{"memberOne":"uniprot.P63092","memberTwo":"uniprot.Q99614","assertedBy":["iid","biogrid"],"brainTissues":[],"created":"2022-01-12T16:09:31.940000","developmentStages":[],"evidenceTypes":["exp"],"jointTissues":[],"methods":["affinity chromatography technology","anti tag coimmunoprecipitation","Affinity Capture-Western"],"subcellularLocations":["Cytoplasm"],"tissues":[],"type":"ProteinInteractsWithProtein","updated":"2022-01-12T16:30:54.080000"},{"memberOne":"uniprot.Q5JWF2","memberTwo":"uniprot.Q99614","assertedBy":["iid","biogrid"],"brainTissues":["Spinal trigeminal nucleus, left","Spinal trigeminal nucleus","Myelencephalon"],"created":"2022-01-12T16:09:31.940000","developmentStages":[],"evidenceTypes":["exp","pred"],"jointTissues":["Synovial macrophages","Chondrocytes","Growth plate cartilage","Articular cartilage"],"methods":["affinity chromatography technology","anti tag coimmunoprecipitation","Affinity Capture-Western"],"subcellularLocations":["Cytoplasm"],"tissues":["Heart","Dor

In [28]:
dis_id_map = pd.read_csv("../mapping_files/disease_id_mapping.csv", dtype="str").fillna("")
dis_id_map

Unnamed: 0,mondo,omim,snomedct,umls,orpha,mesh,doid,ICD-10
0,0002974,603956,363354003,,,,4362,C53
1,0000311,,,,,,,
2,0001642,,1489008,C0019919,,,13134,H00
3,0000310,,,,,,0050308,
4,0001641,,,,,,13129,
...,...,...,...,...,...,...,...,...
24115,0019900,,,,96160,,,Q93
24116,0019902,,766716004,,96168,,,Q93
24117,0019901,,,,96164,,,Q93
24118,0007928,,,,,,,


In [29]:
def cut_to_parent(x:str):
    return ",".join(set([entry.split(".")[0] for entry in x.split(",")]))

In [30]:
dis_id_map["ICD-10"] = dis_id_map["ICD-10"].apply(cut_to_parent)
dis_id_map

Unnamed: 0,mondo,omim,snomedct,umls,orpha,mesh,doid,ICD-10
0,0002974,603956,363354003,,,,4362,C53
1,0000311,,,,,,,
2,0001642,,1489008,C0019919,,,13134,H00
3,0000310,,,,,,0050308,
4,0001641,,,,,,13129,
...,...,...,...,...,...,...,...,...
24115,0019900,,,,96160,,,Q93
24116,0019902,,766716004,,96168,,,Q93
24117,0019901,,,,96164,,,Q93
24118,0007928,,,,,,,


In [31]:
def set_to_len(x: set):
    return len(x)
def atts_to_size(pd_map: pd.DataFrame) -> pd.DataFrame:
    att_len = pd_map.copy()
    att_len[att_len.columns[1:]] = att_len[att_len.columns[1:]].applymap(set_to_len)
    att_len['sum'] = att_len[att_len.columns[1:]].sum(axis=1)
    return att_len

In [32]:
dis_att_map = pd.read_csv("../mapping_files/disease_att_mapping.csv", dtype="str").fillna("")
dis_att_map

Unnamed: 0,mondo,disgenet.variants_related_to_disease,disgenet.genes_related_to_disease,ctd.pathway_related_to_disease
0,0000001,rs5443;rs11320420;rs138873021;rs2014355;rs1010...,387882;51024;7903;7157;11093;10153;55291;10636...,
1,0000004,rs6161;rs121918654;rs1284060395;rs104894897;rs...,3576;4214;55131;5576;791114;7681;2516;7465;513...,hsa01100;hsa04913;hsa04925;hsa00140;hsa_M00107
2,0000005,rs773764015;rs7014851;rs121434448;rs121434451,55806,
3,0000009,rs1012488531;rs1555549041;rs759081917;rs781541...,2335;834;23218;59352;25897;7066;147495;442206;...,
4,0000022,rs6313,1326;627;276;51540;6783;54796;6886;83881;79152...,
...,...,...,...,...
24115,0400000,,,
24116,0400002,,,
24117,0400003,,,
24118,0400004,,,


In [33]:
dis_att_size = atts_to_size(dis_att_map)
dis_att_size

Unnamed: 0,mondo,disgenet.variants_related_to_disease,disgenet.genes_related_to_disease,ctd.pathway_related_to_disease,sum
0,0000001,4389,4444,0,8833
1,0000004,117,511,46,674
2,0000005,45,5,0,50
3,0000009,358,537,0,895
4,0000022,6,113,0,119
...,...,...,...,...,...
24115,0400000,0,0,0,0
24116,0400002,0,0,0,0
24117,0400003,0,0,0,0
24118,0400004,0,0,0,0


In [34]:
def size_mapping_to_dict(pd_size_map: pd.DataFrame, id_col: str, term_col: str, threshold: int = 100):
    size_to_occ = pd.DataFrame(pd_size_map[term_col].value_counts()).sort_index().to_dict()[term_col]
    pd_size_map = pd_size_map.sort_values(by=[term_col]).reset_index(drop=True)
    new_dict = dict()
    term_sizes = pd_size_map[term_col].unique().tolist()
    for index, key in enumerate(term_sizes):
        curr_keys = [key]
        if size_to_occ[key] < threshold:
            sum_tmp, add_top, add_bottom = size_to_occ[key], index, index
            while sum_tmp < threshold:
                if add_top - 1 >= 0:
                    add_top = add_top - 1
                    sum_tmp = sum_tmp + size_to_occ[term_sizes[add_top]]
                    curr_keys.append(term_sizes[add_top])
                if add_bottom + 1 < len(term_sizes):
                    add_bottom = add_bottom + 1
                    sum_tmp = sum_tmp + size_to_occ[term_sizes[add_bottom]]
                    curr_keys.append(term_sizes[add_bottom])
        for cur_id in pd_size_map[pd_size_map[term_col] == key][id_col]:
            new_dict[cur_id] = curr_keys
    return new_dict

In [37]:
att_dict = size_mapping_to_dict(pd_size_map=dis_att_size, id_col="mondo", term_col="sum",
                                threshold=100)
att_dict

{'0400005': [0],
 '0008649': [0],
 '0008645': [0],
 '0008643': [0],
 '0008640': [0],
 '0008639': [0],
 '0008636': [0],
 '0008634': [0],
 '0008632': [0],
 '0008631': [0],
 '0008630': [0],
 '0008629': [0],
 '0008626': [0],
 '0008625': [0],
 '0008624': [0],
 '0008623': [0],
 '0008622': [0],
 '0008619': [0],
 '0008618': [0],
 '0008617': [0],
 '0008616': [0],
 '0008615': [0],
 '0008650': [0],
 '0008651': [0],
 '0008654': [0],
 '0008655': [0],
 '0008743': [0],
 '0008741': [0],
 '0008739': [0],
 '0008735': [0],
 '0008731': [0],
 '0008719': [0],
 '0008712': [0],
 '0008711': [0],
 '0008707': [0],
 '0008706': [0],
 '0008614': [0],
 '0008704': [0],
 '0008697': [0],
 '0008696': [0],
 '0008694': [0],
 '0008690': [0],
 '0008688': [0],
 '0008687': [0],
 '0008683': [0],
 '0008669': [0],
 '0008665': [0],
 '0008658': [0],
 '0008699': [0],
 '0008613': [0],
 '0008611': [0],
 '0008609': [0],
 '0008549': [0],
 '0008548': [0],
 '0008545': [0],
 '0008544': [0],
 '0008543': [0],
 '0008540': [0],
 '0008539': [0

In [38]:
def map_to_prev_id(main_id_type: str, id_type: str, id_mapping: pd.DataFrame, att_mapping: pd.DataFrame):
    """
    Map attribute mapping back to original id.

    :param main_id_type:
    :param id_type:
    :param id_mapping:
    :param att_mapping:
    :return:
    """
    columns = [main_id_type, id_type] if id_type != main_id_type else [main_id_type]
    mapping_subset = id_mapping[columns].drop_duplicates()
    hit_mapping = pd.merge(mapping_subset, att_mapping, on=[main_id_type], how='outer')
    hit_mapping = hit_mapping.drop(columns=[main_id_type]) if id_type != main_id_type else hit_mapping
    return hit_mapping

In [43]:
new_mapping = map_to_prev_id(main_id_type="mondo", id_type="ICD-10", id_mapping=dis_id_map, att_mapping=dis_att_map).explode("ICD-10")
new_mapping = new_mapping

Unnamed: 0,ICD-10,disgenet.variants_related_to_disease,disgenet.genes_related_to_disease,ctd.pathway_related_to_disease
0,C53,rs1800795;rs772110575;rs4579555;rs1899663;rs10...,7637;3732;619553;990;6367;6839;57212;5579;5947...,hsa04020;hsa05206;hsa04015;hsa04151;hsa04550;h...
1,,,,
2,H00,,,
3,,,,
4,,,,
...,...,...,...,...
24115,Q93,,,
24116,Q93,,,
24117,Q93,,,
24118,,rs80338831;rs80338826,4627,


In [44]:
dis_att_size = atts_to_size(new_mapping)
dis_att_size

Unnamed: 0,ICD-10,disgenet.variants_related_to_disease,disgenet.genes_related_to_disease,ctd.pathway_related_to_disease,sum
0,C53,3514,10559,116,14189
1,,0,0,0,0
2,H00,0,0,0,0
3,,0,0,0,0
4,,0,0,0,0
...,...,...,...,...,...
24115,Q93,0,0,0,0
24116,Q93,0,0,0,0
24117,Q93,0,0,0,0
24118,,21,4,0,25
