In [3]:
import sys
sys.path.append('..')

# std
import numpy as np
import random as rn
import json
import time
from collections import defaultdict
from itertools import combinations

# datasets
import STRING
import MINT
import bioGRID
import HuRI
import synthetic_PPI
import HI_II_14_src
import IM24272_src
import Lit_BM_13_src
import Lit_NB_13_src

# my lib
import PPILinkPred as pred
import helper as hr
import genData_helper as helper
import traversalHelper as tr

In [4]:
class ns:
    BRToRelat = tr.Helper.binary_to_relation
    toDualBR = tr.Helper.to_dual_binary_relation
    BRToNode = tr.Helper.binary_relation_to_node
    arr_pStr = tr.Helper.list_to_pathStrs
    pStr_arr = tr.Helper.pathStrs_to_list
    br_str = tr.Helper.br_to_pathStr

In [7]:
# yeast
import_funcs = [synthetic_PPI.parse_synthetic(root="../")]
dss = ['synthetic']

for i in range(len(dss)):
    ds = dss[i]
    ppi_df = import_funcs[i]
    ppi = [list(arr) for arr in np.asarray(ppi_df[['nodeA', 'nodeB']])]
    ppiNodes = ns.BRToNode(ppi)
    unionNodes = ppiNodes
    
    samplePPIs = []
    with open("./sampled_datasets/{}_sampledPPIs.json".format(ds), "r") as f:
        samplePPIs = json.loads(f.read())
    
    candPPINums = []
    for samplePPI in samplePPIs:
        sampleNodes = list(ns.BRToNode(samplePPI))
        candPPINums.append(len(sampleNodes)*(len(sampleNodes)-1)/2 - len(samplePPI))
    sampledPPINum = int(len(ppi)*0.5)
    
    print("ds {}, num of nodes: {}, ppi: {}, mean cand ppi: {}, sampled ppi: {}".format(
            ds, len(unionNodes), len(ppi), np.mean(candPPINums), sampledPPINum
    ))

ds synthetic, num of nodes: 8272, ppi: 52922, mean cand ppi: 29816060.1, sampled ppi: 26461


In [9]:
# yeast
import_funcs = [HI_II_14_src.parse_HI_src(root="../"), IM24272_src.parse_IM24272_src(root="../"), 
                Lit_BM_13_src.parse_Lit_BM_src(root="../"), Lit_NB_13_src.parse_Lit_NB_src(root="../")]
dss = ['HI_14_src', "IM24272_src", "Lit_BM_src", "Lit_NB_src"]

for i in range(len(dss)):
    ds = dss[i]
    ppi_df = import_funcs[i]
    ppi = [list(arr) for arr in np.asarray(ppi_df[['nodeA', 'nodeB']])]
    ppiNodes = ns.BRToNode(ppi)
    unionNodes = ppiNodes
    
    samplePPIs = []
    with open("./sampled_datasets/{}_sampledPPIs.json".format(ds), "r") as f:
        samplePPIs = json.loads(f.read())
    
    candPPINums = []
    for samplePPI in samplePPIs:
        sampleNodes = list(ns.BRToNode(samplePPI))
        candPPINums.append(len(sampleNodes)*(len(sampleNodes)-1)/2 - len(samplePPI))
    sampledPPINum = int(len(ppi)*0.5)
    
    print("ds {}, num of nodes: {}, ppi: {}, mean cand ppi: {}, sampled ppi: {}".format(
            ds, len(unionNodes), len(ppi), np.mean(candPPINums), sampledPPINum
    ))

ds HI_14_src, num of nodes: 4298, ppi: 13868, mean cand ppi: 5165263.5, sampled ppi: 6934
ds IM24272_src, num of nodes: 5457, ppi: 28780, mean cand ppi: 10939287.2, sampled ppi: 14390
ds Lit_BM_src, num of nodes: 5545, ppi: 11045, mean cand ppi: 8147585.2, sampled ppi: 5522
ds Lit_NB_src, num of nodes: 3391, ppi: 4906, mean cand ppi: 2738996.0, sampled ppi: 2453


In [5]:
# yeast
import_funcs = [bioGRID.parse_bioGRID(root="../"), STRING.parse_STRING(root="../"), MINT.parse_MINT(root="../")]
dss = ['bioGRID', 'STRING', 'MINT']

for i in range(len(dss)):
    ds = dss[i]
    ggi_df, ppi_df = import_funcs[i]
    ppi = [list(arr) for arr in np.asarray(ppi_df[['nodeA', 'nodeB']])]
    ggi = [list(arr) for arr in np.asarray(ggi_df[['nodeA', 'nodeB']])]
    ppiNodes = ns.BRToNode(ppi)
    ggiNodes = ns.BRToNode(ggi)
    unionNodes = ppiNodes|ggiNodes
    
    samplePPIs = []
    with open("./sampled_datasets/{}_sampledPPIs.json".format(ds), "r") as f:
        samplePPIs = json.loads(f.read())
    
    candPPINums = []
    for samplePPI in samplePPIs:
        sampleNodes = list(ns.BRToNode(samplePPI))
        candPPINums.append(len(sampleNodes)*(len(sampleNodes)-1)/2 - len(samplePPI))
    sampledPPINum = int(len(ppi)*0.5)
    
    print("ds {}, num of nodes: {}, ppi: {}, ggi: {}, mean cand ppi: {}, sampled ppi: {}".format(
            ds, len(unionNodes), len(ppi), len(ggi), np.mean(candPPINums), sampledPPINum
    ))

ds bioGRID, num of nodes: 7085, ppi: 113116, ggi: 441140, mean cand ppi: 20045849.4, sampled ppi: 56558
ds STRING, num of nodes: 4673, ppi: 94529, ggi: 64094, mean cand ppi: 9212026.6, sampled ppi: 47264
ds MINT, num of nodes: 4049, ppi: 16927, ggi: 248, mean cand ppi: 5980266.7, sampled ppi: 8463


In [6]:
# human
import_funcs = [
    bioGRID.parse_bioGRID(filename='./data/BioGRID/BIOGRID-ORGANISM-Homo_sapiens-3.5.187.tab2.txt'
        , wFile_GGI='./data/parsed/BioGRID_homo_GGI.pkl'
        , wFile_PPI='./data/parsed/BioGRID_homo_PPI.pkl', root="../")

    , STRING.parse_STRING(ppiFile='./data/STRING/9606.protein.links.v11.0.txt'
        , typeFile='./data/STRING/9606.protein.actions.v11.0.txt'
        , uniProtMap='./data/UniProt/uniprot-taxonomy_9606_STRING.tab', root='../'
        , wFile_GGI='./data/parsed/STRING_homo_GGI.pkl', wFile_PPI='./data/parsed/STRING_homo_PPI.pkl')

    , MINT.parse_MINT(ppiFile='./data/MINT/species human', uniProtMap="./data/UniProt/uniprot-taxonomy_9606.tab"
        , wFile_GGI='./data/parsed/MINT_homo_GGI.pkl', wFile_PPI='./data/parsed/MINT_homo_PPI.pkl', root="../")
]
completePPIs_map = [
    [list(ppi) for ppi in np.asarray([*import_funcs[0]][1][['nodeA', 'nodeB']])]
    , [list(ppi) for ppi in np.asarray([*import_funcs[1]][1][['nodeA', 'nodeB']])]
    , [list(ppi) for ppi in np.asarray([*import_funcs[2]][1][['nodeA', 'nodeB']])]
    , [list(ppi) for ppi in np.asarray(HuRI.parse_HuRI(root="../")[['nodeA', 'nodeB']])]
]
completeGGIs_map = [
    [list(ppi) for ppi in np.asarray([*import_funcs[0]][0][['nodeA', 'nodeB']])]
    , [list(ppi) for ppi in np.asarray([*import_funcs[1]][0][['nodeA', 'nodeB']])]
    , [list(ppi) for ppi in np.asarray([*import_funcs[2]][0][['nodeA', 'nodeB']])]
    , []
]
dss = ['bioGRID_human', 'STRING_human', 'MINT_human', 'HuRI']
completePPIs = dict(zip(dss, completePPIs_map))
completeGGIs = dict(zip(dss, completeGGIs_map))

for i in range(len(dss)):
    ds = dss[i]
    ppi = completePPIs[ds]
    ggi = completeGGIs[ds]
    ppiNodes = ns.BRToNode(ppi)
    ggiNodes = ns.BRToNode(ggi)
    unionNodes = ppiNodes|ggiNodes
    
    samplePPIs = []
    with open("./sampled_datasets/{}_sampledPPIs.json".format(ds), "r") as f:
        samplePPIs = json.loads(f.read())
    
    candPPINums = []
    for samplePPI in samplePPIs:
        sampleNodes = list(ns.BRToNode(samplePPI))
        candPPINums.append(len(sampleNodes)*(len(sampleNodes)-1)/2 - len(samplePPI))
    sampledPPINum = int(len(ppi)*0.5)
    
    print("ds {}, num of nodes: {}, ppi: {}, ggi: {}, mean cand ppi: {}, sampled ppi: {}".format(
            ds, len(unionNodes), len(ppi), len(ggi), np.mean(candPPINums), sampledPPINum
    ))

ds bioGRID_human, num of nodes: 24760, ppi: 452684, ggi: 8768, mean cand ppi: 220833040.0, sampled ppi: 226342
ds STRING_human, num of nodes: 15668, ppi: 308614, ggi: 404776, mean cand ppi: 88982499.1, sampled ppi: 154307
ds MINT_human, num of nodes: 7534, ppi: 22324, ggi: 1439, mean cand ppi: 15493875.9, sampled ppi: 11162
ds HuRI, num of nodes: 8109, ppi: 51127, ggi: 0, mean cand ppi: 21899033.2, sampled ppi: 25563
