In [2]:
import pandas as pd
import numpy as np
import pickle
import random

In [3]:
with open(r'D:\study\thesis\project\HBDM-main\datasets\ppi\ppi_index.pkl', 'rb') as f:
    value_to_index_mapping = pickle.load(f)
len(value_to_index_mapping)

18767

In [4]:
local_stringdb = 'D:/study/thesis/project/HBDM-main/nn_data/stringdb/'
# load local STRING database and names
df = pd.read_csv(local_stringdb+'9606.protein.info.v12.0.txt', sep='\t', header=0, usecols=['#string_protein_id', 'preferred_name'])
df['preferred_name'] = df['preferred_name'].str.upper()
stringId2name = df.set_index('#string_protein_id')['preferred_name'].to_dict()
name2stringId = df.set_index('preferred_name')['#string_protein_id'].to_dict()
df = pd.read_csv(local_stringdb+'9606.protein.aliases.v12.0.txt', sep='\t', header=0, usecols=['#string_protein_id', 'alias']).drop_duplicates(['alias'], keep='first')
df['alias'] = df['alias'].str.upper()
aliases2stringId = df.set_index('alias')['#string_protein_id'].to_dict()

string_score_transform = lambda x: -np.log(x/1000)

network = pd.read_csv(local_stringdb+'9606.protein.physical.links.detailed.v12.0.txt', sep=' ', header=0).convert_dtypes().replace(0, float('nan'))
network['combined_score'] = network['combined_score'].apply(string_score_transform)

def convert_stringId(alias):
    try:
        stringId = name2stringId[alias]
    except:
        #print(alias, 'can\'t be converted by name2stringId! Now trying aliases2stringId.')
        try:
            stringId = aliases2stringId[alias]
        except:
            #print(alias, 'can\'t be converted by aliases2stringId! Now return None.')
            stringId = None
    #print(alias, stringId)
    return stringId

def read_string_net_from_df(df, weight_choose='combined_score'): # default combined score
    string_net = {}
    for row in df.itertuples(index=False):
        start_node, end_node, weight = row.protein1, row.protein2, getattr(row, weight_choose)
        if start_node not in string_net:
            string_net[start_node] = {}
        string_net[start_node][end_node] = weight
        if end_node not in string_net:
            string_net[end_node] = {}
        string_net[end_node][start_node] = weight
    return string_net

In [5]:
complexs = {'RNA':['RPB1', 'RPB2', 'RPB3', 'RPB4', 'RPB5', 'RPB6', 'RPB7', 'RPB8', 'RPB9', 'RPB10', 'RPB11', 'RPB12', 'RPB13'],
    'protease': ['PSMA1', 'PSMA2', 'PSMA3', 'PSMB1', 'PSMB2', 'PSMB3'],
    'nuclear pore': ['NUP98', 'NUP93', 'NUP107', 'NUP133']
}

names = set(aliases2stringId.keys())
# humans = set(value_to_index_mapping.keys())

complexs_id = dict()

for complex_name in complexs:
    folder_path = 'D:/study/thesis/project/HBDM-main/ppi_results/test_results/'+complex_name
    # os.mkdir(folder_path)
    group_node = []
    for gene in complexs[complex_name]:
        if gene in names:
            stringid = aliases2stringId[gene]
            # stringid = int(stringid[9:])
            # if stringid in humans:
            #     node = value_to_index_mapping[stringid]
            group_node.append(stringid)
    complexs_id[complex_name] = group_node

In [6]:
top = 50
for complex_name in complexs_id:
    points = complexs_id[complex_name]
    for top in [10,20]:
        precision = []
        coverage = []
        for start_gene in points:
            test_nodes = list(set(points)-set(start_gene))
            true_pre = []
            subdf = network[network['protein1']==start_gene]
            ranked = subdf.sort_values(by='combined_score')
            predicted=ranked[:top]['protein2'].tolist()
            for i in predicted:
                if i in test_nodes:
                    true_pre.append(i)
            precision.append(len(true_pre)/len(predicted))
            coverage.append(len(set(true_pre))/len(test_nodes))
        print('top-',top,'\t',complex_name,' precision: ', sum(precision)/len(precision))
        print(complex_name,' coverage: ', sum(coverage)/len(coverage))

top- 10 	 RNA  precision:  0.9416666666666668
RNA  coverage:  0.7847222222222222
top- 20 	 RNA  precision:  0.5499999999999999
RNA  coverage:  0.9166666666666665
top- 10 	 protease  precision:  0.43333333333333335
protease  coverage:  0.7222222222222222
top- 20 	 protease  precision:  0.25
protease  coverage:  0.8333333333333334
top- 10 	 nuclear pore  precision:  0.175
nuclear pore  coverage:  0.4375
top- 20 	 nuclear pore  precision:  0.15
nuclear pore  coverage:  0.75
