In [1]:
import numpy as np
import pandas as pd
import pickle
import networkx as nx
import matplotlib.pyplot as plt
from random import sample
import random

In [2]:
local_stringdb = '/novo/omdb/pds02/PDS2843/data/sprint_tid_ascvd/data/string/lfs-stringdb/'# load local STRING database and names
df = pd.read_csv(local_stringdb+'9606.protein.info.v12.0.txt', sep='\t', header=0, usecols=['#string_protein_id', 'preferred_name'])
df['preferred_name'] = df['preferred_name'].str.upper()
stringId2name = df.set_index('#string_protein_id')['preferred_name'].to_dict()
name2stringId = df.set_index('preferred_name')['#string_protein_id'].to_dict()
df = pd.read_csv(local_stringdb+'9606.protein.aliases.v12.0.txt', sep='\t', header=0, usecols=['#string_protein_id', 'alias']).drop_duplicates(['alias'], keep='first')
df['alias'] = df['alias'].str.upper()
aliases2stringId = df.set_index('alias')['#string_protein_id'].to_dict()

#string_score_transform = lambda x: -np.log(x/1000)

graph_df = pd.read_csv(local_stringdb+'9606.protein.physical.links.detailed.v12.0.txt', sep=' ', header=0).convert_dtypes().replace(0, float('nan'))
#network['combined_score'] = network['combined_score'].apply(string_score_transform)
graph_df = graph_df[['protein1', 'protein2','combined_score']]

In [3]:
G = nx.from_pandas_edgelist(graph_df, source='protein1', target='protein2', edge_attr='combined_score', create_using=nx.Graph)

In [4]:
print(nx.is_connected(G))
components = list(nx.connected_components(G))
#Print information about each connected component
for i, component in enumerate(components):
    print(f"Component {i + 1}")

    # Extract the edges for each component
    subgraph = G.subgraph(component)
    component_edges = subgraph.edges()
    print('nodes',len(subgraph.nodes),'Edges:',len(subgraph.edges))

subgraph = G.subgraph(components[0])

False
Component 1
nodes 18758 Edges: 738800
Component 2
nodes 2 Edges: 1
Component 3
nodes 2 Edges: 1
Component 4
nodes 2 Edges: 1
Component 5
nodes 3 Edges: 2


In [5]:
graph_df = nx.to_pandas_edgelist(subgraph, source='protein1', target='protein2')
proteins = sorted(list(set(graph_df['protein1'].tolist())|set(graph_df['protein2'].tolist())))
gene2node = {value: index for index, value in enumerate(proteins)}
 
file_path ='/novo/omdb/pds02/PDS2843/data/sprint_tid_ascvd/gzn/thesis/HBDM/data/datasets/ppi/ppi_index.pkl'
# Serialize and save the Tensor to the file
with open(file_path, 'wb') as file:
    pickle.dump(gene2node, file)
# Close the file
file.close()

graph_df['node1']=graph_df['protein1'].map(gene2node)
graph_df['node2']=graph_df['protein2'].map(gene2node)
G = nx.from_pandas_edgelist(graph_df, source='node1', target='node2', edge_attr='combined_score', create_using=nx.Graph)

In [6]:
graph_df

Unnamed: 0,protein1,protein2,combined_score,node1,node2
0,9606.ENSP00000000233,9606.ENSP00000257770,311,0,1914
1,9606.ENSP00000000233,9606.ENSP00000226004,161,0,776
2,9606.ENSP00000000233,9606.ENSP00000434442,499,0,15871
3,9606.ENSP00000000233,9606.ENSP00000262455,531,0,2413
4,9606.ENSP00000000233,9606.ENSP00000303145,499,0,5073
...,...,...,...,...,...
738795,9606.ENSP00000359100,9606.ENSP00000359096,900,10097,10095
738796,9606.ENSP00000375108,9606.ENSP00000441197,323,12389,16123
738797,9606.ENSP00000344260,9606.ENSP00000357086,225,8191,9710
738798,9606.ENSP00000479624,9606.ENSP00000477530,800,17428,17253


In [7]:
graph_df.to_csv('/novo/omdb/pds02/PDS2843/data/sprint_tid_ascvd/gzn/thesis/HBDM/data/ppi_connect.csv',index=False)

In [8]:
nodes = G.nodes()

# Get all possible edges in the complete graph
all_possible_edges = [(u, v) for u in nodes for v in nodes if u != v]

# Get the existing edges in the graph
existing_edges = list(G.edges())

part_edges = sample(all_possible_edges,int(len(existing_edges)*1.3))
non_existing_edges = list(set(part_edges) - set(existing_edges))
selected_non_exist_edges = sample(non_existing_edges,int(len(existing_edges)*0.3))

del all_possible_edges 
del part_edges

In [9]:
H = nx.minimum_spanning_tree(G)
sample_pool = list(set(existing_edges)-set(H.edges()))


In [24]:
mask_edegs = sample(sample_pool, int(len(existing_edges)*0.3))

In [10]:
edges = np.array([(u, v, data['combined_score']) for u, v, data in G.edges(data=True)])

# Determine i, j, and weights
i = np.where(edges[:, 0] > edges[:, 1], edges[:, 1], edges[:, 0])
j = np.where(edges[:, 0] > edges[:, 1], edges[:, 0], edges[:, 1])
weights = edges[:, 2]
weights = weights*0.001

root = '/novo/omdb/pds02/PDS2843/data/sprint_tid_ascvd/gzn/thesis/HBDM/data/datasets/ppi/'
np.savetxt(root+'sparse_i.txt', np.array(i), delimiter='\n')
np.savetxt(root+'sparse_j.txt', np.array(j), delimiter='\n')
np.savetxt(root+'sparse_w.txt', np.array(weights), delimiter='\n')

weights = (weights*0.01).astype(int)
np.savetxt(root+'sparse_10.txt', np.array(weights), delimiter='\n')

level_edges = dict()
for u, v, data in G.edges(data=True):
    level = int(str(data['combined_score'])[0])
    if level in level_edges:
        level_edges[level].append([u, v])
    else:
        level_edges[level]=[[u, v]]

for level in level_edges:
    edges = np.array(level_edges[level])
    sparse_i = np.where(edges[:, 0] > edges[:, 1], edges[:, 1], edges[:, 0])
    sparse_j = np.where(edges[:, 0] > edges[:, 1], edges[:, 0], edges[:, 1])
    np.savetxt(root+'level_'+str(level)+'_sparse_i.txt', np.array(sparse_i), delimiter='\n')
    np.savetxt(root+'level_'+str(level)+'_sparse_j.txt', np.array(sparse_j), delimiter='\n')

In [11]:
file_path = '/novo/omdb/pds02/PDS2843/data/sprint_tid_ascvd/gzn/thesis/HBDM/data/datasets/ppi/ppi_index.pkl'
# Serialize and save the Tensor to the file
with open(file_path, 'wb') as file:
    pickle.dump(gene2node, file)
# Close the file
file.close()

In [25]:
traingraph = G.edge_subgraph(list(set(existing_edges)-set(mask_edegs)))

edges = np.array([(u, v, data['combined_score']) for u, v, data in traingraph.edges(data=True)])

# Determine i, j, and weights
i = np.where(edges[:, 0] > edges[:, 1], edges[:, 1], edges[:, 0])
j = np.where(edges[:, 0] > edges[:, 1], edges[:, 0], edges[:, 1])
weights = edges[:, 2]
weights = weights*0.001
root = '/novo/omdb/pds02/PDS2843/data/sprint_tid_ascvd/gzn/thesis/HBDM/data/datasets/ppi_linkpredict5/'
np.savetxt(root+'sparse_i.txt', np.array(i), delimiter='\n')
np.savetxt(root+'sparse_j.txt', np.array(j), delimiter='\n')
np.savetxt(root+'sparse_w.txt', np.array(weights), delimiter='\n')

weights = (weights*0.01).astype(int)
np.savetxt(root+'sparse_10.txt', np.array(weights), delimiter='\n')

level_edges = dict()
for u, v, data in traingraph.edges(data=True):
    level = int(str(data['combined_score'])[0])
    if level in level_edges:
        level_edges[level].append([u, v])
    else:
        level_edges[level]=[[u, v]]

for level in level_edges:
    edges = np.array(level_edges[level])
    sparse_i = np.where(edges[:, 0] > edges[:, 1], edges[:, 1], edges[:, 0])
    sparse_j = np.where(edges[:, 0] > edges[:, 1], edges[:, 0], edges[:, 1])
    np.savetxt(root+'level_'+str(level)+'_sparse_i.txt', np.array(sparse_i), delimiter='\n')
    np.savetxt(root+'level_'+str(level)+'_sparse_j.txt', np.array(sparse_j), delimiter='\n')

In [26]:
edges = np.array(mask_edegs)

# Determine i, j, and weights
i = np.where(edges[:, 0] > edges[:, 1], edges[:, 1], edges[:, 0])
j = np.where(edges[:, 0] > edges[:, 1], edges[:, 0], edges[:, 1])

np.savetxt(root+'sparse_i_rem.txt', np.array(i), delimiter='\n')
np.savetxt(root+'sparse_j_rem.txt', np.array(j), delimiter='\n')

In [27]:
edges = np.array(selected_non_exist_edges)

# Determine i, j, and weights
i = np.where(edges[:, 0] > edges[:, 1], edges[:, 1], edges[:, 0])
j = np.where(edges[:, 0] > edges[:, 1], edges[:, 0], edges[:, 1])

np.savetxt(root+'non_sparse_i.txt', np.array(i), delimiter='\n')
np.savetxt(root+'non_sparse_j.txt', np.array(j), delimiter='\n')