In [None]:
from node2vec import *
import json
import numpy as np
import os 
import pandas as pd
import scipy.spatial as sp
os.environ['PYTHONHASHSEED']='123'

In [None]:
#embedding the GO DAG - performing the random walks
graph = read_graph('P-edges.txt', weighted = False)
random.seed(1)
np.random.seed(1)
G = Node2Vec(graph, p = 0.5, q = 2)
G.preprocess_transition_probs()
walks = G.simulate_walks(num_walks = 20, walk_length = 100)

100%|█████████████████████████████████████████████████████████████████████████| 28642/28642 [00:01<00:00, 28434.06it/s]
Walk 1/20: 100%|████████████████████████████████████████████████████████████████| 28642/28642 [00:34<00:00, 823.16it/s]
Walk 2/20: 100%|███████████████████████████████████████████████████████████████| 28642/28642 [00:27<00:00, 1046.34it/s]
Walk 3/20: 100%|████████████████████████████████████████████████████████████████| 28642/28642 [00:29<00:00, 957.07it/s]
Walk 4/20: 100%|████████████████████████████████████████████████████████████████| 28642/28642 [00:31<00:00, 915.60it/s]
Walk 5/20: 100%|████████████████████████████████████████████████████████████████| 28642/28642 [00:32<00:00, 880.14it/s]
Walk 6/20: 100%|████████████████████████████████████████████████████████████████| 28642/28642 [00:31<00:00, 896.41it/s]
Walk 7/20: 100%|████████████████████████████████████████████████████████████████| 28642/28642 [00:31<00:00, 915.70it/s]
Walk 8/20: 100%|████████████████████████

In [None]:
#applying the Skip-gram model to extract the embeddings
model = learn_embeddings(walks, 128, 10, 5, 'GO_embeddings.txt')

In [None]:
#manually probing the output - analysing the neighbours of an ageing-related GO term
for node, _ in model.wv.most_similar('GO:0008340'):
    print(node)

GO:1901047
GO:0010259
GO:1990636
GO:0032501
GO:0008286
GO:0042697
GO:0038028
GO:0032898
GO:0007585
GO:0036363


In [None]:
#re-loading the embeddings
from gensim.models import KeyedVectors
model = KeyedVectors.load_word2vec_format('GO_embeddings.txt', binary=False)

In [None]:
#obtaining proteins/annotations lists from the annotations dictionary of STRING proteins previously obtained
with open('prop_anno_STRING.txt') as f:
    data = f.read()

annos = json.loads(data)
proteins = [key for key in annos.keys()]

In [None]:
#loading the full C. elegans STRING PPI network, using the NetworkX based function implemented with node2vec
PPI_full = read_graph('6239.protein.links.v11.0.txt', weighted = True)

In [None]:
PPI_full.number_of_edges()

3709383

In [None]:
PPI_full.number_of_nodes()

18181

In [None]:
#obtaining the subgraph of BP annotated STRING proteins (PPI EXP subnetwork), writing it to file as an edgelist for further use
PPI_annos = PPI_full.subgraph(proteins)
nx.write_weighted_edgelist(PPI_annos, "PPI_BP_edgelist_EXP.txt")

In [None]:
PPI_annos.number_of_edges()

327779

In [None]:
PPI_annos.number_of_nodes()

2975

In [None]:
#defining function for MHD calculation for determining functional similarity
def find_similarity(annotations, vectorial_embeddings, PPI_graph):
    #replacing textual GO term annotations with matrices of embedding vector
    dis = {}
    for prot in annotations:
        dis[prot] = []
        for term in annos[prot]:
            dis[prot].append(vectorial_embeddings[term])
    #calculating functional similarities between pairs of nodes connected by an edge in the PPI
    fsim = []
    for edge in tqdm(list(PPI_graph.edges())):
        A = dis[edge[0]]
        B = dis[edge[1]]
        cos_sim = 1 - sp.distance.cdist(A, B, 'cosine')
        fhd = np.mean(np.max(cos_sim,axis=1))
        rhd = np.mean(np.max(cos_sim,axis=0))
        mhd = np.min(np.array([fhd, rhd]))
        fsim.append([edge[0], edge[1], "{0:.3f}".format(mhd, 3)])
    df = pd.DataFrame(fsim)
    df.to_csv('FSim_PPI_EXP.txt', index = False, sep = '\t', header = None)
    return fsim

In [None]:
#determining the MHD-based functional similarity for all interaction pairs in the PPI EXP subnetwork (automatically saving the FSim-scored network to file)
fsim = find_similarity(annos, model, PPI_annos)

100%|████████████████████████████████████████████████████████████████████████| 327779/327779 [03:07<00:00, 1747.68it/s]


In [None]:
#checking output
print(fsim[0:100])

[['6239.R11A8.4a', '6239.AC3.3', '0.521'], ['6239.R11A8.4a', '6239.B0025.1a', '0.651'], ['6239.R11A8.4a', '6239.B0035.14a.2', '0.467'], ['6239.R11A8.4a', '6239.B0041.7', '0.461'], ['6239.R11A8.4a', '6239.B0205.7', '0.703'], ['6239.R11A8.4a', '6239.B0207.4', '0.734'], ['6239.R11A8.4a', '6239.B0218.3', '0.636'], ['6239.R11A8.4a', '6239.B0222.9', '0.521'], ['6239.R11A8.4a', '6239.B0228.5a', '0.377'], ['6239.R11A8.4a', '6239.B0261.2a', '0.635'], ['6239.R11A8.4a', '6239.B0286.5', '0.433'], ['6239.R11A8.4a', '6239.B0302.1a.2', '0.436'], ['6239.R11A8.4a', '6239.B0304.1b', '0.486'], ['6239.R11A8.4a', '6239.B0334.8', '0.615'], ['6239.R11A8.4a', '6239.B0336.8', '0.582'], ['6239.R11A8.4a', '6239.B0350.2f.2', '0.565'], ['6239.R11A8.4a', '6239.B0414.2', '0.604'], ['6239.R11A8.4a', '6239.B0432.5a', '0.625'], ['6239.R11A8.4a', '6239.B0478.1a', '0.687'], ['6239.R11A8.4a', '6239.C01B7.1c.1', '0.391'], ['6239.R11A8.4a', '6239.C01C7.1a', '0.494'], ['6239.R11A8.4a', '6239.C01G6.7', '0.471'], ['6239.R11A8.