In [33]:
import pandas as pd
import networkx as nx
from node2vec import Node2Vec
from collections import defaultdict
from sklearn.manifold import TSNE

In [2]:
network = pd.read_csv('data/bio-pathways-network.csv')
graph = nx.from_pandas_edgelist(network, 'Gene ID 1', 'Gene ID 2')

In [6]:
node2vec = Node2Vec(graph, workers=4)

Computing transition probabilities: 100%|██████████| 21557/21557 [07:31<00:00, 47.75it/s] 


In [8]:
n2v_model = node2vec.fit(window=10, min_count=1, batch_words=4)

In [10]:
# Save embeddings for later use
n2v_model.wv.save_word2vec_format('node2vec_embedding.pt')

# Save model for later use
n2v_model.save('node2vec_model.pt')

In [15]:
n2v_model.wv.most_similar('2')

[('348', 0.7114304304122925),
 ('1558', 0.6682052612304688),
 ('22983', 0.6370899677276611),
 ('1562', 0.6319289207458496),
 ('1990', 0.6176955103874207),
 ('124912', 0.6062756776809692),
 ('26085', 0.6044849753379822),
 ('140545', 0.600368857383728),
 ('24138', 0.5945776104927063),
 ('1510', 0.5849873423576355)]

In [21]:
len(n2v_model.wv.vocab)

21557

In [3]:
import stellargraph as sg
import tensorflow as tf

In [5]:
G = sg.StellarGraph.from_networkx(graph)

In [6]:
rw = sg.data.BiasedRandomWalk(G)

walks = rw.run(
    nodes=list(G.nodes()),  # root nodes
    length=100,  # maximum length of a random walk
    n=10,  # number of random walks per root node
    p=0.5,  # Defines (unormalised) probability, 1/p, of returning to source node
    q=2.0,  # Defines (unormalised) probability, 1/q, for moving away from source node
)
print("Number of random walks: {}".format(len(walks)))

Number of random walks: 215570


In [7]:
from gensim.models import Word2Vec

str_walks = [[str(n) for n in walk] for walk in walks]
model = Word2Vec(str_walks, size=128, window=5, min_count=0, sg=1, workers=8, iter=1)

In [14]:
pd.read_csv('data/all-proteins.csv', index_col=0)

Unnamed: 0,inherited metabolic disorder,integumentary system disease,urinary system disease,nervous system disease,gastrointestinal system disease,substance-related disorder,immune system disease,musculoskeletal system disease,psoriatic arthritis,cancer,...,chromosomal disease,hypospadias,ciliopathy,developmental disorder of mental health,sleep disorder,bacterial infectious disease,respiratory system disease,polycystic ovary syndrome,reproductive system disease,orofacial cleft
3295,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5189,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5190,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5192,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5193,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19400,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
81918,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
139378,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18744,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [31]:
node_subject = pd.read_csv('data/binary-classes.csv', index_col=0)
diseased = node_subject.iloc[:,0]

In [32]:
# Retrieve node embeddings and corresponding subjects
node_ids = model.wv.index2word  # list of node IDs
node_embeddings = (
    model.wv.vectors
)  # numpy.ndarray of size number of nodes times embeddings dimensionality
node_targets = diseased[[int(node_id) for node_id in node_ids]]
# Apply t-SNE transformation on node embeddings
tsne = TSNE(n_components=2)
node_embeddings_2d = tsne.fit_transform(node_embeddings)
# draw the points
alpha = 0.7
label_map = {l: i for i, l in enumerate(np.unique(node_targets))}
node_colours = [label_map[target] for target in node_targets]

plt.figure(figsize=(10, 8))
plt.scatter(
    node_embeddings_2d[:, 0],
    node_embeddings_2d[:, 1],
    c=node_colours,
    cmap="jet",
    alpha=alpha,
)

NameError: name 'TSNE' is not defined