In [1]:
import pandas as pd
import networkx as nx
from node2vec import Node2Vec
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import numpy as np
import random
from gensim.models import Word2Vec


In [2]:

# Load and build graph
df1 = pd.read_csv('83332.protein_chemical.links.detailed.v5.0.tsv', sep='\t')
df = df1.sort_values(by='experimental', ascending=False).head(10000)

G = nx.Graph()
for _, row in df.iterrows():
    G.add_edge(row['chemical'], row['protein'])

In [3]:
#The walk length parameter controls the length of the sentence. 

In [4]:
def generate_walks(graph, num_walks=10, walk_length=40):
    walks = []
    nodes = list(graph.nodes())
    for _ in range(num_walks):
        random.shuffle(nodes)
        for node in nodes:
            walk = [node]
            while len(walk) < walk_length:
                cur = walk[-1]
                neighbors = list(graph.neighbors(cur))
                if not neighbors:
                    break
                walk.append(random.choice(neighbors))
            walks.append(walk)
    return walks

In [5]:
walks = generate_walks(G, num_walks=10, walk_length=40)
walks = [[str(node) for node in walk] for walk in walks]  # gensim requires string tokens


In [6]:
# ----- Step 3: Train skip-gram Word2Vec -----
model = Word2Vec(walks, vector_size=128, window=5, min_count=0, sg=1, workers=4, epochs=5)



In [7]:
# Get node embeddings
node_embeddings = {node: model.wv[str(node)] for node in G.nodes()}

In [8]:
# Example: embedding for a chemical/protein
print(node_embeddings[list(G.nodes())[0]])

[ 0.52443624 -0.3476105   0.01923808  0.4527632   0.16446231 -0.19755913
  0.61542445 -0.27943638 -0.21119517  0.03559417  0.79908127 -0.09800877
  0.20827083 -0.1379976  -0.1702288  -0.09208585 -0.02668916 -0.04828003
  0.39084968  0.34057876  0.03246465 -0.25889388 -0.43645644  0.04061414
 -0.3772784   0.40476128 -0.19017018  0.24968255  0.3729445   0.03554049
  0.26007432 -0.20732394  0.05482044 -0.03231744  0.10476781 -0.07450484
  0.69226366 -0.08950389 -0.46971202  0.2164383   0.7513727  -0.03014943
  0.7825339  -0.06433287  0.39059412 -0.1443428  -0.3028992   0.03753158
 -0.27578613  0.17242607  0.42469478 -0.4380793   0.16107403 -0.11150428
 -0.2349694   0.17533697  0.22437207 -0.53638905 -0.01945447  0.19055508
 -0.40911645  0.18357572  0.37003237  0.22836605  1.1182557   0.19173728
  0.08634968  0.00980959 -0.13673666 -0.01474925  0.21137881 -0.18959641
  0.2613215   0.09936649  0.20712517  0.00899029 -0.15823235 -0.18468815
 -0.4680551   0.11637086  0.30118334 -0.10733818  0

In [9]:

# Helper: Create edge embedding
def get_edge_embedding(u, v):
    return model.wv[u] * model.wv[v]  # Could also try concat or average

In [10]:

# Positive samples (existing links)
positive_edges = list(G.edges())
X_pos = [get_edge_embedding(u, v) for u, v in positive_edges]
y_pos = [1] * len(X_pos)

# Negative samples (non-existent links)
nodes = list(G.nodes())
negative_edges = []
while len(negative_edges) < len(positive_edges):
    u, v = random.sample(nodes, 2)
    if not G.has_edge(u, v) and u[0] != v[0]:  # Ensure bipartite: avoid same-type nodes
        negative_edges.append((u, v))

In [11]:
X_neg = [get_edge_embedding(u, v) for u, v in negative_edges]
y_neg = [0] * len(X_neg)

# Combine and classify
X = X_pos + X_neg
y = y_pos + y_neg
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

clf = LogisticRegression()
clf.fit(X_train, y_train)
y_pred = clf.predict_proba(X_test)[:, 1]

print(f"AUC: {roc_auc_score(y_test, y_pred):.4f}")


AUC: 0.9500
