In [1]:
# ! pip install --upgrade torch torch-geometric scikit-learn ogb
# ! python -c "import ogb; print(ogb.__version__)"

import numpy as np
import pandas as pd
import networkx as nx
from stellargraph import StellarGraph
from stellargraph.data import BiasedRandomWalk
from gensim.models import Word2Vec
from ogb.linkproppred import PygLinkPropPredDataset
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
import random

# Download and process data, stored in './dataset/ogbl_collab/'
dataset = PygLinkPropPredDataset(name="ogbl-collab", root='dataset/')
data = dataset[0]

2024-11-17 20:10:16.044928: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:923] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-11-17 20:10:16.084596: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:923] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-11-17 20:10:16.084641: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:923] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-11-17 20:10:16.086758: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate

In [2]:
# Show dataset information
print("Dataset Name:", dataset.name)
print("Number of Nodes:", data.num_nodes)
print("Number of Edges:", data.edge_index.shape[1])
print("Number of Node Features:", data.x.shape[1] if data.x is not None else "None")
print("Number of Edge Features:", data.edge_attr.shape[1] if data.edge_attr is not None else "None")

Dataset Name: ogbl-collab
Number of Nodes: 235868
Number of Edges: 2358104
Number of Node Features: 128
Number of Edge Features: None


In [3]:
# Split edges into train/validation/test sets
split_edge = dataset.get_edge_split()

In [4]:
# -> NetworkX graph
edge_list = split_edge['train']['edge']
graph = nx.Graph()
graph.add_edges_from(edge_list.tolist())

# NetworkX graph -> StellarGraph
stellargraph = StellarGraph.from_networkx(graph)

In [5]:
# Generate Node2Vec walks
random_walk = BiasedRandomWalk(stellargraph)
walks = random_walk.run(
    nodes=list(stellargraph.nodes()),  # root nodes
    length=10,  # maximum length of a random walk
    n=2,  # number of random walks per root node
    p=1.0,  # Defines (unormalised) probability, 1/p, of returning to source node
    q=1.0,  # Defines (unormalised) probability, 1/q, for moving away from source node
)

In [6]:
# Ensure all nodes appear in walks
all_nodes = set(str(node) for node in stellargraph.nodes())
walked_nodes = set(word for walk in walks for word in walk)
missing_nodes = all_nodes - walked_nodes

# Add singleton walks for missing nodes
walks.extend([[str(node)] for node in missing_nodes])

# Train Word2Vec model
word2vec_model = Word2Vec(walks, vector_size=128, window=5, min_count=0, sg=1, workers=4, epochs=10)

# Extract node embeddings
default_embedding = np.zeros(word2vec_model.vector_size)
node_embeddings = {
    str(node): word2vec_model.wv[str(node)] if str(node) in word2vec_model.wv else default_embedding
    for node in stellargraph.nodes()
}

In [7]:
# Prepare edge features
def prepare_edge_features_batch(edge_index, embeddings, batch_size=10000):
    edge_features = []
    num_edges = edge_index.shape[0]
    
    for start in range(0, num_edges, batch_size):
        end = min(start + batch_size, num_edges)
        batch = edge_index[start:end]
        features = [np.concatenate([embeddings[str(i)], embeddings[str(j)]]) for i, j in batch]
        edge_features.extend(features)
    
    return np.array(edge_features)

In [8]:
# Negative sampling
def sample_negative_edges(num_nodes, pos_edge_index, num_samples):
    neg_edges = []
    pos_edge_set = set(map(tuple, pos_edge_index.tolist()))
    while len(neg_edges) < num_samples:
        i, j = random.randint(0, num_nodes - 1), random.randint(0, num_nodes - 1)
        if i != j and (i, j) not in pos_edge_set and (j, i) not in pos_edge_set:
            neg_edges.append((i, j))
    return np.array(neg_edges)

In [9]:
# Prepare train/validation/test datasets
train_pos_edges = split_edge['train']['edge'].numpy()

valid_pos_edges = split_edge['valid']['edge'].numpy()
valid_neg_edges = split_edge['valid']['edge_neg'].numpy()

test_pos_edges = split_edge['test']['edge'].numpy()
test_neg_edges = split_edge['test']['edge_neg'].numpy()

In [10]:
# Generate negative edges
train_neg_edges = sample_negative_edges(data.num_nodes, train_pos_edges, len(train_pos_edges))

In [11]:
# Generate edge features
batch_size = 10000  # Adjust this based on available memory

train_pos_edges = train_pos_edges[:10000]
train_neg_edges = train_neg_edges[:10000]

train_pos_features = prepare_edge_features_batch(train_pos_edges, node_embeddings, batch_size)
train_neg_features = prepare_edge_features_batch(train_neg_edges, node_embeddings, batch_size)

valid_pos_features = prepare_edge_features_batch(valid_pos_edges, node_embeddings, batch_size)
valid_neg_features = prepare_edge_features_batch(valid_neg_edges, node_embeddings, batch_size)

test_pos_features = prepare_edge_features_batch(test_pos_edges, node_embeddings, batch_size)
test_neg_features = prepare_edge_features_batch(test_neg_edges, node_embeddings, batch_size)

In [12]:
# Combine positive and negative edges
train_features = np.vstack([train_pos_features, train_neg_features])
train_labels = np.hstack([np.ones(len(train_pos_features)), np.zeros(len(train_neg_features))])

valid_features = np.vstack([valid_pos_features, valid_neg_features])
valid_labels = np.hstack([np.ones(len(valid_pos_features)), np.zeros(len(valid_neg_features))])

test_features = np.vstack([test_pos_features, test_neg_features])
test_labels = np.hstack([np.ones(len(test_pos_features)), np.zeros(len(test_neg_features))])

In [13]:
# Train Logistic Regression classifier
classifier = LogisticRegression(max_iter=1000)
classifier.fit(train_features, train_labels)

LogisticRegression(max_iter=1000)

In [14]:
# Evaluate on validation and test sets
valid_preds = classifier.predict_proba(valid_features)[:, 1]
valid_auc = roc_auc_score(valid_labels, valid_preds)
print(f"Validation AUC: {valid_auc:.4f}")

test_preds = classifier.predict_proba(test_features)[:, 1]
test_auc = roc_auc_score(test_labels, test_preds)
print(f"Test AUC: {test_auc:.4f}")

Validation AUC: 0.4992
Test AUC: 0.4940
