In [1]:
import torch
import torch.nn as nn
import numpy as np
from ogb.lsc import MAG240MDataset
from torch_geometric.data import Data
import networkx as nx

# initialize the dataset
ROOT = './MAG240M'
dataset = MAG240MDataset(root=ROOT)

# embedding param
max_author_idx = dataset.num_authors  # authors' max index
max_institution_idx = dataset.num_institutions  # institutions' max index
embedding_dim = 16  # embedding dimension

# create embedding layer
author_embedding = nn.Embedding(num_embeddings=max_author_idx, embedding_dim=embedding_dim)
institution_embedding = nn.Embedding(num_embeddings=max_institution_idx, embedding_dim=embedding_dim)

# retrieve indices & feature
paper_labels = dataset.all_paper_label
valid_indices = np.where((paper_labels >= 0) & (paper_labels <= 152))[0]
paper_indices = np.random.choice(valid_indices, size=30, replace=False)
# paper_indices = np.random.choice(dataset.num_papers, size=30, replace=False)  # paper indices
edge_index_writes = dataset.edge_index('author', 'writes', 'paper')
edge_index_affiliated_with = dataset.edge_index('author', 'institution')
selected_paper_features = dataset.paper_feat[np.array(paper_indices)]
selected_paper_labels = paper_labels[paper_indices]

new_features_list = []
for i, paper_index in enumerate(paper_indices):
    # retrieve author & institution index
    author_idx = edge_index_writes[0, edge_index_writes[1] == paper_index]
    institution_idx = edge_index_affiliated_with[1, np.isin(edge_index_affiliated_with[0], author_idx)]

    # generate embedding
    author_embeds = author_embedding(torch.LongTensor(author_idx))
    institution_embeds = institution_embedding(torch.LongTensor(institution_idx))

    # aggregate embedding by taking the mean
    author_embed = torch.mean(author_embeds, dim=0) if len(author_idx) > 0 else torch.zeros(embedding_dim)
    institution_embed = torch.mean(institution_embeds, dim=0) if len(institution_idx) > 0 else torch.zeros(embedding_dim)


    paper_feature_tensor = torch.tensor(selected_paper_features[i], dtype=torch.float16)
    author_embed_tensor = author_embed.to(torch.float16)
    institution_embed_tensor = institution_embed.to(torch.float16)

    # transfer embedding to float16 and concatenate with original feature matrix
    combined_features = torch.cat([paper_feature_tensor, author_embed_tensor, institution_embed_tensor], dim=0)
    new_features_list.append(combined_features)

# transfer to tensor
new_features_matrix = torch.stack(new_features_list)

# create subgraph
edge_index_cites = dataset.edge_index('paper', 'paper')
new_edge_index = edge_index_cites[:, np.isin(edge_index_cites[0], paper_indices) & np.isin(edge_index_cites[1], paper_indices)]
subgraph_data = Data(x=new_features_matrix, edge_index=new_edge_index)

# save subgraph
# torch.save(subgraph_data, 'MAG240Msubgraph_test.pt')

n_original_nodes = len(new_features_list)
n_synthetic_nodes = 1999970
new_node_labels = np.random.randint(-1, 152, size=n_synthetic_nodes)
n_total_nodes = n_original_nodes + n_synthetic_nodes

all_node_labels = np.concatenate([selected_paper_labels, new_node_labels])

original_edge_index = subgraph_data.edge_index
#Watts-Strogatz model param
k = 6
p = 0.1
#Generate Watts-Strogatz graph
ws_graph = nx.watts_strogatz_graph(n_total_nodes, k, p)
ws_edges = torch.tensor(list(ws_graph.edges), dtype = torch.long).t().contiguous()

# new_edges = ws_edges[:,(ws_edges[0] >= n_original_nodes) | (ws_edges[1] >= n_original_nodes)]

# combined_edge_index = torch.cat([original_edge_index, new_edges], dim=1)


# Assuming new_edges is a NumPy array
new_edges = ws_edges[:, (ws_edges[0] >= n_original_nodes) | (ws_edges[1] >= n_original_nodes)]
new_edges_tensor = torch.from_numpy(new_edges)

# Assuming original_edge_index is already a PyTorch tensor
combined_edge_index = torch.cat([original_edge_index, new_edges_tensor], dim=1)


synthetic_features_list = new_features_list.copy()
for _ in range(n_synthetic_nodes):
    base_feature = new_features_list[np.random.randint(len(new_features_list))].clone()
    noise = torch.randn_like(base_feature) * 0.05
    synthetic_feature = base_feature + noise
    synthetic_features_list.append(synthetic_feature)

synthetic_features_matrix = torch.stack(synthetic_features_list)

synthetic_data = Data(x=synthetic_features_matrix, edge_index=combined_edge_index)
synthetic_data.y = torch.tensor(all_node_labels, dtype=torch.long)
torch.save(synthetic_data, 'MAG240M_synthetic_ws_graph_30core.pt')

In [3]:
new_edges

tensor([[      0,       0,       0,  ..., 1999996, 1999997, 1999998],
        [1999999, 1999998, 1999997,  ..., 1999999, 1999999, 1999999]])

In [10]:
edge_index_cites

array([[        3,         3,         3, ..., 121751664, 121751664,
        121751664],
       [  1806011,   4352730,   5950316, ..., 109890517, 118131973,
        118471854]])

In [11]:
edge_index_cites[:, np.isin(edge_index_cites[0], paper_indices) & np.isin(edge_index_cites[1], paper_indices)]

array([], shape=(2, 0), dtype=int64)

In [12]:
paper_indices

array([ 71586024,  99048486,  58765590,  20224237,  98310460,   9673892,
         2181806,  90585598,  87663071,  94481025, 120304214, 108973976,
        16563845,  19895585,  40567197, 110402197,  89602365, 117563223,
        52943478,  28485890,  73530911,  24438326, 101312487,  95458655,
       114941189,  63317227,  89127793, 106174629,  69444973,  56983353])

In [13]:
np.isin(edge_index_cites[0], paper_indices)

array([False, False, False, ..., False, False, False])