In [2]:
from torch_geometric.datasets import Planetoid
import torch_geometric.transforms as T
from torch_geometric.utils import to_undirected
from torch_geometric.data import Data
from config import Config
import numpy as np
import torch
import random

config = Config()
seed = config.seed
random.seed(config.seed)
torch.manual_seed(seed)
device = config.device

In [2]:
dataset = Planetoid('./data', 'Cora')
data = dataset[0]
print(data)

transform = T.Compose([T.NormalizeFeatures(), T.RandomLinkSplit(num_val=0.05, num_test=0.1, is_undirected=True, split_labels=True)])
train_data, val_data, test_data = transform(data)

torch.save(train_data, './data/Cora/split/train_data.pt')
torch.save(val_data, './data/Cora/split/val_data.pt')
torch.save(test_data, './data/Cora/split/test_data.pt')

Data(x=[2708, 1433], edge_index=[2, 10556], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708])


In [3]:
edge_list = np.loadtxt('./data/Celegans/raw/Celegans.txt').astype(int).T
edge_index = torch.tensor(edge_list, dtype=torch.long)
edge_index = to_undirected(edge_index)
num_nodes = edge_index.max().item() + 1
x = torch.eye(num_nodes, dtype=torch.float)
data = Data(x=x, edge_index=edge_index)
torch.save(data, './data/Celegans/processed/data.pt')
transform = T.Compose([T.NormalizeFeatures(), T.RandomLinkSplit(num_val=0.05, num_test=0.1, is_undirected=True, split_labels=True)])
train_data, val_data, test_data = transform(data)
torch.save(train_data, './data/Celegans/split/train_data.pt')
torch.save(val_data, './data/Celegans/split/val_data.pt')
torch.save(test_data, './data/Celegans/split/test_data.pt')

In [8]:
edge_list = np.loadtxt('./data/USAir/raw/USAir.txt')[:, :2].astype(int).T
edge_index = torch.tensor(edge_list, dtype=torch.long)
edge_index = to_undirected(edge_index)
num_nodes = edge_index.max().item() + 1
x = torch.eye(num_nodes, dtype=torch.float)
data = Data(x=x, edge_index=edge_index)
torch.save(data, './data/USAir/processed/data.pt')
print(data)
transform = T.Compose([T.NormalizeFeatures(), T.RandomLinkSplit(num_val=0.05, num_test=0.1, is_undirected=True, split_labels=True)])
train_data, val_data, test_data = transform(data)

torch.save(train_data, './data/USAir/split/train_data.pt')
torch.save(val_data, './data/USAir/split/val_data.pt')
torch.save(test_data, './data/USAir/split/test_data.pt')

Data(x=[333, 333], edge_index=[2, 4252])


In [None]:
dataset = Planetoid('./data', 'PubMed')
data = dataset[0]
transform = T.Compose([T.NormalizeFeatures(), T.RandomLinkSplit(num_val=0.05, num_test=0.1, is_undirected=True, split_labels=True)])
train_data, val_data, test_data = transform(data)

torch.save(train_data, './data/PubMed/split/train_data.pt')
torch.save(val_data, './data/PubMed/split/val_data.pt')
torch.save(test_data, './data/PubMed/split/test_data.pt')

In [3]:
dataset = Planetoid('./data', 'CiteSeer')
data = dataset[0]
transform = T.Compose([T.NormalizeFeatures(), T.RandomLinkSplit(num_val=0.05, num_test=0.1, is_undirected=True, split_labels=True)])
train_data, val_data, test_data = transform(data)

torch.save(train_data, './data/CiteSeer/split/train_data.pt')
torch.save(val_data, './data/CiteSeer/split/val_data.pt')
torch.save(test_data, './data/CiteSeer/split/test_data.pt')

In [None]:
data = torch.load('./data/Github/processed/github_sub_data.pt')
transform = T.Compose([T.NormalizeFeatures(), T.RandomLinkSplit(num_val=0.05, num_test=0.1, is_undirected=True, split_labels=True)])
train_data, val_data, test_data = transform(data)

torch.save(train_data, './data/Github/split/train_data.pt')
torch.save(val_data, './data/Github/split/val_data.pt')
torch.save(test_data, './data/Github/split/test_data.pt')

Data(x=[4965, 512], edge_index=[2, 37094])


In [12]:
data = torch.load('./data/Github/processed/github_full_data.pt')
transform = T.Compose([T.NormalizeFeatures(), T.RandomLinkSplit(num_val=0.01, num_test=0.01, is_undirected=True, split_labels=True)])
train_data, val_data, test_data = transform(data)
print(train_data)

# --- 采样正例 ---
num_train_pos = train_data.pos_edge_label_index.size(1)
num_sampled_pos = int(num_train_pos * 0.05)

sampled_indices_pos = random.sample(range(num_train_pos), num_sampled_pos)
sampled_pos_edge_label_index = train_data.pos_edge_label_index[:, sampled_indices_pos]

num_train_neg = train_data.neg_edge_label_index.size(1)
num_sampled_neg = int(num_train_neg * 0.05)

sampled_indices_neg = random.sample(range(num_train_neg), num_sampled_neg)
sampled_neg_edge_label_index = train_data.neg_edge_label_index[:, sampled_indices_neg]

train_data = Data(
    x=train_data.x,
    edge_index=train_data.edge_index,
    pos_edge_label_index=sampled_pos_edge_label_index,
    neg_edge_label_index=sampled_neg_edge_label_index,
    y=torch.cat([torch.ones(num_sampled_pos), torch.zeros(num_sampled_neg)]).long()
)
print(train_data)

torch.save(train_data, './data/Github/split/train_data.pt')
torch.save(val_data, './data/Github/split/val_data.pt')
torch.save(test_data, './data/Github/split/test_data.pt')

Data(x=[37700, 512], edge_index=[2, 566446], pos_edge_label=[283223], pos_edge_label_index=[2, 283223], neg_edge_label=[283223], neg_edge_label_index=[2, 283223])
Data(x=[37700, 512], edge_index=[2, 566446], y=[28322], pos_edge_label_index=[2, 14161], neg_edge_label_index=[2, 14161])
