In [6]:
import argparse
import sys

import torch
from torch_geometric.nn import Node2Vec
from torch_geometric.utils import to_undirected

from ogb.nodeproppred import PygNodePropPredDataset


def save_embedding(model):
    torch.save(model.embedding.weight.data.cpu(), 'embedding.pt')


In [9]:
def main():
    if "ipykernel" in sys.argv[0]:
        class Args:
            def __init__(self):
                self.embedding_dim = 128  # Set your desired embedding dimension
                self.walk_length = 10      # Set your desired walk length
                self.context_size = 5      # Set your desired context size
                self.walks_per_node = 5    # Set your desired number of walks per node
                self.batch_size = 64        # Set your desired batch size
                self.lr = 0.01              # Set your desired learning rate
                self.epochs = 100           # Set your desired number of epochs
                self.log_steps = 10         # Set your desired logging frequency
        
        # Create an instance of Args
        args = Args()

    else:
        # Argument parsing for command-line execution
        parser = argparse.ArgumentParser(description='OGBN-Arxiv (GNN)')
        parser.add_argument('--device', type=int, default=0)
        parser.add_argument('--log_steps', type=int, default=1)
        parser.add_argument('--use_sage', action='store_true')
        parser.add_argument('--num_layers', type=int, default=3)
        parser.add_argument('--hidden_channels', type=int, default=256)
        parser.add_argument('--dropout', type=float, default=0.5)
        parser.add_argument('--lr', type=float, default=0.01)
        parser.add_argument('--epochs', type=int, default=500)
        parser.add_argument('--runs', type=int, default=10)
        args = parser.parse_args()
    
    print(args)
    
    device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
    device = torch.device(device)

    dataset = PygNodePropPredDataset(name='ogbn-arxiv')
    data = dataset[0]
    data.edge_index = to_undirected(data.edge_index, data.num_nodes)


    # Create a subgraph consisting only of the specific nodes
    specific_nodes = [110223, 146929, 2940, 104544, 62326, 29759, 96890, 47025]  # Your actual specific nodes
    specific_nodes_tensor = torch.tensor(specific_nodes, dtype=torch.long)
    
    # Check if specific nodes are in the original dataset
    existing_nodes = torch.unique(data.edge_index)
    
    # Print specific nodes and check if they exist
    print("Specific nodes:", specific_nodes)
    print("Existing nodes in dataset:", existing_nodes.tolist())
    
    # Create edge index mask
    mask = torch.isin(edge_index[0], specific_nodes_tensor) & torch.isin(edge_index[1], specific_nodes_tensor)
    
    # Check if mask is empty
    print("Mask shape:", mask.shape)
    print("Number of edges in subgraph:", mask.sum().item())
    
    sub_edge_index = edge_index[:, mask]
    
    # Ensure the sub_edge_index is not empty before proceeding
    if sub_edge_index.numel() == 0:
        print("No edges found for the specified nodes.")
    else:
        # Create the subgraph and adjust node features and labels
        sub_data = data.clone()
        sub_data.edge_index = sub_edge_index
        sub_data.x = data.x[specific_nodes_tensor]  # Update the node features
        sub_data.y = data.y[specific_nodes_tensor] if data.y is not None else None  # Update the labels if they exist
    
        # Ensure the edge index is undirected
        sub_data.edge_index = to_undirected(sub_data.edge_index)
    
        # Proceed with Node2Vec initialization
        model = Node2Vec(sub_data.edge_index, args.embedding_dim, args.walk_length,
                         args.context_size, args.walks_per_node,
                         sparse=True).to(device)
    
        # Create a loader for the subgraph
        loader = model.loader(batch_size=args.batch_size, shuffle=True, num_workers=4)

    # Set up the optimizer
    optimizer = torch.optim.SparseAdam(list(model.parameters()), lr=args.lr)

    # Train the model
    model.train()
    for epoch in range(1, args.epochs + 1):
        for i, (pos_rw, neg_rw) in enumerate(loader):
            optimizer.zero_grad()
            loss = model.loss(pos_rw.to(device), neg_rw.to(device))
            loss.backward()
            optimizer.step()

            if (i + 1) % args.log_steps == 0:
                print(f'Epoch: {epoch:02d}, Step: {i+1:03d}/{len(loader)}, Loss: {loss:.4f}')

            if (i + 1) % 100 == 0:  # Save model every 100 steps.
                save_embedding(model)
        save_embedding(model)

if __name__ == "__main__":
    main()




<__main__.main.<locals>.Args object at 0x7f5c0fe91280>


NameError: name 'data' is not defined