In [1]:
import random

import networkx
import numpy
from sklearn.preprocessing import normalize
import torch
from torch_geometric.data import Data
from torch_geometric.datasets import Planetoid
from torch_geometric.utils import add_remaining_self_loops, dense_to_sparse, from_networkx, to_dense_adj, to_networkx

from data_utils import add_sparse, get_ppr_matrix_dense, get_random_data_split, normalize_adj
from models.pushnet import PushNet
from training import train_single_run

# Demo

This notebooks demonstrates all steps necessary to train our models. As an example, we train *PushNet-PP* on the *CiteSeer* dataset.

## Pre-process dataset

Our models require `torch_geometric.data.Data` objects as input containing:
- A sparse propagation matrix (PPR-matrix or aggregated PPR-matrix) represented by
    - **edge_index**: `torch.LongTensor` of shape *(2, num_nodes)* containing row and column indices
    - **edge_weight**: `torch.FloatTensor` of size *num_nodes* containing the corresponding propagation weights
- A dense node feature matrix
    - **x**: `torch.FloatTensor` of shape *(num_nodes, num_dims)*
- A label vector
    - **y**: `torch.LongTensor` of size *num_nodes*
    
For each graph dataset, we perform the following pre-processing steps:
- Restrict the graph to its largest connected component
- Add all not already existing self-loops to the graph
- Normalize the graph's adjacency matrix. We use symmetric normalization
- *L1*-normalize all node feature vectors
- Compute PPR-matrices for all $\alpha$-values. We use a dense algorithm, since all graphs considered in the paper all small enough
- Aggregate all PPR-matrices. We use *sum*-aggregation since it performed best and additionally allows for fast training

In [2]:
data_root = '../data'
dataset_name = 'Citeseer'

# Load dataset
dataset = Planetoid(root=data_root,
                    name=dataset_name,
                    transform=None,
                    pre_transform=None,
                    )

# Get data
data = dataset[0]
edge_index = data.edge_index
x = dataset.data.x.numpy().astype(numpy.float32)
y = dataset.data.y

# Restrict graph to largest connected component
graph = to_networkx(data,
                    to_undirected=False,
                    )
largest_cc = max(networkx.weakly_connected_components(graph), key=len)
nodes_cc = numpy.sort(list(largest_cc))
graph = graph.subgraph(largest_cc)
data = from_networkx(graph)
edge_index = data.edge_index
x = x[nodes_cc, :]
y = y[nodes_cc]

# Add self-loops
edge_index, edge_weight = add_remaining_self_loops(edge_index=edge_index,
                                                   num_nodes=data.num_nodes,
                                                   )

# Normalize adjacency matrix
edge_index, edge_weight = normalize_adj(edge_index=edge_index,
                                        edge_weight=edge_weight,
                                        num_nodes=data.num_nodes,
                                        dtype=dataset.data.x.dtype,
                                        normalization='sym',
                                        )

# Normalize features
x = normalize(x,
              norm='l1',
              axis=1,
              )
x = torch.FloatTensor(x)

# Compute PPR matrices
adj = to_dense_adj(edge_index=edge_index,
                   edge_attr=edge_weight,
                  )[0].numpy()
alphas = [0.05, 0.1, 0.2]
epsilon = 1e-5
edge_index = []
edge_weight = []
for alpha in alphas:
    ppr_alpha = get_ppr_matrix_dense(adj=adj,
                                     alpha=alpha,
                                     epsilon=epsilon,
                                    )
    edge_index_alpha, edge_weight_alpha = dense_to_sparse(torch.FloatTensor(ppr_alpha))
    edge_index += [edge_index_alpha]
    edge_weight += [edge_weight_alpha]
    
# Perform sum aggregation
edge_index, edge_weight = add_sparse(edge_index=edge_index,
                                     edge_weight=edge_weight,
                                    )

# Prepare data object
data = Data(edge_index=edge_index,
            edge_attr=edge_weight,
            x=x,
            y=y,
            )

In [3]:
# Print key statistics
num_nodes = graph.number_of_nodes()
num_edges = graph.number_of_edges() // 2  # undirected
in_features = x.shape[1]
num_classes = torch.unique(y).numel()
print('Number of nodes: {}'.format(num_nodes))
print('Number of edges: {}'.format(num_edges))
print('Number of features: {}'.format(in_features))
print('Number of classes: {}'.format(num_classes))

Number of nodes: 2120
Number of edges: 3679
Number of features: 3703
Number of classes: 6


## Train model

For simplicity, we train on a single random data split here.

- Each split is defined by 3 binary masks indicating training, validation and test nodes. Each mask is represented by a `torch.BoolTensor` of size *num_nodes*. The data object needs to have the properties **train_mask**, **val_mask** and **test_mask** set before training.
- For each split, we sample 20 nodes per class as training nodes. From the remaining nodes, we sample 500 nodes for validation. The remaining nodes are used for testing. All samples are drawn uniformly at random without replacement.

In [4]:
# Fix random seed
seed = 0
random.seed(seed)
numpy.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [5]:
# Get random data split
split = get_random_data_split(data=data,
                              num_classes=num_classes,
                              num_train_per_class=20,
                              num_val=500,
                             )
data.train_mask = split['train_mask']
data.val_mask = split['val_mask']
data.test_mask = split['test_mask']

In [6]:
# Initialize model
model = PushNet(in_features=in_features,
                num_classes=num_classes,
                variant='TPP',
                dropout=0.5,
                hidden_size=32,
                bias=True,
               )

In [7]:
# Move to GPU
device = torch.device(0)
data = data.to(device)
model = model.to(device)

In [8]:
# Train model
train_acc, val_acc, test_acc, best_epoch, seconds_per_epoch = train_single_run(model=model,
                                                                               data=data,
                                                                               learning_rate=1e-2,
                                                                               patience=100,
                                                                               max_epochs=10000,
                                                                               l2_reg=1e-2,
                                                                              )

In [9]:
# Print results
print('Best epoch after early stopping: {}'.format(best_epoch))
print('Average training time per epoch: {:.4f}s'.format(seconds_per_epoch))
print('Training accuracy: {:.2f}%'.format(train_acc * 100))
print('Validation accuracy: {:.2f}%'.format(val_acc * 100))
print('Test accuracy: {:.2f}%'.format(test_acc * 100))

Best epoch after early stopping: 242
Average training time per epoch: 0.0082s
Training accuracy: 89.17%
Validation accuracy: 75.20%
Test accuracy: 75.33%
