In [1]:
import os
import time
import logging
import yaml
import ast
import numpy as np
import tensorflow as tf
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

from pprgo import utils
from pprgo import ppr
from pprgo import pprgo

In [2]:
# Set up logging
logger = logging.getLogger()
logger.handlers = []
ch = logging.StreamHandler()
formatter = logging.Formatter(
        fmt='%(asctime)s (%(levelname)s): %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
ch.setFormatter(formatter)
logger.addHandler(ch)
logger.setLevel('INFO')

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

In [3]:
# !wget --show-progress -O data/reddit.npz https://ndownloader.figshare.com/files/23742119

--2022-06-13 12:54:09--  https://ndownloader.figshare.com/files/23742119
Resolving ndownloader.figshare.com (ndownloader.figshare.com)... 54.217.124.219, 52.16.102.173
Connecting to ndownloader.figshare.com (ndownloader.figshare.com)|54.217.124.219|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://s3-eu-west-1.amazonaws.com/pfigshare-u-files/23742119/reddit.npz?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIYCQYOYV5JSSROOA/20220613/eu-west-1/s3/aws4_request&X-Amz-Date=20220613T105410Z&X-Amz-Expires=10&X-Amz-SignedHeaders=host&X-Amz-Signature=c920111a8316dc2bea0509daf9509924317e4f7d5f71f73d162a6fa3792d3acf [following]
--2022-06-13 12:54:09--  https://s3-eu-west-1.amazonaws.com/pfigshare-u-files/23742119/reddit.npz?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIYCQYOYV5JSSROOA/20220613/eu-west-1/s3/aws4_request&X-Amz-Date=20220613T105410Z&X-Amz-Expires=10&X-Amz-SignedHeaders=host&X-Amz-Signature=c920111a8316dc2bea0509daf9509924317e4f7d

# Download dataset

# Load config

In [4]:
with open('config_demo.yaml', 'r') as c:
    config = yaml.safe_load(c)

In [5]:
# For strings that yaml doesn't parse (e.g. None)
for key, val in config.items():
    if type(val) is str:
        try:
            config[key] = ast.literal_eval(val)
        except (ValueError, SyntaxError):
            pass

In [6]:
data_file           = config['data_file']           # Path to the .npz data file
# data_file = 'data/cora_full.npz'
split_seed          = config['split_seed']          # Seed for splitting the dataset into train/val/test
ntrain_div_classes  = config['ntrain_div_classes']  # Number of training nodes divided by number of classes
ntrain_div_classes = 20
attr_normalization  = config['attr_normalization']  # Attribute normalization. Not used in the paper

alpha               = config['alpha']               # PPR teleport probability
# alpha = 0.25
eps                 = config['eps']                 # Stopping threshold for ACL's ApproximatePR
topk                = config['topk']                # Number of PPR neighbors for each node
ppr_normalization   = config['ppr_normalization']   # Adjacency matrix normalization for weighting neighbors

hidden_size         = config['hidden_size']         # Size of the MLP's hidden layer
nlayers             = config['nlayers']             # Number of MLP layers
weight_decay        = config['weight_decay']        # Weight decay used for training the MLP
dropout             = config['dropout']             # Dropout used for training

lr                  = config['lr']                  # Learning rate
max_epochs          = config['max_epochs']          # Maximum number of epochs (exact number if no early stopping)
batch_size          = config['batch_size']          # Batch size for training
batch_mult_val      = config['batch_mult_val']      # Multiplier for validation batch size

eval_step           = config['eval_step']           # Accuracy is evaluated after every this number of steps
run_val             = config['run_val']             # Evaluate accuracy on validation set during training

early_stop          = config['early_stop']          # Use early stopping
patience            = config['patience']            # Patience for early stopping

nprop_inference     = config['nprop_inference']     # Number of propagation steps during inference
inf_fraction        = config['inf_fraction']        # Fraction of nodes for which local predictions are computed during inference

# Load the data

In [7]:
start = time.time()
(adj_matrix, attr_matrix, labels,
 train_idx, val_idx, test_idx) = utils.get_data(
        f"{data_file}",
        seed=split_seed,
        ntrain_div_classes=ntrain_div_classes,
        normalize_attr=attr_normalization
)
try:
    d = attr_matrix.n_columns
except AttributeError:
    d = attr_matrix.shape[1]
nc = labels.max() + 1
print(nc)

# print('total: ', adj_matrix.shape[0])
print('Training: ', len(train_idx))
print('Validation: ', len(val_idx))
print('Testing: ', len(test_idx))

# print('attr_matrix: ', attr_matrix.data.shape, len(attr_matrix.data[0]))

# all_lens = [len(i)for i in attr_matrix.data]

# print(attr_matrix.data[0].todense().shape)
# print(max(all_lens))

print('train_idx: ', train_idx)

time_loading = time.time() - start
print(f"Runtime: {time_loading:.2f}s")

41
Training:  820
Validation:  8200
Testing:  223945
train_idx:  [    81    613    721    724    732    836   1375   1546   1879   2481
   3111   3460   4025   4046   4195   4392   4651   5785   6191   6286
   6302   6626   7059   7095   7776   8122   8552   8593   8675   8690
   8815   8944   9975  10122  12076  12269  13339  13439  13469  13581
  13659  14496  14624  14679  14756  14846  14898  15001  15058  15261
  15387  15403  15485  15551  16065  16263  16755  17117  17159  17415
  17548  17569  17712  17746  17762  17958  17993  18485  18502  18970
  19085  19206  19322  20010  20898  21159  21363  21442  21729  21844
  22824  23679  23770  25021  25417  25451  25502  25669  25772  27198
  27529  28089  28588  29020  29079  29370  29914  30060  30994  31296
  31830  31930  32040  32155  32296  32934  33030  33135  33280  33435
  33463  33948  33982  34069  34476  34491  35119  35176  35392  35612
  36070  36488  37242  37335  37686  38055  38264  38306  38553  38892
  39252  397

# Preprocessing: Calculate PPR scores

In [8]:
# compute the ppr vectors for train/val nodes using ACL's ApproximatePR
start = time.time()

topk_train, mean_kn = ppr.topk_ppr_matrix(adj_matrix, alpha, eps, train_idx, topk,
                                 normalization=ppr_normalization, k_window=10)
if run_val:
    topk_val = ppr.topk_ppr_matrix(adj_matrix, alpha, eps, val_idx, topk,
                                   normalization=ppr_normalization)
else:
    topk_val = None

print(topk_train.shape)
time_preprocessing = time.time() - start
print(f"Runtime: {time_preprocessing:.2f}s")

print(mean_kn)


idx_y:  (1,)
Using S:  10
y shape:  (0,)


ValueError: cannot reshape array of size 0 into shape (0,newaxis)

# Training: Set up model and train

In [None]:
start = time.time()
tf.reset_default_graph()
tf.set_random_seed(0)

model = pprgo.PPRGo(d, nc, hidden_size, nlayers, lr, weight_decay, dropout,
                    sparse_features=type(attr_matrix) is not np.ndarray)

sess = tf.compat.v1.Session()
with sess.as_default():
    tf.compat.v1.global_variables_initializer().run()
    nepochs, loss_hist, acc_hist, f1_hist = pprgo.train(
            sess=sess, model=model, attr_matrix=attr_matrix,
            train_idx=train_idx, val_idx=val_idx,
            topk_train=topk_train, topk_val=topk_val,
            labels=labels,
            max_epochs=max_epochs, batch_size=batch_size, batch_mult_val=batch_mult_val,
            eval_step=eval_step, early_stop=early_stop, patience=patience)
time_training = time.time() - start
logging.info('Training done.')
print(f"Runtime: {time_training:.2f}s")











2022-06-13 11:44:27 (INFO): Epoch 6, step 20: train 3.96772
2022-06-13 11:44:28 (INFO): Epoch 13, step 40: train 3.40186
2022-06-13 11:44:29 (INFO): Epoch 19, step 60: train 3.01606
2022-06-13 11:44:30 (INFO): Epoch 26, step 80: train 2.68928
2022-06-13 11:44:30 (INFO): Epoch 33, step 100: train 2.07432
2022-06-13 11:44:31 (INFO): Epoch 39, step 120: train 1.80441
2022-06-13 11:44:32 (INFO): Epoch 46, step 140: train 1.70652
2022-06-13 11:44:32 (INFO): Epoch 53, step 160: train 1.31979
2022-06-13 11:44:33 (INFO): Epoch 59, step 180: train 1.20673
2022-06-13 11:44:34 (INFO): Epoch 66, step 200: train 1.28307
2022-06-13 11:44:34 (INFO): Epoch 73, step 220: train 1.05689
2022-06-13 11:44:35 (INFO): Epoch 79, step 240: train 0.96082
2022-06-13 11:44:36 (INFO): Epoch 86, step 260: train 1.04940
2022-06-13 11:44:36 (INFO): Epoch 93, step 280: train 0.90492
2022-06-13 11:44:37 (INFO): Epoch 99, step 300: train 0.85712
2022-06-13 11:44:38 (INFO): Epoch 106, step 320: train 0.93783
2022-06-13 

KeyboardInterrupt: 

# Inference (val and test)

In [None]:
start = time.time()
predictions, time_logits, time_propagation = model.predict(
        sess=sess, adj_matrix=adj_matrix, attr_matrix=attr_matrix, alpha=alpha,
        nprop=nprop_inference, inf_fraction=inf_fraction,
        ppr_normalization=ppr_normalization)
time_inference = time.time() - start
print(f"Runtime: {time_inference:.2f}s")

Runtime: 0.16s


# Collect and print results

In [None]:

acc_train = 100 * accuracy_score(labels[train_idx], predictions[train_idx])
acc_val = 100 * accuracy_score(labels[val_idx], predictions[val_idx])
acc_test = 100 * accuracy_score(labels[test_idx], predictions[test_idx])

f1_train = f1_score(labels[train_idx], predictions[train_idx], average='macro')
f1_val = f1_score(labels[val_idx], predictions[val_idx], average='macro')
f1_test = f1_score(labels[test_idx], predictions[test_idx], average='macro')



#gpu_max_bytes = tf.contrib.memory_stats.MaxBytesInUse()
#gpu_memory = sess.run(gpu_max_bytes)
memory = utils.get_max_memory_bytes()

time_total = time_preprocessing + time_training + time_inference

In [None]:
print(f'''
Accuracy: Train: {acc_train:.1f}%, val: {acc_val:.1f}%, test: {acc_test:.1f}%
F1 score: Train: {f1_train:.3f}, val: {f1_val:.3f}, test: {f1_test:.3f}

Runtime: Preprocessing: {time_preprocessing:.2f}s, training: {time_training:.2f}s, inference: {time_inference:.2f}s -> total: {time_total:.2f}s
Memory: Main: {(memory/1024) / 2**30:.2f}GB
''')

#Memory: Main: {memory / 2**30:.2f}GB, GPU: {gpu_memory / 2**30:.3f}GB


Accuracy: Train: 99.1%, val: 60.5%, test: 60.4%
F1 score: Train: 0.970, val: 0.497, test: 0.490

Runtime: Preprocessing: 7.10s, training: 13.43s, inference: 0.16s -> total: 20.68s
Memory: Main: 0.43GB

