In [1]:
import os
import time
import logging
import yaml
import ast
import numpy as np
import tensorflow as tf
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

from pprgo import utils
from pprgo import ppr
from pprgo import pprgo

In [2]:
# Set up logging
logger = logging.getLogger()
logger.handlers = []
ch = logging.StreamHandler()
formatter = logging.Formatter(
        fmt='%(asctime)s (%(levelname)s): %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
ch.setFormatter(formatter)
logger.addHandler(ch)
logger.setLevel('INFO')

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

# Download dataset

# Load config

In [3]:
with open('config_demo.yaml', 'r') as c:
    config = yaml.safe_load(c)

In [4]:
# For strings that yaml doesn't parse (e.g. None)
for key, val in config.items():
    if type(val) is str:
        try:
            config[key] = ast.literal_eval(val)
        except (ValueError, SyntaxError):
            pass

In [5]:
data_file           = config['data_file']           # Path to the .npz data file
data_file = 'data/cora_full.npz'
split_seed          = config['split_seed']          # Seed for splitting the dataset into train/val/test
ntrain_div_classes  = config['ntrain_div_classes']  # Number of training nodes divided by number of classes
ntrain_div_classes = 20
attr_normalization  = config['attr_normalization']  # Attribute normalization. Not used in the paper

alpha               = config['alpha']               # PPR teleport probability
alpha = 0.25
eps                 = config['eps']                 # Stopping threshold for ACL's ApproximatePR
topk                = config['topk']                # Number of PPR neighbors for each node
ppr_normalization   = config['ppr_normalization']   # Adjacency matrix normalization for weighting neighbors

hidden_size         = config['hidden_size']         # Size of the MLP's hidden layer
nlayers             = config['nlayers']             # Number of MLP layers
weight_decay        = config['weight_decay']        # Weight decay used for training the MLP
dropout             = config['dropout']             # Dropout used for training

lr                  = config['lr']                  # Learning rate
max_epochs          = config['max_epochs']          # Maximum number of epochs (exact number if no early stopping)
batch_size          = config['batch_size']          # Batch size for training
batch_mult_val      = config['batch_mult_val']      # Multiplier for validation batch size

eval_step           = config['eval_step']           # Accuracy is evaluated after every this number of steps
run_val             = config['run_val']             # Evaluate accuracy on validation set during training

early_stop          = config['early_stop']          # Use early stopping
patience            = config['patience']            # Patience for early stopping

nprop_inference     = config['nprop_inference']     # Number of propagation steps during inference
inf_fraction        = config['inf_fraction']        # Fraction of nodes for which local predictions are computed during inference

# Load the data

In [6]:
start = time.time()
(adj_matrix, attr_matrix, labels,
 train_idx, val_idx, test_idx) = utils.get_data(
        f"{data_file}",
        seed=split_seed,
        ntrain_div_classes=ntrain_div_classes,
        normalize_attr=attr_normalization
)
try:
    d = attr_matrix.n_columns
except AttributeError:
    d = attr_matrix.shape[1]
nc = labels.max() + 1
print(nc)

print('total: ', adj_matrix.shape[0])
print('Training: ', len(train_idx))
print('Validation: ', len(val_idx))
print('Testing: ', len(test_idx))

print('attr_matrix: ', attr_matrix.data.shape, len(attr_matrix.data[0]))

all_lens = [len(i)for i in attr_matrix.data]

# print(attr_matrix.data[0].todense().shape)
print(max(all_lens))

print('train_idx: ', train_idx)

time_loading = time.time() - start
print(f"Runtime: {time_loading:.2f}s")

70
total:  18800
Training:  1400
Validation:  14000
Testing:  3400
attr_matrix:  (18800,) 48
293
train_idx:  [   12    16    18 ... 18790 18793 18798]
Runtime: 0.22s


# Preprocessing: Calculate PPR scores

In [7]:
# compute the ppr vectors for train/val nodes using ACL's ApproximatePR
start = time.time()
topk_train = ppr.topk_ppr_matrix(adj_matrix, alpha, eps, train_idx, topk,
                                 normalization=ppr_normalization)
if run_val:
    topk_val = ppr.topk_ppr_matrix(adj_matrix, alpha, eps, val_idx, topk,
                                   normalization=ppr_normalization)
else:
    topk_val = None

print(topk_train.shape)
time_preprocessing = time.time() - start
print(f"Runtime: {time_preprocessing:.2f}s")

kn:  97
kn:  68
kn:  51
kn:  76
kn:  68
Mean kn:  55
(1400, 18800)
Runtime: 3.23s


# Training: Set up model and train

In [8]:
start = time.time()
tf.reset_default_graph()
tf.set_random_seed(0)

model = pprgo.PPRGo(d, nc, hidden_size, nlayers, lr, weight_decay, dropout,
                    sparse_features=type(attr_matrix) is not np.ndarray)

sess = tf.compat.v1.Session()
with sess.as_default():
    tf.compat.v1.global_variables_initializer().run()
    nepochs, loss_hist, acc_hist, f1_hist = pprgo.train(
            sess=sess, model=model, attr_matrix=attr_matrix,
            train_idx=train_idx, val_idx=val_idx,
            topk_train=topk_train, topk_val=topk_val,
            labels=labels,
            max_epochs=max_epochs, batch_size=batch_size, batch_mult_val=batch_mult_val,
            eval_step=eval_step, early_stop=early_stop, patience=patience)
time_training = time.time() - start
logging.info('Training done.')
print(f"Runtime: {time_training:.2f}s")











2022-06-09 11:09:18 (INFO): Epoch 6, step 20: train 3.94965
2022-06-09 11:09:19 (INFO): Epoch 13, step 40: train 3.36159
2022-06-09 11:09:21 (INFO): Epoch 19, step 60: train 2.98854
2022-06-09 11:09:22 (INFO): Epoch 26, step 80: train 2.62987
2022-06-09 11:09:23 (INFO): Epoch 33, step 100: train 2.01063
2022-06-09 11:09:24 (INFO): Epoch 39, step 120: train 1.72112
2022-06-09 11:09:26 (INFO): Epoch 46, step 140: train 1.64454
2022-06-09 11:09:27 (INFO): Epoch 53, step 160: train 1.29669
2022-06-09 11:09:28 (INFO): Epoch 59, step 180: train 1.18593
2022-06-09 11:09:30 (INFO): Epoch 66, step 200: train 1.23938
2022-06-09 11:09:31 (INFO): Epoch 73, step 220: train 1.02499
2022-06-09 11:09:33 (INFO): Epoch 79, step 240: train 0.95707
2022-06-09 11:09:34 (INFO): Epoch 86, step 260: train 1.04187
2022-06-09 11:09:35 (INFO): Epoch 93, step 280: train 0.87281
2022-06-09 11:09:37 (INFO): Epoch 99, step 300: train 0.84049
2022-06-09 11:09:38 (INFO): Epoch 106, step 320: train 0.91341
2022-06-09 

Runtime: 42.98s


# Inference (val and test)

In [9]:
start = time.time()
predictions, logits, time_logits, time_propagation = model.predict(
        sess=sess, adj_matrix=adj_matrix, attr_matrix=attr_matrix, alpha=alpha,
        nprop=nprop_inference, inf_fraction=inf_fraction,
        ppr_normalization=ppr_normalization)
time_inference = time.time() - start
print(f"Runtime: {time_inference:.2f}s")

logits[0] [  3.7471876   -0.7202362   -5.5534153   -2.8628085   -1.2649837
  -4.0905247   -9.876308    -6.0071654   -6.408699    -5.5743093
  -0.63308656  -0.98601854   2.0576594  -10.358907    -8.077548
  -8.24498     -5.714405   -10.451134   -10.962844   -14.78145
  -5.26064    -11.089117    -7.9837437  -11.277013    -8.646212
  -8.932689    -8.079648   -10.156454    -5.3691773   -3.3356826
  -5.080211    -7.8928914  -11.086209    -3.057368    -1.2501965
  -7.8558326  -10.926219   -11.579767    -0.2666244   -7.204233
 -14.248711   -12.231257    -1.7392194   -5.395652    -6.5156603
  -6.3987317   -4.6175823   -4.6677675    5.3326616  -13.935593
  -7.0067945   -7.474948    -2.5876276   -5.604537    -5.6191244
  -4.430469    -5.5851073   -6.7725234   -7.49998     -8.961808
  -7.3768015   -9.209583   -13.433086    -6.322972   -12.824596
  -9.993391   -12.594318   -12.483537   -12.895504    -8.611756  ]
sum -484.7999
sum Tensor("strided_slice_3:0", shape=(70,), dtype=float32)


ValueError: too many values to unpack (expected 3)

# Collect and print results

In [None]:
acc_train = 100 * accuracy_score(labels[train_idx], predictions[train_idx])
acc_val = 100 * accuracy_score(labels[val_idx], predictions[val_idx])
acc_test = 100 * accuracy_score(labels[test_idx], predictions[test_idx])

f1_train = f1_score(labels[train_idx], predictions[train_idx], average='macro')
f1_val = f1_score(labels[val_idx], predictions[val_idx], average='macro')
f1_test = f1_score(labels[test_idx], predictions[test_idx], average='macro')

#gpu_max_bytes = tf.contrib.memory_stats.MaxBytesInUse()
#gpu_memory = sess.run(gpu_max_bytes)
memory = utils.get_max_memory_bytes()

time_total = time_preprocessing + time_training + time_inference

In [None]:
print(f'''
Accuracy: Train: {acc_train:.1f}%, val: {acc_val:.1f}%, test: {acc_test:.1f}%
F1 score: Train: {f1_train:.3f}, val: {f1_val:.3f}, test: {f1_test:.3f}

Runtime: Preprocessing: {time_preprocessing:.2f}s, training: {time_training:.2f}s, inference: {time_inference:.2f}s -> total: {time_total:.2f}s
Memory: Main: {(memory/1024) / 2**30:.2f}GB
''')

#Memory: Main: {memory / 2**30:.2f}GB, GPU: {gpu_memory / 2**30:.3f}GB


Accuracy: Train: 99.6%, val: 61.2%, test: 61.0%
F1 score: Train: 0.993, val: 0.506, test: 0.501

Runtime: Preprocessing: 4.77s, training: 47.11s, inference: 0.34s -> total: 52.22s
Memory: Main: 0.53GB

