In [1]:
import os
import time
import logging
import yaml
import ast
import numpy as np
import tensorflow as tf
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

from pprgo import utils
from pprgo import ppr
from pprgo import pprgo

import igraph

In [2]:
# Set up logging
logger = logging.getLogger()
logger.handlers = []
ch = logging.StreamHandler()
formatter = logging.Formatter(
        fmt='%(asctime)s (%(levelname)s): %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
ch.setFormatter(formatter)
logger.addHandler(ch)
logger.setLevel('INFO')

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

In [3]:
# !wget --show-progress -O data/reddit.npz https://ndownloader.figshare.com/files/23742119

# Download dataset

# Load config

In [4]:
with open('config_demo.yaml', 'r') as c:
    config = yaml.safe_load(c)

In [5]:
# For strings that yaml doesn't parse (e.g. None)
for key, val in config.items():
    if type(val) is str:
        try:
            config[key] = ast.literal_eval(val)
        except (ValueError, SyntaxError):
            pass

In [6]:
data_file           = config['data_file']           # Path to the .npz data file
data_file = 'data/cora_full.npz'
# data_file = 'data/reddit.npz'

split_seed          = config['split_seed']          # Seed for splitting the dataset into train/val/test
ntrain_div_classes  = config['ntrain_div_classes']  # Number of training nodes divided by number of classes
attr_normalization  = config['attr_normalization']  # Attribute normalization. Not used in the paper

alpha               = config['alpha']               # PPR teleport probability
alpha = 0.25
eps                 = config['eps']                 # Stopping threshold for ACL's ApproximatePR
topk                = config['topk']                # Number of PPR neighbors for each node
topk=4
ppr_normalization   = config['ppr_normalization']   # Adjacency matrix normalization for weighting neighbors

hidden_size         = config['hidden_size']         # Size of the MLP's hidden layer
nlayers             = config['nlayers']             # Number of MLP layers
weight_decay        = config['weight_decay']        # Weight decay used for training the MLP
dropout             = config['dropout']             # Dropout used for training

lr                  = config['lr']                  # Learning rate
max_epochs          = config['max_epochs']          # Maximum number of epochs (exact number if no early stopping)
batch_size          = config['batch_size']          # Batch size for training
batch_mult_val      = config['batch_mult_val']      # Multiplier for validation batch size

eval_step           = config['eval_step']           # Accuracy is evaluated after every this number of steps
run_val             = config['run_val']             # Evaluate accuracy on validation set during training

early_stop          = config['early_stop']          # Use early stopping
patience            = config['patience']            # Patience for early stopping

nprop_inference     = config['nprop_inference']     # Number of propagation steps during inference
inf_fraction        = config['inf_fraction']        # Fraction of nodes for which local predictions are computed during inference

In [7]:
import warnings
warnings.filterwarnings("ignore")

# Load the data

In [8]:
start = time.time()
(adj_matrix, attr_matrix, labels,
 train_idx, val_idx, test_idx) = utils.get_data(
        f"{data_file}",
        seed=split_seed,
        ntrain_div_classes=ntrain_div_classes,
        normalize_attr=attr_normalization
)
try:
    d = attr_matrix.n_columns
except AttributeError:
    d = attr_matrix.shape[1]
nc = labels.max() + 1
print(nc)

print('shape attibute matrix: ', attr_matrix.n_rows)
print('Training: ', len(train_idx))
print('Validation: ', len(val_idx))
print('Testing: ', len(test_idx))

print('train_idx: ', train_idx)

time_loading = time.time() - start
print(f"Runtime: {time_loading:.2f}s")

70
shape attibute matrix:  18800
Training:  1400
Validation:  14000
Testing:  3400
train_idx:  [   12    16    18 ... 18790 18793 18798]
Runtime: 0.22s


# Preprocessing: Calculate PPR scores

In [9]:
#Compute core numbers
core_numbers = None
g = igraph.Graph.Adjacency((adj_matrix.todense()> 0).tolist())
core_numbers = np.array(g.coreness())


In [10]:
# compute the ppr vectors for train/val nodes using ACL's ApproximatePR

start = time.time()

topk_train, mean_kn = ppr.topk_ppr_matrix(adj_matrix, alpha, eps, train_idx, topk, core_numbers,
                                 normalization=ppr_normalization, S=1, gamma=0.3)
if run_val:
    topk_val = ppr.topk_ppr_matrix(adj_matrix, alpha, eps, val_idx, topk,
                                   normalization=ppr_normalization)
else:
    topk_val = None

time_preprocessing = time.time() - start
print(f"Runtime: {time_preprocessing:.2f}s")


print('topk_train', topk_train.shape)
adj_xd = adj_matrix[train_idx]
print(adj_xd[0:0 + batch_size].shape)

j_ranked:  [12685  5448     3   133   120   124 16513   226 16422 12683   162  8251
 16423 13331 12673  4895    35 18480    48  4685  4859 18476    68   181
 13332  2627    55    85 18496  2537    91    12]
core of j_ranked:  [ 2  2  6 10  8  8 10  4 10 10 12 10 10 10 14 12 12 14 14 12 12 14 12 14
 12 14 10 14 14 14 14 14]
js[i]:  [12685  5448   226     3   120   124 13331 16423  8251    55 16422 16513
   133 12683  4859 13332    68  4685  4895    35   162 18480 18476    91
   181 12673  2627    85 18496  2537    48    12]
idx:  [ 0  1  7  2  4  5 13 12 11 26  8  6  3  9 20 24 22 19 15 16 10 17 21 30
 23 14 25 27 28 29 18 31]
core top:  [0.00574713 0.00574713 0.01149425 0.01724138 0.02298851 0.02298851
 0.02873563 0.02873563 0.02873563 0.02873563 0.02873563 0.02873563
 0.02873563 0.02873563 0.03448276 0.03448276 0.03448276 0.03448276
 0.03448276 0.03448276 0.03448276 0.04022989 0.04022989 0.04022989
 0.04022989 0.04022989 0.04022989 0.04022989 0.04022989 0.04022989
 0.04022989 0.040229

# Training: Set up model and train

In [11]:
start = time.time()
tf.reset_default_graph()
tf.set_random_seed(0)

model = pprgo.PPRGo(d, nc, hidden_size, nlayers, lr, weight_decay, dropout,
                    sparse_features=type(attr_matrix) is not np.ndarray)

# print(model.debug.shape)
# print(model.A_hat.shape)
# print(model.debug2.shape)
# print(model.logits.shape)
# print(model.batch_pprw.shape)
print(model.batch_idx.shape)
# print(model.batch_adj_matrix.shape)
# print('logits: ', model.logits.shape)
# print('weighted_logits: ', model.weighted_logits.shape)
# print('adj_weighted_logits: ', model.adj_weighted_logits.shape)













(?,)


In [12]:
sess = tf.compat.v1.Session()
with sess.as_default():
    tf.compat.v1.global_variables_initializer().run()
    nepochs, loss_hist, acc_hist, f1_hist = pprgo.train(
            sess=sess, model=model, attr_matrix=attr_matrix,
            train_idx=train_idx, val_idx=val_idx,
            topk_train=topk_train, topk_val=topk_val,
            labels=labels,
            max_epochs=max_epochs, batch_size=batch_size, batch_mult_val=batch_mult_val,
            eval_step=eval_step, early_stop=early_stop, patience=patience)
time_training = time.time() - start
logging.info('Training done.')
print(f"Runtime: {time_training:.2f}s")

2022-06-28 18:23:31 (INFO): Epoch 6, step 20: train 3.72451
2022-06-28 18:23:31 (INFO): Epoch 13, step 40: train 2.87134
2022-06-28 18:23:32 (INFO): Epoch 19, step 60: train 2.32926
2022-06-28 18:23:33 (INFO): Epoch 26, step 80: train 1.92869
2022-06-28 18:23:34 (INFO): Epoch 33, step 100: train 1.43751
2022-06-28 18:23:34 (INFO): Epoch 39, step 120: train 1.19314
2022-06-28 18:23:35 (INFO): Epoch 46, step 140: train 1.24077
2022-06-28 18:23:36 (INFO): Epoch 53, step 160: train 0.97421
2022-06-28 18:23:36 (INFO): Epoch 59, step 180: train 0.85371
2022-06-28 18:23:37 (INFO): Epoch 66, step 200: train 0.95295
2022-06-28 18:23:38 (INFO): Epoch 73, step 220: train 0.78172
2022-06-28 18:23:38 (INFO): Epoch 79, step 240: train 0.72213
2022-06-28 18:23:39 (INFO): Epoch 86, step 260: train 0.80338
2022-06-28 18:23:40 (INFO): Epoch 93, step 280: train 0.66650
2022-06-28 18:23:41 (INFO): Epoch 99, step 300: train 0.64857
2022-06-28 18:23:41 (INFO): Epoch 106, step 320: train 0.71270
2022-06-28 1

Runtime: 22.17s


# Inference (val and test)

In [13]:
start = time.time()
predictions, time_logits, time_propagation = model.predict(
        sess=sess, adj_matrix=adj_matrix, attr_matrix=attr_matrix, alpha=alpha,
        nprop=nprop_inference, inf_fraction=inf_fraction,
        ppr_normalization=ppr_normalization)
time_inference = time.time() - start
print(f"Runtime: {time_inference:.2f}s")

Runtime: 0.14s


# Collect and print results

In [14]:

acc_train = 100 * accuracy_score(labels[train_idx], predictions[train_idx])
acc_val = 100 * accuracy_score(labels[val_idx], predictions[val_idx])
acc_test = 100 * accuracy_score(labels[test_idx], predictions[test_idx])

f1_train = f1_score(labels[train_idx], predictions[train_idx], average='macro')
f1_val = f1_score(labels[val_idx], predictions[val_idx], average='macro')
f1_test = f1_score(labels[test_idx], predictions[test_idx], average='macro')



#gpu_max_bytes = tf.contrib.memory_stats.MaxBytesInUse()
#gpu_memory = sess.run(gpu_max_bytes)
memory = utils.get_max_memory_bytes()

time_total = time_preprocessing + time_training + time_inference

In [15]:
print(f'''
Accuracy: Train: {acc_train:.1f}%, val: {acc_val:.1f}%, test: {acc_test:.1f}%
F1 score: Train: {f1_train:.3f}, val: {f1_val:.3f}, test: {f1_test:.3f}

Runtime: Preprocessing: {time_preprocessing:.2f}s, training: {time_training:.2f}s, inference: {time_inference:.2f}s -> total: {time_total:.2f}s
Memory: Main: {(memory/1024) / 2**30:.2f}GB
''')

#Memory: Main: {memory / 2**30:.2f}GB, GPU: {gpu_memory / 2**30:.3f}GB


Accuracy: Train: 99.3%, val: 61.4%, test: 60.9%
F1 score: Train: 0.993, val: 0.521, test: 0.524

Runtime: Preprocessing: 1.67s, training: 22.17s, inference: 0.14s -> total: 23.99s
Memory: Main: 3.76GB

