In [1]:
import os
import time
import logging
import yaml
import ast
import numpy as np
import tensorflow as tf
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

from pprgo import utils
from pprgo import ppr
from pprgo import pprgo



In [2]:
# Set up logging
logger = logging.getLogger()
logger.handlers = []
ch = logging.StreamHandler()
formatter = logging.Formatter(
        fmt='%(asctime)s (%(levelname)s): %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
ch.setFormatter(formatter)
logger.addHandler(ch)
logger.setLevel('INFO')

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

In [3]:
# !wget --show-progress -O data/reddit.npz https://ndownloader.figshare.com/files/23742119  #Reddit
# !wget --show-progress -O data/magc.npz https://figshare.com/ndownloader/files/24045741 #MAG-COARSE

# Download dataset

# Load config

In [4]:
with open('config_demo.yaml', 'r') as c:
    config = yaml.safe_load(c)

In [5]:
# For strings that yaml doesn't parse (e.g. None)
for key, val in config.items():
    if type(val) is str:
        try:
            config[key] = ast.literal_eval(val)
        except (ValueError, SyntaxError):
            pass

In [6]:
data_file           = config['data_file']           # Path to the .npz data file
data_file = 'data/cora_full.npz'
# data_file = 'data/pubmed.npz'
# data_file = 'data/reddit.npz'
# data_file = 'data/magc.npz'

split_seed          = config['split_seed']          # Seed for splitting the dataset into train/val/test
ntrain_div_classes  = config['ntrain_div_classes']  # Number of training nodes divided by number of classes
attr_normalization  = config['attr_normalization']  # Attribute normalization. Not used in the paper

alpha               = config['alpha']               # PPR teleport probability
alpha = 0.25
eps                 = config['eps']                 # Stopping threshold for ACL's ApproximatePR
topk                = config['topk']                # Number of PPR neighbors for each node
topk=32
ppr_normalization   = config['ppr_normalization']   # Adjacency matrix normalization for weighting neighbors

hidden_size         = config['hidden_size']         # Size of the MLP's hidden layer
nlayers             = config['nlayers']             # Number of MLP layers
weight_decay        = config['weight_decay']        # Weight decay used for training the MLP
dropout             = config['dropout']             # Dropout used for training

lr                  = config['lr']                  # Learning rate
max_epochs          = config['max_epochs']          # Maximum number of epochs (exact number if no early stopping)
batch_size          = config['batch_size']          # Batch size for training
batch_mult_val      = config['batch_mult_val']      # Multiplier for validation batch size

eval_step           = config['eval_step']           # Accuracy is evaluated after every this number of steps
run_val             = config['run_val']             # Evaluate accuracy on validation set during training

early_stop          = config['early_stop']          # Use early stopping
patience            = config['patience']            # Patience for early stopping

nprop_inference     = config['nprop_inference']     # Number of propagation steps during inference
inf_fraction        = config['inf_fraction']        # Fraction of nodes for which local predictions are computed during inference

In [7]:
# import warnings
# warnings.filterwarnings("ignore")

# Load the data

In [8]:
start = time.time()
(adj_matrix, attr_matrix, labels,
 train_idx, val_idx, test_idx) = utils.get_data(
        f"{data_file}",
        seed=split_seed,
        ntrain_div_classes=ntrain_div_classes,
        normalize_attr=attr_normalization
)
try:
    d = attr_matrix.n_columns
except AttributeError:
    d = attr_matrix.shape[1]
nc = labels.max() + 1
print('Number of classes: ', nc)

# print('shape attibute matrix: ', attr_matrix.n_rows)
# print('Training: ', len(train_idx))
# print('Validation: ', len(val_idx))
# print('Testing: ', len(test_idx))
# print('train_idx: ', train_idx)

time_loading = time.time() - start
print(f"Runtime: {time_loading:.2f}s")

Number of classes:  70
Runtime: 0.23s


# Preprocessing: Calculate PPR scores

In [9]:
#Compute core numbers
# core_numbers = np.load('core-numbers-networkx.npy')

# graph = igraph.Graph.Adjacency((adj_matrix.todense()> 0).tolist())
# core_numbers = np.array(graph.coreness())
# try:
#     core_numbers = np.load('coredata/reddit-cores.npy')
#     print('Cores loaded ')
# except:
#     print('No file for core numbers')
#     core_numbers = None

# core_numbers = None
core_numbers = np.load('coredata/cora-cores.npy')

In [10]:
idx_all_nodes = np.arange(adj_matrix.shape[0])

In [38]:
# compute the ppr vectors for train/val nodes using ACL's ApproximatePR

start = time.time()

# ppr_topk_train, core_topk_train, coreRank, mean_kn = ppr.topk_ppr_matrix(adj_matrix, alpha, eps, idx_all_nodes, None, core_numbers,
#                                  normalization=ppr_normalization)

# ppr_topk_train_top32, core_topk_train_top32, coreRank, mean_kn = ppr.topk_ppr_matrix(adj_matrix, alpha, eps, idx_all_nodes, topk, core_numbers,
#                                  normalization=ppr_normalization)

if run_val:
    topk_val = ppr.topk_ppr_matrix(adj_matrix, alpha, eps, val_idx, topk,
                                   normalization=ppr_normalization)
else:
    topk_val = None

time_preprocessing = time.time() - start
print(f"Runtime: {time_preprocessing:.2f}s")


Runtime: 4.74s


In [39]:
import scipy.sparse

# scipy.sparse.save_npz('full_ppr.npz', ppr_topk_train)
# scipy.sparse.save_npz('full_core.npz', core_topk_train)
# scipy.sparse.save_npz('top32_ppr.npz', ppr_topk_train_top32)
# scipy.sparse.save_npz('top32_core.npz', core_topk_train_top32)

In [51]:
ppr_topk_train = scipy.sparse.load_npz('top32_ppr.npz')
core_topk_train = scipy.sparse.load_npz('top32_core.npz')

In [53]:
print(ppr_topk_train.shape)
print(ppr_topk_train[0].nonzero()[0].shape)
print(core_topk_train[0].nonzero()[0].shape)
print(core_topk_train.shape)

(18800, 18800)
(32,)
(32,)
(18800, 18800)


# Training: Set up model and train

In [16]:
start = time.time()
tf.reset_default_graph()
tf.set_random_seed(0)


gamma = 0.5
# gamma = np.array([0.8, 0.2])
# gamma = gamma.astype('f')

model = pprgo.PPRGo(d, nc, hidden_size, nlayers, lr, gamma, weight_decay, dropout, #adj_matrix, train_idx, TEMP, K, topk_train, intermediate_layer=100,
                    sparse_features=type(attr_matrix) is not np.ndarray)



# print(type(attr_matrix) is not np.ndarray)













In [17]:
sess = tf.compat.v1.Session()
with sess.as_default():
    tf.compat.v1.global_variables_initializer().run()
    nepochs, loss_hist, acc_hist, f1_hist = pprgo.train(
            sess=sess, model=model, attr_matrix=attr_matrix,
            train_idx=train_idx, val_idx=val_idx,
            topk_train=ppr_topk_train[train_idx], topk_val=topk_val, core_topk_train=core_topk_train[train_idx],
            labels=labels, #adj_matrix=adj_matrix,
            max_epochs=max_epochs, batch_size=batch_size, batch_mult_val=batch_mult_val,
            eval_step=eval_step, early_stop=early_stop, patience=patience)
time_training = time.time() - start
logging.info('Training done.')
print(f"Runtime: {time_training:.2f}s")

2022-08-10 11:06:01 (INFO): Epoch 6, step 20: train 3.96262
2022-08-10 11:06:06 (INFO): Epoch 13, step 40: train 3.37453
2022-08-10 11:06:11 (INFO): Epoch 19, step 60: train 3.13222
2022-08-10 11:06:17 (INFO): Epoch 26, step 80: train 2.85235
2022-08-10 11:06:23 (INFO): Epoch 33, step 100: train 2.15716
2022-08-10 11:06:28 (INFO): Epoch 39, step 120: train 1.94652
2022-08-10 11:06:33 (INFO): Epoch 46, step 140: train 1.91482
2022-08-10 11:06:38 (INFO): Epoch 53, step 160: train 1.42399
2022-08-10 11:06:43 (INFO): Epoch 59, step 180: train 1.31011
2022-08-10 11:06:47 (INFO): Epoch 66, step 200: train 1.40398
2022-08-10 11:06:52 (INFO): Epoch 73, step 220: train 1.08109
2022-08-10 11:06:57 (INFO): Epoch 79, step 240: train 1.00902
2022-08-10 11:07:02 (INFO): Epoch 86, step 260: train 1.09894
2022-08-10 11:07:07 (INFO): Epoch 93, step 280: train 0.90630
2022-08-10 11:07:11 (INFO): Epoch 99, step 300: train 0.84797
2022-08-10 11:07:16 (INFO): Epoch 106, step 320: train 0.93151
2022-08-10 1

Runtime: 175.23s


In [32]:
ppr_topk_train = scipy.sparse.load_npz('top32_ppr.npz')
core_topk_train = scipy.sparse.load_npz('top32_core.npz')

# Inference (val and test)

In [33]:
start = time.time()
predictions, time_logits, time_propagation, logits = model.predict(
        sess=sess, adj_matrix=adj_matrix, attr_matrix=attr_matrix, alpha=alpha, ppr_topk_test=ppr_topk_train[test_idx], core_topk_test=core_topk_train[test_idx],
        nprop=nprop_inference, inf_fraction=inf_fraction,
        ppr_normalization=ppr_normalization)
time_inference = time.time() - start
print(f"Runtime: {time_inference:.2f}s")

Inference gamma:  0.112844475
Runtime: 4.89s


In [34]:
print(len(predictions))
print(len(test_idx))

3400
3400


In [None]:
deg = adj_matrix.sum(1).A1
deg_sqrt_inv = 1. / np.sqrt(np.maximum(deg, 1e-12))

print('logits: ', logits.shape)
print('adj_matrix: ', adj_matrix.shape)

# print(adj_matrix)
# print(deg_sqrt_inv)

# adj_matrix_norm =  adj_matrix.multiply(deg_sqrt_inv[:, None])

# print('x: ', x.shape, x )

# print('x: ', x.shape)

# x = np.arange(16).reshape(4,4)
# print(x)
# x = scipy.sparse.csr_matrix(x)

# # print(x)
# print(x.shape)

# y = np.arange(4)
# print(y)
# print(y[:, None].shape)


# w = x.multiply(y[:, None])
# print('w: ', w.shape, w.todense())

# z = y[:, None] * x
# print('z: ', z.shape)


# print(logits)

# # core_matrix = adj_matrix.multiply(coreRank)
# # normalized_core_matrix = core_matrix.multiply(1/core_matrix.sum(axis=1).A1[:, None])
# # print(normalized_core_matrix.sum(axis=1))

# M = adj_matrix.multiply(deg_sqrt_inv[:, None])
# A_inner = scipy.sparse.eye(adj_matrix.shape[0]) - (1 - alpha) * M
# result = scipy.sparse.linalg.inv(A_inner)

# print('result: ', result.shape, type(result))



# right_term = scipy.sparse.eye(adj_matrix.shape[0])
# adj_power = adj_matrix.multiply(deg_sqrt_inv[:, None])


# for k in range(1, nprop_inference):
#     if k ==1:
#         right_term += (1-alpha)  * adj_power
#     else:
#         adj_power = adj_power @ adj_power
#         right_term += np.power(1-alpha, k) * adj_power

# right_term = alpha * right_term

# adj_power = adj_power @ adj_power

# left_term = np.power(1-alpha, nprop_inference) * adj_power

# new_logits = (left_term +right_term) @ (deg_sqrt_inv[:, None] * logits)







In [None]:
# print(deg_sqrt_inv[:, None].shape)
# print((adj_matrix).shape)

# deg_sqrt_inv[:, None] * 

In [None]:
# xd =  deg_sqrt_inv[:, None] *(normalized_core_matrix * deg_sqrt_inv[:, None])
# print(xd.shape)
# adj = np.array([[0,1,1],[1,1,0], [1,0,1]])
# coreRank = np.array([3,7,9])

# print('adj: ', adj)
# print('coreRank: ', coreRank)

# coreMatrix = np.multiply(adj, coreRank)

# print('coreMatrix: ', coreMatrix)

# # print('sum: ', np.sum(coreMatrix, axis=1))

# z = np.multiply(coreMatrix, 1/ np.sum(coreMatrix, axis=1)[:,None])
# print(z)

# print(np.sum(z, axis=1))


In [35]:
print(acc_hist)


{'train': [0.0859375, 0.271484375, 0.39361702127659576, 0.466796875, 0.685546875, 0.7446808510638298, 0.779296875, 0.875, 0.9069148936170213, 0.89453125, 0.94921875, 0.9787234042553191, 0.96484375, 0.982421875, 0.9867021276595744, 0.982421875, 0.986328125, 0.9946808510638298, 0.990234375, 0.990234375, 0.9946808510638298, 0.984375, 0.99609375, 0.9946808510638298, 0.99609375, 1.0, 0.9973404255319149, 0.990234375, 0.99609375, 1.0], 'val': []}


# Collect and print results

In [36]:

# acc_train = 100 * accuracy_score(labels[train_idx], predictions[train_idx])
# acc_val = 100 * accuracy_score(labels[val_idx], predictions[val_idx])
# acc_test = 100 * accuracy_score(labels[test_idx], predictions[test_idx])

acc_test = 100 * accuracy_score(labels[test_idx], predictions)
acc_train = 0.00
acc_val = 0.00

# f1_train = f1_score(labels[train_idx], predictions[train_idx], average='macro')
# f1_val = f1_score(labels[val_idx], predictions[val_idx], average='macro')
# f1_test = f1_score(labels[test_idx], predictions[test_idx], average='macro')
f1_train = 0.00
f1_val = 0.00
f1_test = f1_score(labels[test_idx], predictions, average='macro')



#gpu_max_bytes = tf.contrib.memory_stats.MaxBytesInUse()
#gpu_memory = sess.run(gpu_max_bytes)
memory = utils.get_max_memory_bytes()

time_total = time_preprocessing + time_training + time_inference

In [37]:
print(f'''
Accuracy: Train: {acc_train:.1f}%, val: {acc_val:.1f}%, test: {acc_test:.1f}%
F1 score: Train: {f1_train:.3f}, val: {f1_val:.3f}, test: {f1_test:.3f}

Runtime: Preprocessing: {time_preprocessing:.2f}s, training: {time_training:.2f}s, inference: {time_inference:.2f}s -> total: {time_total:.2f}s
Memory: Main: {(memory/1024) / 2**30:.2f}GB
''')

#Memory: Main: {memory / 2**30:.2f}GB, GPU: {gpu_memory / 2**30:.3f}GB


Accuracy: Train: 0.0%, val: 0.0%, test: 60.9%
F1 score: Train: 0.000, val: 0.000, test: 0.505

Runtime: Preprocessing: 11.64s, training: 175.23s, inference: 4.89s -> total: 191.76s
Memory: Main: 2.77GB

