In [1]:
import os
import time
import logging
import yaml
import ast
import numpy as np
import tensorflow as tf
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, confusion_matrix
import math

from pprgo import utils
from pprgo import ppr
from pprgo import pprgo

import warnings
warnings.filterwarnings("ignore")

# import igraph
# import pickle
# import networkx as nx
# from networkx import from_scipy_sparse_matrix, k_truss, core_number, selfloop_edges



In [2]:
# Set up logging
logger = logging.getLogger()
logger.handlers = []
ch = logging.StreamHandler()
formatter = logging.Formatter(
        fmt='%(asctime)s (%(levelname)s): %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
ch.setFormatter(formatter)
logger.addHandler(ch)
logger.setLevel('INFO')

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

In [3]:
# !wget --show-progress -O data/reddit.npz https://ndownloader.figshare.com/files/23742119  #Reddit
# !wget --show-progress -O data/magc.npz https://figshare.com/ndownloader/files/24045741 #MAG-COARSE

# Download dataset

# Load config

In [3]:
with open('config_demo.yaml', 'r') as c:
    config = yaml.safe_load(c)

In [4]:
# For strings that yaml doesn't parse (e.g. None)
for key, val in config.items():
    if type(val) is str:
        try:
            config[key] = ast.literal_eval(val)
        except (ValueError, SyntaxError):
            pass

In [6]:
data_file           = config['data_file']           # Path to the .npz data file
data_file = 'data/cora_full.npz'
# data_file = 'data/pubmed.npz'
# data_file = 'data/reddit.npz'
# data_file = 'data/amazon_electronics_photo.npz'
# data_file = 'data/ms_academic_cs.npz'
# data_file = 'data/magc.npz'


split_seed          = config['split_seed']          # Seed for splitting the dataset into train/val/test
ntrain_div_classes  = config['ntrain_div_classes']  # Number of training nodes divided by number of classes
attr_normalization  = config['attr_normalization']  # Attribute normalization. Not used in the paper

alpha               = config['alpha']               # PPR teleport probability
alpha = 0.25 #If reddit 0.5, all the others 0.25 (default is now 0.5)
eps                 = config['eps']                 # Stopping threshold for ACL's ApproximatePR
topk                = config['topk']                # Number of PPR neighbors for each node
ppr_normalization   = config['ppr_normalization']   # Adjacency matrix normalization for weighting neighbors

hidden_size         = config['hidden_size']         # Size of the MLP's hidden layer
nlayers             = config['nlayers']             # Number of MLP layers
weight_decay        = config['weight_decay']        # Weight decay used for training the MLP
dropout             = config['dropout']             # Dropout used for training

lr                  = config['lr']                  # Learning rate
max_epochs          = config['max_epochs']          # Maximum number of epochs (exact number if no early stopping)
batch_size          = config['batch_size']          # Batch size for training
batch_mult_val      = config['batch_mult_val']      # Multiplier for validation batch size

eval_step           = config['eval_step']           # Accuracy is evaluated after every this number of steps
run_val             = config['run_val']             # Evaluate accuracy on validation set during training

early_stop          = config['early_stop']          # Use early stopping
patience            = config['patience']            # Patience for early stopping

nprop_inference     = config['nprop_inference']     # Number of propagation steps during inference
inf_fraction        = config['inf_fraction']        # Fraction of nodes for which local predictions are computed during inference


#new parameters
S = 1
gamma = 0.5


In [7]:
# import warnings
# warnings.filterwarnings("ignore")

In [9]:
graph = None
core_numbers = None

def pprgo_algorithm():

    print('k: ', topk, 'alpha: ', alpha)  

    #LOAD THE DATA------------------------------------
    start = time.time()
    (adj_matrix, attr_matrix, labels,
    train_idx, val_idx, test_idx) = utils.get_data(
            f"{data_file}",
            seed=split_seed,
            ntrain_div_classes=ntrain_div_classes,
            normalize_attr=attr_normalization
    )

    global core_numbers

    if core_numbers is None:
        print('Calculating core numbers')
        core_numbers = np.load('coredata/cora-cores.npy')
            
    try:
        d = attr_matrix.n_columns
    except AttributeError:
        d = attr_matrix.shape[1]
    nc = labels.max() + 1
    time_loading = time.time() - start
    # print(f"Loading data Runtime: {time_loading:.2f}s")


    #PREPROCESSING: CALCULATE PPR SCORES----------------

    # compute the ppr vectors for train/val nodes using ACL's ApproximatePR
    start = time.time()
    topk_train, core_topk_train, mean_kn = ppr.topk_ppr_matrix(adj_matrix, alpha, eps, train_idx, topk, core_numbers, graph,
                                    normalization=ppr_normalization, S=S, gamma= gamma)

    global mean_k
    mean_k = mean_kn
    if run_val:
        topk_val = ppr.topk_ppr_matrix(adj_matrix, alpha, eps, val_idx, topk,
                                    normalization=ppr_normalization)
    else:
        topk_val = None
    time_preprocessing = time.time() - start
    # print(f"PPR scores Runtime: {time_preprocessing:.2f}s")

    #TRAINING: SET UP MODEL AND TRAIN
    start = time.time()
    tf.reset_default_graph()
    tf.set_random_seed(0)

    model = pprgo.PPRGo(d, nc, hidden_size, nlayers, lr, gamma, weight_decay, dropout, #adj_matrix, intermediate_layer=1000,
                        sparse_features=type(attr_matrix) is not np.ndarray)

    sess = tf.compat.v1.Session()
    with sess.as_default():
        tf.compat.v1.global_variables_initializer().run()
        nepochs, loss_hist, acc_hist, f1_hist = pprgo.train(
                sess=sess, model=model, attr_matrix=attr_matrix,
                train_idx=train_idx, val_idx=val_idx,
                topk_train=topk_train, topk_val=topk_val, core_topk_train=core_topk_train,
                labels=labels, #adj_matrix=adj_matrix,
                max_epochs=max_epochs, batch_size=batch_size, batch_mult_val=batch_mult_val,
                eval_step=eval_step, early_stop=early_stop, patience=patience)
    time_training = time.time() - start
    logging.info('Training done.')
    # print(f"Training Runtime: {time_training:.2f}s")

    #INFERENCE (VAL AND TEST)
    start = time.time()
    predictions, time_logits, time_propagation = model.predict(
            sess=sess, adj_matrix=adj_matrix, attr_matrix=attr_matrix, alpha=alpha,
            nprop=nprop_inference, inf_fraction=inf_fraction,
            ppr_normalization=ppr_normalization)
    time_inference = time.time() - start
    # print(f"Inference Runtime: {time_inference:.2f}s")


    #COLLECT AND PRINT RESULTS
    acc_train = 100 * accuracy_score(labels[train_idx], predictions[train_idx])
    acc_val = 100 * accuracy_score(labels[val_idx], predictions[val_idx])
    acc_test = 100 * accuracy_score(labels[test_idx], predictions[test_idx])
    f1_train = f1_score(labels[train_idx], predictions[train_idx], average='macro')
    f1_val = f1_score(labels[val_idx], predictions[val_idx], average='macro')
    f1_test = f1_score(labels[test_idx], predictions[test_idx], average='macro')

    recall_test = 100 * recall_score(labels[test_idx], predictions[test_idx], average='macro')
    precision_test = 100 * precision_score(labels[test_idx], predictions[test_idx], average='macro', zero_division=1)
    # roc_auc_test = 100 * roc_auc_score(labels[test_idx], logits[test_idx], average='macro', multi_class='ovo')

    # gpu_max_bytes = tf.contrib.memory_stats.MaxBytesInUse()
    # gpu_memory = sess.run(gpu_max_bytes)
    memory = utils.get_max_memory_bytes()

    time_total = time_preprocessing + time_training + time_inference

    print(f'''
        Accuracy: Train: {acc_train:.1f}%, val: {acc_val:.1f}%, test: {acc_test:.1f}%
        F1 score: Train: {f1_train:.3f}, val: {f1_val:.3f}, test: {f1_test:.3f}

        Runtime: Preprocessing: {time_preprocessing:.2f}s, training: {time_training:.2f}s, inference: {time_inference:.2f}s -> total: {time_total:.2f}s
        Memory: Main: {(memory/1024) / 2**30:.2f}GB
        ''')

    return acc_test, recall_test, precision_test, f1_test, time_total, (memory / 2**30) #, (gpu_memory / 2**30)



In [10]:
import statistics


logger = logging.getLogger()
logger.disabled = True

logger = logging.getLogger("numba");
logger.setLevel(logging.ERROR)
logging.disable(logging.WARNING)


k_values = []
a_values = []
nt_values = []

iterations = 2

windows = [1,2,3,4,5,6,7,8,9,10] #or S value for knee point

# alpha_values = [0.05, 0.1, 0.15, 0.20,0.25, 0.30, 0.35]
# n_train_values = [20, 30, 40] #1400, 2100, 2800, 3500, 4200
# gamma_values = [1.0]

ACCURACY_RESULTS= []
ACC_STD_DEVIATION = []
RECALL_RESULTS = []
PRECISION_RESULTS = []
F1_RESULTS = []
MEAN_K_RESULTS = []
VARIANCE = []
TIME_RESULTS = []
MEMORY = []
GPU_MEMORY = []


elapsed_time = 0
topk = 16

# top_k_test = [12, 15, 16, 17, 18, 19, 20, 21, 22, 23]

while True:

    if topk >= 32:
        break

    topk *= 2
    # topk = 29

# for k in top_k_test:
#     topk = k
    
    # for k_w in windows:
    #     k_window = k_w

    # for g in gamma_values:

    gamma = 0.5

    all_acc = []
    all_recall = []
    all_precision = []
    all_f1 = []
    all_times = []
    main_memory_list = []
    # gpu_memory_list = []

    mean_k_values = []

    for i in range(iterations):
        split_seed = i
        acc_test_, recall_test, precision_test, f1_test, time_total_, memory_ = pprgo_algorithm()

        all_acc.append(acc_test_)
        all_recall.append(recall_test)
        all_precision.append(precision_test)
        all_f1.append(f1_test)
        all_times.append(time_total_)
        main_memory_list.append(memory_)
        # gpu_memory_list.append(gpu_memory_) 
        
        mean_k_values.append(mean_k)


    #Calculate standard deviation and variance
    std_deviation = statistics.stdev(all_acc)

    #Calculate mean 
    k_values.append(topk)
    MEAN_K_RESULTS.append(math.ceil(sum(mean_k_values)/len(mean_k_values)))

    ACC_STD_DEVIATION.append(std_deviation)
    ACCURACY_RESULTS.append(sum(all_acc)/len(all_acc))
    print('Average accuracy: ', sum(all_acc)/len(all_acc) )
    RECALL_RESULTS.append(sum(all_recall)/len(all_recall))
    PRECISION_RESULTS.append(sum(all_precision)/len(all_precision))
    F1_RESULTS.append(sum(all_f1)/len(all_f1))
    TIME_RESULTS.append(sum(all_times)/len(all_times))
    MEMORY.append(sum(main_memory_list)/len(main_memory_list))
    # GPU_MEMORY.append(sum(gpu_memory_list)/len(gpu_memory_list))




k:  32 alpha:  0.25
Calculating core numbers

        Accuracy: Train: 99.6%, val: 62.0%, test: 62.0%
        F1 score: Train: 0.992, val: 0.518, test: 0.531

        Runtime: Preprocessing: 0.10s, training: 30.06s, inference: 0.11s -> total: 30.27s
        Memory: Main: 0.49GB
        


StatisticsError: variance requires at least two data points

In [None]:
from tabulate import tabulate

# gamma_all = gamma_values * 10
# print(len(gamma_all))

table = []
#Build table
table.append(['mean k', 'Accuracy (%)', 'Std deviation (%)', 'Recall', 'Precision', 'F1', 'Running time (s)', 'Main Memory (GB)'])
for i in range(len(k_values)):
    current_row = []

    # current_row.append(k_values[i])
    current_row.append(MEAN_K_RESULTS[i])
    # current_row.append(gamma_all[i])
    # current_row.append(windows[i])
    # current_row.append(k_values[i])
    # current_row.append(a_values[i])
    # current_row.append(nt_values[i])
    # current_row.append(top_k_test[i])
    
    current_row.append(ACCURACY_RESULTS[i])
    current_row.append(ACC_STD_DEVIATION[i])
    # current_row.append(VARIANCE[i])
    current_row.append(RECALL_RESULTS[i])
    current_row.append(PRECISION_RESULTS[i])
    current_row.append(F1_RESULTS[i])
    # current_row.append(ROC_AUC_RESULTS[i])
    
    current_row.append(TIME_RESULTS[i])
    current_row.append(MEMORY[i])
    # current_row.append(GPU_MEMORY[i])
    table.append(current_row)

print(tabulate(table, headers='firstrow', tablefmt='fancy_grid'))