In [13]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

import time
import numpy as np
import scipy as sp
import tensorflow as tf
import scipy.sparse as sps
from sklearn import metrics
import matplotlib.pyplot as plt
from collections import defaultdict
from IPython.display import clear_output

snapshot_prefix = "C:/Users/Achil/Documents/VGAE/src/data/hive/hive-node-snapshot/hive-node-snapshot-"

num_nodes = 386
num_edges = 23459#61125
num_features = num_nodes
num_snapshots = 10
snapshot_nodes = int(num_nodes / num_snapshots)
intermediate_size = 32
embedding_size = 16
epochs = 100
learning_rate = 0.01
l_depth = 1
test_ratio = 0.1

In [14]:
def make_test_matrix(coords, values, shape):
    rows = np.concatenate((coords[:, 0], coords[:, 1]))
    cols = np.concatenate((coords[:, 1], coords[:, 0]))
    vals = np.concatenate((values, values))
    return sps.coo_matrix((vals, (rows, cols)), shape=shape).tocsr()

def split_features(features, snapshot_id):
    rows = (snapshot_id + 1) * snapshot_nodes
    split = features[:rows,:]
    coo = split.tocoo().astype(np.float32)
    indices = np.mat([coo.row, coo.col]).transpose()
    features_sparse_tensor = tf.SparseTensor(indices, coo.data, coo.shape)
    return features_sparse_tensor

def load_features_identity():
    # Identity matrix
    features = sps.identity(num_nodes, dtype=np.float32, format='csr')
    return features

def load_features(features_filename):
    csr = sps.load_npz(features_filename)
    return csr

def sparse_to_tuple(sparse_mx):
    sparse_mx = sps.triu(sparse_mx)
    coords = np.vstack((sparse_mx.row, sparse_mx.col)).transpose()
    values = sparse_mx.data
    shape = sparse_mx.shape
    return coords, values, shape

def sample_coords(matrix, sample_size):
    matrix = sps.coo_matrix(matrix)
    coords, values, shape = sparse_to_tuple(matrix)
    perm = np.random.rand(coords.shape[0]).argsort()
    np.take(coords, perm, axis=0, out=coords)
    np.take(values, perm, axis=0, out=values)
    return coords[:sample_size], values[:sample_size]

def sample_edges(adj, sample_ratio):
    sample_size = int(sample_ratio * adj.count_nonzero() / 2)
    pos_edges_coords, pos_edges_values = sample_coords(adj, sample_size)
    t_matrix = sp.full(adj.shape, 1)
    neg_adj = t_matrix - adj - np.eye(adj.shape[0])
    neg_adj = sp.sparse.coo_matrix(neg_adj)
    neg_edges_coords, zeros = sample_coords(neg_adj, sample_size)
    return pos_edges_coords, pos_edges_values, neg_edges_coords
    
def calc_normalized(adj_):
    rowsum = np.array(adj_.sum(1))
    degree_mat_inv_sqrt = sps.diags(np.power(rowsum, -0.5).flatten())
    adj_normalized = adj_.dot(degree_mat_inv_sqrt).transpose().dot(degree_mat_inv_sqrt).tocoo().astype(np.float32)
    return adj_normalized

def adj_from_snapshot(snapshot_id):
    npy_adj = np.load(snapshot_prefix + str(snapshot_id) + '.npy')
    csr_adj = sps.csr_matrix(npy_adj)
    return csr_adj

def load_training_snapshot(snapshot_id):
    adj = adj_from_snapshot(snapshot_id)
    test_pos_edges, test_edges_values, test_neg_edges = sample_edges(adj, test_ratio)
    adj_test = make_test_matrix(test_pos_edges, test_edges_values, adj.shape)
    
    adj = adj - adj_test
    adj.eliminate_zeros()
    train_pos_edges, train_edges_values, train_neg_edges = sample_edges(adj, 1)
    
    adj_with_diag = adj + sps.identity(adj.shape[0], dtype=np.float32).tocsr()

    adj_tensor = tf.Variable(adj_with_diag.todense(), dtype=tf.float32)

    adj_norm = calc_normalized(adj_with_diag)
    indices = np.mat([adj_norm.row, adj_norm.col]).transpose()
    adj_norm_tensor = tf.SparseTensor(indices, adj_norm.data, adj_norm.shape)

    return adj_tensor, adj_norm_tensor, test_pos_edges, test_edges_values, test_neg_edges, train_pos_edges, test_edges_values, train_neg_edges

def auc(pe, ne, embeddings):
    y_true = []
    y_pred = []
    
    for i, coords in enumerate(pe):
        emb1 = embeddings[coords[0]]
        emb2 = embeddings[coords[1]]
        pred = tf.sigmoid(tf.tensordot(emb1, emb2, 1)).numpy()
        y_true.append(1)
        y_pred.append(pred)
    
    for coords in ne:
        emb1 = embeddings[coords[0]]
        emb2 = embeddings[coords[1]]
        pred = tf.sigmoid(tf.tensordot(emb1, emb2, 1)).numpy()
        y_true.append(0)
        y_pred.append(pred)


    fpr, tpr, thresholds = metrics.roc_curve(y_true=y_true, y_score=y_pred)
    roc_auc = metrics.auc(fpr, tpr)
    return roc_auc

In [15]:
features = load_features_identity()
adj_snapshots = []
adj_norm_snapshots = []
features_snapshots = []
test_pos_edges_snapshot = []
test_edges_values_snapshot = []
test_neg_edges_snapshot = []
num_nodes_snapshot = []
train_pos_edges_snapshot = []
train_neg_edges_snapshot = []
train_edges_values_snapshot = []
pos_edges_combined = []
neg_edges_combined = []
pos_edges_incr = []
neg_edges_incr = []

for i in range(num_snapshots):
    snapshot_adj, snapshot_adj_norm, test_pos_edges, test_edges_values, test_neg_edges, train_pos_edges, train_edges_values, train_neg_edges = load_training_snapshot(i)
    snapshot_features = split_features(features, i)
    adj_snapshots.append(snapshot_adj)
    adj_norm_snapshots.append(snapshot_adj_norm)
    features_snapshots.append(snapshot_features)
    test_pos_edges_snapshot.append(test_pos_edges)
    test_edges_values_snapshot.append(test_edges_values)
    test_neg_edges_snapshot.append(test_neg_edges)
    train_pos_edges_snapshot.append(train_pos_edges)
    train_edges_values_snapshot.append(train_edges_values)
    train_neg_edges_snapshot.append(train_neg_edges)
    pos_edges_combined.append(np.concatenate((test_pos_edges, train_pos_edges), axis=0))
    neg_edges_combined.append(np.concatenate((test_neg_edges, train_neg_edges), axis=0))
    if i > 0:
        pos_inc = list(set(map(tuple, pos_edges_combined[i])) - set(map(tuple, pos_edges_combined[i-1])))
        neg_inc = list(set(map(tuple, neg_edges_combined[i])) - set(map(tuple, neg_edges_combined[i-1])))[:len(pos_inc)]
        
        pos_edges_incr.append(pos_inc)
        neg_edges_incr.append(neg_inc)
    
    num_nodes_snapshot.append(snapshot_adj.shape[0])

In [16]:
class FirstLayer(tf.keras.layers.Layer):
    def __init__(self, adj_norm, shared_w0):
        super(FirstLayer, self).__init__()
        self.adj_norm = adj_norm
        self.w = shared_w0

    def call(self, inputs, **kwargs):
        xw = tf.sparse.sparse_dense_matmul(inputs, self.w)
        axw = tf.sparse.sparse_dense_matmul(self.adj_norm, xw)
        relu = tf.nn.relu(axw)
        return relu
    
class SecondLayer(tf.keras.layers.Layer):

    def __init__(self, units, adj_norm):
        super(SecondLayer, self).__init__()
        self.units = units
        self.adj_norm = adj_norm
        self.training = True

    def build(self, input_shape):
        self.w = self.add_weight(shape=(input_shape[-1], self.units),
                                initializer=tf.keras.initializers.glorot_uniform(),
                                trainable=True)

    def call(self, inputs, **kwargs):
        x = tf.matmul(inputs, self.w)
        x = tf.sparse.sparse_dense_matmul(self.adj_norm, x)
        return x
    


class Encoder(tf.keras.Model):
    def __init__(self, adj_norm, embedding_size, shared_w0):
        super(Encoder, self).__init__()
        self.first_layer = FirstLayer(adj_norm, shared_w0)
        self.mean_layer = SecondLayer(embedding_size, adj_norm)
        self.std_layer = SecondLayer(embedding_size, adj_norm)
    
    def call(self, input_features, **kwargs):
        intermediate = self.first_layer(input_features)
        means = self.mean_layer(intermediate)
        stds = self.std_layer(intermediate)
        z = means + (tf.random.normal(shape=means.shape) * tf.exp(stds))
        return z, means, stds
    
class ThirdLayer(tf.keras.layers.Layer):

    def __init__(self):
        super(ThirdLayer, self).__init__()

    def call(self, inputs, **kwargs):
        matmul = tf.matmul(inputs, inputs, transpose_b=True)
        flat = tf.reshape(matmul, [-1])
        return flat

class Decoder(tf.keras.Model):
    def __init__(self):
        super(Decoder, self).__init__()
        self.third_layer = ThirdLayer()
    
    def call(self, input_features, **kwargs):
        return self.third_layer(input_features)
    
class Autoencoder(tf.keras.Model):
    def __init__(self, adj_norm, intermediate_size, embedding_size, shared_w0):
        super(Autoencoder, self).__init__()
        self.encoder = Encoder(adj_norm, embedding_size, shared_w0)
        self.decoder = Decoder()
        
    
    def call(self, input_features, **kwargs):
        z, means, stds = self.encoder(input_features)
        reconstructed = self.decoder(z)
        return reconstructed, means, stds    

In [17]:
opt = tf.keras.optimizers.Adam(learning_rate=learning_rate)
glorot_initializer = tf.keras.initializers.glorot_uniform()
shared_w0 = tf.Variable(initial_value=glorot_initializer(shape=(num_features, intermediate_size), dtype=tf.float32), trainable=True)

autoencoders = []
pos_weights = []
norms = []
labels = []

for i in range(num_snapshots):
    adj = adj_snapshots[i]
    adj_norm = adj_norm_snapshots[i]
    
    autoencoders.append(Autoencoder(adj_norm, intermediate_size, embedding_size, shared_w0))
    adj_sum = tf.reduce_sum(adj)
    pos_weights.append(float((adj.shape[0] * adj.shape[0]) - adj_sum) / adj_sum)
    norms.append(adj.shape[0] * adj.shape[0] / float(((adj.shape[0] * adj.shape[0]) - adj_sum) * 2))
    labels.append(tf.reshape(adj, [-1]))

In [18]:
print("start training")
snapshot_history = defaultdict(list)
kl_loss_history = defaultdict(list)
reconstructed_loss_history = defaultdict(list)
auc_history = []
start_global = time.time()

kl_losses = {}
for i in range(num_snapshots):
    print('step', i)
    if i > 0:
        last_trained_ae = autoencoders[i-1]
        test_adj_norm = adj_norm_snapshots[i]
        test_features = features_snapshots[i]
        last_trained_ae.encoder.first_layer.adj_norm = test_adj_norm
        last_trained_ae.encoder.mean_layer.adj_norm = test_adj_norm
        last_trained_ae.encoder.std_layer.adj_norm = test_adj_norm
        r, embeddings, s  = last_trained_ae(test_features)
        
        auc_score = auc(pos_edges_incr[i-1], neg_edges_incr[i-1], embeddings)
        auc_history.append(auc_score)
        print('auc score', auc_score)
    
    for epoch in range(epochs):
        start = time.time()
        with tf.GradientTape() as tape:
            autoenc = autoencoders[i]
            reconstructed, means, stds = autoenc(features_snapshots[i])
            reconstruction_loss = norms[i] * tf.reduce_mean(tf.nn.weighted_cross_entropy_with_logits(logits=reconstructed, labels=labels[i], pos_weight=pos_weights[i]))
            kl_self_loss = tf.abs((0.5 / num_nodes_snapshot[i]) * tf.reduce_mean(tf.reduce_sum(1 + 2 * stds - tf.square(means) - tf.square(tf.exp(stds)), 1)))
            kl_loss = 0
            if i == 0:
                kl_loss += kl_self_loss
            else:
                for l in range(i-1, max(-1, i -1 - l_depth), -1):
                    prev_kl = kl_losses[l]
                    kl_loss += (kl_self_loss + prev_kl) / 2
            kl_losses[i] = kl_loss

            step_loss = reconstruction_loss + kl_loss
            snapshot_history[i].append(step_loss)
            kl_loss_history[i].append(kl_loss)
            reconstructed_loss_history[i].append(reconstruction_loss)
            
            # clear_output()
            if epoch % 10 == 0:
                print('epoch:', epoch, 'loss', step_loss.numpy(), 'exec time', time.time() - start)
            
        gradients = tape.gradient(step_loss, autoenc.trainable_variables)
        gradient_variables = zip(gradients, autoenc.trainable_variables)
        opt.apply_gradients(gradient_variables)

total = time.time() - start_global
print("elapsed: " + str(total) + " seconds")

start training
step 0
epoch: 0 loss 1.485008 exec time 0.009932994842529297
epoch: 10 loss 1.4007387 exec time 0.001994609832763672
epoch: 20 loss 0.9578406 exec time 0.001996755599975586
epoch: 30 loss 0.8286164 exec time 0.006980419158935547
epoch: 40 loss 0.7545888 exec time 0.0069828033447265625
epoch: 50 loss 0.7343683 exec time 0.0030362606048583984
epoch: 60 loss 0.677586 exec time 0.002039194107055664
epoch: 70 loss 0.72615194 exec time 0.002039194107055664
epoch: 80 loss 0.70412576 exec time 0.0029931068420410156
epoch: 90 loss 0.6665158 exec time 0.0049877166748046875
step 1
auc score 0.7881862381829835
l 0
epoch: 0 loss 1.7340312 exec time 0.010970115661621094
l 0
l 0
l 0
l 0
l 0
l 0
l 0
l 0
l 0
l 0
epoch: 10 loss 1.0054775 exec time 0.00494837760925293
l 0
l 0
l 0
l 0
l 0
l 0
l 0
l 0
l 0
l 0
epoch: 20 loss 0.6765732 exec time 0.003983497619628906
l 0
l 0
l 0
l 0
l 0
l 0
l 0
l 0
l 0
l 0
epoch: 30 loss 0.64786 exec time 0.004029750823974609
l 0
l 0
l 0
l 0
l 0
l 0
l 0
l 0
l 0

KeyboardInterrupt: 

In [None]:
%matplotlib qt

x_axis = range(epochs)
plt.figure()
for i in range(num_snapshots-1):
    plt.plot(x_axis, snapshot_history[i], label=str(i))
plt.xlabel('Epoch')
plt.ylabel('Total Loss per autoencoder')
plt.legend(loc="upper right")
plt.title('Total loss during training for each autoencoder')


plt.figure() 
for i in range(num_snapshots-1):
    plt.plot(x_axis, reconstructed_loss_history[i], label=str(i))
plt.xlabel('Epoch')
plt.ylabel('Reconstruction Loss per autoencoder')
plt.legend(loc="upper right")
plt.title('Reconstruction loss during training')

plt.figure()
for i in range(num_snapshots-1):
    plt.plot(x_axis, kl_loss_history[i], label=str(i))
plt.xlabel('Epoch')
plt.ylabel('KL Divergence Loss')
plt.legend(loc="upper right")
plt.title('KL Divergence during training')

x_axis = range(num_snapshots-1)
plt.figure()
plt.plot(x_axis, auc_history)
plt.xlabel('Snapshot')
plt.ylabel('AUC')
plt.title("AUC during training for trainset")

y_axis = [len(e) for e in pos_edges_incr]
plt.figure()
plt.plot(x_axis, y_axis)
plt.xlabel('Snapshot')
plt.ylabel('Number of new edges')
plt.title("New edges in each snapshot")

y_axis = [len(e)/10000 for e in pos_edges_incr]
plt.figure()
plt.plot(x_axis, y_axis)
plt.plot(x_axis, auc_history)
plt.xlabel('Snapshot')
plt.ylabel('Number of new edges')
plt.title("New edges in each snapshot")

plt.show()

In [None]:
# link prediction
# last_trained_ae = autoencoders[num_snapshots-2]
# test_adj_norm = adj_norm_snapshots[num_snapshots-1]
# test_features = features_snapshots[num_snapshots-1]
# last_trained_ae.encoder.first_layer.adj_norm = test_adj_norm
# last_trained_ae.encoder.mean_layer.adj_norm = test_adj_norm
# last_trained_ae.encoder.std_layer.adj_norm = test_adj_norm
# 
# reconstructed, embeddings, stds  = last_trained_ae(test_features)

In [None]:
# last_trained_ae = autoencoders[num_snapshots-2]
# test_adj_norm = adj_norm_snapshots[num_snapshots-2]
# test_features = features_snapshots[num_snapshots-2]
# last_trained_ae.encoder.first_layer.adj_norm = test_adj_norm
# last_trained_ae.encoder.mean_layer.adj_norm = test_adj_norm
# last_trained_ae.encoder.std_layer.adj_norm = test_adj_norm
# 
# reconstructed, embeddings, stds  = last_trained_ae(test_features)

In [None]:
# def total_auc(pe, ne, embeddings):
#     y_true = []
#     y_pred = []
#     
#     for pos_edges in pe:
#         for coords in pos_edges:
#             # if coords[0] > 342 or coords[1] > 342:
#                 emb1 = embeddings[coords[0]]
#                 emb2 = embeddings[coords[1]]
#                 pred = tf.sigmoid(tf.tensordot(emb1, emb2, 1)).numpy()
#                 y_true.append(1)
#                 y_pred.append(pred)
#     
#     for neg_edges in ne:
#         for coords in neg_edges:
#             # if coords[0] > 342 or coords[1] > 342:
#                 emb1 = embeddings[coords[0]]
#                 emb2 = embeddings[coords[1]]
#                 pred = tf.sigmoid(tf.tensordot(emb1, emb2, 1)).numpy()
#                 y_true.append(0)
#                 y_pred.append(pred)
# 
#     fpr, tpr, thresholds = metrics.roc_curve(y_true=y_true, y_score=y_pred)
#     roc_auc = metrics.auc(fpr, tpr)
#     print("ROC score:")
#     print(roc_auc)
#     plt.figure()
#     lw = 2
#     plt.plot(fpr, tpr, color='darkorange', lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
#     plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
#     plt.xlim([0.0, 1.0])
#     plt.ylim([0.0, 1.05])
#     plt.xlabel('False Positive Rate')
#     plt.ylabel('True Positive Rate')
#     plt.title('ROC')
#     plt.legend(loc="lower right")
#     plt.show()
# 
# 
# total_auc(test_pos_edges_snapshot[:], test_neg_edges_snapshot[:], embeddings)

In [None]:
# def mse(pe, pv, ne, embeddings):
#     y_true = []
#     y_pred = []
#     
#     for i, pos_edges in enumerate(pe):
#         for j, coords in enumerate(pos_edges):
#             # if coords[0] > 342 or coords[1] > 342:
#                 emb1 = embeddings[coords[0]]
#                 emb2 = embeddings[coords[1]]
#                 pred = tf.sigmoid(tf.tensordot(emb1, emb2, 1)).numpy()
#                 y_true.append(pv[i][j])
#                 y_pred.append(pred)
#     
#     for neg_edges in ne:
#         for coords in neg_edges:
#             # if coords[0] > 342 or coords[1] > 342:
#                 emb1 = embeddings[coords[0]]
#                 emb2 = embeddings[coords[1]]
#                 pred = tf.sigmoid(tf.tensordot(emb1, emb2, 1)).numpy()
#                 y_true.append(0)
#                 y_pred.append(pred)
#     
#     y_true = np.array(y_true)
#     y_pred = np.array(y_pred)
#     mse = ((y_true - y_pred)**2).mean()
#     return mse
# 
# er = mse(test_pos_edges_snapshot[:], test_edges_values_snapshot[:], test_neg_edges_snapshot[:], embeddings)
# print(er)