In [0]:
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd
import scipy.sparse as sp
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.metrics import average_precision_score
import pickle

In [0]:
path = 'edges.csv'
graph = nx.Graph()
edges = pd.read_csv(path, sep=',', header=None)
graph.add_edges_from(edges.values)
nx.info(graph)

'Name: \nType: Graph\nNumber of nodes: 10312\nNumber of edges: 333983\nAverage degree:  64.7756'

In [0]:
all_non_edges = nx.non_edges(graph)

In [0]:
# print(len(all_non_edges))

52829533


In [0]:
from similarities import *

In [0]:
from preprocessing import mask_test_edges

In [0]:
np.random.seed(0)
adj_sparse = nx.to_scipy_sparse_matrix(graph)

In [0]:
adj_train, train_edges, train_edges_false, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges(adj_sparse, test_frac=.2, val_frac=0, prevent_disconnect=False, verbose=True)

preprocessing...
generating test/val sets...
creating false test edges...
creating false val edges...
creating false train edges...
final checks for disjointness...
creating adj_train...
Done with train-test split!



In [0]:
g_train = nx.from_scipy_sparse_matrix(adj_train) # new graph object with only non-hidden edges

In [0]:
# Inspect train/test split
print("Total nodes:", adj_sparse.shape[0])
print("Total edges:", int(adj_sparse.nnz/2)) # adj is symmetric, so nnz (num non-zero) = 2*num_edges
print("Training edges (positive):", len(train_edges))
print("Training edges (negative):", len(train_edges_false))
# print("Validation edges (positive):", len(val_edges))
# print("Validation edges (negative):", len(val_edges_false))
print("Test edges (positive):", len(test_edges))
print("Test edges (negative):", len(test_edges_false))

Total nodes: 10312
Total edges: 333983
Training edges (positive): 267187
Training edges (negative): 267187
Test edges (positive): 66796
Test edges (negative): 66796


In [0]:
def get_roc_score(edges_pos, edges_neg, score_matrix):
    # Store positive edge predictions, actual values
    preds_pos = []
    pos = []
    for edge in edges_pos:
        preds_pos.append(score_matrix[edge[0], edge[1]]) # predicted score
        pos.append(adj_sparse[edge[0]-1, edge[1]-1]) # actual value (1 for positive)
        
    # Store negative edge predictions, actual values
    preds_neg = []
    neg = []
    for edge in edges_neg:
        preds_neg.append(score_matrix[edge[0], edge[1]]) # predicted score
        neg.append(adj_sparse[edge[0]-1, edge[1]-1]) # actual value (0 for negative)
        
    # Calculate scores
    preds_all = np.hstack([preds_pos, preds_neg])
    labels_all = np.hstack([np.ones(len(preds_pos)), np.zeros(len(preds_neg))])
    roc_score = roc_auc_score(labels_all, preds_all)
    ap_score = average_precision_score(labels_all, preds_all)
    return roc_score, ap_score

In [0]:
n = 10312
e = 333983

In [0]:
nx.info(g_train)

'Name: \nType: Graph\nNumber of nodes: 10312\nNumber of edges: 267186\nAverage degree:  51.8204'

In [0]:
# Compute Adamic-Adar indexes from g_train
aa_matrix = np.zeros((n+1,n+1))  ## make (n+1,n+1) as it is 1-indexed graph
for u, v, p in nx.adamic_adar_index(g_train): 
    aa_matrix[u][v] = p
    aa_matrix[v][u] = p # make sure it's symmetric
    
# Normalize array
aa_matrix = aa_matrix / aa_matrix.max()

In [0]:
all_non_edges = nx.non_edges(graph)  #iterator
# Calculate ROC AUC and Average Precision
aa_roc, aa_ap = get_roc_score(test_edges, all_non_edges, aa_matrix)

print('Adamic-Adar Test ROC score: ', str(aa_roc))
print('Adamic-Adar Test AP score: ', str(aa_ap))

Adamic-Adar Test ROC score:  0.9480881823221416
Adamic-Adar Test AP score:  0.08446533458695125


In [0]:
# Compute Jaccard Coefficients from g_train
jc_matrix = np.zeros((n+1,n+1))
for u, v, p in nx.jaccard_coefficient(g_train):
    jc_matrix[u][v] = p
    jc_matrix[v][u] = p # make sure it's symmetric
    
# Normalize array
jc_matrix = jc_matrix / jc_matrix.max()

In [0]:
all_non_edges = nx.non_edges(graph)  #iterator
# Calculate ROC AUC and Average Precision
jc_roc, jc_ap = get_roc_score(test_edges, all_non_edges, jc_matrix)

print('Jaccard Coefficient Test ROC score: ', str(jc_roc))
print('Jaccard Coefficient Test AP score: ', str(jc_ap))

Jaccard Coefficient Test ROC score:  0.7708421551368657
Jaccard Coefficient Test AP score:  0.0035971441279551855


In [0]:
pa_matrix = np.zeros((n+1,n+1))
for u, v, p in preferential_attachment_score(g_train): 
    pa_matrix[u][v] = p
    pa_matrix[v][u] = p # make sure it's symmetric
    
# Normalize array
pa_matrix = pa_matrix / pa_matrix.max()

In [0]:
# len(list(all_non_edges))

In [0]:
# Calculate ROC AUC and Average Precision
pa_roc, pa_ap = get_roc_score(test_edges, all_non_edges, pa_matrix)

print('Preferential Attachment Test ROC score: ', str(pa_roc))
print('Preferential Attachment Test AP score: ', str(pa_ap))

Preferential Attachment Test ROC score:  0.951258282249704
Preferential Attachment Test AP score:  0.07791663976914605


In [0]:
# Calculate, store Adamic-Index scores in array
cn_matrix = np.zeros((n+1,n+1))
for u, v, p in common_neighbors_score(g_train): 
    cn_matrix[u][v] = p
    cn_matrix[v][u] = p # make sure it's symmetric
    
# Normalize array
cn_matrix = cn_matrix / cn_matrix.max()

In [0]:
all_non_edges = nx.non_edges(graph)
# Calculate ROC AUC and Average Precision
cn_roc, cn_ap = get_roc_score(test_edges, all_non_edges, cn_matrix)

print('Common Neighbors Test ROC score: ', str(cn_roc))
print('Common Neighbors Test AP score: ', str(cn_ap))

Common Neighbors Test ROC score:  0.9435535406466976
Common Neighbors Test AP score:  0.08007766303570453


In [0]:
# Calculate, store Adamic-Index scores in array
ra_matrix = np.zeros((n+1,n+1))
for u, v, p in resource_allocation_score(g_train): 
    ra_matrix[u][v] = p
    ra_matrix[v][u] = p # make sure it's symmetric
    
# Normalize array
ra_matrix = ra_matrix / ra_matrix.max()

In [0]:
all_non_edges = nx.non_edges(graph)
# Calculate ROC AUC and Average Precision
ra_roc, ra_ap = get_roc_score(test_edges, all_non_edges, ra_matrix)

print('Resource Allocation Test ROC score: ', str(ra_roc))
print('Resource Allocation Test AP score: ', str(ra_ap))

Resource Allocation Test ROC score:  0.9538787382531045
Resource Allocation Test AP score:  0.09052325018464226


In [0]:
# Calculate, store Adamic-Index scores in array
katz_matrix = np.zeros((n+1,n+1))
for u, v, p in list_katz_scores(g_train): 
    katz_matrix[u][v] = p
    katz_matrix[v][u] = p # make sure it's symmetric
    
# Normalize array
katz_matrix = katz_matrix / katz_matrix.max()

In [0]:
all_non_edges = nx.non_edges(graph)
# Calculate ROC AUC and Average Precision
katz_roc, katz_ap = get_roc_score(test_edges, all_non_edges, katz_matrix)

print('Katz Test ROC score: ', str(katz_roc))
print('Katz Test AP score: ', str(katz_ap))

In [0]:
##katz infinite length format

In [0]:
katz_matrix2 = katz_score(g_train,beta=0.001)
katz_matrix2 = katz_matrix2 / katz_matrix2.max()

In [0]:
def get_roc_katz2(edges_pos, edges_neg, score_matrix):
    # Store positive edge predictions, actual values
    preds_pos = []
    pos = []
    for edge in edges_pos:
        preds_pos.append(score_matrix[edge[0]-1, edge[1]-1]+score_matrix[edge[1]-1, edge[0]-1]) # predicted score
        pos.append(adj_sparse[edge[0]-1, edge[1]-1]) # actual value (1 for positive)
        
    # Store negative edge predictions, actual values
    preds_neg = []
    neg = []
    for edge in edges_neg:
        preds_neg.append(score_matrix[edge[0]-1, edge[1]-1]+score_matrix[edge[1]-1, edge[0]-1]) # predicted score
        neg.append(adj_sparse[edge[0]-1, edge[1]-1]) # actual value (0 for negative)
        
    # Calculate scores
    preds_all = np.hstack([preds_pos, preds_neg])
    labels_all = np.hstack([np.ones(len(preds_pos)), np.zeros(len(preds_neg))])
#     print(labels_all)
#     print(len(preds_pos), len(preds_neg), len(edges_neg))
    roc_score = roc_auc_score(labels_all, preds_all)
    ap_score = average_precision_score(labels_all, preds_all)
    return roc_score, ap_score

In [0]:
# Calculate ROC AUC and Average Precision
katz_roc2, katz_ap2 = get_roc_katz2(test_edges, all_non_edges, katz_matrix2)

print('Katz2 Test ROC score: ', str(katz_roc2))
print('Katz2 Test AP score: ', str(katz_ap2))

Katz2 Test ROC score:  0.8188314501870374
Katz2 Test AP score:  0.008561477488748583


In [0]:
def rpr_matrix(graph, alpha=0.85):
    D = graph.to_directed()
    H = nx.stochastic_graph(D)
    H = nx.to_numpy_matrix(H).transpose()
    I = np.eye(H.shape[0])
    S = alpha*np.linalg.inv(I - (1-alpha)*H)
    return S

In [0]:
pr_matrix2 = rpr_matrix(g_train)
pr_matrix2 = pr_matrix2 / pr_matrix2.max()

In [0]:
# Calculate ROC AUC and Average Precision
pr_roc2, pr_ap2 = get_roc_katz2(test_edges, all_non_edges, pr_matrix2)

print('Rooted Pagerank Test ROC score: ', str(pr_roc2))
print('Rooted Pagerank Test AP score: ', str(pr_ap2))

Rooted Pagerank Test ROC score:  0.7885410197413326
Rooted Pagerank Test AP score:  0.005347762206385155
