In [0]:
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd
import scipy.sparse as sp
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.metrics import average_precision_score
import random

In [0]:
def load_blog_dataset():
    path = 'edges.csv'
    G = nx.Graph()
    edges = pd.read_csv(path, sep=',', header=None)
    G.add_edges_from(edges.values)
    return G

In [0]:
graph = load_blog_dataset()

In [0]:
nx.info(graph)

'Name: \nType: Graph\nNumber of nodes: 10312\nNumber of edges: 333983\nAverage degree:  64.7756'

In [0]:
from similarities import *

In [0]:
np.random.seed(0)
adj_sparse = nx.to_scipy_sparse_matrix(graph)

In [0]:
edges = list(nx.edges(graph))

In [0]:
len(edges)

333983

In [0]:
non_edges = list(nx.non_edges(graph))
len(non_edges)

52829533

In [0]:
selected_new_edges = random.sample(non_edges,333983)

In [0]:
len(selected_new_edges)

333983

In [0]:
g_train = nx.Graph()
g_train.add_edges_from(graph.edges)
g_train.add_edges_from(selected_new_edges)
nx.info(g_train)

'Name: \nType: Graph\nNumber of nodes: 10312\nNumber of edges: 667966\nAverage degree: 129.5512'

In [0]:
nodes = list(g_train.nodes())

In [0]:
labels_all = [1]*333983 + [0]*333983

In [0]:
actual_edge_scores = []
new_edge_scores = []
for ed in edges:
  actual_edge_scores.append(PA_score(g_train,ed[0],ed[1]))
for ed in selected_new_edges:
  new_edge_scores.append(PA_score(g_train,ed[0],ed[1]))
  
preds_all = actual_edge_scores + new_edge_scores  
roc_score = roc_auc_score(labels_all, preds_all)
ap_score = average_precision_score(labels_all, preds_all)

In [0]:
print("Preferential attachment model")
print('Test ROC score: ', str(roc_score))
print('Test AP score: ', str(ap_score))

Preferential attachment model
Test ROC score:  0.9487277655690727
Test AP score:  0.9486412574388965


In [0]:
actual_edge_scores = []
new_edge_scores = []
for ed in edges:
  actual_edge_scores.append(AA_score(g_train,ed[0],ed[1]))
for ed in selected_new_edges:
  new_edge_scores.append(AA_score(g_train,ed[0],ed[1]))
  
preds_all = actual_edge_scores + new_edge_scores  
roc_score = roc_auc_score(labels_all, preds_all)
ap_score = average_precision_score(labels_all, preds_all)

In [0]:
print("Adamic adar model")
print('Test ROC score: ', str(roc_score))
print('Test AP score: ', str(ap_score))

Adamic adar model
Test ROC score:  0.9582366119978797
Test AP score:  0.9579478837906896


In [0]:
actual_edge_scores = []
new_edge_scores = []
for ed in edges:
  actual_edge_scores.append(CN_score(g_train,ed[0],ed[1]))
for ed in selected_new_edges:
  new_edge_scores.append(CN_score(g_train,ed[0],ed[1]))
  
preds_all = actual_edge_scores + new_edge_scores  
roc_score = roc_auc_score(labels_all, preds_all)
ap_score = average_precision_score(labels_all, preds_all)

In [0]:
print("Common Neighbors model")
print('Test ROC score: ', str(roc_score))
print('Test AP score: ', str(ap_score))

Common Neighbors model
Test ROC score:  0.9565612250468415
Test AP score:  0.953688523695042


In [0]:
actual_edge_scores = []
new_edge_scores = []
for ed in edges:
  actual_edge_scores.append(RA_score(g_train,ed[0],ed[1]))
for ed in selected_new_edges:
  new_edge_scores.append(RA_score(g_train,ed[0],ed[1]))
  
preds_all = actual_edge_scores + new_edge_scores  
roc_score = roc_auc_score(labels_all, preds_all)
ap_score = average_precision_score(labels_all, preds_all)

In [0]:
print("Resource Allocation model")
print('Test ROC score: ', str(roc_score))
print('Test AP score: ', str(ap_score))

Resource Allocation model
Test ROC score:  0.9456238803919146
Test AP score:  0.9495381807578173


In [0]:
actual_edge_scores = []
new_edge_scores = []
for ed in edges:
  actual_edge_scores.append(JC_score(g_train,ed[0],ed[1]))
for ed in selected_new_edges:
  new_edge_scores.append(JC_score(g_train,ed[0],ed[1]))
  
preds_all = actual_edge_scores + new_edge_scores  
roc_score = roc_auc_score(labels_all, preds_all)
ap_score = average_precision_score(labels_all, preds_all)

In [0]:
print("Jaccard Coef model")
print('Test ROC score: ', str(roc_score))
print('Test AP score: ', str(ap_score))

Jaccard Coef model
Test ROC score:  0.874270467843672
Test AP score:  0.8753410760664072


In [0]:
rpr_mat = rpr_matrix(g_train)

In [0]:
rp_dict = rpr_dict(g_train)

In [0]:
def rp_score(u,v):
#   print(u,v)
  u = nodes.index(u)
  v = nodes.index(v)
  # print(u,v)
  mi = min(rpr_mat[u,v],rpr_mat[v,u])
  ma = max(rpr_mat[u,v],rpr_mat[v,u])
  # return mi*mi
  # return (mi+ma*2)
  return mi
  # return min(rp_dict[(u,v)] , rp_dict[(v,u)])

In [0]:
actual_edge_scores = []
new_edge_scores = []
for ed in edges:
  actual_edge_scores.append(rp_score(ed[0],ed[1]))
for ed in selected_new_edges:
  new_edge_scores.append(rp_score(ed[0],ed[1]))
  
preds_all = actual_edge_scores + new_edge_scores 
su = sum(preds_all) 
norm = [float(i)/su for i in preds_all]
roc_score = roc_auc_score(labels_all, norm)
ap_score = average_precision_score(labels_all, norm)

In [0]:
print("Rooted Pagerank model")
print('Test ROC score: ', str(roc_score))
print('Test AP score: ', str(ap_score))

Rooted Pagerank model
Test ROC score:  0.07068108283686993
Test AP score:  0.3145659049069807


In [0]:
def katz_score(graph,beta=0.004):
    # non_edges = nx.non_edges(graph)
    A = nx.to_numpy_matrix(graph)
    # print(A)
    # w, v = np.linalg.eigh(A)
    # lambda1 = max([abs(x) for x in w])   # beta should be less than 1/lambda1
    # # print(1/lambda1)
    # if beta >= 1/lambda1 :
    #     raise ValueError('beta should be less than 1/lambda, lambda being the eigenvalue with largest magnitude')
    I = np.eye(A.shape[0])
    S = np.linalg.inv(I - beta * A) - I
    return S

In [0]:
katz_values = katz_score(g_train,beta=0.001)

In [0]:
def Katz_score(u,v):
  u = nodes.index(u)
  v = nodes.index(v)
  return katz_values[u,v] + katz_values[v,u]

In [0]:
actual_edge_scores = []
new_edge_scores = []
for ed in edges:
  actual_edge_scores.append(Katz_score(ed[0],ed[1]))
for ed in selected_new_edges:
  new_edge_scores.append(Katz_score(ed[0],ed[1]))
  
preds_all = actual_edge_scores + new_edge_scores  
roc_score = roc_auc_score(labels_all, preds_all)
ap_score = average_precision_score(labels_all, preds_all)

In [0]:
print("Katz model")
print('Test ROC score: ', str(roc_score))
print('Test AP score: ', str(ap_score))

Katz model
Test ROC score:  0.9573414367643536
Test AP score:  0.9566438716965094
