In [0]:
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd
import scipy.sparse as sp
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.metrics import average_precision_score

In [0]:
def load_restaurant_dataset():
    path = 'dataset_ubicomp2013_checkins.txt'
#     lines = (line.decode('utf-8') for line in path)
    infile = open(path, 'r')
    a = set()
    b = set()
    edges = []
    for line in infile:
        s=line.strip().split(None)
        u=-1*int(s.pop(0)) -10
        v=int(s.pop(0))
        a.add(u)
        b.add(v)
        edges.append((u,v))
    top_nodes = {}
    bottom_nodes = {}
    count = 0 
    for x in a:
        top_nodes[x] = count
        count = count + 1
    count  = 0    
    for y in b:
        bottom_nodes[y] = count
        count  = count + 1
    
    A = np.zeros((len(a),len(b)))
    for edge in edges:
        e1 = top_nodes[edge[0]]
        e2 = bottom_nodes[edge[1]]
        A[e1, e2] = 1
    
    A = np.dot(A,A.T)
#     print(A[:35,:35])
    for i in range(0,A.shape[0]):  #making numpy matrix undirected graph type
        for j in range(0,A.shape[1]):
            if i == j :
                A[i,j] = 0
            else:
                if A[i,j] > 0:
                    A[i,j] = 1
                    
    G=nx.from_numpy_matrix(A)
    return G

In [0]:
graph = load_restaurant_dataset()

In [6]:
nx.info(graph)

'Name: \nType: Graph\nNumber of nodes: 2060\nNumber of edges: 58810\nAverage degree:  57.0971'

In [0]:
from similarities import *
from preprocessing import mask_test_edges

In [0]:
np.random.seed(0)
adj_sparse = nx.to_scipy_sparse_matrix(graph)

In [9]:
adj_train, train_edges, train_edges_false, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges(adj_sparse, test_frac=.2, val_frac=0, prevent_disconnect=False, verbose=True)

preprocessing...
generating test/val sets...
creating false test edges...
creating false val edges...
creating false train edges...
final checks for disjointness...
creating adj_train...
Done with train-test split!



In [0]:
g_train = nx.from_scipy_sparse_matrix(adj_train) # new graph object with only non-hidden edges

In [11]:
# Inspect train/test split
print("Total nodes:", adj_sparse.shape[0])
print("Total edges:", int(adj_sparse.nnz/2)) # adj is symmetric, so nnz (num non-zero) = 2*num_edges
print("Training edges (positive):", len(train_edges))
print("Training edges (negative):", len(train_edges_false))
# print("Validation edges (positive):", len(val_edges))
# print("Validation edges (negative):", len(val_edges_false))
print("Test edges (positive):", len(test_edges))
print("Test edges (negative):", len(test_edges_false))

Total nodes: 2060
Total edges: 58810
Training edges (positive): 47048
Training edges (negative): 47048
Test edges (positive): 11762
Test edges (negative): 11762


In [0]:
def get_roc_score(edges_pos, edges_neg, score_matrix):
    # Store positive edge predictions, actual values
    preds_pos = []
    pos = []
    for edge in edges_pos:
        preds_pos.append(score_matrix[edge[0], edge[1]]) # predicted score
        pos.append(adj_sparse[edge[0], edge[1]]) # actual value (1 for positive)
        
    # Store negative edge predictions, actual values
    preds_neg = []
    neg = []
    for edge in edges_neg:
        preds_neg.append(score_matrix[edge[0], edge[1]]) # predicted score
        neg.append(adj_sparse[edge[0], edge[1]]) # actual value (0 for negative)
        
    # Calculate scores
    preds_all = np.hstack([preds_pos, preds_neg])
    labels_all = np.hstack([np.ones(len(preds_pos)), np.zeros(len(preds_neg))])
    roc_score = roc_auc_score(labels_all, preds_all)
    ap_score = average_precision_score(labels_all, preds_all)
    return roc_score, ap_score

In [0]:
n = 2060
e = 58810

In [0]:
# Compute Adamic-Adar indexes from g_train
aa_matrix = np.zeros((n,n))
for u, v, p in adamic_adar_score(g_train): 
    aa_matrix[u][v] = p
    aa_matrix[v][u] = p # make sure it's symmetric
    
# Normalize array
aa_matrix = aa_matrix / aa_matrix.max()

In [15]:
# Calculate ROC AUC and Average Precision
aa_roc, aa_ap = get_roc_score(test_edges, test_edges_false, aa_matrix)

print('Adamic-Adar Test ROC score: ', str(aa_roc))
print('Adamic-Adar Test AP score: ', str(aa_ap))

Adamic-Adar Test ROC score:  0.9665555502098081
Adamic-Adar Test AP score:  0.9667062088924903


In [0]:
# Compute Jaccard Coefficients from g_train
jc_matrix = np.zeros((n,n))
for u, v, p in jaccard_coefficient_score(g_train): 
    jc_matrix[u][v] = p
    jc_matrix[v][u] = p # make sure it's symmetric
    
# Normalize array
jc_matrix = jc_matrix / jc_matrix.max()

In [17]:
# Calculate ROC AUC and Average Precision
jc_roc, jc_ap = get_roc_score(test_edges, test_edges_false, jc_matrix)

print('Jaccard Coefficient Test ROC score: ', str(jc_roc))
print('Jaccard Coefficient Test AP score: ', str(jc_ap))

Jaccard Coefficient Test ROC score:  0.9692547439711509
Jaccard Coefficient Test AP score:  0.9725807684374088


In [0]:
# Calculate, store Adamic-Index scores in array
pa_matrix = np.zeros((n,n))
for u, v, p in preferential_attachment_score(g_train): 
    pa_matrix[u][v] = p
    pa_matrix[v][u] = p # make sure it's symmetric
    
# Normalize array
pa_matrix = pa_matrix / pa_matrix.max()

In [64]:
# Calculate ROC AUC and Average Precision
pa_roc, pa_ap = get_roc_score(test_edges, test_edges_false, pa_matrix)

print('Preferential Attachment Test ROC score: ', str(pa_roc))
print('Preferential Attachment Test AP score: ', str(pa_ap))

Preferential Attachment Test ROC score:  0.8415557851303591
Preferential Attachment Test AP score:  0.8347541889534089


In [0]:
# Calculate, store Adamic-Index scores in array
cn_matrix = np.zeros((n,n))
for u, v, p in common_neighbors_score(g_train): 
    cn_matrix[u][v] = p
    cn_matrix[v][u] = p # make sure it's symmetric
    
# Normalize array
cn_matrix = cn_matrix / cn_matrix.max()

In [23]:
# Calculate ROC AUC and Average Precision
cn_roc, cn_ap = get_roc_score(test_edges, test_edges_false, cn_matrix)

print('Common Neighbors Test ROC score: ', str(cn_roc))
print('Common Neighbors Test AP score: ', str(cn_ap))

Common Neighbors Test ROC score:  0.9595000837184562
Common Neighbors Test AP score:  0.9552215888598393


In [0]:
# Calculate, store Adamic-Index scores in array
ra_matrix = np.zeros((n,n))
for u, v, p in resource_allocation_score(g_train): 
    ra_matrix[u][v] = p
    ra_matrix[v][u] = p # make sure it's symmetric
    
# Normalize array
ra_matrix = ra_matrix / ra_matrix.max()

In [27]:
# Calculate ROC AUC and Average Precision
ra_roc, ra_ap = get_roc_score(test_edges, test_edges_false, ra_matrix)

print('Resource Allocation Test ROC score: ', str(ra_roc))
print('Resource Allocation Test AP score: ', str(ra_ap))

Resource Allocation Test ROC score:  0.976725203037134
Resource Allocation Test AP score:  0.978957596709797


In [0]:
# Calculate, store Adamic-Index scores in array
pr_matrix = np.zeros((n,n))
for u, v, p in list_rpr_scores(g_train): 
    pr_matrix[u][v] = p
    pr_matrix[v][u] = p # make sure it's symmetric
    
# Normalize array
pr_matrix = pr_matrix / pr_matrix.max()

In [34]:
# Calculate ROC AUC and Average Precision
pr_roc, pr_ap = get_roc_score(test_edges, test_edges_false, pr_matrix)

print('Rooted Pagerank Test ROC score: ', str(pr_roc))
print('Rooted Pagerank Test AP score: ', str(pr_ap))

Rooted Pagerank Test ROC score:  0.9597723927787186
Rooted Pagerank Test AP score:  0.958078639156332


In [0]:
# Calculate, store Adamic-Index scores in array
katz_matrix = np.zeros((n,n))
for u, v, p in list_katz_scores(g_train): 
    katz_matrix[u][v] = p
    katz_matrix[v][u] = p # make sure it's symmetric
    
# Normalize array
katz_matrix = katz_matrix / katz_matrix.max()

In [33]:
# Calculate ROC AUC and Average Precision
katz_roc, katz_ap = get_roc_score(test_edges, test_edges_false, katz_matrix)

print('Katz Test ROC score: ', str(katz_roc))
print('Katz Test AP score: ', str(katz_ap))

Katz Test ROC score:  0.955391688311403
Katz Test AP score:  0.9559698862020785


In [0]:
##katz infinite length format

In [0]:
def get_roc_katz2(edges_pos, edges_neg, score_matrix):  # for katz and pr matrix scores(2 sided score)
    # Store positive edge predictions, actual values
    preds_pos = []
    pos = []
    for edge in edges_pos:
        preds_pos.append(score_matrix[edge[0], edge[1]]+score_matrix[edge[1], edge[0]]) # predicted score
        pos.append(adj_sparse[edge[0], edge[1]]) # actual value (1 for positive)
        
    # Store negative edge predictions, actual values
    preds_neg = []
    neg = []
    for edge in edges_neg:
        preds_neg.append(score_matrix[edge[0], edge[1]]+score_matrix[edge[1], edge[0]]) # predicted score
        neg.append(adj_sparse[edge[0], edge[1]]) # actual value (0 for negative)
        
    # Calculate scores
    preds_all = np.hstack([preds_pos, preds_neg])
    labels_all = np.hstack([np.ones(len(preds_pos)), np.zeros(len(preds_neg))])
    roc_score = roc_auc_score(labels_all, preds_all)
    ap_score = average_precision_score(labels_all, preds_all)
    return roc_score, ap_score

In [0]:
katz_matrix2 = katz_score(g_train,beta=0.001)
katz_matrix2 = katz_matrix2 / katz_matrix2.max()

In [61]:
# Calculate ROC AUC and Average Precision
katz_roc2, katz_ap2 = get_roc_katz2(test_edges, test_edges_false, katz_matrix2)

print('Katz2 Test ROC score: ', str(katz_roc2))
print('Katz2 Test AP score: ', str(katz_ap2))

Katz2 Test ROC score:  0.9553909582506136
Katz2 Test AP score:  0.9559683511903705


In [0]:
def rpr_matrix(graph, alpha=0.85):
    D = graph.to_directed()
    H = nx.stochastic_graph(D)
    H = nx.to_numpy_matrix(H).transpose()
    I = np.eye(H.shape[0])
    S = alpha*np.linalg.inv(I - (1-alpha)*H)
    return S

In [0]:
pr_matrix2 = rpr_matrix(g_train)
pr_matrix2 = pr_matrix2 / pr_matrix2.max()

In [62]:
# Calculate ROC AUC and Average Precision
pr_roc2, pr_ap2 = get_roc_katz2(test_edges, test_edges_false, pr_matrix2)

print('Rooted Pagerank Test ROC score: ', str(pr_roc2))
print('Rooted Pagerank Test AP score: ', str(pr_ap2))

Rooted Pagerank Test ROC score:  0.9758252368628019
Rooted Pagerank Test AP score:  0.9741201118885143
