In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None
pd.set_option('display.max_columns', None)
import networkx as nx
from networkx.algorithms import bipartite
# import community
from networkx.readwrite import json_graph
# import nx_altair as nxa
from networkx.algorithms.community import greedy_modularity_communities
from pyvis import network as net
# from node2vec import Node2Vec
import altair as alt
import matplotlib.pyplot as plt
import scipy.sparse as sp
import numpy as np
import itertools
import collections
from tqdm.notebook import trange, tqdm
tqdm.pandas()
from sklearn.metrics import roc_auc_score
from sklearn.metrics import average_precision_score
import warnings
warnings.filterwarnings("ignore")
from IPython.display import display, Markdown, HTML
import sys
sys.path.append("..")
from bigraph.predict import pa_predict, jc_predict, cn_predict,aa_predict, katz_predict
from bigraph.evaluation import evaluation
from network_analysis.birankpy import BipartiteNetwork
from network_analysis.load_datasets import get_updated_shxco_data
from network_analysis.generate_network_metrics import *
from network_analysis.create_networks import *
from network_analysis.read_write_networks import * 
from network_analysis.link_prediction import * 
members_df, books_df, borrow_events, events_df = get_updated_shxco_data(get_subscription=False)


In [9]:
g = nx.read_gexf("./data/borrow_events_unipartite_projected_members_graph.gexf")
nodes_df = pd.DataFrame.from_dict(
    dict(g.nodes(data=True)), orient='index')
label_dict = dict(zip(nodes_df.label, nodes_df.uri))
graph = nx.relabel_nodes(g, label_dict)
adj = nx.adjacency_matrix(graph)


In [4]:
edgelist = pd.read_csv('./data/borrow_events_unipartite_projected_members_edgelist.csv')
edgelist['original_source'] = edgelist['source']
edgelist['original_target'] = edgelist['target']
nodelist = pd.read_csv(
    './data/borrow_events_unipartite_projected_members_nodelist.csv')


In [7]:
for index, row in edgelist.iterrows():
    row['source'] = nodelist.loc[row['original_source'] == nodelist.node_id].uri

In [12]:
np.random.seed(0)  # make sure train-test split is consistent between notebooks
adj_sparse = nx.to_scipy_sparse_matrix(graph)


In [13]:
# Perform train-test split
adj_train, train_edges, train_edges_false, val_edges, val_edges_false, \
    test_edges, test_edges_false = mask_test_edges(
        adj_sparse, test_frac=.3, val_frac=.1, verbose=True)


preprocessing...
generating test/val sets...
creating false test edges...
creating false val edges...
creating false train edges...
final checks for disjointness...
creating adj_train...
Done with train-test split!



In [14]:
# new graph object with only non-hidden edges
g_train = nx.from_scipy_sparse_matrix(adj_train)


In [15]:
# Inspect train/test split
print("Total nodes:", adj_sparse.shape[0])
# adj is symmetric, so nnz (num non-zero) = 2*num_edges
print("Total edges:", int(adj_sparse.nnz/2))
print("Training edges (positive):", len(train_edges))
print("Training edges (negative):", len(train_edges_false))
print("Validation edges (positive):", len(val_edges))
print("Validation edges (negative):", len(val_edges_false))
print("Test edges (positive):", len(test_edges))
print("Test edges (negative):", len(test_edges_false))


Total nodes: 528
Total edges: 19592
Training edges (positive): 11756
Training edges (negative): 11756
Validation edges (positive): 1959
Validation edges (negative): 1959
Test edges (positive): 5877
Test edges (negative): 5877


In [16]:
def get_roc_score(edges_pos, edges_neg, score_matrix):
    # Store positive edge predictions, actual values
    preds_pos = []
    pos = []
    for edge in edges_pos:
        preds_pos.append(score_matrix[edge[0], edge[1]])  # predicted score
        # actual value (1 for positive)
        pos.append(adj_sparse[edge[0], edge[1]])

    # Store negative edge predictions, actual values
    preds_neg = []
    neg = []
    for edge in edges_neg:
        preds_neg.append(score_matrix[edge[0], edge[1]])  # predicted score
        # actual value (0 for negative)
        neg.append(adj_sparse[edge[0], edge[1]])

    # Calculate scores
    preds_all = np.hstack([preds_pos, preds_neg])
    labels_all = np.hstack([np.ones(len(preds_pos)), np.zeros(len(preds_neg))])
    roc_score = roc_auc_score(labels_all, preds_all)
    ap_score = average_precision_score(labels_all, preds_all)
    return roc_score, ap_score


In [18]:
# Compute Adamic-Adar indexes from g_train
aa_matrix = np.zeros(adj.shape)
# (u, v) = node indices, p = Adamic-Adar index
for u, v, p in nx.adamic_adar_index(g_train):
    aa_matrix[u][v] = p
    aa_matrix[v][u] = p  # make sure it's symmetric

# Normalize array
aa_matrix = aa_matrix / aa_matrix.max()


In [9]:
# Calculate ROC AUC and Average Precision
aa_roc, aa_ap = get_roc_score(test_edges, test_edges_false, aa_matrix)

print('Adamic-Adar Test ROC score: ', str(aa_roc))
print('Adamic-Adar Test AP score: ', str(aa_ap))


Adamic-Adar Test ROC score:  0.9307033622648676
Adamic-Adar Test AP score:  0.9291000236718376


In [14]:
# Compute Jaccard Coefficients from g_train
jc_matrix = np.zeros(adj.shape)
# (u, v) = node indices, p = Jaccard coefficient
for u, v, p in nx.jaccard_coefficient(g_train):
    jc_matrix[u][v] = p
    jc_matrix[v][u] = p  # make sure it's symmetric

# Normalize array
jc_matrix = jc_matrix / jc_matrix.max()


In [15]:
# Calculate ROC AUC and Average Precision
jc_roc, jc_ap = get_roc_score(test_edges, test_edges_false, jc_matrix)

print('Jaccard Coefficient Test ROC score: ', str(jc_roc))
print('Jaccard Coefficient Test AP score: ', str(jc_ap))


Jaccard Coefficient Test ROC score:  0.8878069764005919
Jaccard Coefficient Test AP score:  0.8842410074431496


In [16]:
# Calculate, store Adamic-Index scores in array
pa_matrix = np.zeros(adj.shape)
# (u, v) = node indices, p = Jaccard coefficient
for u, v, p in nx.preferential_attachment(g_train):
    pa_matrix[u][v] = p
    pa_matrix[v][u] = p  # make sure it's symmetric

# Normalize array
pa_matrix = pa_matrix / pa_matrix.max()


In [17]:
# Calculate ROC AUC and Average Precision
pa_roc, pa_ap = get_roc_score(test_edges, test_edges_false, pa_matrix)

print('Preferential Attachment Test ROC score: ', str(pa_roc))
print('Preferential Attachment Test AP score: ', str(pa_ap))


Preferential Attachment Test ROC score:  0.9197610861872051
Preferential Attachment Test AP score:  0.9185476075334742


In [20]:
def get_roc_score_emb(edges_pos, edges_neg, embeddings):
    score_matrix = np.dot(embeddings, embeddings.T)
    
    def sigmoid(x):
        return 1 / (1 + np.exp(-x))
    
    # Store positive edge predictions, actual values
    preds_pos = []
    pos = []
    for edge in edges_pos:
        preds_pos.append(sigmoid(score_matrix[edge[0], edge[1]])) # predicted score
        pos.append(adj_sparse[edge[0], edge[1]]) # actual value (1 for positive)
        
    # Store negative edge predictions, actual values
    preds_neg = []
    neg = []
    for edge in edges_neg:
        preds_neg.append(sigmoid(score_matrix[edge[0], edge[1]])) # predicted score
        neg.append(adj_sparse[edge[0], edge[1]]) # actual value (0 for negative)
        
    # Calculate scores
    preds_all = np.hstack([preds_pos, preds_neg])
    labels_all = np.hstack([np.ones(len(preds_pos)), np.zeros(len(preds_neg))])
    roc_score = roc_auc_score(labels_all, preds_all)
    ap_score = average_precision_score(labels_all, preds_all)
    return roc_score, ap_score

In [21]:
from sklearn.manifold import spectral_embedding

# Get spectral embeddings (16-dim)
emb = spectral_embedding(adj_train, n_components=16, random_state=0)


In [22]:
# Calculate ROC AUC and Average Precision
sc_roc, sc_ap = get_roc_score_emb(test_edges, test_edges_false, emb)

print('Spectral Clustering Test ROC score: ', str(sc_roc))
print('Spectral Clustering Test AP score: ', str(sc_ap))


Spectral Clustering Test ROC score:  0.6271955446531382
Spectral Clustering Test AP score:  0.5419932256219776


In [None]:
import node2vec
from gensim.models import Word2Vec
