In [1]:
import random

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
import pickle
import sklearn.metrics as metrics

### fuctions

In [2]:
def edge_features(node_emb_1, node_emb_2, operator):
    node_emb_1 = np.asfarray(node_emb_1,float)
    node_emb_2 = np.asfarray(node_emb_2, float)
    # combine two nodes' embeddings with specificed operator
    if operator == 'Average':
        edge = [((x + y) / 2.0) for x,y in zip(node_emb_1, node_emb_2)]
    elif operator == 'Hadamard':
        edge = [(x * y) for x,y in zip(node_emb_1, node_emb_2)]
    elif operator == 'Weighted-L1':
        edge = [abs(x - y) for x,y in zip(node_emb_1, node_emb_2)]
    elif operator == 'Weighted-L2':
        edge = [abs(x - y)**2 for x,y in zip(node_emb_1, node_emb_2)]
    elif operator == 'Concat':
        edge = np.concatenate((node_emb_1, node_emb_2), axis=None) 
    else:
        print("Generate edge features: Operator not supported")
        print("Use default operator: Weighted-L1")
        edge = [abs(x - y) for x,y in zip(node_emb_1, node_emb_2)]
        
    return edge
def generate_edge_features(edge_list, node_embeddings, operator):
    edge_features_mtx = []
    
    # generate features for each edge in the list
    for node_index_1, node_index_2 in edge_list:
        node_emb_1 = node_embeddings[node_index_1-1]
        node_emb_2 = node_embeddings[node_index_2-1]
        
        edge_features_mtx.append(edge_features(node_emb_1, node_emb_2, operator))
        
    return edge_features_mtx

def generate_train_set(graph_train, num_edge_sample, node_embeddings, edge_operator,):
    edge_list = list(graph_train.edges)
    num_nodes = graph_train.number_of_nodes()
    
    train_edges = []
    train_edges_labels = [1] * num_edge_sample + [0] * num_edge_sample
    
    random.seed(0)
    
    # sample edges with label 1 (true edges)
    for edge_num in range(num_edge_sample):
        rand_index = random.randint(0, len(edge_list) - 1)
        
        #train_edges.append(tuple(edge_list[rand_index]))
        train_edges.append(edge_list[rand_index])
    non_edge_num = 0
    
    # sample edges with label 0 (non-exist edges)
    while(non_edge_num < num_edge_sample):
        rand_nodes = tuple(np.random.randint(low=0,high=num_nodes, size=2))
        
        if rand_nodes not in edge_list:
            train_edges.append(rand_nodes)
            non_edge_num += 1

    train_edges_features_mtx = generate_edge_features(train_edges, node_embeddings, edge_operator)
            
    return train_edges, train_edges_features_mtx, train_edges_labels

def generate_test_set(graph_test, node_embeddings, edge_operator):
    edge_list = graph_test.edges
    nodes_with_edge = set()
    
    for edge in edge_list:
        nodes_with_edge.add(edge[0])
        nodes_with_edge.add(edge[1])
    
    num_nodes = graph_test.number_of_nodes()
    
    test_edges = []
    test_edges_labels = []
    
    num_edge_sample = len(edge_list)
    non_edge_num = 0 
    # sample edges with label 0 (non-exist edges)
    
    while(non_edge_num < num_edge_sample):
        rand_nodes = tuple(np.random.randint(low=0,high=num_nodes, size=2))
        
        if rand_nodes not in edge_list:
            test_edges.append(rand_nodes)
            test_edges_labels.append(0)
            non_edge_num += 1
        
    for edge in edge_list:
        test_edges.append(edge)
        test_edges_labels.append(1)
    
    test_edges_features_mtx = generate_edge_features(test_edges, node_embeddings, edge_operator)
    
    return test_edges, test_edges_features_mtx, test_edges_labels

def build_clf(feature_mtx, response_vec):
   
    logistic_regression_model = LogisticRegression(random_state = 0,max_iter=5000,solver='liblinear',verbose=1,tol=1e-6)
    binary_clf = logistic_regression_model.fit(feature_mtx, response_vec)
    
    return binary_clf

def pred_links(feature_mtx, LR_clf):
    predict_edges_labels = LR_clf.predict(feature_mtx)
    
    return predict_edges_labels

def precision_recall(predict_labels, true_labels):
    true_positive  = false_positive = 0
    true_negative =  false_negative = 0
    
    for p_label, true_label in zip(predict_labels, true_labels):
        
        #print(p_label,true_label)
        if p_label == true_label and true_label == 1:
            true_positive += 1
        elif p_label == true_label and true_label == 0:
            true_negative += 1
        elif p_label != true_label and true_label == 1:
            false_negative += 1
        elif p_label != true_label and true_label == 0:
            false_positive += 1

    print("TP: ", true_positive)
    print("TN: ", true_negative)
    print("FP: ", false_positive)
    print("FN: ", false_negative)
    
    precision = true_positive / (true_positive + false_positive)
    recall = true_positive / (true_positive + false_negative)
    f1 = 2 * (precision * recall) / (precision + recall)
    print("F1: {}".format(f1))
    
    cm = metrics.confusion_matrix(true_labels, predict_labels)
    print(cm)
    print(metrics.classification_report(true_labels, predict_labels))
    map = metrics.average_precision_score(true_labels, predict_labels)
    print('Mean Average Precision: {}'.format(map))
    fpr, tpr, thresholds = metrics.roc_curve(true_labels, predict_labels)
    roc_auc = metrics.auc(fpr, tpr)
    print('Area Under ROC Curve: {}'.format(roc_auc))
    
    return precision, recall

### load data

In [3]:
# load the training and testing graph
with open('./graphs/msg_equal_edge_dir.pkl', 'rb') as file:
    graphs = pickle.load(file)
graph_train = graphs[-2]
graph_test = graphs[-1]
print(len(graph_train.nodes()))
print(graph_train.size())
# parameters
num_edge_sample = 400
num_snapshots = len(graphs)

1899
1106


In [4]:
emb_list = []
for i in range(num_snapshots):
    file = 'npy_node2vec/MSG/msg_equal/msg_equal_' + str(i) + '.npy'
    node_embedding = np.load(file)
    emb_list.append(node_embedding)
print(len(emb_list))

28


### independent

In [5]:
node_embeddings_training = emb_list[-3]
node_embeddings_testing = emb_list[-2]

#node_embeddings_training = np.load('./reddit/reddit_month_emb/reddit_month_37.npy')
#node_embeddings_testing =np.load('./reddit/reddit_month_emb/reddit_month_38.npy')

In [6]:
for edge_operator in [ 'Average', 'Hadamard','Weighted-L1','Weighted-L2', 'Concat']:
    # generate the training set
    train_edges, train_edges_features_mtx, train_edges_labels = generate_train_set(graph_train, num_edge_sample, node_embeddings_training, edge_operator)
    # generate the testing set
    test_edges, test_edges_features_mtx, test_edges_labels = generate_test_set(graph_test, node_embeddings_testing, edge_operator)

    LR_clf = build_clf(train_edges_features_mtx, train_edges_labels)

    print("Edge Operator: {}".format(edge_operator))
    predict_edges_labels = pred_links(test_edges_features_mtx, LR_clf)
    precision, recall = precision_recall(list(predict_edges_labels), list(test_edges_labels))
    print('Precision: ', precision)
    print('Recall: ', recall)

[LibLinear]Edge Operator: Average
TP:  330
TN:  1061
FP:  29
FN:  760
F1: 0.4554865424430642
[[1061   29]
 [ 760  330]]
              precision    recall  f1-score   support

           0       0.58      0.97      0.73      1090
           1       0.92      0.30      0.46      1090

    accuracy                           0.64      2180
   macro avg       0.75      0.64      0.59      2180
weighted avg       0.75      0.64      0.59      2180

Mean Average Precision: 0.6269198333801845
Area Under ROC Curve: 0.6380733944954128
Precision:  0.9192200557103064
Recall:  0.30275229357798167
[LibLinear]Edge Operator: Hadamard
TP:  492
TN:  1053
FP:  37
FN:  598
F1: 0.6077825818406424
[[1053   37]
 [ 598  492]]
              precision    recall  f1-score   support

           0       0.64      0.97      0.77      1090
           1       0.93      0.45      0.61      1090

    accuracy                           0.71      2180
   macro avg       0.78      0.71      0.69      2180
weighted avg    

### sum

In [7]:
for i in range(len(emb_list)):
    emb_list[i] = np.asfarray(emb_list[i],float)
node_embeddings_training = np.sum(np.asarray(emb_list[0:-2]),axis=0)
node_embeddings_testing = np.sum(np.asarray(emb_list[0:-1]),axis=0)



In [8]:
for edge_operator in ['Average', 'Hadamard','Weighted-L1','Weighted-L2', 'Concat']:
    # generate the training set
    train_edges, train_edges_features_mtx, train_edges_labels = generate_train_set(graph_train, num_edge_sample, node_embeddings_training, edge_operator)
    # generate the testing set
    test_edges, test_edges_features_mtx, test_edges_labels = generate_test_set(graph_test, node_embeddings_testing, edge_operator)

    LR_clf = build_clf(train_edges_features_mtx, train_edges_labels)

    print("Edge Operator: {}".format(edge_operator))
    predict_edges_labels = pred_links(test_edges_features_mtx, LR_clf)
    precision, recall = precision_recall(list(predict_edges_labels), list(test_edges_labels))
    print('Precision: ', precision)
    print('Recall: ', recall)

[LibLinear]Edge Operator: Average
TP:  874
TN:  855
FP:  235
FN:  216
F1: 0.794906775807185
[[855 235]
 [216 874]]
              precision    recall  f1-score   support

           0       0.80      0.78      0.79      1090
           1       0.79      0.80      0.79      1090

    accuracy                           0.79      2180
   macro avg       0.79      0.79      0.79      2180
weighted avg       0.79      0.79      0.79      2180

Mean Average Precision: 0.7310065270803517
Area Under ROC Curve: 0.7931192660550459
Precision:  0.78809738503156
Recall:  0.8018348623853211
[LibLinear]Edge Operator: Hadamard
TP:  552
TN:  869
FP:  221
FN:  538
F1: 0.5925925925925926
[[869 221]
 [538 552]]
              precision    recall  f1-score   support

           0       0.62      0.80      0.70      1090
           1       0.71      0.51      0.59      1090

    accuracy                           0.65      2180
   macro avg       0.67      0.65      0.64      2180
weighted avg       0.67     

### expdecay theta = 0.9, 0.5, 0.3

In [189]:
exps = [np.exp(-i * 0.3) for i in range(1,num_snapshots)]

exps

[0.7408182206817179,
 0.5488116360940264,
 0.40656965974059917,
 0.30119421191220214,
 0.22313016014842982,
 0.16529888822158656,
 0.1224564282529819,
 0.09071795328941251,
 0.06720551273974978,
 0.049787068367863944,
 0.036883167401240015,
 0.02732372244729257,
 0.02024191144580439,
 0.014995576820477703,
 0.011108996538242306,
 0.00822974704902003,
 0.006096746565515638,
 0.00451658094261267,
 0.003345965457471272,
 0.0024787521766663585,
 0.0018363047770289071,
 0.0013603680375478939,
 0.0010077854290485113,
 0.0007465858083766799,
 0.0005530843701478336,
 0.0004097349789797868,
 0.0003035391380788668,
 0.0002248673241788482,
 0.00016658581098763354,
 0.00012340980408667956,
 9.142423147817343e-05,
 6.77287364908539e-05,
 5.017468205617528e-05,
 3.7170318684126734e-05,
 2.7536449349747158e-05,
 2.039950341117196e-05,
 1.5112323819855033e-05,
 1.119548484259094e-05,
 8.29381916075737e-06,
 6.14421235332821e-06,
 4.55174446308324e-06,
 3.3720152341391845e-06,
 2.498050325866635e-06,
 

In [183]:
node_embeddings_training = np.zeros((emb_list[0]).shape) 
for c,e in zip(emb_list[0:-2],exps[:-1]):
    node_embeddings_training += e * c


node_embeddings_testing = np.zeros((emb_list[0]).shape) 
for c,e in zip(emb_list[0:-1],exps[:-1]):
    node_embeddings_testing += e * c 

In [184]:
for edge_operator in [ 'Average', 'Hadamard','Weighted-L1','Weighted-L2', 'Concat']:
    # generate the training set
    train_edges, train_edges_features_mtx, train_edges_labels = generate_train_set(graph_train, num_edge_sample, node_embeddings_training, edge_operator)
    # generate the testing set
    test_edges, test_edges_features_mtx, test_edges_labels = generate_test_set(graph_test, node_embeddings_testing, edge_operator)

    LR_clf = build_clf(train_edges_features_mtx, train_edges_labels)

    print("Edge Operator: {}".format(edge_operator))
    predict_edges_labels = pred_links(test_edges_features_mtx, LR_clf)
    precision, recall = precision_recall(list(predict_edges_labels), list(test_edges_labels))
    print('Precision: ', precision)
    print('Recall: ', recall)

[LibLinear]Edge Operator: Average
TP:  1880
TN:  2007
FP:  678
FN:  805
F1: 0.7171466717528132
[[2007  678]
 [ 805 1880]]
              precision    recall  f1-score   support

           0       0.71      0.75      0.73      2685
           1       0.73      0.70      0.72      2685

    accuracy                           0.72      5370
   macro avg       0.72      0.72      0.72      5370
weighted avg       0.72      0.72      0.72      5370

Mean Average Precision: 0.6645081775071597
Area Under ROC Curve: 0.7238361266294228
Precision:  0.7349491790461298
Recall:  0.7001862197392924
[LibLinear]Edge Operator: Hadamard
TP:  1676
TN:  2278
FP:  407
FN:  1009
F1: 0.703020134228188
[[2278  407]
 [1009 1676]]
              precision    recall  f1-score   support

           0       0.69      0.85      0.76      2685
           1       0.80      0.62      0.70      2685

    accuracy                           0.74      5370
   macro avg       0.75      0.74      0.73      5370
weighted avg 

### weekly

In [26]:
# load the training and testing graph
with open('./graphs/msg_1_week_dir.pkl', 'rb') as file:
    graphs = pickle.load(file)
graph_train = graphs[-2]
graph_test = graphs[-1]

# parameters
num_edge_sample = 400
num_snapshots = 51

In [27]:
emb_list = []
for i in range(num_snapshots):
    file = '/home/irenelin/Downloads/EECS576-Project-master/npy_node2vec/RM/rm-week/rm_week_' + str(i) + '.npy'
    node_embedding = np.load(file)
    emb_list.append(node_embedding)
print(len(emb_list))

51


### independent

In [28]:
node_embeddings_training = emb_list[-3]
node_embeddings_testing = emb_list[-2]

In [29]:
for edge_operator in ['Average', 'Hadamard','Weighted-L1','Weighted-L2', 'Concat']:
    # generate the training set
    train_edges, train_edges_features_mtx, train_edges_labels = generate_train_set(graph_train, num_edge_sample, node_embeddings_training, edge_operator)
    # generate the testing set
    test_edges, test_edges_features_mtx, test_edges_labels = generate_test_set(graph_test, node_embeddings_testing, edge_operator)

    LR_clf = build_clf(train_edges_features_mtx, train_edges_labels)

    print("Edge Operator: {}".format(edge_operator))
    predict_edges_labels = pred_links(test_edges_features_mtx, LR_clf)
    precision, recall = precision_recall(list(predict_edges_labels), list(test_edges_labels))
    print('Precision: ', precision)
    print('Recall: ', recall)

[LibLinear]Edge Operator: Average
TP:  86
TN:  0
FP:  86
FN:  0
F1: 0.6666666666666666
[[ 0 86]
 [ 0 86]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        86
           1       0.50      1.00      0.67        86

    accuracy                           0.50       172
   macro avg       0.25      0.50      0.33       172
weighted avg       0.25      0.50      0.33       172

Mean Average Precision: 0.5
Area Under ROC Curve: 0.5
Precision:  0.5
Recall:  1.0
[LibLinear]Edge Operator: Hadamard
TP:  86
TN:  0
FP:  86
FN:  0
F1: 0.6666666666666666
[[ 0 86]
 [ 0 86]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        86
           1       0.50      1.00      0.67        86

    accuracy                           0.50       172
   macro avg       0.25      0.50      0.33       172
weighted avg       0.25      0.50      0.33       172

Mean Average Precision: 0.5
Area Under ROC Curve: 

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[LibLinear]Edge Operator: Weighted-L1
TP:  84
TN:  3
FP:  83
FN:  2
F1: 0.6640316205533596
[[ 3 83]
 [ 2 84]]
              precision    recall  f1-score   support

           0       0.60      0.03      0.07        86
           1       0.50      0.98      0.66        86

    accuracy                           0.51       172
   macro avg       0.55      0.51      0.36       172
weighted avg       0.55      0.51      0.36       172

Mean Average Precision: 0.5029243837905584
Area Under ROC Curve: 0.5058139534883721
Precision:  0.5029940119760479
Recall:  0.9767441860465116
[LibLinear]Edge Operator: Weighted-L2
TP:  84
TN:  1
FP:  85
FN:  2
F1: 0.6588235294117647
[[ 1 85]
 [ 2 84]]
              precision    recall  f1-score   support

           0       0.33      0.01      0.02        86
           1       0.50      0.98      0.66        86

    accuracy                           0.49       172
   macro avg       0.42      0.49      0.34       172
weighted avg       0.42      0.49     

### sum

In [30]:
for i in range(len(emb_list)):
    emb_list[i] = np.asfarray(emb_list[i],float)
node_embeddings_training = np.sum(np.asarray(emb_list[0:-2]),axis=0)
node_embeddings_testing = np.sum(np.asarray(emb_list[0:-1]),axis=0)

In [31]:
for edge_operator in ['Average', 'Hadamard','Weighted-L1','Weighted-L2', 'Concat']:
    # generate the training set
    train_edges, train_edges_features_mtx, train_edges_labels = generate_train_set(graph_train, num_edge_sample, node_embeddings_training, edge_operator)
    # generate the testing set
    test_edges, test_edges_features_mtx, test_edges_labels = generate_test_set(graph_test, node_embeddings_testing, edge_operator)

    LR_clf = build_clf(train_edges_features_mtx, train_edges_labels)

    print("Edge Operator: {}".format(edge_operator))
    predict_edges_labels = pred_links(test_edges_features_mtx, LR_clf)
    precision, recall = precision_recall(list(predict_edges_labels), list(test_edges_labels))
    print('Precision: ', precision)
    print('Recall: ', recall)

[LibLinear]Edge Operator: Average
TP:  34
TN:  75
FP:  11
FN:  52
F1: 0.5190839694656488
[[75 11]
 [52 34]]
              precision    recall  f1-score   support

           0       0.59      0.87      0.70        86
           1       0.76      0.40      0.52        86

    accuracy                           0.63       172
   macro avg       0.67      0.63      0.61       172
weighted avg       0.67      0.63      0.61       172

Mean Average Precision: 0.6010335917312661
Area Under ROC Curve: 0.6337209302325582
Precision:  0.7555555555555555
Recall:  0.3953488372093023
[LibLinear]Edge Operator: Hadamard
TP:  32
TN:  68
FP:  18
FN:  54
F1: 0.4705882352941177
[[68 18]
 [54 32]]
              precision    recall  f1-score   support

           0       0.56      0.79      0.65        86
           1       0.64      0.37      0.47        86

    accuracy                           0.58       172
   macro avg       0.60      0.58      0.56       172
weighted avg       0.60      0.58      0.

### expdecay

In [35]:
exps = [np.exp(-i * 0.9) for i in range(1,100)]
node_embeddings_training = np.zeros((emb_list[0]).shape) 
for c,e in zip(emb_list[0:-2],exps[:-1]):
    node_embeddings_training += e * c 
node_embeddings_testing = np.zeros((emb_list[0]).shape) 
for c,e in zip(emb_list[0:-1],exps[:-1]):
    node_embeddings_testing += e * c 

In [36]:
for edge_operator in ['Average', 'Hadamard','Weighted-L1','Weighted-L2', 'Concat']:
    # generate the training set
    train_edges, train_edges_features_mtx, train_edges_labels = generate_train_set(graph_train, num_edge_sample, node_embeddings_training, edge_operator)
    # generate the testing set
    test_edges, test_edges_features_mtx, test_edges_labels = generate_test_set(graph_test, node_embeddings_testing, edge_operator)

    LR_clf = build_clf(train_edges_features_mtx, train_edges_labels)

    print("Edge Operator: {}".format(edge_operator))
    predict_edges_labels = pred_links(test_edges_features_mtx, LR_clf)
    precision, recall = precision_recall(list(predict_edges_labels), list(test_edges_labels))
    print('Precision: ', precision)
    print('Recall: ', recall)

[LibLinear]Edge Operator: Average
TP:  4
TN:  85
FP:  1
FN:  82
F1: 0.0879120879120879
[[85  1]
 [82  4]]
              precision    recall  f1-score   support

           0       0.51      0.99      0.67        86
           1       0.80      0.05      0.09        86

    accuracy                           0.52       172
   macro avg       0.65      0.52      0.38       172
weighted avg       0.65      0.52      0.38       172

Mean Average Precision: 0.5139534883720931
Area Under ROC Curve: 0.5174418604651163
Precision:  0.8
Recall:  0.046511627906976744
[LibLinear]Edge Operator: Hadamard
TP:  82
TN:  5
FP:  81
FN:  4
F1: 0.6586345381526104
[[ 5 81]
 [ 4 82]]
              precision    recall  f1-score   support

           0       0.56      0.06      0.11        86
           1       0.50      0.95      0.66        86

    accuracy                           0.51       172
   macro avg       0.53      0.51      0.38       172
weighted avg       0.53      0.51      0.38       172

Mea