In [47]:
import random

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
import pickle
import sklearn.metrics as metrics

### fuctions

In [57]:
def edge_features(node_emb_1, node_emb_2, operator):
    node_emb_1 = np.asfarray(node_emb_1,float)
    node_emb_2 = np.asfarray(node_emb_2, float)
    # combine two nodes' embeddings with specificed operator
    if operator == 'Average':
        edge = [((x + y) / 2.0) for x,y in zip(node_emb_1, node_emb_2)]
    elif operator == 'Hadamard':
        edge = [(x * y) for x,y in zip(node_emb_1, node_emb_2)]
    elif operator == 'Weighted-L1':
        edge = [abs(x - y) for x,y in zip(node_emb_1, node_emb_2)]
    elif operator == 'Weighted-L2':
        edge = [abs(x - y)**2 for x,y in zip(node_emb_1, node_emb_2)]
    elif operator == 'Concat':
        edge = np.concatenate((node_emb_1, node_emb_2), axis=None) 
    else:
        print("Generate edge features: Operator not supported")
        print("Use default operator: Weighted-L1")
        edge = [abs(x - y) for x,y in zip(node_emb_1, node_emb_2)]
        
    return edge
def generate_edge_features(edge_list, node_embeddings, operator):
    edge_features_mtx = []
    
    # generate features for each edge in the list
    for node_index_1, node_index_2 in edge_list:
        node_emb_1 = node_embeddings[node_index_1-1]
        node_emb_2 = node_embeddings[node_index_2-1]
        
        edge_features_mtx.append(edge_features(node_emb_1, node_emb_2, operator))
        
    return edge_features_mtx

def generate_train_set(graph_train, num_edge_sample, node_embeddings, edge_operator,):
    edge_list = list(graph_train.edges)
    num_nodes = graph_train.number_of_nodes()
    
    train_edges = []
    train_edges_labels = [1] * num_edge_sample + [0] * num_edge_sample
    
    random.seed(0)
    
    # sample edges with label 1 (true edges)
    for edge_num in range(num_edge_sample):
        rand_index = random.randint(0, len(edge_list) - 1)
        
        #train_edges.append(tuple(edge_list[rand_index]))
        train_edges.append(edge_list[rand_index])
    non_edge_num = 0
    
    # sample edges with label 0 (non-exist edges)
    while(non_edge_num < num_edge_sample):
        rand_nodes = tuple(np.random.randint(low=0,high=num_nodes, size=2))
        
        if rand_nodes not in edge_list:
            train_edges.append(rand_nodes)
            non_edge_num += 1

    train_edges_features_mtx = generate_edge_features(train_edges, node_embeddings, edge_operator)
            
    return train_edges, train_edges_features_mtx, train_edges_labels

def generate_test_set(graph_test, node_embeddings, edge_operator):
    edge_list = graph_test.edges
    nodes_with_edge = set()
    
    for edge in edge_list:
        nodes_with_edge.add(edge[0])
        nodes_with_edge.add(edge[1])
    
    num_nodes = graph_test.number_of_nodes()
    
    test_edges = []
    test_edges_labels = []
    
    num_edge_sample = len(edge_list)
    non_edge_num = 0 
    # sample edges with label 0 (non-exist edges)
    
    while(non_edge_num < num_edge_sample):
        rand_nodes = tuple(np.random.randint(low=0,high=num_nodes, size=2))
        
        if rand_nodes not in edge_list:
            test_edges.append(rand_nodes)
            test_edges_labels.append(0)
            non_edge_num += 1
        
    for edge in edge_list:
        test_edges.append(edge)
        test_edges_labels.append(1)
    
    test_edges_features_mtx = generate_edge_features(test_edges, node_embeddings, edge_operator)
    
    return test_edges, test_edges_features_mtx, test_edges_labels

def build_clf(feature_mtx, response_vec):
   
    logistic_regression_model = LogisticRegression(random_state = 0,max_iter=5000,solver='liblinear',verbose=1,tol=1e-6)
    binary_clf = logistic_regression_model.fit(feature_mtx, response_vec)
    
    return binary_clf

def pred_links(feature_mtx, LR_clf):
    predict_edges_labels = LR_clf.predict(feature_mtx)
    
    return predict_edges_labels

def precision_recall(predict_labels, true_labels):
    true_positive  = false_positive = 0
    true_negative =  false_negative = 0
    
    for p_label, true_label in zip(predict_labels, true_labels):
        
        #print(p_label,true_label)
        if p_label == true_label and true_label == 1:
            true_positive += 1
        elif p_label == true_label and true_label == 0:
            true_negative += 1
        elif p_label != true_label and true_label == 1:
            false_negative += 1
        elif p_label != true_label and true_label == 0:
            false_positive += 1

    print("TP: ", true_positive)
    print("TN: ", true_negative)
    print("FP: ", false_positive)
    print("FN: ", false_negative)
    
    precision = true_positive / (true_positive + false_positive)
    recall = true_positive / (true_positive + false_negative)
    f1 = 2 * (precision * recall) / (precision + recall)
    print("F1: {}".format(f1))
    
    cm = metrics.confusion_matrix(true_labels, predict_labels)
    print(cm)
    print(metrics.classification_report(true_labels, predict_labels))
    map = metrics.average_precision_score(true_labels, predict_labels)
    print('Mean Average Precision: {}'.format(map))
    fpr, tpr, thresholds = metrics.roc_curve(true_labels, predict_labels)
    roc_auc = metrics.auc(fpr, tpr)
    print('Area Under ROC Curve: {}'.format(roc_auc))
    
    return precision, recall

### load data

In [40]:
# load the training and testing graph
with open('./graphs/msg_1_month_dir.pkl', 'rb') as file:
    graphs = pickle.load(file)
graph_train = graphs[-2]
graph_test = graphs[-1]

# parameters
num_edge_sample = 400
edge_operator = 'Weighted-L2'
num_snapshots = 7

In [41]:
emb_list = []
for i in range(num_snapshots):
    file = './MSG/msg_month/msg_month_' + str(i) + '.npy'
    node_embedding = np.load(file)
    emb_list.append(node_embedding)
print(len(emb_list))

7


### independent

In [42]:
node_embeddings_training = emb_list[-3]
node_embeddings_testing = emb_list[-2]

In [59]:
for edge_operator in ['Hadamard','Weighted-L1','Weighted-L2', 'Concat','Average']:
    # generate the training set
    train_edges, train_edges_features_mtx, train_edges_labels = generate_train_set(graph_train, num_edge_sample, node_embeddings_training, edge_operator)
    # generate the testing set
    test_edges, test_edges_features_mtx, test_edges_labels = generate_test_set(graph_test, node_embeddings_testing, edge_operator)

    LR_clf = build_clf(train_edges_features_mtx, train_edges_labels)

    print("Edge Operator: {}".format(edge_operator))
    predict_edges_labels = pred_links(test_edges_features_mtx, LR_clf)
    precision, recall = precision_recall(list(predict_edges_labels), list(test_edges_labels))
    print('Precision: ', precision)
    print('Recall: ', recall)

[LibLinear]Edge Operator: Hadamard
TP:  280
TN:  561
FP:  29
FN:  310
F1: 0.6229143492769745
[[561  29]
 [310 280]]
              precision    recall  f1-score   support

           0       0.64      0.95      0.77       590
           1       0.91      0.47      0.62       590

    accuracy                           0.71      1180
   macro avg       0.78      0.71      0.70      1180
weighted avg       0.78      0.71      0.70      1180

Mean Average Precision: 0.6927486149964346
Area Under ROC Curve: 0.7127118644067797
Precision:  0.9061488673139159
Recall:  0.4745762711864407
[LibLinear]Edge Operator: Weighted-L1
TP:  480
TN:  394
FP:  196
FN:  110
F1: 0.7582938388625592
[[394 196]
 [110 480]]
              precision    recall  f1-score   support

           0       0.78      0.67      0.72       590
           1       0.71      0.81      0.76       590

    accuracy                           0.74      1180
   macro avg       0.75      0.74      0.74      1180
weighted avg       0.7

ZeroDivisionError: division by zero

### sum

In [71]:
for i in range(len(emb_list)):
    emb_list[i] = np.asfarray(emb_list[i],float)
node_embeddings_training = np.sum(np.asarray(emb_list[0:-2]),axis=0)
node_embeddings_testing = np.sum(np.asarray(emb_list[0:-1]),axis=0)

In [74]:
for edge_operator in ['Average', 'Hadamard','Weighted-L1','Weighted-L2', 'Concat']:
    # generate the training set
    train_edges, train_edges_features_mtx, train_edges_labels = generate_train_set(graph_train, num_edge_sample, node_embeddings_training, edge_operator)
    # generate the testing set
    test_edges, test_edges_features_mtx, test_edges_labels = generate_test_set(graph_test, node_embeddings_testing, edge_operator)

    LR_clf = build_clf(train_edges_features_mtx, train_edges_labels)

    print("Edge Operator: {}".format(edge_operator))
    predict_edges_labels = pred_links(test_edges_features_mtx, LR_clf)
    precision, recall = precision_recall(list(predict_edges_labels), list(test_edges_labels))
    print('Precision: ', precision)
    print('Recall: ', recall)

[LibLinear]Edge Operator: Average
TP:  330
TN:  528
FP:  62
FN:  260
F1: 0.6720977596741344
[[528  62]
 [260 330]]
              precision    recall  f1-score   support

           0       0.67      0.89      0.77       590
           1       0.84      0.56      0.67       590

    accuracy                           0.73      1180
   macro avg       0.76      0.73      0.72      1180
weighted avg       0.76      0.73      0.72      1180

Mean Average Precision: 0.6911968177101349
Area Under ROC Curve: 0.7271186440677966
Precision:  0.8418367346938775
Recall:  0.559322033898305
[LibLinear]Edge Operator: Hadamard
TP:  310
TN:  499
FP:  91
FN:  280
F1: 0.625630676084763
[[499  91]
 [280 310]]
              precision    recall  f1-score   support

           0       0.64      0.85      0.73       590
           1       0.77      0.53      0.63       590

    accuracy                           0.69      1180
   macro avg       0.71      0.69      0.68      1180
weighted avg       0.71      

### expdecay theta = 0.9, 0.5, 0.3

In [93]:
exps = [np.exp(-i * 0.3) for i in range(1,8)]
node_embeddings_training = np.zeros((emb_list[0]).shape) 
for c,e in zip(emb_list[0:-2],exps[:-1]):
    node_embeddings_training += e * c 


In [94]:
node_embeddings_testing = np.zeros((emb_list[0]).shape) 
for c,e in zip(emb_list[0:-1],exps[:-1]):
    node_embeddings_testing += e * c 

In [95]:
for edge_operator in [ 'Average', 'Hadamard','Weighted-L1','Weighted-L2', 'Concat']:
    # generate the training set
    train_edges, train_edges_features_mtx, train_edges_labels = generate_train_set(graph_train, num_edge_sample, node_embeddings_training, edge_operator)
    # generate the testing set
    test_edges, test_edges_features_mtx, test_edges_labels = generate_test_set(graph_test, node_embeddings_testing, edge_operator)

    LR_clf = build_clf(train_edges_features_mtx, train_edges_labels)

    print("Edge Operator: {}".format(edge_operator))
    predict_edges_labels = pred_links(test_edges_features_mtx, LR_clf)
    precision, recall = precision_recall(list(predict_edges_labels), list(test_edges_labels))
    print('Precision: ', precision)
    print('Recall: ', recall)

[LibLinear]Edge Operator: Average
TP:  432
TN:  497
FP:  93
FN:  158
F1: 0.7748878923766817
[[497  93]
 [158 432]]
              precision    recall  f1-score   support

           0       0.76      0.84      0.80       590
           1       0.82      0.73      0.77       590

    accuracy                           0.79      1180
   macro avg       0.79      0.79      0.79      1180
weighted avg       0.79      0.79      0.79      1180

Mean Average Precision: 0.7363970944309928
Area Under ROC Curve: 0.7872881355932203
Precision:  0.8228571428571428
Recall:  0.7322033898305085
[LibLinear]Edge Operator: Hadamard
TP:  278
TN:  507
FP:  83
FN:  312
F1: 0.5846477392218716
[[507  83]
 [312 278]]
              precision    recall  f1-score   support

           0       0.62      0.86      0.72       590
           1       0.77      0.47      0.58       590

    accuracy                           0.67      1180
   macro avg       0.69      0.67      0.65      1180
weighted avg       0.69    

### weekly

In [96]:
# load the training and testing graph
with open('./graphs/msg_1_week_dir.pkl', 'rb') as file:
    graphs = pickle.load(file)
graph_train = graphs[-2]
graph_test = graphs[-1]

# parameters
num_edge_sample = 400
num_snapshots = 28

In [97]:
emb_list = []
for i in range(num_snapshots):
    file = './MSG/msg_week/msg_week_' + str(i) + '.npy'
    node_embedding = np.load(file)
    emb_list.append(node_embedding)
print(len(emb_list))

28


### independent

In [98]:
node_embeddings_training = emb_list[-3]
node_embeddings_testing = emb_list[-2]

In [100]:
for edge_operator in ['Average', 'Hadamard','Weighted-L1','Weighted-L2', 'Concat']:
    # generate the training set
    train_edges, train_edges_features_mtx, train_edges_labels = generate_train_set(graph_train, num_edge_sample, node_embeddings_training, edge_operator)
    # generate the testing set
    test_edges, test_edges_features_mtx, test_edges_labels = generate_test_set(graph_test, node_embeddings_testing, edge_operator)

    LR_clf = build_clf(train_edges_features_mtx, train_edges_labels)

    print("Edge Operator: {}".format(edge_operator))
    predict_edges_labels = pred_links(test_edges_features_mtx, LR_clf)
    precision, recall = precision_recall(list(predict_edges_labels), list(test_edges_labels))
    print('Precision: ', precision)
    print('Recall: ', recall)

[LibLinear]Edge Operator: Average
TP:  20
TN:  84
FP:  2
FN:  66
F1: 0.3703703703703703
[[84  2]
 [66 20]]
              precision    recall  f1-score   support

           0       0.56      0.98      0.71        86
           1       0.91      0.23      0.37        86

    accuracy                           0.60       172
   macro avg       0.73      0.60      0.54       172
weighted avg       0.73      0.60      0.54       172

Mean Average Precision: 0.595137420718816
Area Under ROC Curve: 0.6046511627906976
Precision:  0.9090909090909091
Recall:  0.23255813953488372
[LibLinear]Edge Operator: Hadamard
TP:  22
TN:  86
FP:  0
FN:  64
F1: 0.40740740740740744
[[86  0]
 [64 22]]
              precision    recall  f1-score   support

           0       0.57      1.00      0.73        86
           1       1.00      0.26      0.41        86

    accuracy                           0.63       172
   macro avg       0.79      0.63      0.57       172
weighted avg       0.79      0.63      0.5

### sum

In [101]:
for i in range(len(emb_list)):
    emb_list[i] = np.asfarray(emb_list[i],float)
node_embeddings_training = np.sum(np.asarray(emb_list[0:-2]),axis=0)
node_embeddings_testing = np.sum(np.asarray(emb_list[0:-1]),axis=0)

In [102]:
for edge_operator in ['Average', 'Hadamard','Weighted-L1','Weighted-L2', 'Concat']:
    # generate the training set
    train_edges, train_edges_features_mtx, train_edges_labels = generate_train_set(graph_train, num_edge_sample, node_embeddings_training, edge_operator)
    # generate the testing set
    test_edges, test_edges_features_mtx, test_edges_labels = generate_test_set(graph_test, node_embeddings_testing, edge_operator)

    LR_clf = build_clf(train_edges_features_mtx, train_edges_labels)

    print("Edge Operator: {}".format(edge_operator))
    predict_edges_labels = pred_links(test_edges_features_mtx, LR_clf)
    precision, recall = precision_recall(list(predict_edges_labels), list(test_edges_labels))
    print('Precision: ', precision)
    print('Recall: ', recall)

[LibLinear]Edge Operator: Average
TP:  40
TN:  78
FP:  8
FN:  46
F1: 0.5970149253731343
[[78  8]
 [46 40]]
              precision    recall  f1-score   support

           0       0.63      0.91      0.74        86
           1       0.83      0.47      0.60        86

    accuracy                           0.69       172
   macro avg       0.73      0.69      0.67       172
weighted avg       0.73      0.69      0.67       172

Mean Average Precision: 0.6550387596899225
Area Under ROC Curve: 0.686046511627907
Precision:  0.8333333333333334
Recall:  0.46511627906976744
[LibLinear]Edge Operator: Hadamard
TP:  22
TN:  83
FP:  3
FN:  64
F1: 0.39639639639639646
[[83  3]
 [64 22]]
              precision    recall  f1-score   support

           0       0.56      0.97      0.71        86
           1       0.88      0.26      0.40        86

    accuracy                           0.61       172
   macro avg       0.72      0.61      0.55       172
weighted avg       0.72      0.61      0.5

### expdecay

In [118]:
exps = [np.exp(-i * 0.3) for i in range(1,29)]
node_embeddings_training = np.zeros((emb_list[0]).shape) 
for c,e in zip(emb_list[0:-2],exps[:-1]):
    node_embeddings_training += e * c 
node_embeddings_testing = np.zeros((emb_list[0]).shape) 
for c,e in zip(emb_list[0:-1],exps[:-1]):
    node_embeddings_testing += e * c 

In [119]:
for edge_operator in ['Average', 'Hadamard','Weighted-L1','Weighted-L2', 'Concat']:
    # generate the training set
    train_edges, train_edges_features_mtx, train_edges_labels = generate_train_set(graph_train, num_edge_sample, node_embeddings_training, edge_operator)
    # generate the testing set
    test_edges, test_edges_features_mtx, test_edges_labels = generate_test_set(graph_test, node_embeddings_testing, edge_operator)

    LR_clf = build_clf(train_edges_features_mtx, train_edges_labels)

    print("Edge Operator: {}".format(edge_operator))
    predict_edges_labels = pred_links(test_edges_features_mtx, LR_clf)
    precision, recall = precision_recall(list(predict_edges_labels), list(test_edges_labels))
    print('Precision: ', precision)
    print('Recall: ', recall)

[LibLinear]Edge Operator: Average
TP:  30
TN:  79
FP:  7
FN:  56
F1: 0.4878048780487805
[[79  7]
 [56 30]]
              precision    recall  f1-score   support

           0       0.59      0.92      0.71        86
           1       0.81      0.35      0.49        86

    accuracy                           0.63       172
   macro avg       0.70      0.63      0.60       172
weighted avg       0.70      0.63      0.60       172

Mean Average Precision: 0.6084223758642364
Area Under ROC Curve: 0.6337209302325582
Precision:  0.8108108108108109
Recall:  0.3488372093023256
[LibLinear]Edge Operator: Hadamard
TP:  22
TN:  71
FP:  15
FN:  64
F1: 0.3577235772357724
[[71 15]
 [64 22]]
              precision    recall  f1-score   support

           0       0.53      0.83      0.64        86
           1       0.59      0.26      0.36        86

    accuracy                           0.54       172
   macro avg       0.56      0.54      0.50       172
weighted avg       0.56      0.54      0.5