<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Email-Monthly-w/-Graphwave" data-toc-modified-id="Email-Monthly-w/-Graphwave-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Email Monthly w/ Graphwave</a></span><ul class="toc-item"><li><span><a href="#Using-only-T-1-embedding" data-toc-modified-id="Using-only-T-1-embedding-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Using only T-1 embedding</a></span></li><li><span><a href="#Pure-Sum" data-toc-modified-id="Pure-Sum-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Pure Sum</a></span></li><li><span><a href="#Expotential-Sum" data-toc-modified-id="Expotential-Sum-1.3"><span class="toc-item-num">1.3&nbsp;&nbsp;</span>Expotential Sum</a></span></li></ul></li><li><span><a href="#Email-Weekly" data-toc-modified-id="Email-Weekly-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Email Weekly</a></span><ul class="toc-item"><li><span><a href="#Using-only-T-1-Embedding" data-toc-modified-id="Using-only-T-1-Embedding-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Using only T-1 Embedding</a></span></li><li><span><a href="#Pure-Sum" data-toc-modified-id="Pure-Sum-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>Pure Sum</a></span></li><li><span><a href="#Exponential-Sum" data-toc-modified-id="Exponential-Sum-2.3"><span class="toc-item-num">2.3&nbsp;&nbsp;</span>Exponential Sum</a></span></li></ul></li><li><span><a href="#Email-Equal-(weekly)" data-toc-modified-id="Email-Equal-(weekly)-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Email Equal (weekly)</a></span><ul class="toc-item"><li><span><a href="#Using-only-T-1-Embedding" data-toc-modified-id="Using-only-T-1-Embedding-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Using only T-1 Embedding</a></span></li><li><span><a href="#Pure-Sum" data-toc-modified-id="Pure-Sum-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>Pure Sum</a></span></li><li><span><a href="#Exponential-Sum" data-toc-modified-id="Exponential-Sum-3.3"><span class="toc-item-num">3.3&nbsp;&nbsp;</span>Exponential Sum</a></span></li></ul></li><li><span><a href="#Email-Equal(Monthly)" data-toc-modified-id="Email-Equal(Monthly)-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Email Equal(Monthly)</a></span><ul class="toc-item"><li><span><a href="#Using-only-T-1-Embedding" data-toc-modified-id="Using-only-T-1-Embedding-4.1"><span class="toc-item-num">4.1&nbsp;&nbsp;</span>Using only T-1 Embedding</a></span></li><li><span><a href="#Pure-Sum" data-toc-modified-id="Pure-Sum-4.2"><span class="toc-item-num">4.2&nbsp;&nbsp;</span>Pure Sum</a></span></li><li><span><a href="#Exponential-Sum" data-toc-modified-id="Exponential-Sum-4.3"><span class="toc-item-num">4.3&nbsp;&nbsp;</span>Exponential Sum</a></span></li></ul></li></ul></div>

In [3]:
%matplotlib inline
import networkx as nx 
import numpy as np
import pandas as pd
import seaborn as sb
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

import matplotlib.pyplot as plt
import graphwave
from graphwave.shapes import build_graph
from graphwave.graphwave import *
import pickle

import random
from sklearn.linear_model import LogisticRegression
import sklearn.metrics as metrics
import seaborn as sns
sns.set_style('darkgrid')

%load_ext autoreload
%autoreload 2
np.random.seed(123)


In [4]:
def edge_features(node_emb_1, node_emb_2, operator):
    
    # combine two nodes' embeddings with specificed operator
    if operator == 'Average':
        edge = [((x + y) / 2.0) for x,y in zip(node_emb_1, node_emb_2)]
    elif operator == 'Hadamard':
        edge = [(x * y) for x,y in zip(node_emb_1, node_emb_2)]
    elif operator == 'Weighted-L1':
        edge = [abs(x - y) for x,y in zip(node_emb_1, node_emb_2)]
    elif operator == 'Weighted-L2':
        edge = [abs(x - y)**2 for x,y in zip(node_emb_1, node_emb_2)]
    elif operator == 'Concat':
        edge = np.concatenate((node_emb_1, node_emb_2), axis=None) 
    else:
        print("Generate edge features: Operator not supported")
        print("Use default operator: Weighted-L1")
        edge = [abs(x - y) for x,y in zip(node_emb_1, node_emb_2)]
        
    return edge
def generate_edge_features(edge_list, node_embeddings, operator):
    edge_features_mtx = []
    
    # generate features for each edge in the list
    for node_index_1, node_index_2 in edge_list:
        node_emb_1 = node_embeddings[node_index_1]
        node_emb_2 = node_embeddings[node_index_2]
        
        edge_features_mtx.append(edge_features(node_emb_1, node_emb_2, operator))
        
    return edge_features_mtx

def generate_train_set(graph_train, num_edge_sample, node_embeddings, edge_operator,):
    edge_list = list(graph_train.edges)
    num_nodes = graph_train.number_of_nodes()
    
    train_edges = []
    train_edges_labels = [1] * num_edge_sample + [0] * num_edge_sample
    
    random.seed(0)
    
    # sample edges with label 1 (true edges)
    for edge_num in range(num_edge_sample):
        rand_index = random.randint(0, len(edge_list) - 1)
        
        #train_edges.append(tuple(edge_list[rand_index]))
        train_edges.append(edge_list[rand_index])
    non_edge_num = 0
    
    # sample edges with label 0 (non-exist edges)
    while(non_edge_num < num_edge_sample):
        rand_nodes = tuple(np.random.randint(low=0,high=num_nodes, size=2))
        
        if rand_nodes not in edge_list:
            train_edges.append(rand_nodes)
            non_edge_num += 1

    train_edges_features_mtx = generate_edge_features(train_edges, node_embeddings, edge_operator)
            
    return train_edges, train_edges_features_mtx, train_edges_labels

def generate_test_set(graph_test, node_embeddings, edge_operator):
    edge_list = graph_test.edges
    nodes_with_edge = set()
    
    for edge in edge_list:
        nodes_with_edge.add(edge[0])
        nodes_with_edge.add(edge[1])
    
    num_nodes = graph_test.number_of_nodes()
    
    test_edges = []
    test_edges_labels = []
    
    num_edge_sample = len(edge_list)
    non_edge_num = 0 
    # sample edges with label 0 (non-exist edges)
    
    while(non_edge_num < num_edge_sample):
        rand_nodes = tuple(np.random.randint(low=0,high=num_nodes, size=2))
        
        if rand_nodes not in edge_list:
            test_edges.append(rand_nodes)
            test_edges_labels.append(0)
            non_edge_num += 1
        
    for edge in edge_list:
        test_edges.append(edge)
        test_edges_labels.append(1)
    '''
    # generate all possible edges for each node with at least one edge (assume undirected edges)
    for node_1 in nodes_with_edge:
        for node_2 in range(num_nodes):
            test_edges.append((node_1, node_2))
            
            if (node_1, node_2) in edge_list:
                test_edges_labels.append(1)
            else:
                test_edges_labels.append(0)
    '''
    test_edges_features_mtx = generate_edge_features(test_edges, node_embeddings, edge_operator)
    
    return test_edges, test_edges_features_mtx, test_edges_labels

def build_clf(feature_mtx, response_vec):
   
    logistic_regression_model = LogisticRegression(random_state = 0,max_iter=5000,solver='liblinear',verbose=1,tol=1e-6)
    binary_clf = logistic_regression_model.fit(feature_mtx, response_vec)
    
    return binary_clf

def pred_links(feature_mtx, LR_clf):
    predict_edges_labels = LR_clf.predict(feature_mtx)
    
    return predict_edges_labels

def precision_recall(predict_labels, true_labels):
    true_positive  = false_positive = 0
    true_negative =  false_negative = 0
    
    for p_label, true_label in zip(predict_labels, true_labels):
        
        #print(p_label,true_label)
        if p_label == true_label and true_label == 1:
            true_positive += 1
        elif p_label == true_label and true_label == 0:
            true_negative += 1
        elif p_label != true_label and true_label == 1:
            false_negative += 1
        elif p_label != true_label and true_label == 0:
            false_positive += 1

    print("TP: ", true_positive)
    print("TN: ", true_negative)
    print("FP: ", false_positive)
    print("FN: ", false_negative)
    precision = true_positive / (true_positive + false_positive)
    recall = true_positive / (true_positive + false_negative)
    try:
        f1 = 2 * (precision * recall) / (precision + recall)
        print("F1: {}".format(f1))
    except:
        print("F1: Error")
    
    cm = metrics.confusion_matrix(true_labels, predict_labels)
    print(cm)
    print(metrics.classification_report(true_labels, predict_labels))
    map = metrics.average_precision_score(true_labels, predict_labels)
    print('Mean Average Precision: {}'.format(map))
    fpr, tpr, thresholds = metrics.roc_curve(true_labels, predict_labels)
    roc_auc = metrics.auc(fpr, tpr)
    print('Area Under ROC Curve: {}'.format(roc_auc))
    
    
    return precision, recall

## Email Monthly w/ Graphwave

In [5]:
#load the graphs 
with open('/z/pujat/576/data/email_eu/email_1_month_dir.pkl', 'rb') as file:
    graphs = pickle.load(file)
graph_train = graphs[-2]
graph_test = graphs[-1]

In [6]:
chi_list = []
heat_print_list = []
taus_list = []
for e, g in enumerate(graphs[:-1]): #last embedding used for link prediction
    chi, heat_print, taus = graphwave_alg(g, np.linspace(0,200,32), taus='auto', verbose=True)
    chi_list.append(chi)
    heat_print_list.append(heat_print)
    taus_list.append(taus)
    print("Completed: {}/{}".format(e,len(graphs[:-1])))

Completed: 0/16
Completed: 1/16
Completed: 2/16
Completed: 3/16
Completed: 4/16
Completed: 5/16
Completed: 6/16
Completed: 7/16
Completed: 8/16
Completed: 9/16
Completed: 10/16
Completed: 11/16
Completed: 12/16
Completed: 13/16
Completed: 14/16
Completed: 15/16


### Using only T-1 embedding

In [7]:
num_edge_sample = 400
edge_operator = 'Average' #'Average', 'Average' or 'Hadamard' or 'Weighted-L1' or 'Weighted-L2'
train_edges, train_edges_features_mtx, train_edges_labels = generate_train_set(graph_train, num_edge_sample, chi_list[-2], edge_operator)
test_edges, test_edges_features_mtx, test_edges_labels = generate_test_set(graph_test, chi_list[-1], edge_operator)

In [8]:
#for edge_operator in ['Average','Hadamard','Weighted-L1','Weighted-L2']:
for edge_operator in ['Concat']:
    train_edges, train_edges_features_mtx, train_edges_labels = generate_train_set(graph_train, num_edge_sample, chi_list[-2], edge_operator)
    test_edges, test_edges_features_mtx, test_edges_labels = generate_test_set(graph_test, chi_list[-1], edge_operator)

    LR_clf = build_clf(train_edges_features_mtx, train_edges_labels)

    print("Edge Operator: {}".format(edge_operator))
    predict_edges_labels = pred_links(test_edges_features_mtx, LR_clf)
    precision, recall = precision_recall(list(predict_edges_labels), list(test_edges_labels))
    print('Precision: ', precision)
    print('Recall: ', recall)

[LibLinear]Edge Operator: Concat
TP:  4886
TN:  5959
FP:  1067
FN:  2140
F1: 0.7529085445720011
[[5959 1067]
 [2140 4886]]
              precision    recall  f1-score   support

           0       0.74      0.85      0.79      7026
           1       0.82      0.70      0.75      7026

    accuracy                           0.77     14052
   macro avg       0.78      0.77      0.77     14052
weighted avg       0.78      0.77      0.77     14052

Mean Average Precision: 0.7230638005107759
Area Under ROC Curve: 0.7717762596071734
Precision:  0.8207626406853687
Recall:  0.695417022487902


### Pure Sum

In [10]:
prev_embedding = np.sum(np.asarray(chi_list[0:-1]),axis=0)
cur_embedding = np.sum(np.asarray(chi_list),axis=0)

In [11]:
num_edge_sample = 400
#Average' or 'Hadamard' or 'Weighted-L1' or 'Weighted-L2'
train_edges, train_edges_features_mtx, train_edges_labels = generate_train_set(graph_train, num_edge_sample, prev_embedding, edge_operator)
test_edges, test_edges_features_mtx, test_edges_labels = generate_test_set(graph_test, cur_embedding, edge_operator)
#for edge_operator in ['Average','Hadamard','Weighted-L1','Weighted-L2']:
for edge_operator in ['Concat']: 
    train_edges, train_edges_features_mtx, train_edges_labels = generate_train_set(graph_train, num_edge_sample, prev_embedding, edge_operator)
    test_edges, test_edges_features_mtx, test_edges_labels = generate_test_set(graph_test, cur_embedding, edge_operator)

    LR_clf = build_clf(train_edges_features_mtx, train_edges_labels)

    print("Edge Operator: {}".format(edge_operator))
    predict_edges_labels = pred_links(test_edges_features_mtx, LR_clf)
    precision, recall = precision_recall(list(predict_edges_labels), list(test_edges_labels))
    print('Precision: ', precision)
    print('Recall: ', recall)

[LibLinear]Edge Operator: Concat
TP:  5250
TN:  5471
FP:  1555
FN:  1776
F1: 0.7591641963704722
[[5471 1555]
 [1776 5250]]
              precision    recall  f1-score   support

           0       0.75      0.78      0.77      7026
           1       0.77      0.75      0.76      7026

    accuracy                           0.76     14052
   macro avg       0.76      0.76      0.76     14052
weighted avg       0.76      0.76      0.76     14052

Mean Average Precision: 0.7028651635690089
Area Under ROC Curve: 0.7629518929689724
Precision:  0.7714915503306392
Recall:  0.7472245943637916


### Expotential Sum 

In [12]:
embeddings = chi_list
num_edge_sample = 400
#Average' or 'Hadamard' or 'Weighted-L1' or 'Weighted-L2'
for decay in [1,0.9,0.5,0.3]:
    print("------------ BEGIN: {} ---------------".format(decay))
    exps = [math.pow(math.e , (-i * decay)) for i in range(1,len(embeddings[:-2]))]
    exps.reverse()
    temp_embedding = np.zeros((embeddings[0]).shape) 
    for c,e in zip(embeddings[0:-2],exps):
         temp_embedding += e * c 
    prev_embedding = temp_embedding + embeddings[-2]
    
    # this is done so the last embedding has weight one. 
    cur_embedding = temp_embedding + exps[-1] * embeddings[-2] + embeddings[-1]
    
    for edge_operator in ['Concat']:
        #for edge_operator in ['Average','Hadamard','Weighted-L1','Weighted-L2']:
        try:
            train_edges, train_edges_features_mtx, train_edges_labels = generate_train_set(graph_train, num_edge_sample, prev_embedding, edge_operator)
            test_edges, test_edges_features_mtx, test_edges_labels = generate_test_set(graph_test, cur_embedding, edge_operator)

            LR_clf = build_clf(train_edges_features_mtx, train_edges_labels)

            print("Edge Operator: {}".format(edge_operator))
            predict_edges_labels = pred_links(test_edges_features_mtx, LR_clf)
            precision, recall = precision_recall(list(predict_edges_labels), list(test_edges_labels))
            print('Precision: ', precision)
            print('Recall: ', recall)
        except:
            print("Edge Operator: {} ERROR".format(edge_operator))
    print("------------ END: {} ---------------".format(decay))


------------ BEGIN: 1 ---------------
[LibLinear]Edge Operator: Concat
TP:  5219
TN:  5726
FP:  1300
FN:  1807
F1: 0.7706164636397195
[[5726 1300]
 [1807 5219]]
              precision    recall  f1-score   support

           0       0.76      0.81      0.79      7026
           1       0.80      0.74      0.77      7026

    accuracy                           0.78     14052
   macro avg       0.78      0.78      0.78     14052
weighted avg       0.78      0.78      0.78     14052

Mean Average Precision: 0.7232767172023428
Area Under ROC Curve: 0.7788926843153999
Precision:  0.8005829114894922
Recall:  0.7428124110446912
------------ END: 1 ---------------
------------ BEGIN: 0.9 ---------------
[LibLinear]Edge Operator: Concat
TP:  5136
TN:  5780
FP:  1246
FN:  1890
F1: 0.7661097852028639
[[5780 1246]
 [1890 5136]]
              precision    recall  f1-score   support

           0       0.75      0.82      0.79      7026
           1       0.80      0.73      0.77      7026

    ac

## Email Weekly

In [35]:
#load the graphs 
with open('/z/pujat/576/data/email_eu/email_1_week_dir.pkl', 'rb') as file:
    graphs = pickle.load(file)
graph_train = graphs[-2]
graph_test = graphs[-1]

In [36]:
chi_list = []
heat_print_list = []
taus_list = []
for e, g in enumerate(graphs[:-1]): #last embedding used for link prediction
    chi, heat_print, taus = graphwave_alg(g, np.linspace(0,200,32), taus='auto', verbose=True)
    chi_list.append(chi)
    heat_print_list.append(heat_print)
    taus_list.append(taus)
    print("Completed: {}/{}".format(e,len(graphs[:-1])))

Completed: 0/73
Completed: 1/73
Completed: 2/73
Completed: 3/73
Completed: 4/73
Completed: 5/73
Completed: 6/73
Completed: 7/73
Completed: 8/73
Completed: 9/73
Completed: 10/73
Completed: 11/73
Completed: 12/73
Completed: 13/73
Completed: 14/73
Completed: 15/73
Completed: 16/73
Completed: 17/73
Completed: 18/73
Completed: 19/73
Completed: 20/73
Completed: 21/73
Completed: 22/73
Completed: 23/73
Completed: 24/73
Completed: 25/73
Completed: 26/73
Completed: 27/73
Completed: 28/73
Completed: 29/73
Completed: 30/73
Completed: 31/73
Completed: 32/73
Completed: 33/73
Completed: 34/73
Completed: 35/73
Completed: 36/73
Completed: 37/73
Completed: 38/73
Completed: 39/73
Completed: 40/73
Completed: 41/73
Completed: 42/73
Completed: 43/73
Completed: 44/73
Completed: 45/73
Completed: 46/73
Completed: 47/73
Completed: 48/73
Completed: 49/73
Completed: 50/73
Completed: 51/73
Completed: 52/73
Completed: 53/73
Completed: 54/73
Completed: 55/73
Completed: 56/73
Completed: 57/73
Completed: 58/73
Complet

### Using only T-1 Embedding

In [37]:
#for edge_operator in ['Average','Hadamard','Weighted-L1','Weighted-L2']:
for edge_operator in ['Concat']:
    try:
        train_edges, train_edges_features_mtx, train_edges_labels = generate_train_set(graph_train, num_edge_sample, chi_list[-2], edge_operator)
        test_edges, test_edges_features_mtx, test_edges_labels = generate_test_set(graph_test, chi_list[-1], edge_operator)

        LR_clf = build_clf(train_edges_features_mtx, train_edges_labels)

        print("Edge Operator: {}".format(edge_operator))
        predict_edges_labels = pred_links(test_edges_features_mtx, LR_clf)
        precision, recall = precision_recall(list(predict_edges_labels), list(test_edges_labels))
        print('Precision: ', precision)
        print('Recall: ', recall)
    except:
        print("Edge Operator: {} ERROR".format(edge_operator))

[LibLinear]Edge Operator: Concat
TP:  2078
TN:  2095
FP:  590
FN:  607
F1: 0.7763870726695311
[[2095  590]
 [ 607 2078]]
              precision    recall  f1-score   support

           0       0.78      0.78      0.78      2685
           1       0.78      0.77      0.78      2685

    accuracy                           0.78      5370
   macro avg       0.78      0.78      0.78      5370
weighted avg       0.78      0.78      0.78      5370

Mean Average Precision: 0.7158183478093355
Area Under ROC Curve: 0.777094972067039
Precision:  0.7788605697151424
Recall:  0.7739292364990689


### Pure Sum

In [38]:
prev_embedding = np.sum(np.asarray(chi_list[0:-1]),axis=0)
cur_embedding = np.sum(np.asarray(chi_list),axis=0)

In [39]:
num_edge_sample = 400
#Average' or 'Hadamard' or 'Weighted-L1' or 'Weighted-L2'
train_edges, train_edges_features_mtx, train_edges_labels = generate_train_set(graph_train, num_edge_sample, prev_embedding, edge_operator)
test_edges, test_edges_features_mtx, test_edges_labels = generate_test_set(graph_test, cur_embedding, edge_operator)
#for edge_operator in ['Average','Hadamard','Weighted-L1','Weighted-L2']:
for edge_operator in ['Concat']:
    train_edges, train_edges_features_mtx, train_edges_labels = generate_train_set(graph_train, num_edge_sample, prev_embedding, edge_operator)
    test_edges, test_edges_features_mtx, test_edges_labels = generate_test_set(graph_test, cur_embedding, edge_operator)

    LR_clf = build_clf(train_edges_features_mtx, train_edges_labels)

    print("Edge Operator: {}".format(edge_operator))
    predict_edges_labels = pred_links(test_edges_features_mtx, LR_clf)
    precision, recall = precision_recall(list(predict_edges_labels), list(test_edges_labels))
    print('Precision: ', precision)
    print('Recall: ', recall)

[LibLinear]Edge Operator: Concat
TP:  2056
TN:  2156
FP:  529
FN:  629
F1: 0.7802656546489564
[[2156  529]
 [ 629 2056]]
              precision    recall  f1-score   support

           0       0.77      0.80      0.79      2685
           1       0.80      0.77      0.78      2685

    accuracy                           0.78      5370
   macro avg       0.78      0.78      0.78      5370
weighted avg       0.78      0.78      0.78      5370

Mean Average Precision: 0.7261659985088014
Area Under ROC Curve: 0.7843575418994414
Precision:  0.795357833655706
Recall:  0.7657355679702048


### Exponential Sum

In [40]:
embeddings = chi_list
num_edge_sample = 400
#Average' or 'Hadamard' or 'Weighted-L1' or 'Weighted-L2'
for decay in [1,0.9,0.5,0.3]:
    print("------------ BEGIN: {} ---------------".format(decay))
    exps = [math.pow(math.e , (-i * decay)) for i in range(1,len(embeddings[:-2]))]
    exps.reverse()
    temp_embedding = np.zeros((embeddings[0]).shape) 
    for c,e in zip(embeddings[0:-2],exps):
         temp_embedding += e * c 
    prev_embedding = temp_embedding + embeddings[-2]
    
    # this is done so the last embedding has weight one. 
    cur_embedding = temp_embedding + exps[-1] * embeddings[-2] + embeddings[-1]
    
    #for edge_operator in ['Average','Hadamard','Weighted-L1','Weighted-L2']:
    for edge_operator in ['Concat']:
        try:
            train_edges, train_edges_features_mtx, train_edges_labels = generate_train_set(graph_train, num_edge_sample, prev_embedding, edge_operator)
            test_edges, test_edges_features_mtx, test_edges_labels = generate_test_set(graph_test, cur_embedding, edge_operator)

            LR_clf = build_clf(train_edges_features_mtx, train_edges_labels)

            print("Edge Operator: {}".format(edge_operator))
            predict_edges_labels = pred_links(test_edges_features_mtx, LR_clf)
            precision, recall = precision_recall(list(predict_edges_labels), list(test_edges_labels))
            print('Precision: ', precision)
            print('Recall: ', recall)
        except:
            print("Edge Operator: {} ERROR".format(edge_operator))
    print("------------ END: {} ---------------".format(decay))


------------ BEGIN: 1 ---------------
[LibLinear]Edge Operator: Concat
TP:  2025
TN:  2153
FP:  532
FN:  660
F1: 0.7726058756199924
[[2153  532]
 [ 660 2025]]
              precision    recall  f1-score   support

           0       0.77      0.80      0.78      2685
           1       0.79      0.75      0.77      2685

    accuracy                           0.78      5370
   macro avg       0.78      0.78      0.78      5370
weighted avg       0.78      0.78      0.78      5370

Mean Average Precision: 0.7201809907297964
Area Under ROC Curve: 0.7780260707635009
Precision:  0.791943684004693
Recall:  0.7541899441340782
------------ END: 1 ---------------
------------ BEGIN: 0.9 ---------------
[LibLinear]Edge Operator: Concat
TP:  2084
TN:  2123
FP:  562
FN:  601
F1: 0.781842055899456
[[2123  562]
 [ 601 2084]]
              precision    recall  f1-score   support

           0       0.78      0.79      0.78      2685
           1       0.79      0.78      0.78      2685

    accuracy

## Email Equal (weekly)

In [22]:
#load the graphs 
with open('/z/pujat/576/data/email_eu/email_equal_weekly_dir.pkl', 'rb') as file:
    graphs = pickle.load(file)
graph_train = graphs[-2]
graph_test = graphs[-1]

In [23]:
chi_list = []
heat_print_list = []
taus_list = []
for e, g in enumerate(graphs[:-1]): #last embedding used for link prediction
    chi, heat_print, taus = graphwave_alg(g, np.linspace(0,200,50), taus='auto', verbose=True)
    chi_list.append(chi)
    heat_print_list.append(heat_print)
    taus_list.append(taus)
    print("Completed: {}/{}".format(e,len(graphs[:-1])))

Completed: 0/73
Completed: 1/73
Completed: 2/73
Completed: 3/73
Completed: 4/73
Completed: 5/73
Completed: 6/73
Completed: 7/73
Completed: 8/73
Completed: 9/73
Completed: 10/73
Completed: 11/73
Completed: 12/73
Completed: 13/73
Completed: 14/73
Completed: 15/73
Completed: 16/73
Completed: 17/73
Completed: 18/73
Completed: 19/73
Completed: 20/73
Completed: 21/73
Completed: 22/73
Completed: 23/73
Completed: 24/73
Completed: 25/73
Completed: 26/73
Completed: 27/73
Completed: 28/73
Completed: 29/73
Completed: 30/73
Completed: 31/73
Completed: 32/73
Completed: 33/73
Completed: 34/73
Completed: 35/73
Completed: 36/73
Completed: 37/73
Completed: 38/73
Completed: 39/73
Completed: 40/73
Completed: 41/73
Completed: 42/73
Completed: 43/73
Completed: 44/73
Completed: 45/73
Completed: 46/73
Completed: 47/73
Completed: 48/73
Completed: 49/73
Completed: 50/73
Completed: 51/73
Completed: 52/73
Completed: 53/73
Completed: 54/73
Completed: 55/73
Completed: 56/73
Completed: 57/73
Completed: 58/73
Complet

### Using only T-1 Embedding

In [24]:
num_edge_sample = 400
edge_operator = 'Average' #'Average', 'Average' or 'Hadamard' or 'Weighted-L1' or 'Weighted-L2'
train_edges, train_edges_features_mtx, train_edges_labels = generate_train_set(graph_train, num_edge_sample, chi_list[-2], edge_operator)
test_edges, test_edges_features_mtx, test_edges_labels = generate_test_set(graph_test, chi_list[-1], edge_operator)

In [25]:
for edge_operator in ['Concat']:
    #for edge_operator in ['Average','Hadamard','Weighted-L1','Weighted-L2']:
    train_edges, train_edges_features_mtx, train_edges_labels = generate_train_set(graph_train, num_edge_sample, chi_list[-2], edge_operator)
    test_edges, test_edges_features_mtx, test_edges_labels = generate_test_set(graph_test, chi_list[-1], edge_operator)

    LR_clf = build_clf(train_edges_features_mtx, train_edges_labels)

    print("Edge Operator: {}".format(edge_operator))
    predict_edges_labels = pred_links(test_edges_features_mtx, LR_clf)
    precision, recall = precision_recall(list(predict_edges_labels), list(test_edges_labels))
    print('Precision: ', precision)
    print('Recall: ', recall)

[LibLinear]Edge Operator: Concat
TP:  1678
TN:  1563
FP:  582
FN:  467
F1: 0.761861520998865
[[1563  582]
 [ 467 1678]]
              precision    recall  f1-score   support

           0       0.77      0.73      0.75      2145
           1       0.74      0.78      0.76      2145

    accuracy                           0.76      4290
   macro avg       0.76      0.76      0.76      4290
weighted avg       0.76      0.76      0.76      4290

Mean Average Precision: 0.6896866555273635
Area Under ROC Curve: 0.7554778554778554
Precision:  0.7424778761061946
Recall:  0.7822843822843822


### Pure Sum

In [28]:
prev_embedding = np.sum(np.asarray(chi_list[0:-1]),axis=0)
cur_embedding = np.sum(np.asarray(chi_list),axis=0)

In [26]:
num_edge_sample = 400
#Average' or 'Hadamard' or 'Weighted-L1' or 'Weighted-L2'
train_edges, train_edges_features_mtx, train_edges_labels = generate_train_set(graph_train, num_edge_sample, prev_embedding, edge_operator)
test_edges, test_edges_features_mtx, test_edges_labels = generate_test_set(graph_test, cur_embedding, edge_operator)
#for edge_operator in ['Average','Hadamard','Weighted-L1','Weighted-L2']:
for edge_operator in ['Concat']:
    train_edges, train_edges_features_mtx, train_edges_labels = generate_train_set(graph_train, num_edge_sample, prev_embedding, edge_operator)
    test_edges, test_edges_features_mtx, test_edges_labels = generate_test_set(graph_test, cur_embedding, edge_operator)

    LR_clf = build_clf(train_edges_features_mtx, train_edges_labels)

    print("Edge Operator: {}".format(edge_operator))
    predict_edges_labels = pred_links(test_edges_features_mtx, LR_clf)
    precision, recall = precision_recall(list(predict_edges_labels), list(test_edges_labels))
    print('Precision: ', precision)
    print('Recall: ', recall)

[LibLinear]Edge Operator: Concat
TP:  1686
TN:  1664
FP:  481
FN:  459
F1: 0.7820037105751392
[[1664  481]
 [ 459 1686]]
              precision    recall  f1-score   support

           0       0.78      0.78      0.78      2145
           1       0.78      0.79      0.78      2145

    accuracy                           0.78      4290
   macro avg       0.78      0.78      0.78      4290
weighted avg       0.78      0.78      0.78      4290

Mean Average Precision: 0.7185387293832148
Area Under ROC Curve: 0.780885780885781
Precision:  0.7780341485925242
Recall:  0.786013986013986


### Exponential Sum

In [27]:
embeddings = chi_list
num_edge_sample = 400
#Average' or 'Hadamard' or 'Weighted-L1' or 'Weighted-L2'
for decay in [1,0.9,0.5,0.3]:
    print("------------ BEGIN: {} ---------------".format(decay))
    exps = [math.pow(math.e , (-i * decay)) for i in range(1,len(embeddings[:-2]))]
    exps.reverse()
    temp_embedding = np.zeros((embeddings[0]).shape) 
    for c,e in zip(embeddings[0:-2],exps):
         temp_embedding += e * c 
    prev_embedding = temp_embedding + embeddings[-2]
    
    # this is done so the last embedding has weight one. 
    cur_embedding = temp_embedding + exps[-1] * embeddings[-2] + embeddings[-1]
    
    #for edge_operator in ['Average','Hadamard','Weighted-L1','Weighted-L2']:
    for edge_operator in ['Concat']:
        try:
            train_edges, train_edges_features_mtx, train_edges_labels = generate_train_set(graph_train, num_edge_sample, prev_embedding, edge_operator)
            test_edges, test_edges_features_mtx, test_edges_labels = generate_test_set(graph_test, cur_embedding, edge_operator)

            LR_clf = build_clf(train_edges_features_mtx, train_edges_labels)

            print("Edge Operator: {}".format(edge_operator))
            predict_edges_labels = pred_links(test_edges_features_mtx, LR_clf)
            precision, recall = precision_recall(list(predict_edges_labels), list(test_edges_labels))
            print('Precision: ', precision)
            print('Recall: ', recall)
        except:
            print("Edge Operator: {} ERROR".format(edge_operator))
    print("------------ END: {} ---------------".format(decay))


------------ BEGIN: 1 ---------------
[LibLinear]Edge Operator: Concat
TP:  1647
TN:  1683
FP:  462
FN:  498
F1: 0.7743300423131171
[[1683  462]
 [ 498 1647]]
              precision    recall  f1-score   support

           0       0.77      0.78      0.78      2145
           1       0.78      0.77      0.77      2145

    accuracy                           0.78      4290
   macro avg       0.78      0.78      0.78      4290
weighted avg       0.78      0.78      0.78      4290

Mean Average Precision: 0.7157138736086104
Area Under ROC Curve: 0.7762237762237763
Precision:  0.7809388335704125
Recall:  0.7678321678321678
------------ END: 1 ---------------
------------ BEGIN: 0.9 ---------------
[LibLinear]Edge Operator: Concat
TP:  1680
TN:  1652
FP:  493
FN:  465
F1: 0.778138026864289
[[1652  493]
 [ 465 1680]]
              precision    recall  f1-score   support

           0       0.78      0.77      0.78      2145
           1       0.77      0.78      0.78      2145

    accurac

## Email Equal(Monthly)

In [28]:
#load the graphs 
with open('/z/pujat/576/data/email_eu/email_equal_monthly_dir.pkl', 'rb') as file:
    graphs = pickle.load(file)
graph_train = graphs[-2]
graph_test = graphs[-1]

In [29]:
chi_list = []
heat_print_list = []
taus_list = []
for e, g in enumerate(graphs[:-1]): #last embedding used for link prediction
    chi, heat_print, taus = graphwave_alg(g, np.linspace(0,200,50), taus='auto', verbose=True)
    chi_list.append(chi)
    heat_print_list.append(heat_print)
    taus_list.append(taus)
    print("Completed: {}/{}".format(e,len(graphs[:-1])))

Completed: 0/16
Completed: 1/16
Completed: 2/16
Completed: 3/16
Completed: 4/16
Completed: 5/16
Completed: 6/16
Completed: 7/16
Completed: 8/16
Completed: 9/16
Completed: 10/16
Completed: 11/16
Completed: 12/16
Completed: 13/16
Completed: 14/16
Completed: 15/16


### Using only T-1 Embedding

In [30]:
num_edge_sample = 400
edge_operator = 'Average' #'Average', 'Average' or 'Hadamard' or 'Weighted-L1' or 'Weighted-L2'
train_edges, train_edges_features_mtx, train_edges_labels = generate_train_set(graph_train, num_edge_sample, chi_list[-2], edge_operator)
test_edges, test_edges_features_mtx, test_edges_labels = generate_test_set(graph_test, chi_list[-1], edge_operator)

In [31]:
#for edge_operator in ['Average','Hadamard','Weighted-L1','Weighted-L2']:
for edge_operator in ['Concat']:
    train_edges, train_edges_features_mtx, train_edges_labels = generate_train_set(graph_train, num_edge_sample, chi_list[-2], edge_operator)
    test_edges, test_edges_features_mtx, test_edges_labels = generate_test_set(graph_test, chi_list[-1], edge_operator)

    LR_clf = build_clf(train_edges_features_mtx, train_edges_labels)

    print("Edge Operator: {}".format(edge_operator))
    predict_edges_labels = pred_links(test_edges_features_mtx, LR_clf)
    precision, recall = precision_recall(list(predict_edges_labels), list(test_edges_labels))
    print('Precision: ', precision)
    print('Recall: ', recall)

[LibLinear]Edge Operator: Concat
TP:  4421
TN:  3790
FP:  1539
FN:  908
F1: 0.7832403224377713
[[3790 1539]
 [ 908 4421]]
              precision    recall  f1-score   support

           0       0.81      0.71      0.76      5329
           1       0.74      0.83      0.78      5329

    accuracy                           0.77     10658
   macro avg       0.77      0.77      0.77     10658
weighted avg       0.77      0.77      0.77     10658

Mean Average Precision: 0.7005822578999801
Area Under ROC Curve: 0.7704072058547571
Precision:  0.7417785234899329
Recall:  0.829611559392006


### Pure Sum

In [32]:
prev_embedding = np.sum(np.asarray(chi_list[0:-1]),axis=0)
cur_embedding = np.sum(np.asarray(chi_list),axis=0)

In [33]:
num_edge_sample = 400
#Average' or 'Hadamard' or 'Weighted-L1' or 'Weighted-L2'
train_edges, train_edges_features_mtx, train_edges_labels = generate_train_set(graph_train, num_edge_sample, prev_embedding, edge_operator)
test_edges, test_edges_features_mtx, test_edges_labels = generate_test_set(graph_test, cur_embedding, edge_operator)
#for edge_operator in ['Average','Hadamard','Weighted-L1','Weighted-L2']:
for edge_operator in ['Concat']:
    train_edges, train_edges_features_mtx, train_edges_labels = generate_train_set(graph_train, num_edge_sample, prev_embedding, edge_operator)
    test_edges, test_edges_features_mtx, test_edges_labels = generate_test_set(graph_test, cur_embedding, edge_operator)

    LR_clf = build_clf(train_edges_features_mtx, train_edges_labels)

    print("Edge Operator: {}".format(edge_operator))
    predict_edges_labels = pred_links(test_edges_features_mtx, LR_clf)
    precision, recall = precision_recall(list(predict_edges_labels), list(test_edges_labels))
    print('Precision: ', precision)
    print('Recall: ', recall)

[LibLinear]Edge Operator: Concat
TP:  4029
TN:  4142
FP:  1187
FN:  1300
F1: 0.7641536273115221
[[4142 1187]
 [1300 4029]]
              precision    recall  f1-score   support

           0       0.76      0.78      0.77      5329
           1       0.77      0.76      0.76      5329

    accuracy                           0.77     10658
   macro avg       0.77      0.77      0.77     10658
weighted avg       0.77      0.77      0.77     10658

Mean Average Precision: 0.7059719318533733
Area Under ROC Curve: 0.766654156502158
Precision:  0.772430981595092
Recall:  0.7560517920810659


### Exponential Sum

In [34]:
embeddings = chi_list
num_edge_sample = 400
#Average' or 'Hadamard' or 'Weighted-L1' or 'Weighted-L2'
for decay in [1,0.9,0.5,0.3]:
    print("------------ BEGIN: {} ---------------".format(decay))
    exps = [math.pow(math.e , (-i * decay)) for i in range(1,len(embeddings[:-2]))]
    exps.reverse()
    temp_embedding = np.zeros((embeddings[0]).shape) 
    for c,e in zip(embeddings[0:-2],exps):
         temp_embedding += e * c 
    prev_embedding = temp_embedding + embeddings[-2]
    
    # this is done so the last embedding has weight one. 
    cur_embedding = temp_embedding + exps[-1] * embeddings[-2] + embeddings[-1]
    
    #for edge_operator in ['Average','Hadamard','Weighted-L1','Weighted-L2']:
    for edge_operator in ['Concat']:
        try:
            train_edges, train_edges_features_mtx, train_edges_labels = generate_train_set(graph_train, num_edge_sample, prev_embedding, edge_operator)
            test_edges, test_edges_features_mtx, test_edges_labels = generate_test_set(graph_test, cur_embedding, edge_operator)

            LR_clf = build_clf(train_edges_features_mtx, train_edges_labels)

            print("Edge Operator: {}".format(edge_operator))
            predict_edges_labels = pred_links(test_edges_features_mtx, LR_clf)
            precision, recall = precision_recall(list(predict_edges_labels), list(test_edges_labels))
            print('Precision: ', precision)
            print('Recall: ', recall)
        except:
            print("Edge Operator: {} ERROR".format(edge_operator))
    print("------------ END: {} ---------------".format(decay))


------------ BEGIN: 1 ---------------
[LibLinear]Edge Operator: Concat
TP:  4286
TN:  3932
FP:  1397
FN:  1043
F1: 0.7784235379585907
[[3932 1397]
 [1043 4286]]
              precision    recall  f1-score   support

           0       0.79      0.74      0.76      5329
           1       0.75      0.80      0.78      5329

    accuracy                           0.77     10658
   macro avg       0.77      0.77      0.77     10658
weighted avg       0.77      0.77      0.77     10658

Mean Average Precision: 0.7044308039698056
Area Under ROC Curve: 0.7710639894914618
Precision:  0.7541791307408059
Recall:  0.8042784762619628
------------ END: 1 ---------------
------------ BEGIN: 0.9 ---------------
[LibLinear]Edge Operator: Concat
TP:  4226
TN:  4016
FP:  1313
FN:  1103
F1: 0.777695988222304
[[4016 1313]
 [1103 4226]]
              precision    recall  f1-score   support

           0       0.78      0.75      0.77      5329
           1       0.76      0.79      0.78      5329

    acc