In [None]:
# imports
import random

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

## Helper functions
edge_features(): generate edge features based on the two nodes' embeddings and the combine operator

generate_edge_features(): generate edge features for a list of edges

generate_train_set(): generate train set: {edges | edges' features matrix | edges labels}

generate_test_set(): generate test set: {edges | edges' features matrix | edges labels}

build_clf(): build a Logistic Regression classifier

pred_links(): predict links using LR classifier

precision_recall(): compute precision/recall of predictions

In [None]:
def edge_features(node_emb_1, node_emb_2, operator):
    
    # combine two nodes' embeddings with specificed operator
    if operator == 'Average':
        edge = [((x + y) / 2.0) for x,y in zip(node_emb_1, node_emb_2)]
    elif operator == 'Hadamard':
        edge = [(x * y) for x,y in zip(node_emb_1, node_emb_2)]
    elif operator == 'Weighted-L1':
        edge = [abs(x - y) for x,y in zip(node_emb_1, node_emb_2)]
    elif operator == 'Weighted-L2':
        edge = [abs(x - y)**2 for x,y in zip(node_emb_1, node_emb_2)]
    else:
        print("Generate edge features: Operator not supported")
        print("Use default operator: Weighted-L1")
        edge = [abs(x - y) for x,y in zip(node_emb_1, node_emb_2)]
        
    return edge

In [None]:
def generate_edge_features(edge_list, node_embeddings, operator):
    edge_features_mtx = []
    
    # generate features for each edge in the list
    for node_index_1, node_index_2 in edge_list:
        node_emb_1 = node_embeddings[node_index_1]
        node_emb_2 = node_embeddings[node_index_2]
        
        edge_features_mtx.append(edge_features(node_emb_1, node_emb_2, operator))
        
    return edge_features_mtx

In [None]:
def generate_train_set(graph_train, num_edge_sample, node_embeddings, edge_operator):
    edge_list = graph_train.edges
    num_nodes = graph_train.number_of_nodes()
    
    train_edges = []
    train_edges_labels = [1] * num_edge_sample + [1] * num_edge_sample
    
    random.seed(0)
    
    # sample edges with label 1 (true edges)
    for edge_num in range(num_edge_sample):
        rand_index = random.randint(0, len(edge_list) - 1)
        
        train_edges.append(tuple(edge_list[rand_index]))
        
    non_edge_num = 0
    
    # sample edges with label 0 (non-exist edges)
    while(non_edge_num < num_edge_sample):
        rand_nodes = tuple(sample(range(100), 2))
        
        if rand_nodes not in edge_list:
            train_edges.append(rand_nodes)
            non_edge_num += 1

    train_edges_features_mtx = generate_edge_features(train_edges, node_embeddings, edge_operator)
            
    return train_edges, train_edges_features_mtx, train_edges_labels

In [None]:
def generate_test_set(graph_test, node_embeddings, edge_operator):
    edge_list = graph_train.edges
    nodes_with_edge = set()
    
    for edge in edge_list:
        nodes_with_edge.add(edge[0])
        nodes_with_edge.add(edge[1])
    
    num_nodes = graph_train.number_of_nodes()
    
    test_edges = []
    test_edges_labels = []
    
    # generate all possible edges for each node with at least one edge (assume undirected edges)
    for node_1 in nodes_with_edge:
        for node_2 in range(num_nodes):
            test_edges.append(tuple(node_1, node_2))
            
            if tuple(node_1, node_2) in edge_list:
                test_edges_labels.append(1)
            else:
                test_edges_labels.append(0)
            
    test_edges_features_mtx = generate_edge_features(test_edges, node_embeddings, edge_operator)
    
    return test_edges, test_edges_features_mtx, test_edges_labels

In [None]:
def build_clf(feature_mtx, response_vec):
    scaler = StandardScaler()
    feature_mtx_standardized = scaler.fit_transform(feature_mtx)
    
    logistic_regression_model = LogisticRegression(random_state = 0)
    binary_clf = logistic_regression_model.fit(feature_mtx_standardized, response_vec)
    
    return binary_clf

In [None]:
def pred_links(feature_mtx, LR_clf):
    predict_edges_labels = LR_clf.predict(feature_mtx)
    
    return predict_edges_labels

In [None]:
def precision_recall(predict_labels, true_labels):
    true_positive, false_positive = 0
    true_negative, false_negative = 0
    
    for p_label, true_label in zip(predict_labels, true_labels):
        if p_label == true_label and true_label == 1:
            true_positive += 1
        elif p_label == true_label and true_label == 0:
            true_negative += 1
        elif p_label != true_label and true_label == 1:
            false_negative += 1
        elif p_label != true_label and true_label == 0:
            false_positive += 1
            
        precision = true_positive / (true_positive + false_positive)
        recall = true_positive / (true_positive + false_negative)
        
        return precision, recall

## Main function
1. Load/set inputs
2. Generate train set
3. Generate test set
4. Build Logistic Regression Model
5. Predict links with model
6. Report Precision/Recall

In [None]:
# load/set inputs

# node_embeddings
# num_edge_sample: how many edges to be samples in training set
#                  Total training samples is 2 * num_edge_sample (half edge, half non-edge)
# edge_operator = 'Average' or 'Hadamard' or 'Weighted-L1' or 'Weighted-L2'
# graph_train: the graph to be used to generate edges and edge-labels in training set
# graph_test: the graph to be used to generate edges and edge-labels in test set

In [None]:
# generate train set
train_edges, train_edges_features_mtx, train_edges_labels = 
    generate_train_set(graph_train, num_edge_sample, node_embeddings, edge_operator)

In [None]:
# generate test set
test_edges, test_edges_features_mtx, test_edges_labels = 
    generate_test_set(graph_test, node_embeddings, edge_operator)

In [None]:
# build LR model
LR_clf = build_clf(train_edges_features_mtx, train_edges_labels)

In [None]:
# predict links
predict_edges_labels = pred_links(test_edges_features_mtx, LR_clf):

In [None]:
# report precision/recall
precision, recall = precision_recall(predict_edges_labels, test_edges_labels)
print('Precision: ', precision)
print('Recall: ', recall)