# IMPORT FUNCFIONS

In [1]:
import pandas as pd
import numpy as np
import networkx as nx
import itertools
import pickle
import os
import random
from sklearn.model_selection import train_test_split

from sklearn.metrics import average_precision_score
from sklearn import metrics

from sklearn.ensemble import RandomForestClassifier
from node2vec import Node2Vec
from node2vec.edges import HadamardEmbedder, WeightedL1Embedder, WeightedL2Embedder
from sklearn.linear_model import LogisticRegression

# READ DATA

In [2]:
def get_edges_dict(traindata):
    if os.path.isfile("edgesdic.pkl"):
        f = open("edgesdic.pkl","rb")
        edges = pickle.load(f)
        f.close()
        return edges
    else:
        edges = dict()
        for edge in range(len(traindata)): 
            edges[(traindata.loc[[edge]]['id_1'].values[0],traindata.loc[[edge]]['id_2'].values[0])] = 1 
            edges[(traindata.loc[[edge]]['id_2'].values[0],traindata.loc[[edge]]['id_1'].values[0])] = 1  
        f = open("edgesdic.pkl","wb")
        pickle.dump(edges,f)
        f.close()
        return edges

def get_negative_edges(traindata,g):
    edges = get_edges_dict(traindata)
    # for missing edges.
    negative_edges = set([])
    maxNodenum = max(max(traindata['id_1']),max(traindata['id_2']))
    #產生與原圖同edge數目的negative edges
    while (len(negative_edges)<len(traindata)):
        node1=random.randint(0, maxNodenum) 
        node2=random.randint(0, maxNodenum) 
        tmp = edges.get((node1,node2),0) #edge不在graph裡標為0
        if tmp == 0 and node1!=node2 and g.has_node(node1) and g.has_node(node2): # if edge不在graph裡才要做處理
            negative_edges.add((node1,node2))
        else:
            continue
    return negative_edges

# MODELS

In [3]:
def n2v_embedding(train_G):  #https://github.com/eliorc/node2vec
    node2vec  = Node2Vec(train_G, dimensions=12, walk_length=10, num_walks=20, workers=4, p=0.25, q=0.25)
    model = node2vec.fit(window=10, min_count=1, batch_words=4)
    edges_embs = HadamardEmbedder(keyed_vectors=model.wv)
    return edges_embs

In [4]:
def n2v_combine_embedding(data, embeddings):
    i=0
    X = []
    for edge in data:
        X.append(np.concatenate((data[i], embeddings[(str(int(edge[0])), str(int(edge[1])))])))
        # print(embeddings[str(int(data[0]))])
        i+=1
    return X

In [5]:
all_ap = []
all_auc = []

filename = 'celegans'

for i in range(10):
    ori_df = pd.read_csv('data/'+filename+'.txt', header=None, sep=' ')
    ori_df.columns = ['id_1', 'id_2']

    ori_G = nx.from_pandas_edgelist(ori_df, 'id_1', 'id_2')
    #print(nx.info(ori_G))
    
    negative_edges = get_negative_edges(ori_df, ori_G)
    #print(len(negative_edges))

    train_df, test_df = train_test_split(ori_df, test_size=0.1)
    
    nodes_number = len(ori_G.nodes)
    graph_np = np.zeros((nodes_number, nodes_number))
    for i in range(train_df.shape[0]):
        graph_np[train_df.iloc[i, 0], train_df.iloc[i, 1]] = 1
        graph_np[train_df.iloc[i, 1], train_df.iloc[i, 0]] = 1

    train_G = nx.from_numpy_matrix(graph_np)
    #print(nx.info(train_G))

    edges_embs = n2v_embedding(train_G)
    
    df_neg = pd.DataFrame(list(negative_edges), columns=['id_1', 'id_2'])
    
    train_df = train_df.values
    test_pos_df = test_df.values
    df_neg = df_neg.values
    
    train_df = n2v_combine_embedding(train_df, edges_embs)
    test_pos_df = n2v_combine_embedding(test_pos_df, edges_embs)
    df_neg = n2v_combine_embedding(df_neg, edges_embs)
    
    X_train_neg, X_test_neg = train_test_split(df_neg, test_size=0.1)
    
    y_train_pos = np.ones(len(train_df))
    y_train_neg = np.zeros(len(X_train_neg))
    y_test_pos = np.ones(len(test_pos_df))
    y_test_neg = np.zeros(len(X_test_neg))
    
    X_train = np.concatenate((train_df, X_train_neg))
    y_train = np.concatenate((y_train_pos, y_train_neg))
    X_test = np.concatenate((test_pos_df, X_test_neg))
    y_test = np.concatenate((y_test_pos, y_test_neg))
    
    clf2 = RandomForestClassifier(n_estimators=400)
    clf2.fit(X_train, y_train)
    
    predict_Y = clf2.predict(X_test)
    
    ap = average_precision_score(y_test, predict_Y)
    print("AP： ", ap)
    fpr, tpr, _ = metrics.roc_curve(y_test, predict_Y, pos_label=1)
    auc = metrics.auc(fpr, tpr)
    print("AUC SCORE: ",auc)
    
    all_ap.append(ap)
    all_auc.append(auc)

print("####################### SUM UP ############################")
print("AP MEAN : ", np.array(all_ap).mean())
print("AP STD : ", np.array(all_ap).std())
print("AUC MEAN : ", np.array(all_auc).mean())
print("AUC STD : ", np.array(all_auc).std())

Computing transition probabilities:   0%|          | 0/297 [00:00<?, ?it/s]

AP：  0.6875944935147609
AUC SCORE:  0.7436440677966102


Computing transition probabilities:   0%|          | 0/297 [00:00<?, ?it/s]

AP：  0.7301230729920791
AUC SCORE:  0.777542372881356


Computing transition probabilities:   0%|          | 0/297 [00:00<?, ?it/s]

AP：  0.7159997280884619
AUC SCORE:  0.7605932203389831


Computing transition probabilities:   0%|          | 0/297 [00:00<?, ?it/s]

AP：  0.7361134418526598
AUC SCORE:  0.7838983050847457


Computing transition probabilities:   0%|          | 0/297 [00:00<?, ?it/s]

AP：  0.7055383146335641
AUC SCORE:  0.760593220338983


Computing transition probabilities:   0%|          | 0/297 [00:00<?, ?it/s]

AP：  0.7187004258161476
AUC SCORE:  0.7690677966101696


Computing transition probabilities:   0%|          | 0/297 [00:00<?, ?it/s]

AP：  0.7187004258161476
AUC SCORE:  0.7690677966101696


Computing transition probabilities:   0%|          | 0/297 [00:00<?, ?it/s]

AP：  0.7181764292614385
AUC SCORE:  0.7648305084745762


Computing transition probabilities:   0%|          | 0/297 [00:00<?, ?it/s]

AP：  0.7459088252483927
AUC SCORE:  0.7796610169491526


Computing transition probabilities:   0%|          | 0/297 [00:00<?, ?it/s]

AP：  0.6838380796696206
AUC SCORE:  0.7351694915254239
####################### SUM UP ############################
AP MEAN :  0.7160693236893272
AP STD :  0.018623994539861057
AUC MEAN :  0.764406779661017
AUC STD :  0.014641654697602506
