In [1]:
import pandas as pd
import numpy as np
import networkx as nx
import itertools
import pickle
import os
import random
from sklearn.model_selection import train_test_split

from sklearn.metrics import average_precision_score
from sklearn import metrics

from sklearn.ensemble import RandomForestClassifier
from node2vec import Node2Vec
from node2vec.edges import HadamardEmbedder, WeightedL1Embedder, WeightedL2Embedder
from sklearn.linear_model import LogisticRegression

In [2]:
def n2v_embedding(train_G):  #https://github.com/eliorc/node2vec
    node2vec  = Node2Vec(train_G, dimensions=12, walk_length=80, num_walks=20, workers=4, p=0.25, q=0.25)
    model = node2vec.fit(window=20, min_count=1, batch_words=16)
    edges_embs = HadamardEmbedder(keyed_vectors=model.wv)
    return edges_embs

In [3]:
def n2v_combine_embedding(data, embeddings):
    i=0
    X = []
    for edge in data:
        X.append(np.concatenate((data[i], embeddings[(str(int(edge[0])), str(int(edge[1])))])))
        # print(embeddings[str(int(data[0]))])
        i+=1
    return X

In [4]:
def get_edges_dict(traindata):
    if os.path.isfile("edgesdic.pkl"):
        f = open("edgesdic.pkl","rb")
        edges = pickle.load(f)
        f.close()
        return edges
    else:
        edges = dict()
        for edge in range(len(traindata)): 
            edges[(traindata.loc[[edge]]['id_1'].values[0],traindata.loc[[edge]]['id_2'].values[0])] = 1 
            edges[(traindata.loc[[edge]]['id_2'].values[0],traindata.loc[[edge]]['id_1'].values[0])] = 1  
        f = open("edgesdic.pkl","wb")
        pickle.dump(edges,f)
        f.close()
        return edges

def get_negative_edges(traindata,g):
    edges = get_edges_dict(traindata)
    # for missing edges.
    negative_edges = set([])
    maxNodenum = max(max(traindata['id_1']),max(traindata['id_2']))
    #產生與原圖同edge數目的negative edges
    while (len(negative_edges)<len(traindata)*2):
        node1=random.randint(0, maxNodenum) 
        node2=random.randint(0, maxNodenum) 
        tmp = edges.get((node1,node2),0) #edge不在graph裡標為0
        if tmp == 0 and node1!=node2 and g.has_node(node1) and g.has_node(node2): # if edge不在graph裡才要做處理
            negative_edges.add((node1,node2))
        else:
            continue
    return negative_edges

In [5]:
ori_df = pd.read_csv('data/karate_edges.txt', header=None, sep=' ')
ori_df.columns = ['id_1', 'id_2']

ori_G = nx.from_pandas_edgelist(ori_df, 'id_1', 'id_2')

edges_embs = n2v_embedding(ori_G)   # 生成边的表示向量

Computing transition probabilities:   0%|          | 0/34 [00:00<?, ?it/s]

In [6]:
train_df = ori_df.values

train_df = n2v_combine_embedding(train_df, edges_embs)

y_train_pos = np.ones(len(train_df))

negative_edges = get_negative_edges(ori_df, ori_G)

df_neg = pd.DataFrame(list(negative_edges), columns=['id_1', 'id_2'])
df_neg = df_neg.values

df_neg = n2v_combine_embedding(df_neg, edges_embs)

X_train_neg = df_neg

y_train_neg = np.zeros(len(X_train_neg))

y_train = np.concatenate((y_train_pos, y_train_neg))

X_train = np.concatenate((train_df, X_train_neg))

In [7]:
clf2 = RandomForestClassifier(n_estimators=400)

clf2.fit(X_train, y_train)

RandomForestClassifier(n_estimators=400)

In [8]:
all_edges = []
for i in range(34):
    for j in range(34):
        all_edges.append([i, j])
        
all_edges = pd.DataFrame(all_edges, columns=['id1', 'id2'])

all_df = all_edges.values
all_df = n2v_combine_embedding(all_df, edges_embs)

all_df = np.array([list(all_df[i]) for i in range(len(all_df))])
predict_Y = clf2.predict(all_df)
adj_rec = predict_Y.reshape(34, 34)

In [9]:
import copy
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

for i in range(34):
    for j in range(34):
        if i>=j :
            adj_rec[i, j] = 0

adj_rec = adj_rec + adj_rec.T

In [10]:
predict_edges = adj_rec.reshape(adj_rec.shape[0]*adj_rec.shape[0])

adj_true = nx.adjacency_matrix(ori_G).todense()
adj_true = np.array(adj_true)
true_edges = adj_true.reshape(34*34)

print("edges： ",predict_edges.sum())
print("ACC： ",accuracy_score(true_edges, predict_edges))
print("AP： ",precision_score(true_edges, predict_edges, average='macro'))
print("RECALL： ",recall_score(true_edges, predict_edges, average='macro'))
print("F1 SCORE： ",f1_score(true_edges, predict_edges, average='macro'))

edges：  176.0
ACC：  0.8235294117647058
AP：  0.6348794063079777
RECALL：  0.6491282051282051
F1 SCORE：  0.6412559328221978


In [11]:
predict_graph = predict_edges.reshape(34, 34)
pd.DataFrame(predict_graph).to_csv('NODE2VEC_recon_karate.txt', header=None, index=False, sep=' ')