In [1]:
from __future__ import division
from __future__ import print_function
import os, sys
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=RuntimeWarning)
warnings.simplefilter(action='ignore', category=UserWarning)
#sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), os.pardir))
# For replicating the experiments
SEED = 42
import argparse
import time
import random
import numpy as np
import scipy.sparse as sp
import torch

np.random.seed(SEED)
torch.manual_seed(SEED)
from torch import optim
import torch.nn.functional as F
from model import LinTrans, LogReg
from optimizer import loss_function
from utils import *
from sklearn.cluster import SpectralClustering, KMeans
from clustering_metric import clustering_metrics
from tqdm import tqdm
from sklearn.preprocessing import normalize, MinMaxScaler
from sklearn import metrics
from sklearn.metrics import accuracy_score, f1_score, average_precision_score, roc_auc_score
import pandas as pd

In [2]:
parser = argparse.ArgumentParser()
parser.add_argument('--gnnlayers', type=int, default=1, help="Number of gnn layers")
parser.add_argument('--linlayers', type=int, default=1, help="Number of hidden layers")
parser.add_argument('--epochs', type=int, default=400, help='Number of epochs to train.')
parser.add_argument('--dims', type=int, default=[500], help='Number of units in hidden layer 1.')
parser.add_argument('--lr', type=float, default=0.001, help='Initial learning rate.')
parser.add_argument('--upth_st', type=float, default=0.0011, help='Upper Threshold start.')
parser.add_argument('--lowth_st', type=float, default=0.1, help='Lower Threshold start.')
parser.add_argument('--upth_ed', type=float, default=0.001, help='Upper Threshold end.')
parser.add_argument('--lowth_ed', type=float, default=0.5, help='Lower Threshold end.')
parser.add_argument('--upd', type=int, default=10, help='Update epoch.')
parser.add_argument('--bs', type=int, default=100, help='Batchsize.')
parser.add_argument('--dataset', type=str, default='polbooks', help='type of dataset.')
parser.add_argument('--no-cuda', action='store_true', default=False,
                    help='Disables CUDA training.')
args,_ = parser.parse_known_args()
args.cuda = False

In [3]:
def update_similarity(z, upper_threshold, lower_treshold, pos_num, neg_num):
    f_adj = np.matmul(z, np.transpose(z))
    cosine = f_adj
    cosine = cosine.reshape([-1,])
    pos_num = round(upper_threshold * len(cosine))
    neg_num = round((1-lower_treshold) * len(cosine))
    
    pos_inds = np.argpartition(-cosine, pos_num)[:pos_num]
    neg_inds = np.argpartition(cosine, neg_num)[:neg_num]
    
    return np.array(pos_inds), np.array(neg_inds)

def update_threshold(upper_threshold, lower_treshold, up_eta, low_eta):
    upth = upper_threshold + up_eta
    lowth = lower_treshold + low_eta
    return upth, lowth

def load_network_data(adj_name, nodes_numbers):
    raw_edges = pd.read_csv("data/"+adj_name+".txt",header=None,sep=' ')
    drop_self_loop = raw_edges[raw_edges[0]!=raw_edges[1]]
    graph_np = np.zeros((nodes_numbers, nodes_numbers))
    for i in range(drop_self_loop.shape[0]):
        graph_np[drop_self_loop.iloc[i,0], drop_self_loop.iloc[i,1]]=1
        graph_np[drop_self_loop.iloc[i,1], drop_self_loop.iloc[i,0]]=1
    adj = nx.adjacency_matrix(nx.from_numpy_matrix(graph_np))
    features = np.eye(nodes_numbers)
    return adj, features

def get_scores(emb, adj_orig, edges_pos, edges_neg):
    def sigmoid(x):
        return 1 / (1 + np.exp(-x))

    # Predict on test set of edges
    adj_rec = np.dot(emb, emb.T)
    preds = []
    pos = []
    for e in edges_pos:
        preds.append(sigmoid(adj_rec[e[0], e[1]]))
        pos.append(adj_orig[e[0], e[1]])

    preds_neg = []
    neg = []
    for e in edges_neg:
        preds_neg.append(sigmoid(adj_rec[e[0], e[1]]))
        neg.append(adj_orig[e[0], e[1]])

    preds_all = np.hstack([preds, preds_neg])
    labels_all = np.hstack([np.ones(len(preds)), np.zeros(len(preds))])
    
    roc_score = roc_auc_score(labels_all, preds_all)
    ap_score = average_precision_score(labels_all, preds_all)

    return roc_score, ap_score

In [4]:
def gae_for(args):
    print("Using {} dataset".format(args.dataset))
    
    if args.dataset == 'cora':
        nodes_number = 2708    # 这里需要输入网络的节点数量
        n_clusters = 7     # 指定类簇数量
    elif args.dataset == 'citeseer':
        nodes_number = 3327    # 这里需要输入网络的节点数量
        n_clusters = 6     # 指定类簇数量
    elif args.dataset == 'wiki':
        nodes_number = 2405    # 这里需要输入网络的节点数量
        n_clusters = 17     # 指定类簇数量
    elif args.dataset == 'celegans':
        nodes_number = 297    # 这里需要输入网络的节点数量
        n_clusters = 10     # 指定类簇数量
    elif args.dataset == 'email':
        nodes_number = 986    # 这里需要输入网络的节点数量
        n_clusters = 10     # 指定类簇数量
    elif args.dataset == 'polbooks':
        nodes_number = 105    # 这里需要输入网络的节点数量
        n_clusters = 10     # 指定类簇数量
    elif args.dataset == 'texas':
        nodes_number = 183    # 这里需要输入网络的节点数量
        n_clusters = 10     # 指定类簇数量
    elif args.dataset == 'wisconsin':
        nodes_number = 215    # 这里需要输入网络的节点数量
        n_clusters = 10     # 指定类簇数量
        
    Cluster = SpectralClustering(n_clusters=n_clusters, affinity = 'precomputed', random_state=0)
    adj, features = load_network_data(args.dataset, nodes_number)

    n_nodes, feat_dim = features.shape
    dims = [feat_dim] + args.dims
    
    layers = args.linlayers
    # Store original adjacency matrix (without diagonal entries) for later
    
    adj = adj - sp.dia_matrix((adj.diagonal()[np.newaxis, :], [0]), shape=adj.shape)
    adj.eliminate_zeros()
    adj_orig = adj

    #adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges(adj)    # val:0.05  test:0.1
    adj_train, train_edges, test_edges, test_edges_false = mask_test_edges(adj)   
    adj = adj_train
    n = adj.shape[0]

    adj_norm_s = preprocess_graph(adj, args.gnnlayers, norm='sym', renorm=True)
    sm_fea_s = sp.csr_matrix(features).toarray()
    
    print('Laplacian Smoothing...')
    for a in adj_norm_s:
        sm_fea_s = a.dot(sm_fea_s)
    adj_1st = (adj + sp.eye(n)).toarray()

    adj_label = torch.FloatTensor(adj_1st)
    
    model = LinTrans(layers, dims)
    
    optimizer = optim.Adam(model.parameters(), lr=args.lr)
    
    sm_fea_s = torch.FloatTensor(sm_fea_s)
    adj_label = adj_label.reshape([-1,])

    inx = sm_fea_s
    
    pos_num = len(adj.indices)
    neg_num = n_nodes*n_nodes-pos_num

    up_eta = (args.upth_ed - args.upth_st) / (args.epochs/args.upd)
    low_eta = (args.lowth_ed - args.lowth_st) / (args.epochs/args.upd)

    pos_inds, neg_inds = update_similarity(normalize(sm_fea_s.numpy()), args.upth_st, args.lowth_st, pos_num, neg_num)
    upth, lowth = update_threshold(args.upth_st, args.lowth_st, up_eta, low_eta)

    bs = min(args.bs, len(pos_inds))
    length = len(pos_inds)
    
    pos_inds_cuda = torch.LongTensor(pos_inds)
    best_lp = 0.
    print('Start Training...')
    for epoch in range(args.epochs):
        st, ed = 0, bs
        batch_num = 0
        model.train()
        length = len(pos_inds)
        
        while ( ed <= length ):
            sampled_neg = torch.LongTensor(np.random.choice(neg_inds, size=ed-st))
            sampled_inds = torch.cat((pos_inds_cuda[st:ed], sampled_neg), 0)
            t = time.time()
            optimizer.zero_grad()
            xind = sampled_inds // n_nodes
            yind = sampled_inds % n_nodes
            x = torch.index_select(inx, 0, xind)
            y = torch.index_select(inx, 0, yind)
            zx = model(x)
            zy = model(y)
            batch_label = torch.cat((torch.ones(ed-st), torch.zeros(ed-st)))
            batch_pred = model.dcs(zx, zy)
            loss = loss_function(adj_preds=batch_pred, adj_labels=batch_label, n_nodes=ed-st)
            
            loss.backward()
            cur_loss = loss.item()
            optimizer.step()
            
            st = ed
            batch_num += 1
            if ed < length and ed + bs >= length:
                ed += length - ed
            else:
                ed += bs

        if (epoch + 1) % args.upd == 0:
            model.eval()
            mu = model(inx)
            hidden_emb = mu.cpu().data.numpy()
            upth, lowth = update_threshold(upth, lowth, up_eta, low_eta)
            pos_inds, neg_inds = update_similarity(hidden_emb, upth, lowth, pos_num, neg_num)
            bs = min(args.bs, len(pos_inds))
            pos_inds_cuda = torch.LongTensor(pos_inds)
            
            #val_auc, val_ap, val_acc, val_f1 = get_scores(hidden_emb, adj_orig, val_edges, val_edges_false)
            #if val_auc + val_ap >= best_lp:
                #best_lp = val_auc + val_ap
                #best_emb = hidden_emb
            #print("Epoch: {}, train_loss_gae={:.5f}, val_ap={:.5f}, val_acc={:.5f},time={:.5f}".format(epoch + 1, cur_loss, val_ap, val_acc, time.time() - t))
    
    print("Optimization Finished!")
    #auc_score, ap_score, acc_score, f1_score = get_scores(best_emb, adj_orig, test_edges, test_edges_false)
    auc_score, ap_score = get_scores(hidden_emb, adj_orig, test_edges, test_edges_false)
    print('Test AP score: ',ap_score)
    print('Test AUC score: ',auc_score)
    return auc_score, ap_score

In [5]:
if __name__ == '__main__':
    all_auc = []
    all_ap = []
    for i in range(10):
        auc, ap = gae_for(args)
        all_auc.append(auc)
        all_ap.append(ap)

Using polbooks dataset
Laplacian Smoothing...
Start Training...
Optimization Finished!
Test AP score:  0.87041669233205
Test AUC score:  0.8873966942148761
Using polbooks dataset
Laplacian Smoothing...
Start Training...
Optimization Finished!
Test AP score:  0.8245656277839583
Test AUC score:  0.8465909090909092
Using polbooks dataset
Laplacian Smoothing...
Start Training...
Optimization Finished!
Test AP score:  0.9221858957396772
Test AUC score:  0.9034090909090909
Using polbooks dataset
Laplacian Smoothing...
Start Training...
Optimization Finished!
Test AP score:  0.8679877995776042
Test AUC score:  0.8657024793388429
Using polbooks dataset
Laplacian Smoothing...
Start Training...
Optimization Finished!
Test AP score:  0.9058008525407801
Test AUC score:  0.8884297520661157
Using polbooks dataset
Laplacian Smoothing...
Start Training...
Optimization Finished!
Test AP score:  0.8534911177346045
Test AUC score:  0.8698347107438016
Using polbooks dataset
Laplacian Smoothing...
Start Tr

In [6]:
print("AP MEAN : ", np.array(all_ap).mean())
print("AP STD : ", np.array(all_ap).std())

print("AUC MEAN : ", np.array(all_auc).mean())
print("AUC STD : ", np.array(all_auc).std())

AP MEAN :  0.859304051991514
AP STD :  0.032207006545178454
AUC MEAN :  0.8675103305785123
AUC STD :  0.022794381106312316
