In [1]:
import argparse, time, math
import numpy as np
import networkx as nx
import torch
import torch.nn as nn
import torch.nn.functional as F
import dgl
from dgl.data import register_data_args
from dgl.data import CoraGraphDataset, CiteseerGraphDataset, PubmedGraphDataset


def gcn_msg(edge):
    msg = edge.src['h'] * edge.src['norm']
    return {'m': msg}


def gcn_reduce(node):
    accum = torch.sum(node.mailbox['m'], 1) * node.data['norm']
    return {'h': accum}


class NodeApplyModule(nn.Module):
    def __init__(self, out_feats, activation=None, bias=True):
        super(NodeApplyModule, self).__init__()
        if bias:
            self.bias = nn.Parameter(torch.Tensor(out_feats))
        else:
            self.bias = None
        self.activation = activation
        self.reset_parameters()

    def reset_parameters(self):
        if self.bias is not None:
            stdv = 1. / math.sqrt(self.bias.size(0))
            self.bias.data.uniform_(-stdv, stdv)

    def forward(self, nodes):
        h = nodes.data['h']
        if self.bias is not None:
            h = h + self.bias
        if self.activation:
            h = self.activation(h)
        return {'h': h}


class GCNLayer(nn.Module):
    def __init__(self,
                 g,
                 in_feats,
                 out_feats,
                 activation,
                 dropout,
                 bias=True):
        super(GCNLayer, self).__init__()
        self.g = g
        self.weight = nn.Parameter(torch.Tensor(in_feats, out_feats))
        if dropout:
            self.dropout = nn.Dropout(p=dropout)
        else:
            self.dropout = 0.
        self.node_update = NodeApplyModule(out_feats, activation, bias)
        self.reset_parameters()

    def reset_parameters(self):
        stdv = 1. / math.sqrt(self.weight.size(1))
        self.weight.data.uniform_(-stdv, stdv)

    def forward(self, h):
        if self.dropout:
            h = self.dropout(h)
        self.g.ndata['h'] = torch.mm(h, self.weight)
        self.g.update_all(gcn_msg, gcn_reduce, self.node_update)
        h = self.g.ndata.pop('h')
        return h

class GCN(nn.Module):
    def __init__(self,
                 g,
                 in_feats,
                 n_hidden,
                 n_classes,
                 n_layers,
                 activation,
                 dropout):
        super(GCN, self).__init__()
        self.layers = nn.ModuleList()
        # input layer
        self.layers.append(GCNLayer(g, in_feats, n_hidden, activation, dropout))
        # hidden layers
        for i in range(n_layers - 1):
            self.layers.append(GCNLayer(g, n_hidden, n_hidden, activation, dropout))
        # output layer
        self.layers.append(GCNLayer(g, n_hidden, n_classes, None, dropout))

    def forward(self, features):
        h = features
        for layer in self.layers:
            h = layer(h)
        return h

Using backend: pytorch


In [2]:
def evaluate(model, features, labels, mask):
    model.eval()
    with torch.no_grad():
        logits = model(features)
        logits = logits[mask]
        labels = labels[mask]
        _, indices = torch.max(logits, dim=1)
        probas = F.softmax(logits)
        
#         correct = torch.sum(indices == labels)
#         return correct.item() * 1.0 / len(labels)
        return metrics(labels.cpu().detach(), indices.cpu().detach(), probas.cpu().detach()[:,1])

In [3]:
def metrics(y_true, y_pred, y_prob):

    y_true, y_pred, y_prob = y_true.numpy(), y_pred.numpy(), y_prob.numpy()
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()

    pos_acc = tp / sum(y_true)
    neg_acc = tn / (len(y_pred) - sum(y_pred)) # [y_true=0 & y_pred=0] / y_pred=0
    accuracy = (tp+tn)/(tn+fp+fn+tp)
    
    recall = tp / (tp+fn)
    precision = tp / (tp+fp)
    f1 = 2*precision*recall / (precision+recall)
    
    roc_auc = roc_auc_score(y_true, y_prob)
    prec, reca, _ = precision_recall_curve(y_true, y_prob)
    aupr = auc(reca, prec)
    
    print('tn = {}, fp = {}, fn = {}, tp = {}'.format(tn, fp, fn, tp))
    print('y_pred: 0 = {} | 1 = {}'.format(Counter(y_pred)[0], Counter(y_pred)[1]))
    print('y_true: 0 = {} | 1 = {}'.format(Counter(y_true)[0], Counter(y_true)[1]))
    print('acc={:.4f}|precision={:.4f}|recall={:.4f}|f1={:.4f}|auc={:.4f}|aupr={:.4f}|pos_acc={:.4f}|neg_acc={:.4f}'.format(accuracy, precision, recall, f1, roc_auc, aupr, pos_acc, neg_acc))
    return (y_true, y_pred, y_prob), (accuracy, precision, recall, f1, roc_auc, aupr, pos_acc, neg_acc)

In [4]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import scipy.sparse as sp
from copy import deepcopy
import warnings 
import os
# from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold
import json
warnings.filterwarnings("ignore") 

from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score, auc
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import classification_report
from collections import Counter
from sklearn.model_selection import KFold

In [5]:
def main(args, g, features, labels, train_idx):

    if args.gpu >= 0:
        cuda = True
        device = torch.device('cuda:%d' % args.gpu)
    else:
        cuda = False
        device = torch.device('cpu')
    
    num_nodes = g.number_of_nodes()
    train_mask = np.zeros(num_nodes, dtype = 'int64')
    train_mask[train_idx] = 1
    test_mask = 1 - train_mask
    print(Counter(train_mask), Counter(test_mask))
    train_mask = torch.BoolTensor(train_mask)
    test_mask = torch.BoolTensor(test_mask)

    g.ndata['feat'] = features
    g.ndata['label'] = labels
    g.ndata['train_mask'] = train_mask
    g.ndata['test_mask'] = test_mask
    
    g = g.to(device)
        
    in_feats = features.shape[1]
    n_classes = 2
    n_edges = g.number_of_edges()

    features, labels = features.to(device), labels.to(device)
    
    print("""----Data statistics------'
      #Edges %d
      #Classes %d
      #Train samples %d
      #Test samples %d""" %
          (n_edges, n_classes,
          train_mask.int().sum().item(),
          test_mask.int().sum().item()))

    # normalization
    degs = g.in_degrees().float()
    norm = torch.pow(degs, -0.5)
    norm[torch.isinf(norm)] = 0
    if cuda:
        norm = norm.cuda()
    g.ndata['norm'] = norm.unsqueeze(1)

    # create GCN model
    model = GCN(g,
                in_feats,
                args.n_hidden,
                n_classes,
                args.n_layers,
                F.relu,
                args.dropout)

    if cuda:
        model.cuda()
    loss_fcn = torch.nn.CrossEntropyLoss()

    # use optimizer
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=args.lr,
                                 weight_decay=args.weight_decay)

    # initialize graph
    dur = []
    for epoch in range(args.n_epochs):
        model.train()

        t0 = time.time()
        # forward
        logits = model(features)
        loss = loss_fcn(logits[train_mask], labels[train_mask])
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        dur.append(time.time() - t0)
        
        print('=====Epoch {} | Time(s) {:.4f} | Loss {:.4f} | ETputs(KTEPS) {:.2f}'.format(epoch, np.mean(dur), loss.item(), n_edges / np.mean(dur) / 1000))
        ys_train, metrics_train = evaluate(model, features, labels, train_mask)
        ys_test, metrics_test = evaluate(model, features, labels, test_mask)
    return ys_train, metrics_train, ys_test, metrics_test

In [6]:
def run(args, task, isbalance, balance, n_neigh):
    pwd = '/home/chujunyi/4_GNN/GAEMDA-miRNA-disease/0_data/'
    
    if isbalance:
        node_feature_label = pd.read_csv(pwd + 'node_feature_label.csv', index_col = 0)
    else:
        node_feature_label = pd.read_csv(pwd + 'node_feature_label__nobalance.csv', index_col = 0)
    
    train_test_id_idx = np.load('/home/chujunyi/4_GNN/GraphSAINT/miRNA_disease_data/task_' + task + balance + '__testlabel0_knn_edge_train_test_index_all.npz', allow_pickle = True)
    train_index_all = train_test_id_idx['train_index_all']
    test_index_all = train_test_id_idx['test_index_all']
    
    num_nodes = node_feature_label.shape[0]
    features = torch.FloatTensor(np.array(node_feature_label.iloc[:, 3:]))
    labels = torch.LongTensor(np.array(node_feature_label['label']))
    
    fold = 0
    for train_idx, test_idx in zip(train_index_all, test_index_all):
        
        print('=====Fold {}============================================='.format(fold))
        
        knn_graph_file = 'task_' + task + balance + '__testlabel0_knn' + str(n_neigh) + 'neighbors_edge__fold' + str(fold) + '.npz'
        knn_neighbors_graph = sp.load_npz(pwd + knn_graph_file)
        
        edge_src = knn_neighbors_graph.nonzero()[0]
        edge_dst = knn_neighbors_graph.nonzero()[1]
        
        g = dgl.DGLGraph()
        g.add_nodes(num_nodes)
        g.add_edges(edge_src, edge_dst)
        g = dgl.add_self_loop(g)
        print(g)
        
        ys_train, metrics_train, ys_test, metrics_test = main(args, g, features, labels, train_idx)
        
        fold += 1
    return node_feature_label, train_index_all, test_index_all, knn_neighbors_graph, g, ys_train, metrics_train, ys_test, metrics_test

# RUN

In [None]:
# balance data
if __name__ == '__main__':
    
    parser = argparse.ArgumentParser(description='GCN')
    register_data_args(parser)
    parser.add_argument("--dropout", type=float, default=0.3,
            help="dropout probability")
    parser.add_argument("--gpu", type=int, default=0,
            help="gpu")
    parser.add_argument("--lr", type=float, default=1e-4,
            help="learning rate")
    parser.add_argument("--n-epochs", type=int, default=1000,
            help="number of training epochs")
    parser.add_argument("--n-hidden", type=int, default=128,
            help="number of hidden gcn units")
    parser.add_argument("--n-layers", type=int, default=3,
            help="number of hidden gcn layers")
    parser.add_argument("--weight-decay", type=float, default=5e-4,
            help="Weight for L2 loss")
    args = parser.parse_args(args = [])
    print(args)
    
    for isbalance in [True]:
        
        if isbalance:
            balance = ''
        else:
            balance = '__nobalance'
            
        for task in ['Tp', 'Tm', 'Td']:
            
            for n_neigh in [1, 3, 5, 7, 10, 15]:
                
                print('************** isbalance = {} | task = {} | n_neigh = {}'.format(isbalance, task, n_neigh))
                
                node_feature_label, train_index_all, test_index_all, \
                knn_neighbors_graph, g, ys_train, metrics_train, ys_test, metrics_test = run(args,
                                                                                              task, 
                                                                                              isbalance, 
                                                                                              balance, 
                                                                                              n_neigh)

Namespace(dataset=None, dropout=0.3, gpu=0, lr=0.0001, n_epochs=1000, n_hidden=128, n_layers=3, weight_decay=0.0005)
************** isbalance = True | task = Tp | n_neigh = 1
Graph(num_nodes=10860, num_edges=21720,
      ndata_schemes={}
      edata_schemes={})
Counter({1: 8688, 0: 2172}) Counter({0: 8688, 1: 2172})
----Data statistics------'
      #Edges 21720
      #Classes 2
      #Train samples 8688
      #Test samples 2172
=====Epoch 0 | Time(s) 2.4537 | Loss 0.9426 | ETputs(KTEPS) 8.85
tn = 0, fp = 4336, fn = 0, tp = 4352
y_pred: 0 = 0 | 1 = 8688
y_true: 0 = 4336 | 1 = 4352
acc=0.5009|precision=0.5009|recall=1.0000|f1=0.6675|auc=0.7024|aupr=0.7259|pos_acc=1.0000|neg_acc=nan
tn = 0, fp = 1094, fn = 0, tp = 1078
y_pred: 0 = 0 | 1 = 2172
y_true: 0 = 1094 | 1 = 1078
acc=0.4963|precision=0.4963|recall=1.0000|f1=0.6634|auc=0.7020|aupr=0.7144|pos_acc=1.0000|neg_acc=nan
=====Epoch 1 | Time(s) 1.2639 | Loss 0.8766 | ETputs(KTEPS) 17.18
tn = 0, fp = 4336, fn = 0, tp = 4352
y_pred: 0 = 0 | 

In [None]:
# nobalance data
if __name__ == '__main__':
    
    parser = argparse.ArgumentParser(description='GCN')
    register_data_args(parser)
    parser.add_argument("--dropout", type=float, default=0.3,
            help="dropout probability")
    parser.add_argument("--gpu", type=int, default=0,
            help="gpu")
    parser.add_argument("--lr", type=float, default=1e-4,
            help="learning rate")
    parser.add_argument("--n-epochs", type=int, default=1000,
            help="number of training epochs")
    parser.add_argument("--n-hidden", type=int, default=128,
            help="number of hidden gcn units")
    parser.add_argument("--n-layers", type=int, default=3,
            help="number of hidden gcn layers")
    parser.add_argument("--weight-decay", type=float, default=5e-4,
            help="Weight for L2 loss")
    args = parser.parse_args(args = [])
    print(args)
    
    for isbalance in [False]:
        
        if isbalance:
            balance = ''
        else:
            balance = '__nobalance'
            
        for task in ['Tp', 'Tm', 'Td']:
            
            for n_neigh in [1, 3, 5, 7, 10, 15]:
                
                print('************** isbalance = {} | task = {} | n_neigh = {}'.format(isbalance, task, n_neigh))
                
                node_feature_label, train_index_all, test_index_all, \
                knn_neighbors_graph, g, ys_train, metrics_train, ys_test, metrics_test = run(args,
                                                                                              task, 
                                                                                              isbalance, 
                                                                                              balance, 
                                                                                              n_neigh)