In [1]:
import matplotlib.pyplot as plt
import mxnet as mx
import numpy as np
from scipy import interp
from sklearn import metrics
import warnings
warnings.filterwarnings("ignore")

import time
import random
import numpy as np
import pandas as pd
import math
import mxnet as mx
from mxnet import ndarray as nd, gluon, autograd
from mxnet.gluon import loss as gloss
import dgl
from sklearn.model_selection import KFold
from sklearn import metrics

from utils import build_graph, sample, load_data
from model import GNNMDA, GraphEncoder, BilinearDecoder

from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score, auc
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import classification_report
from collections import Counter

In [2]:
def sample(directory, random_seed):
    all_associations = pd.read_csv(directory + '/all_mirna_disease_pairs.csv', names=['miRNA', 'disease', 'label'])
    known_associations = all_associations.loc[all_associations['label'] == 1]
    unknown_associations = all_associations.loc[all_associations['label'] == 0]
    random_negative = unknown_associations.sample(n=known_associations.shape[0], random_state=random_seed, axis=0)

    sample_df = known_associations.append(random_negative)
    sample_df.reset_index(drop=True, inplace=True)

    return sample_df, sample_df.values

In [3]:
def generate_task_Tm_Td_train_test_idx(item, ids, dtp):
    
    test_num = int(len(ids) / 5)
    
    train_index_all, test_index_all = [], []
    train_id_all, test_id_all = [], []
    
    for fold in range(5):
        print('-------Fold ', fold)
        if fold != 4:
            test_ids = ids[fold * test_num : (fold + 1) * test_num]
        else:
            test_ids = ids[fold * test_num :]

        train_ids = list(set(ids) ^ set(test_ids))
        print('# {}: Train = {} | Test = {}'.format(item, len(train_ids), len(test_ids)))

        test_idx = dtp[dtp[item].isin(test_ids)].index.tolist()
        train_idx = dtp[dtp[item].isin(train_ids)].index.tolist()
        random.shuffle(test_idx)
        random.shuffle(train_idx)
        print('# Pairs: Train = {} | Test = {}'.format(len(train_idx), len(test_idx)))
        assert len(train_idx) + len(test_idx) == len(dtp)

        train_index_all.append(train_idx) 
        test_index_all.append(test_idx)
        
    return train_index_all, test_index_all

In [4]:
def Train(directory, epochs, aggregator, embedding_size, layers, dropout, slope, lr, wd, random_seed, ctx, isbalance, task):
    dgl.load_backend('mxnet')
    random.seed(random_seed)
    np.random.seed(random_seed)
    mx.random.seed(random_seed)

    g, disease_ids_invmap, mirna_ids_invmap = build_graph(directory, random_seed, ctx, isbalance)
    print(g)
    if isbalance:
        dtp, samples = sample(directory, random_seed)
    else:
        dtp = pd.read_csv(directory + '/all_mirna_disease_pairs.csv', names=['miRNA', 'disease', 'label'])
        samples = dtp.values
    print(samples.shape)
    ID, IM = load_data(directory)

    print('## vertices:', g.number_of_nodes())
    print('## edges:', g.number_of_edges())
    print('## disease nodes:', nd.sum(g.ndata['type'] == 1).asnumpy())
    print('## mirna nodes:', nd.sum(g.ndata['type'] == 0).asnumpy())

    samples_df = pd.DataFrame(samples, columns=['miRNA', 'disease', 'label'])
    sample_disease_vertices = [disease_ids_invmap[id_] for id_ in samples[:, 1]]
    sample_mirna_vertices = [mirna_ids_invmap[id_] + ID.shape[0] for id_ in samples[:, 0]]
    
    if task == 'Tp':
        kf = KFold(n_splits=5, shuffle=True, random_state=random_seed)
        train_index = []
        test_index = []
        for train_idx, test_idx in kf.split(samples[:, 2]):
            train_index.append(train_idx)
            test_index.append(test_idx)
    else:
        mirna_ids = list(set(dtp['miRNA']))
        disease_ids = list(set(dtp['disease']))
        random.shuffle(mirna_ids)
        random.shuffle(disease_ids)
        print('# miRNA = {} | Disease = {}'.format(len(mirna_ids), len(disease_ids)))

        mirna_test_num = int(len(mirna_ids) / 5)
        disease_test_num = int(len(disease_ids) / 5)
        print('# Test: miRNA = {} | Disease = {}'.format(mirna_test_num, disease_test_num))
        
        if task == 'Td':
            item = 'disease'
            ids = disease_ids
        elif task == 'Tm':
            item = 'miRNA'
            ids = mirna_ids
            
        train_index, test_index = generate_task_Tm_Td_train_test_idx(item, ids, dtp)
      
    #####################################################################################        
    auc_result = []
    acc_result = []
    pre_result = []
    recall_result = []
    f1_result = []

    fprs = []
    tprs = []

    for i in range(len(train_index)):
        print('------------------------------------------------------------------------------------------------------')
        print('Training for Fold ', i + 1)

        samples_df['train'] = 0
        samples_df['test'] = 0

        samples_df['train'].iloc[train_index[i]] = 1
        samples_df['test'].iloc[test_index[i]] = 1

        train_tensor = nd.from_numpy(samples_df['train'].values.astype('int32')).copyto(ctx)
        test_tensor = nd.from_numpy(samples_df['test'].values.astype('int32')).copyto(ctx)

        edge_data = {'train': train_tensor,
                     'test': test_tensor}

        g.edges[sample_disease_vertices, sample_mirna_vertices].data.update(edge_data)
        g.edges[sample_mirna_vertices, sample_disease_vertices].data.update(edge_data)

        train_eid = g.filter_edges(lambda edges: edges.data['train']).astype('int64')
        g_train = g.edge_subgraph(train_eid, preserve_nodes=True)
        g_train.copy_from_parent()

        # get the training set
        rating_train = g_train.edata['rating']
        src_train, dst_train = g_train.all_edges()
        # get the testing edge set
        test_eid = g.filter_edges(lambda edges: edges.data['test']).astype('int64')
        src_test, dst_test = g.find_edges(test_eid)
        rating_test = g.edges[test_eid].data['rating']
        src_train = src_train.copyto(ctx)
        src_test = src_test.copyto(ctx)
        dst_train = dst_train.copyto(ctx)
        dst_test = dst_test.copyto(ctx)
        print('## Training edges:', len(train_eid))
        print('## Testing edges:', len(test_eid))

        # Train the model
        model = GNNMDA(GraphEncoder(embedding_size=embedding_size, n_layers=layers, G=g_train, aggregator=aggregator,
                                    dropout=dropout, slope=slope, ctx=ctx),
                       BilinearDecoder(feature_size=embedding_size))

        model.collect_params().initialize(init=mx.init.Xavier(magnitude=math.sqrt(2.0)), ctx=ctx)
        cross_entropy = gloss.SigmoidBinaryCrossEntropyLoss(from_sigmoid=True)
        trainer = gluon.Trainer(model.collect_params(), 'adam', {'learning_rate': lr, 'wd': wd})

        for epoch in range(epochs):
            start = time.time()
            for _ in range(10):
                with mx.autograd.record():
                    score_train = model(g_train, src_train, dst_train)
                    loss_train = cross_entropy(score_train, rating_train).mean()
                    loss_train.backward()
                trainer.step(1)
                
            results_train = [0 if j < 0.5 else 1 for j in np.squeeze(score_train.asnumpy())]
            
            h_val = model.encoder(g)
            score_val = model.decoder(h_val[src_test], h_val[dst_test])
            loss_val = cross_entropy(score_val, rating_test).mean()
            results_val = [0 if j < 0.5 else 1 for j in np.squeeze(score_val.asnumpy())]

            end = time.time()

            print('Epoch:', epoch + 1, 
                  'Train Loss: %.4f' % loss_train.asscalar(),
                  'Val Loss: %.4f' % loss_val.asscalar(),
                  'Time: %.2f' % (end - start))    
        
            print('***************Train: ')
            ys_train, performances_train = performances(rating_train.asnumpy(), 
                                            results_train, 
                                            score_train.asnumpy())
            
            print('***************Test: ')
            ys_val, performances_val = performances(rating_test.asnumpy(), 
                                            results_val, 
                                            score_val.asnumpy())
            


        h_test = model.encoder(g)
        score_test = model.decoder(h_test[src_test], h_test[dst_test])
#         loss_test = cross_entropy(score_test, rating_test).mean()
        results_test = [0 if j < 0.5 else 1 for j in np.squeeze(score_test.asnumpy())]

        print('***************Fold:', i + 1)
        ys_test, performances_test = performances(rating_test.asnumpy(), 
                                            results_test, 
                                            score_test.asnumpy())

    print('## Training Finished !')
    print('----------------------------------------------------------------------------------------------------------')

    return ys_train, performances_train, ys_test, performances_test

In [5]:
def performances(y_true, y_pred, y_prob):

    tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels = [0, 1]).ravel().tolist()

    accuracy = (tp+tn)/(tn+fp+fn+tp)
    
    if tp+fn != 0:
        recall = tp / (tp+fn)
    else:
        print('tp + fn = 0')
        recall = 0
    
    if tp+fp != 0:
        precision = tp / (tp+fp)
    else:
        print('tp + fp = 0')
        precision = 0
    
    if precision + recall != 0:
        f1 = 2*precision*recall / (precision+recall)
    else:
        f1 = 0
        
    roc_auc = roc_auc_score(y_true, y_prob)
    prec, reca, _ = precision_recall_curve(y_true, y_prob)
    aupr = auc(reca, prec)
    
    print('tn = {}, fp = {}, fn = {}, tp = {}'.format(tn, fp, fn, tp))
    print('y_pred: 0 = {} | 1 = {}'.format(Counter(y_pred)[0], Counter(y_pred)[1]))
    print('y_true: 0 = {} | 1 = {}'.format(Counter(y_true)[0], Counter(y_true)[1]))
    print('acc={:.4f}|precision={:.4f}|recall={:.4f}|f1={:.4f}|auc={:.4f}|aupr={:.4f}'.format(accuracy, precision, recall, f1, roc_auc, aupr))
    return (y_true, y_pred, y_prob), (accuracy, precision, recall, f1, roc_auc, aupr)

In [6]:
def run(isbalance, task):
    ys_train, performances_train, ys_test, performances_test = Train(directory='data',
                                                      epochs=100,
                                                      aggregator='GraphSAGE',  # 'GraphSAGE'
                                                      embedding_size=256,
                                                      layers=2,
                                                      dropout=0.7,
                                                      slope=0.2,  # LeakyReLU
                                                      lr=0.001,
                                                      wd=1e-3,
                                                      random_seed=1234,
                                                      ctx=mx.cpu(),
                                                      isbalance = isbalance,
                                                      task = task)
    return ys_train, performances_train, ys_test, performances_test

# Run

In [7]:
ys_train, performances_train, ys_test, performances_test = run(isbalance = True, task = 'Tp')

Building graph ...
Adding disease features ...
Adding miRNA features ...
Adding edges ...
Successfully build graph !!
DGLGraph(num_nodes=878, num_edges=21720,
         ndata_schemes={'type': Scheme(shape=(), dtype=<class 'numpy.float32'>), 'd_features': Scheme(shape=(383,), dtype=<class 'numpy.float32'>), 'm_features': Scheme(shape=(495,), dtype=<class 'numpy.float32'>)}
         edata_schemes={'inv': Scheme(shape=(), dtype=<class 'numpy.int32'>), 'rating': Scheme(shape=(), dtype=<class 'numpy.float32'>)})
(10860, 3)
## vertices: 878
## edges: 21720
## disease nodes: [383.]
## mirna nodes: [495.]
------------------------------------------------------------------------------------------------------
Training for Fold  1
## Training edges: 17376
## Testing edges: 4344
Epoch: 1 Train Loss: 0.5413 Val Loss: 0.5911 Time: 17.50
***************Train: 
tn = 6833, fp = 1839, fn = 2117, tp = 6587
y_pred: 0 = 8950 | 1 = 8426
y_true: 0 = 8672 | 1 = 8704
acc=0.7723|precision=0.7817|recall=0.7568|f1=

In [8]:
ys_train, performances_train, ys_test, performances_test = run(isbalance = False, task = 'Tp')

Building graph ...
Adding disease features ...
Adding miRNA features ...
Adding edges ...
Successfully build graph !!
DGLGraph(num_nodes=878, num_edges=379170,
         ndata_schemes={'type': Scheme(shape=(), dtype=<class 'numpy.float32'>), 'd_features': Scheme(shape=(383,), dtype=<class 'numpy.float32'>), 'm_features': Scheme(shape=(495,), dtype=<class 'numpy.float32'>)}
         edata_schemes={'inv': Scheme(shape=(), dtype=<class 'numpy.int32'>), 'rating': Scheme(shape=(), dtype=<class 'numpy.float32'>)})
(189585, 3)
## vertices: 878
## edges: 379170
## disease nodes: [383.]
## mirna nodes: [495.]
------------------------------------------------------------------------------------------------------
Training for Fold  1
## Training edges: 303336
## Testing edges: 75834
Epoch: 1 Train Loss: 0.1322 Val Loss: 0.1210 Time: 38.16
***************Train: 
tn = 294171, fp = 415, fn = 8735, tp = 15
y_pred: 0 = 302906 | 1 = 430
y_true: 0 = 294586 | 1 = 8750
acc=0.9698|precision=0.0349|recall=0.0

In [9]:
ys_train, performances_train, ys_test, performances_test = run(isbalance = True, task = 'Tm')

Building graph ...
Adding disease features ...
Adding miRNA features ...
Adding edges ...
Successfully build graph !!
DGLGraph(num_nodes=878, num_edges=21720,
         ndata_schemes={'type': Scheme(shape=(), dtype=<class 'numpy.float32'>), 'd_features': Scheme(shape=(383,), dtype=<class 'numpy.float32'>), 'm_features': Scheme(shape=(495,), dtype=<class 'numpy.float32'>)}
         edata_schemes={'inv': Scheme(shape=(), dtype=<class 'numpy.int32'>), 'rating': Scheme(shape=(), dtype=<class 'numpy.float32'>)})
(10860, 3)
## vertices: 878
## edges: 21720
## disease nodes: [383.]
## mirna nodes: [495.]
# miRNA = 495 | Disease = 383
# Test: miRNA = 99 | Disease = 76
-------Fold  0
# miRNA: Train = 396 | Test = 99
# Pairs: Train = 8791 | Test = 2069
-------Fold  1
# miRNA: Train = 396 | Test = 99
# Pairs: Train = 8928 | Test = 1932
-------Fold  2
# miRNA: Train = 396 | Test = 99
# Pairs: Train = 8838 | Test = 2022
-------Fold  3
# miRNA: Train = 396 | Test = 99
# Pairs: Train = 8535 | Test = 2

In [10]:
ys_train, performances_train, ys_test, performances_test = run(isbalance = False, task = 'Tm')

Building graph ...
Adding disease features ...
Adding miRNA features ...
Adding edges ...
Successfully build graph !!
DGLGraph(num_nodes=878, num_edges=379170,
         ndata_schemes={'type': Scheme(shape=(), dtype=<class 'numpy.float32'>), 'd_features': Scheme(shape=(383,), dtype=<class 'numpy.float32'>), 'm_features': Scheme(shape=(495,), dtype=<class 'numpy.float32'>)}
         edata_schemes={'inv': Scheme(shape=(), dtype=<class 'numpy.int32'>), 'rating': Scheme(shape=(), dtype=<class 'numpy.float32'>)})
(189585, 3)
## vertices: 878
## edges: 379170
## disease nodes: [383.]
## mirna nodes: [495.]
# miRNA = 495 | Disease = 383
# Test: miRNA = 99 | Disease = 76
-------Fold  0
# miRNA: Train = 396 | Test = 99
# Pairs: Train = 151668 | Test = 37917
-------Fold  1
# miRNA: Train = 396 | Test = 99
# Pairs: Train = 151668 | Test = 37917
-------Fold  2
# miRNA: Train = 396 | Test = 99
# Pairs: Train = 151668 | Test = 37917
-------Fold  3
# miRNA: Train = 396 | Test = 99
# Pairs: Train = 151

In [11]:
ys_train, performances_train, ys_test, performances_test = run(isbalance = True, task = 'Td')

Building graph ...
Adding disease features ...
Adding miRNA features ...
Adding edges ...
Successfully build graph !!
DGLGraph(num_nodes=878, num_edges=21720,
         ndata_schemes={'type': Scheme(shape=(), dtype=<class 'numpy.float32'>), 'd_features': Scheme(shape=(383,), dtype=<class 'numpy.float32'>), 'm_features': Scheme(shape=(495,), dtype=<class 'numpy.float32'>)}
         edata_schemes={'inv': Scheme(shape=(), dtype=<class 'numpy.int32'>), 'rating': Scheme(shape=(), dtype=<class 'numpy.float32'>)})
(10860, 3)
## vertices: 878
## edges: 21720
## disease nodes: [383.]
## mirna nodes: [495.]
# miRNA = 495 | Disease = 383
# Test: miRNA = 99 | Disease = 76
-------Fold  0
# disease: Train = 307 | Test = 76
# Pairs: Train = 8661 | Test = 2199
-------Fold  1
# disease: Train = 307 | Test = 76
# Pairs: Train = 8902 | Test = 1958
-------Fold  2
# disease: Train = 307 | Test = 76
# Pairs: Train = 8512 | Test = 2348
-------Fold  3
# disease: Train = 307 | Test = 76
# Pairs: Train = 8540 | 

In [12]:
ys_train, performances_train, ys_test, performances_test = run(isbalance = False, task = 'Td')

Building graph ...
Adding disease features ...
Adding miRNA features ...
Adding edges ...
Successfully build graph !!
DGLGraph(num_nodes=878, num_edges=379170,
         ndata_schemes={'type': Scheme(shape=(), dtype=<class 'numpy.float32'>), 'd_features': Scheme(shape=(383,), dtype=<class 'numpy.float32'>), 'm_features': Scheme(shape=(495,), dtype=<class 'numpy.float32'>)}
         edata_schemes={'inv': Scheme(shape=(), dtype=<class 'numpy.int32'>), 'rating': Scheme(shape=(), dtype=<class 'numpy.float32'>)})
(189585, 3)
## vertices: 878
## edges: 379170
## disease nodes: [383.]
## mirna nodes: [495.]
# miRNA = 495 | Disease = 383
# Test: miRNA = 99 | Disease = 76
-------Fold  0
# disease: Train = 307 | Test = 76
# Pairs: Train = 151965 | Test = 37620
-------Fold  1
# disease: Train = 307 | Test = 76
# Pairs: Train = 151965 | Test = 37620
-------Fold  2
# disease: Train = 307 | Test = 76
# Pairs: Train = 151965 | Test = 37620
-------Fold  3
# disease: Train = 307 | Test = 76
# Pairs: Tra