In [12]:
# https://scikit-learn.org/stable/modules/naive_bayes.html#gaussian-naive-bayes
# https://scikit-learn.org/stable/modules/tree.html#classification
# https://scikit-learn.org/stable/modules/sgd.html#classification
# https://scikit-learn.org/stable/modules/svm.html#classification
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html#sklearn.ensemble.ExtraTreesClassifier

In [11]:
import math
from sklearn.model_selection import KFold
from sklearn import metrics
from sklearn.metrics import precision_recall_curve
from sklearn import preprocessing

import numpy as np
import pandas as pd

import time
import random
import matplotlib.pyplot as plt
from scipy import interp
import warnings
warnings.filterwarnings("ignore")

from sklearn.ensemble import ExtraTreesClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn import svm
from collections import Counter
from tqdm import tqdm

In [2]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score, auc
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import classification_report

In [3]:
def load_data(directory):
    D_SSM1 = np.loadtxt(directory + '/D_SSM1.txt')
    D_SSM2 = np.loadtxt(directory + '/D_SSM2.txt')
    D_GSM = np.loadtxt(directory + '/D_GSM.txt')
    M_FSM = np.loadtxt(directory + '/M_FSM.txt')
    M_GSM = np.loadtxt(directory + '/M_GSM.txt')
    D_SSM = (D_SSM1 + D_SSM2) / 2

    ID = np.zeros(shape=(D_SSM.shape[0], D_SSM.shape[1]))
    IM = np.zeros(shape=(M_FSM.shape[0], M_FSM.shape[1]))
    for i in range(D_SSM.shape[0]):
        for j in range(D_SSM.shape[1]):
            if D_SSM[i][j] == 0:
                ID[i][j] = D_GSM[i][j]
            else:
                ID[i][j] = D_SSM[i][j]
    for i in range(M_FSM.shape[0]):
        for j in range(M_FSM.shape[1]):
            if M_FSM[i][j] == 0:
                IM[i][j] = M_GSM[i][j]
            else:
                IM[i][j] = M_FSM[i][j]
                
    ID = pd.DataFrame(ID).reset_index()
    IM = pd.DataFrame(IM).reset_index()
    ID.rename(columns = {'index':'id'}, inplace = True)
    IM.rename(columns = {'index':'id'}, inplace = True)
    ID['id'] = ID['id'] + 1
    IM['id'] = IM['id'] + 1
    
    return ID, IM

def sample(directory, random_seed):
    all_associations = pd.read_csv(directory + '/all_mirna_disease_pairs.csv', names=['miRNA', 'disease', 'label'])
    known_associations = all_associations.loc[all_associations['label'] == 1]
    unknown_associations = all_associations.loc[all_associations['label'] == 0]
    random_negative = unknown_associations.sample(n=known_associations.shape[0], random_state=random_seed, axis=0)

    sample_df = known_associations.append(random_negative)
    sample_df.reset_index(drop=True, inplace=True)

    return sample_df

In [4]:
def performances(y_true, y_pred, y_prob):

    tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels = [0, 1]).ravel().tolist()

    pos_acc = tp / sum(y_true)
    neg_acc = tn / (len(y_pred) - sum(y_pred)) # [y_true=0 & y_pred=0] / y_pred=0
    accuracy = (tp+tn)/(tn+fp+fn+tp)
    
    recall = tp / (tp+fn)
    precision = tp / (tp+fp)
    f1 = 2*precision*recall / (precision+recall)
    
    roc_auc = roc_auc_score(y_true, y_prob)
    prec, reca, _ = precision_recall_curve(y_true, y_prob)
    aupr = auc(reca, prec)
    
    print('tn = {}, fp = {}, fn = {}, tp = {}'.format(tn, fp, fn, tp))
    print('y_pred: 0 = {} | 1 = {}'.format(Counter(y_pred)[0], Counter(y_pred)[1]))
    print('y_true: 0 = {} | 1 = {}'.format(Counter(y_true)[0], Counter(y_true)[1]))
    print('acc={:.4f}|precision={:.4f}|recall={:.4f}|f1={:.4f}|auc={:.4f}|aupr={:.4f}|pos_acc={:.4f}|neg_acc={:.4f}'.format(accuracy, precision, recall, f1, roc_auc, aupr, pos_acc, neg_acc))
    return (y_true, y_pred, y_prob), (accuracy, precision, recall, f1, roc_auc, aupr, pos_acc, neg_acc)

In [5]:
def obtain_data(directory, isbalance):

    ID, IM = load_data(directory)

    if isbalance:
        dtp = sample(directory, random_seed = 1234)
    else:
        dtp = pd.read_csv(directory + '/all_mirna_disease_pairs.csv', names=['miRNA', 'disease', 'label'])

    mirna_ids = list(set(dtp['miRNA']))
    disease_ids = list(set(dtp['disease']))
    random.shuffle(mirna_ids)
    random.shuffle(disease_ids)
    print('# miRNA = {} | Disease = {}'.format(len(mirna_ids), len(disease_ids)))

    mirna_test_num = int(len(mirna_ids) / 5)
    disease_test_num = int(len(disease_ids) / 5)
    print('# Test: miRNA = {} | Disease = {}'.format(mirna_test_num, disease_test_num))    
    
    samples = pd.merge(pd.merge(dtp, ID, left_on = 'disease', right_on = 'id'), IM, left_on = 'miRNA', right_on = 'id')
    samples.drop(labels = ['id_x', 'id_y'], axis = 1, inplace = True)
    
    return ID, IM, dtp, mirna_ids, disease_ids, mirna_test_num, disease_test_num, samples

In [6]:
def generate_task_Tp_train_test_idx(samples):
    kf = KFold(n_splits = 5, shuffle = True, random_state = 1234)

    train_index_all, test_index_all, n = [], [], 0
    train_id_all, test_id_all = [], []
    fold = 0
    for train_idx, test_idx in tqdm(kf.split(samples.iloc[:, 3:])): #train_index与test_index为下标
        print('-------Fold ', fold)
        train_index_all.append(train_idx) 
        test_index_all.append(test_idx)

        train_id_all.append(np.array(dtp.iloc[train_idx][['miRNA', 'disease']]))
        test_id_all.append(np.array(dtp.iloc[test_idx][['miRNA', 'disease']]))

        print('# Pairs: Train = {} | Test = {}'.format(len(train_idx), len(test_idx)))
        fold += 1
    return train_index_all, test_index_all, train_id_all, test_id_all

In [7]:
def generate_task_Tm_Td_train_test_idx(item, ids, dtp):
    
    test_num = int(len(ids) / 5)
    
    train_index_all, test_index_all = [], []
    train_id_all, test_id_all = [], []
    
    for fold in range(5):
        print('-------Fold ', fold)
        if fold != 4:
            test_ids = ids[fold * test_num : (fold + 1) * test_num]
        else:
            test_ids = ids[fold * test_num :]

        train_ids = list(set(ids) ^ set(test_ids))
        print('# {}: Train = {} | Test = {}'.format(item, len(train_ids), len(test_ids)))

        test_idx = dtp[dtp[item].isin(test_ids)].index.tolist()
        train_idx = dtp[dtp[item].isin(train_ids)].index.tolist()
        random.shuffle(test_idx)
        random.shuffle(train_idx)
        print('# Pairs: Train = {} | Test = {}'.format(len(train_idx), len(test_idx)))
        assert len(train_idx) + len(test_idx) == len(dtp)

        train_index_all.append(train_idx) 
        test_index_all.append(test_idx)
        
        train_id_all.append(train_ids)
        test_id_all.append(test_ids)
        
    return train_index_all, test_index_all, train_id_all, test_id_all

In [16]:
def run_clf(train_index_all, test_index_all, samples, classfier):
    
    fold = 0
    for train_idx, test_idx in zip(train_index_all, test_index_all):
        print('-----------------------Fold = ', str(fold))

        X = samples.iloc[:, 3:]
        y = samples['label']

        scaler = preprocessing.MinMaxScaler().fit(X.iloc[train_idx,:])
        X = scaler.transform(X)

        x_train, y_train = X[train_idx], y[train_idx]
        x_test, y_test = X[test_idx], y[test_idx]

        if classfier == 'ERT':
            clf = ExtraTreesClassifier(random_state = 19961231)
        elif classfier == 'GNB':
            clf = GaussianNB()
        elif classfier == 'DT':
            clf = DecisionTreeClassifier(random_state = 19961231)
        elif classfier == 'SGD':
            clf = SGDClassifier(loss="hinge", penalty="l2", max_iter=5)
        elif classfier == 'SVM':
            clf = svm.SVC()
            
        clf.fit(x_train, y_train)

        y_train_prob = clf.predict_proba(x_train)
        y_test_prob = clf.predict_proba(x_test)

        y_train_pred = clf.predict(x_train)
        y_test_pred = clf.predict(x_test)

        print('Train:')
        ys_train, metrics_train = performances(y_train, y_train_pred, y_train_prob[:, 1])
        print('Test:')
        ys_test, metrics_test = performances(y_test, y_test_pred, y_test_prob[:, 1])

        fold += 1
    
    return ys_train, metrics_train, ys_test, metrics_test

In [9]:
directory = 'data'
for isbalance in [True, False]:
    
    ID, IM, dtp, mirna_ids, disease_ids, mirna_test_num, disease_test_num, samples = obtain_data(directory, 
                                                                                                 isbalance)
    for task in ['Tp', 'Tm', 'Td']:
        
        print('========== isbalance = {} | task = {}'.format(isbalance, task))
        
        if task == 'Tp':
            train_index_all, test_index_all, train_id_all, test_id_all = generate_task_Tp_train_test_idx(samples)
            
        elif task == 'Tm':
            item = 'miRNA'
            ids = mirna_ids
            train_index_all, test_index_all, train_id_all, test_id_all = generate_task_Tm_Td_train_test_idx(item, ids, dtp)

        elif task == 'Td':
            item = 'disease'
            ids = disease_ids
            train_index_all, test_index_all, train_id_all, test_id_all = generate_task_Tm_Td_train_test_idx(item, ids, dtp)

        ys_train, metrics_train, ys_test, metrics_test = run_rf(train_index_all, test_index_all, samples, 'ERT')

# miRNA = 495 | Disease = 383
# Test: miRNA = 99 | Disease = 76


0it [00:00, ?it/s]

-------Fold  0
# Pairs: Train = 8688 | Test = 2172
-------Fold  1


5it [00:00, 77.57it/s]


# Pairs: Train = 8688 | Test = 2172
-------Fold  2
# Pairs: Train = 8688 | Test = 2172
-------Fold  3
# Pairs: Train = 8688 | Test = 2172
-------Fold  4
# Pairs: Train = 8688 | Test = 2172
-----------------------Fold =  0
Train:
tn = 4330, fp = 0, fn = 0, tp = 4358
y_pred: 0 = 4330 | 1 = 4358
y_true: 0 = 4330 | 1 = 4358
acc=1.0000|precision=1.0000|recall=1.0000|f1=1.0000|auc=1.0000|aupr=1.0000|pos_acc=1.0000|neg_acc=1.0000
Test:
tn = 944, fp = 156, fn = 113, tp = 959
y_pred: 0 = 1057 | 1 = 1115
y_true: 0 = 1100 | 1 = 1072
acc=0.8762|precision=0.8601|recall=0.8946|f1=0.8770|auc=0.9452|aupr=0.9416|pos_acc=0.8946|neg_acc=0.8931
-----------------------Fold =  1
Train:
tn = 4336, fp = 0, fn = 0, tp = 4352
y_pred: 0 = 4336 | 1 = 4352
y_true: 0 = 4336 | 1 = 4352
acc=1.0000|precision=1.0000|recall=1.0000|f1=1.0000|auc=1.0000|aupr=1.0000|pos_acc=1.0000|neg_acc=1.0000
Test:
tn = 925, fp = 169, fn = 150, tp = 928
y_pred: 0 = 1075 | 1 = 1097
y_true: 0 = 1094 | 1 = 1078
acc=0.8531|precision=0.8459|

KeyboardInterrupt: 

In [None]:
directory = 'data'
for isbalance in [True, False]:
    
    ID, IM, dtp, mirna_ids, disease_ids, mirna_test_num, disease_test_num, samples = obtain_data(directory, 
                                                                                                 isbalance)
    for task in ['Tp', 'Tm', 'Td']:
        
        print('========== isbalance = {} | task = {}'.format(isbalance, task))
        
        if task == 'Tp':
            train_index_all, test_index_all, train_id_all, test_id_all = generate_task_Tp_train_test_idx(samples)
            
        elif task == 'Tm':
            item = 'miRNA'
            ids = mirna_ids
            train_index_all, test_index_all, train_id_all, test_id_all = generate_task_Tm_Td_train_test_idx(item, ids, dtp)

        elif task == 'Td':
            item = 'disease'
            ids = disease_ids
            train_index_all, test_index_all, train_id_all, test_id_all = generate_task_Tm_Td_train_test_idx(item, ids, dtp)

        ys_train, metrics_train, ys_test, metrics_test = run_rf(train_index_all, test_index_all, samples, 'GNB')

In [None]:
directory = 'data'
for isbalance in [True, False]:
    
    ID, IM, dtp, mirna_ids, disease_ids, mirna_test_num, disease_test_num, samples = obtain_data(directory, 
                                                                                                 isbalance)
    for task in ['Tp', 'Tm', 'Td']:
        
        print('========== isbalance = {} | task = {}'.format(isbalance, task))
        
        if task == 'Tp':
            train_index_all, test_index_all, train_id_all, test_id_all = generate_task_Tp_train_test_idx(samples)
            
        elif task == 'Tm':
            item = 'miRNA'
            ids = mirna_ids
            train_index_all, test_index_all, train_id_all, test_id_all = generate_task_Tm_Td_train_test_idx(item, ids, dtp)

        elif task == 'Td':
            item = 'disease'
            ids = disease_ids
            train_index_all, test_index_all, train_id_all, test_id_all = generate_task_Tm_Td_train_test_idx(item, ids, dtp)

        ys_train, metrics_train, ys_test, metrics_test = run_rf(train_index_all, test_index_all, samples, 'DT')

In [None]:
directory = 'data'
for isbalance in [True, False]:
    
    ID, IM, dtp, mirna_ids, disease_ids, mirna_test_num, disease_test_num, samples = obtain_data(directory, 
                                                                                                 isbalance)
    for task in ['Tp', 'Tm', 'Td']:
        
        print('========== isbalance = {} | task = {}'.format(isbalance, task))
        
        if task == 'Tp':
            train_index_all, test_index_all, train_id_all, test_id_all = generate_task_Tp_train_test_idx(samples)
            
        elif task == 'Tm':
            item = 'miRNA'
            ids = mirna_ids
            train_index_all, test_index_all, train_id_all, test_id_all = generate_task_Tm_Td_train_test_idx(item, ids, dtp)

        elif task == 'Td':
            item = 'disease'
            ids = disease_ids
            train_index_all, test_index_all, train_id_all, test_id_all = generate_task_Tm_Td_train_test_idx(item, ids, dtp)

        ys_train, metrics_train, ys_test, metrics_test = run_rf(train_index_all, test_index_all, samples, 'SGD')

In [None]:
directory = 'data'
for isbalance in [True, False]:
    
    ID, IM, dtp, mirna_ids, disease_ids, mirna_test_num, disease_test_num, samples = obtain_data(directory, 
                                                                                                 isbalance)
    for task in ['Tp', 'Tm', 'Td']:
        
        print('========== isbalance = {} | task = {}'.format(isbalance, task))
        
        if task == 'Tp':
            train_index_all, test_index_all, train_id_all, test_id_all = generate_task_Tp_train_test_idx(samples)
            
        elif task == 'Tm':
            item = 'miRNA'
            ids = mirna_ids
            train_index_all, test_index_all, train_id_all, test_id_all = generate_task_Tm_Td_train_test_idx(item, ids, dtp)

        elif task == 'Td':
            item = 'disease'
            ids = disease_ids
            train_index_all, test_index_all, train_id_all, test_id_all = generate_task_Tm_Td_train_test_idx(item, ids, dtp)

        ys_train, metrics_train, ys_test, metrics_test = run_rf(train_index_all, test_index_all, samples, 'SVM')