In [28]:
#!/usr/bin/env python

import pandas as pd
import networkx as nx
from networkx.algorithms.bipartite.matrix import biadjacency_matrix
import numpy as np
from sklearn.metrics import precision_recall_curve, auc
import random
from sklearn import metrics
import time
import matplotlib.pyplot as plt
from scipy.linalg import inv
from joblib import Parallel, delayed
from sklearn.utils import parallel_backend
from itertools import product
from sklearn.metrics import roc_auc_score
from scipy.spatial.distance import cdist
random.seed(1949) # for dataset split
np.random.seed(1949) # for matrix initialization

This file is for the study of ADRs prediction using KR. After the function in section 1 run, nested CV, CV and result of test set in section 2 can be ran separately.

## 1. Define the functions used for KR

In [29]:
def option(str):
    global methodOption
    methodOption = str

### Function of Kernel Regression

In [30]:
def KernelRegression(matrix,feature_matrix1,feature_matrix2,idx_train,idx_test,l1,l2,s):
    sigma = s
    lmd1 = l1
    lmd2 = l2
        
    X1 = np.array(feature_matrix1[idx_train, :].copy()).tolist()
    X_new1 = np.array(feature_matrix1[idx_test, :].copy()).tolist()
    X2 = np.array(feature_matrix2[idx_train, :].copy()).tolist()
    X_new2 = np.array(feature_matrix2[idx_test, :].copy()).tolist()
    y = matrix[:, idx_train].copy()
    Y = pd.DataFrame(y.T.copy())
    # y_new = matrix[:, idx_test].copy()
    matrix_new = (matrix.copy()).astype(float)


    distance1 = cdist(X_new1, X1)**2
    distance2 = cdist(X_new2, X2)**2

    kernel1 = np.exp(-distance1/sigma**2)
    kernel2 = np.exp(-distance2/sigma**2)

    similarity1 = cdist(X1, X1)**2
    similarity2 = cdist(X2, X2)**2

    K1 = pd.DataFrame(np.exp(-similarity1/sigma**2))
    K2 = pd.DataFrame(np.exp(-similarity2/sigma**2))

    n = len(idx_train) # size of known drug

    
    if methodOption == "KR1":
        Lmd = np.diag(np.ones(n)*lmd1)
        W = inv(K1.dot(K1)+Lmd).dot(K1.dot(Y))
        y_new = kernel1.dot(W)
    elif methodOption == "KR2":
        Lmd = np.diag(np.ones(n)*lmd1)
        # print(Lmd)
        # print(K2.dot(K2)+Lmd)
        W = inv(K2.dot(K2)+Lmd).dot(K2.dot(Y))
        y_new = kernel2.dot(W)
    elif methodOption == "KR1&KR2":
        c1 = 0.5
        c2 = 0.5
        Lmd = np.diag(np.ones(n)*lmd1)
        K = c1*K1 + c2*K2
        W = inv(K.dot(K)+Lmd).dot(K.dot(Y))
        y_new = (c1*kernel1+c2*kernel2).dot(W)
    elif methodOption == "MKR":
        K = pd.concat([K1, K2], axis = 0).copy()
        KT = pd.concat([K1, K2], axis = 1).copy()
        KY = pd.concat([K1.dot(Y), K2.dot(Y)], axis = 0).copy()
        lmd1Vector = np.ones(n)*lmd1
        lmd2Vector = np.ones(n)*lmd2
        Lmd = np.diag(np.hstack([lmd1Vector, lmd2Vector]))
        W = inv(K.dot(KT)+Lmd).dot(KY)
        W1 = W[0:n, :].copy()
        W2 = W[n:2*n, :].copy()

        y_new = (kernel1.dot(W1) + kernel2.dot(W2))

    matrix_new[:, idx_test] = y_new.T
    return matrix_new


### Function for generating features for common drugs

In [31]:
def FeaturePreprocess(df_all, drug_nodes):
    
    drug_nodes_df = np.intersect1d(df_all.index, drug_nodes)
    df = df_all.loc[drug_nodes_df]
    _, q = df.shape
    drug_nodes_diff = np.setdiff1d(drug_nodes, (df.index).tolist())
    n = len(drug_nodes_diff)
    df_diff = pd.DataFrame(np.zeros(n*q).reshape(n,q))
    df_diff.index = drug_nodes_diff
    df_diff.columns = df.columns
    df_all = pd.concat([df, df_diff], axis = 0)
    featureMat = df_all.loc[drug_nodes]
    return np.array(featureMat)

### Function for cross validation and nested cross validation

In [35]:
def fold(IDX1,IDX2,feature_matrix1,feature_matrix2,matrix,l1,l2,s):
    # IDX1 target index, need to be evaluated
    # IDX2 test index, masked

    pr_auc_all = 0
    roc_auc_all = 0


    print('First few target index:', IDX1[0:10])
    print('First few mask index:', IDX2[0:10])

    target_idx = IDX1
    mask_idx = IDX2
    Ground_Truth = matrix.copy()
    side_effects_drug_relation_copy = matrix.copy()

    # target_idx = IDX2
    ### making all the links to predict as 0 ###############    
    for i in range(len(mask_idx)):
        side_effects_drug_relation_copy[:, mask_idx[i]] = 0
    
    m,n = side_effects_drug_relation_copy.shape

    drug_idx = list(range(n))
    existing_drug_idx = np.setdiff1d(drug_idx, mask_idx)

    print(methodOption + ' starts:')
    # real_stdout = sys.stdout
    # sys.stdout = open(os.devnull, "w")
    side_effects_drug_relation_fact = KernelRegression(matrix=matrix,\
        feature_matrix1=feature_matrix1,feature_matrix2=feature_matrix2,idx_train=existing_drug_idx,idx_test=target_idx,l1=l1,l2=l2,s=s)
    # sys.stdout = real_stdout
    print(methodOption + ' ends:')


    score = side_effects_drug_relation_fact.copy()


    print("proportion of ground truth:", sum(Ground_Truth[:, target_idx].ravel())/(Ground_Truth[:, target_idx].shape[0]*Ground_Truth[:, target_idx].shape[1]))

    print('---evaluation---')

    prec, recall, threshold = precision_recall_curve(Ground_Truth[:, target_idx].ravel(), score[:, target_idx].ravel())
    pr_auc_all = auc(recall, prec)
    roc_auc_all = roc_auc_score(Ground_Truth[:, target_idx].ravel(), score[:, target_idx].ravel())


    print("-----")

    print("AUC-PR all:", pr_auc_all)

    print("-----")

    print("AUC-ROC all:", roc_auc_all)


    return pr_auc_all, roc_auc_all

In [36]:
def innerfold(IDX1,IDX2,feature_matrix1,feature_matrix2,matrix,l1,l2,s):
    # IDX1 target index, need to be evaluated
    # IDX2 test index, masked

    print('First few target index:', IDX1[0:10])
    print('First few mask index:', IDX2[0:10])

    target_idx = IDX1
    mask_idx = IDX2
    Ground_Truth = matrix.copy()
    side_effects_drug_relation_copy = matrix.copy()

    # target_idx = IDX2
    ### making all the links to predict as 0 ###############    
    for i in range(len(mask_idx)):
        side_effects_drug_relation_copy[:, mask_idx[i]] = 0
    
    m,n = side_effects_drug_relation_copy.shape

    drug_idx = list(range(n))
    existing_drug_idx = np.setdiff1d(drug_idx, mask_idx)

    print(methodOption + ' starts:')
    # real_stdout = sys.stdout
    # sys.stdout = open(os.devnull, "w")
    side_effects_drug_relation_fact = KernelRegression(matrix=matrix,\
        feature_matrix1=feature_matrix1,feature_matrix2=feature_matrix2,idx_train=existing_drug_idx, idx_test=target_idx,l1=l1,l2=l2,s=s)
    # sys.stdout = real_stdout
    print(methodOption + ' ends:')

    score = side_effects_drug_relation_fact.copy()

    print("proportion of ground truth:", sum(Ground_Truth[:, target_idx].ravel())/(Ground_Truth[:, target_idx].shape[0]*Ground_Truth[:, target_idx].shape[1]))

    print('---evaluation---')
  
    pr_auc_all = 0
    roc_auc_all = 0

    prec, recall, threshold = precision_recall_curve(Ground_Truth[:, target_idx].ravel(), score[:, target_idx].ravel())
    pr_auc_all = auc(recall, prec)

    print("-----")

    print("AUC-PR all:", pr_auc_all)

    print("-----")


    return pr_auc_all, roc_auc_all

In [37]:
def plotfold(IDX1,IDX2,feature_matrix1,feature_matrix2,matrix,l1,l2,s):
    # IDX1 target index, need to be evaluated
    # IDX2 test index, masked


    print('First few target index:', IDX1[0:10])
    print('First few mask index:', IDX2[0:10])

    target_idx = IDX1
    mask_idx = IDX2
    Ground_Truth = matrix.copy()
    side_effects_drug_relation_copy = matrix.copy()

    # target_idx = IDX2
    ### making all the links to predict as 0 ###############    
    for i in range(len(mask_idx)):
        side_effects_drug_relation_copy[:, mask_idx[i]] = 0
    
    m,n = side_effects_drug_relation_copy.shape

    drug_idx = list(range(n))
    existing_drug_idx = np.setdiff1d(drug_idx, mask_idx)


    print(methodOption + ' starts:')
    side_effects_drug_relation_fact = KernelRegression(matrix=matrix,\
        feature_matrix1=feature_matrix1,feature_matrix2=feature_matrix2,idx_train=existing_drug_idx,idx_test=target_idx,l1=l1,l2=l2,s=s)
    print(methodOption + ' ends:')


    score = side_effects_drug_relation_fact.copy()
    

    print("proportion of ground truth:", sum(Ground_Truth[:, target_idx].ravel())/(Ground_Truth[:, target_idx].shape[0]*Ground_Truth[:, target_idx].shape[1]))

    print('---evaluation---')

    prec, recall, prthreshold = precision_recall_curve(Ground_Truth[:, target_idx].ravel(), score[:, target_idx].ravel())
    pr_auc_all = auc(recall, prec)
    
    fpr, tpr, rocthreshold = metrics.roc_curve(Ground_Truth[:, target_idx].ravel(), score[:, target_idx].ravel())
    roc_auc_all = auc(fpr, tpr)

    print("-----")

    print("AUC-PR all:", pr_auc_all)

    print("-----")

    print("AUC-ROC all:", roc_auc_all)

    print("-----")

    Out1 = pd.DataFrame([prec, recall, prthreshold])
    Out2 = pd.DataFrame([fpr, tpr, rocthreshold])
    return Out1, Out2

### Function for assigning arguments of CV and nested CV, as well as finding the best hyperparameters

In [38]:
def setvar_tune(size):
# set var for hyper pars tuning size is the hyper par size ALL_...

    global ALL_AUCPR_all
    global ALL_AUROC_all

    ALL_AUCPR_all = np.zeros(size)
    ALL_AUROC_all = np.zeros(size)

In [39]:
def setvar_cv(FOLDS):
# set var for cv 

    global AUC_roc_all
    global AUC_pr_all
    
    AUC_roc_all = np.zeros(FOLDS)
    AUC_pr_all = np.zeros(FOLDS)

In [40]:
def asgvar_tune(idx, results):
    # assign var for cv from results
    # f: size of hyper pars

    ALL_AUCPR_all[idx] = results[0]
    ALL_AUROC_all[idx] = results[1]


In [41]:
def asgvar_cv(f, results):
    # assign var for cv from results
    # f: size of hyper pars

    AUC_pr_all[f] = results[0]
    AUC_roc_all[f] = results[1]

In [43]:
def tuning_results(tuneVar):
    idx = np.argmax(ALL_AUCPR_all)
    Var = tuneVar[idx]
    Value = ALL_AUCPR_all[idx]

    print("best hyperpar: ", Var)
    print("AUPRC: ", Value)

    
    return Var, Value

In [44]:
def setvar_besttune(innerfolds):
    global besttunevalue
    global besttunevar
    besttunevalue = np.zeros(innerfolds) # best metric value
    besttunevar = np.zeros(innerfolds) # the value of best var
    besttunevar = besttunevar.tolist()

In [45]:
def asg_besttune(f, value, var):
    besttunevalue[f] = value
    besttunevar[f] = var

In [46]:
def besttune():
    idx = np.argmax(besttunevalue)
    value = besttunevalue[idx]
    var = besttunevar[idx]
    return value, var

In [47]:
def cv_results():
    
    print("Mean AUC_pr_all", AUC_pr_all.mean()," ", "Standard Deviation:", AUC_pr_all.std())
    print("Mean AUC_roc_all", AUC_roc_all.mean()," ", "Standard Deviation:", AUC_roc_all.std())
    print("-----------")
    results = np.array([AUC_pr_all, AUC_roc_all])

    return results

### Function for parallel computation

In [48]:
def tuning_loop(innermatrix, idx_train_inner, idx_test_inner, feature_matrix_inner1, feature_matrix_inner2, hyperparList, i):
    
    l1,l2,s = hyperparList[i]
    idx_target_inner = idx_test_inner
    print('target size:', len(idx_target_inner))
    results = innerfold(idx_target_inner,idx_test_inner,feature_matrix1=feature_matrix_inner1,feature_matrix2=feature_matrix_inner2,matrix=innermatrix,l1=l1,l2=l2,s=s)
    asgvar_tune(i, results=results)
    print("------ lmd1: ", l1, "lmd1: ", l2, "sigma: ", s, "------")

### Main function

In [49]:
def main(method_option,normalization=True,Validation=False,sets="intersect", l1=0.5, l2=0, s=0.5):
    random.seed(1949) # for dataset split
    np.random.seed(1949) # for matrix initialization
    option(method_option)

    df = pd.read_csv("data/side-effect-and-drug_name_upper.tsv",sep = "\t")


    drug_id = df["drugbank_id"] # put col of df in var
    drug_name = df["drugbank_name"]
    # drug_name = df["drugnames"]
    side_effect = df["side_effect_name"]
    
    edgelist1 = zip(side_effect, drug_name)
    ##making Biparite Graph##
    B = nx.DiGraph()
    B.add_nodes_from(side_effect,bipartite = 0)
    B.add_nodes_from(drug_name,bipartite = 1)
    B.add_edges_from(edgelist1)
    # B.add_weighted_edges_from(edgelist2)
    drug_nodes = {n for n, d in B.nodes(data=True) if d['bipartite']==1}
    side_effect_nodes = {n for n, d in B.nodes(data=True) if d['bipartite']==0}
    drug_nodes = list(drug_nodes)
    drug_nodes.sort()
    side_effect_nodes = list(side_effect_nodes)
    side_effect_nodes.sort()
    ###Getting the Bi-Adjacency matrix between side effects and drugs ###################
    matrix_all = biadjacency_matrix(B, row_order = side_effect_nodes, column_order = drug_nodes) # create biadjacency matrix for drug side effect graph
    matrix_all = matrix_all.A
    m_all,n_all = matrix_all.shape # number of side effect # number of drug
    
    
    ### Setting validation set / training set / testing set ###
    validate_sz = int(0.25 * n_all)
    IDX_all = list(range(n_all))
    random.shuffle(IDX_all)
    IDX_validate = sorted(IDX_all[0:validate_sz])
    print("first few validation set idx:")
    print(IDX_validate[0:10])
    IDX_validate_diff = np.setdiff1d(IDX_all, IDX_validate)
    matrix = matrix_all[:, IDX_validate_diff].copy()

    df1 = pd.read_csv("data/intersection_DGIdb_mat.tsv",sep = "\t")
    df2 = pd.read_csv("data/intersection_Fingerprint_mat.tsv",sep = "\t")
    featureMat1_all = FeaturePreprocess(df1, drug_nodes=drug_nodes)
    featureMat2_all = FeaturePreprocess(df2, drug_nodes=drug_nodes)
    featureMat1 = featureMat1_all[IDX_validate_diff, :].copy()
    featureMat2 = featureMat2_all[IDX_validate_diff, :].copy()
    
    
    non_zero_idx_union = np.hstack(np.where(~((featureMat1.sum(1) == 0) & (featureMat2.sum(1) == 0))))
    non_zero_idx_missing = np.hstack(np.where(~(~(featureMat1.sum(1) == 0) & ~(featureMat2.sum(1) == 0))))
    non_zero_idx_intersect = np.hstack(np.where(~(featureMat1.sum(1) == 0) & ~(featureMat2.sum(1) == 0)))
    if sets == "union":
        # union
        matrix = matrix[:, non_zero_idx_union].copy()
        featureMat1 = featureMat1[non_zero_idx_union, :].copy()
        featureMat2 = featureMat2[non_zero_idx_union, :].copy()
    elif sets == "intersect":
        # intersect
        non_zero_idx_intersect_all = np.hstack(np.where(~(featureMat1_all.sum(1) == 0) & ~(featureMat2_all.sum(1) == 0)))
    
        matrix_all = matrix_all[:, non_zero_idx_intersect_all].copy()
        featureMat1_all = featureMat1_all[non_zero_idx_intersect_all, :].copy()
        featureMat2_all = featureMat2_all[non_zero_idx_intersect_all, :].copy()
    
        matrix = matrix[:, non_zero_idx_intersect].copy()
        featureMat1 = featureMat1[non_zero_idx_intersect, :].copy()
        featureMat2 = featureMat2[non_zero_idx_intersect, :].copy()
    
        IDX_validate = np.setdiff1d(non_zero_idx_intersect_all, IDX_validate_diff)
        IDX_validate_diff = np.setdiff1d(non_zero_idx_intersect_all, IDX_validate)
    
        drug_nodes_intersect_all = np.array(drug_nodes)[non_zero_idx_intersect_all]
        drug_nodes_intersect_validate_diff = np.array(drug_nodes)[IDX_validate_diff]
        drug_nodes_intersect_validate = np.array(drug_nodes)[IDX_validate]
    
        IDX_validate = np.array([x for x in range(len(drug_nodes_intersect_all)) if drug_nodes_intersect_all[x] in drug_nodes_intersect_validate])
        IDX_validate_diff = np.array([x for x in range(len(drug_nodes_intersect_all)) if drug_nodes_intersect_all[x] in drug_nodes_intersect_validate_diff])
    

    m,n = matrix.shape # number of side effect # number of drug


    random.seed(1949) # for dataset split
    np.random.seed(1949) # for matrix initialization
    start_time = time.time()



    FOLDS = 5
    innerFOLDS = 4
    ####for test sets####
    setvar_cv(FOLDS)

    sz = n
    IDX = list(range(sz))
    fsz = int(sz/FOLDS)
    random.shuffle(IDX)
    IDX = np.array(IDX)
    offset = 0

    innersz = sz - fsz
    innerIDX = list(range(innersz))
    random.shuffle(innerIDX)
    innerIDX = np.array(innerIDX)
    innerfsz = int(innersz / innerFOLDS)
    inneroffset = 0
    # setvar_cv(FOLDS=FOLDS)

    if Validation == "nested_cv":
        for f in range(FOLDS):  # range(FOLDS):
            offset = 0 + fsz*f 
            idx_test = IDX[offset:offset + fsz]
    
            idx_train = IDX[np.setdiff1d(np.arange(len(IDX)), np.arange(offset,offset + fsz))]
            print("Fold:",f)
                
            innermatrix = matrix[:, idx_train]
    
            innerfeatureMat1 = featureMat1[idx_train, :].copy()
            innerfeatureMat2 = featureMat2[idx_train, :].copy()

    
            setvar_besttune(innerFOLDS)
    
            for innerf in range(innerFOLDS):
                inneroffset = 0 + innerf*innerfsz
                idx_test_inner = innerIDX[inneroffset:inneroffset + innerfsz]
                idx_train_inner = innerIDX[np.array(np.setdiff1d(np.arange(len(idx_train)), np.arange(inneroffset,inneroffset + innerfsz)))]
    
                print("Inner Fold:", innerf)

                lmd1 = (10**np.arange(-2, 4, 1, dtype=float)).tolist()
                if method_option == "MKR":
                    lmd2 = (10**np.arange(-2, 4, 1, dtype=float)).tolist()
                else:
                    lmd2 = (np.arange(0, 1, 1, dtype=float)).tolist()
                sigma = (10**np.arange(-1, 2, 1, dtype=float)).tolist()
                hyperparList = list(product(lmd1, lmd2, sigma))

    
                setvar_tune(len(hyperparList))
    
                with parallel_backend('threading'):
                    Parallel(n_jobs=50)(delayed(tuning_loop)(innermatrix = innermatrix, idx_train_inner = idx_train_inner, \
                            idx_test_inner = idx_test_inner, feature_matrix_inner1 = innerfeatureMat1, feature_matrix_inner2 = innerfeatureMat2, \
                                hyperparList = hyperparList, i = i) \
                                    for i in range(len(hyperparList)))

                hyperpars, evalValue = tuning_results(tuneVar=hyperparList)
    
    
                asg_besttune(innerf, value=evalValue, var=hyperpars)
                    
            _, bestHyperPars = besttune()
    
            print("--- tuning end ---")
            l1, l2, s = bestHyperPars
            idx_target = idx_test
            print('target size:', len(idx_target))
    
            print("------ lmd1: ", l1, "lmd1: ", l2, "sigma: ", s, "------")

    
            results = fold(idx_target,idx_test,featureMat1,featureMat2,matrix,l1=l1,l2=l2,s=s)
            asgvar_cv(f=f, results=results)

        out = cv_results()
        return out

    elif Validation == "cv":
        
        setvar_besttune(FOLDS)

        for f in range(FOLDS):
            offset = 0 + fsz*f 
            idx_test = IDX[offset:offset + fsz]
            idx_train = IDX[np.setdiff1d(np.arange(len(IDX)), np.arange(offset,offset + fsz))]

            print("Fold:", f)


            lmd1 = (10**np.arange(-2, 4, 1, dtype=float)).tolist()
            if method_option == "MKR":
                lmd2 = (10**np.arange(-2, 4, 1, dtype=float)).tolist()
            else:
                lmd2 = (np.arange(0, 1, 1, dtype=float)).tolist()
            sigma = (10**np.arange(-1, 2, 1, dtype=float)).tolist()
            hyperparList = list(product(lmd1, lmd2, sigma))

            setvar_tune(len(hyperparList))
    
            with parallel_backend('threading'):
                Parallel(n_jobs=50)(delayed(tuning_loop)(innermatrix = matrix, idx_train_inner = idx_train, \
                        idx_test_inner = idx_test, feature_matrix_inner1 = featureMat1, feature_matrix_inner2 = featureMat2, \
                            hyperparList = hyperparList, i = i) \
                                for i in range(len(hyperparList)))
    
            hyperpars, evalValue = tuning_results(tuneVar=hyperparList)
            asg_besttune(f, value=evalValue, var=hyperpars)

    
        print("--- tuning end ---")
        # cv_results()
        _, bestHyperPars = besttune()
    elif Validation == "Validation":

        # validation
        idx_test = IDX_validate
        idx_train = IDX_validate_diff
        idx_target = idx_test
        print('target size:', len(idx_target))
        print("------ lmd1: ", l1, "lmd1: ", l2, "sigma: ", s, "------")
        results = fold(idx_target,idx_test,featureMat1_all,featureMat2_all,matrix_all,l1=l1,l2=l2,s=s)
        return
    elif Validation == "plot":

        # validation
        idx_test = IDX_validate
        idx_train = IDX_validate_diff
        idx_target = idx_test
        print('target size:', len(idx_target))
        print("------ lmd1: ", l1, "lmd1: ", l2, "sigma: ", s, "------")
        pr, roc = plotfold(idx_target,idx_test,featureMat1_all,featureMat2_all,matrix_all,l1=l1,l2=l2,s=s)
        return pr, roc

## 2. Nested CV and CV of KR

### 2.1. Nested CV

Running the nested CV for feature DGI

In [46]:
results_KR1 = main(method_option="KR1",Validation="nested_cv")

first few validation set idx:
[2, 7, 12, 13, 17, 20, 26, 39, 45, 47]
Fold: 0
Inner Fold: 0
target size: 115
First few target index: [327  39 166 367  17 387 158  45 293  27]
First few mask index: [327  39 166 367  17 387 158  45 293  27]
target size: 115
First few target index: [327  39 166 367  17 387 158  45 293  27]
First few mask index: [327  39 166 367  17 387 158  45 293  27]
target size: 115
First few target index:target size: 115
First few target index: [327  39 166 367  17 387 158  45 293  27]
First few mask index: [327  39 166 367  17 387 158  45 293  27]
 [327  39 166 367  17 387 158  45 293  27]
First few mask index: [327  39 166 367  17 387 158  45 293  27]
target size: 115
First few target index: [327  39 166 367  17 387 158  45 293  27]
First few mask index: [327  39 166 367  17 387 158  45 293  27]
target size: 115
First few target index: [327  39 166 367  17 387 158  45 293  27]
First few mask index: [327  39 166 367  17 387 158  45 293  27]
target size: 115
First few 

Running the nested CV for feature Chem

In [47]:
results_KR2 = main(method_option="KR2",Validation="nested_cv")

first few validation set idx:
[2, 7, 12, 13, 17, 20, 26, 39, 45, 47]
Fold: 0
Inner Fold: 0
target size: 115
First few target index: [327  39 166 367  17 387 158  45 293  27]
First few mask index: [327  39 166 367  17 387 158  45 293  27]
target size: 115
First few target index: [327  39 166 367  17 387 158  45 293  27]
First few mask index: [327  39 166 367  17 387 158  45 293  27]
target size: 115
First few target index: [327  39 166 367  17 387 158  45 293  27]
First few mask index: [327  39 166 367  17 387 158  45 293  27]
target size: 115
First few target index: [327  39 166 367  17 387 158  45 293  27]
First few mask index: [327  39 166 367  17 387 158  45 293  27]
target size: 115
First few target index: [327  39 166 367  17 387 158  45 293  27]
First few mask index: [327  39 166 367  17 387 158  45 293  27]
target size: 115
First few target index: [327  39 166 367  17 387 158  45 293  27]
First few mask index: [327  39 166 367  17 387 158  45 293  27]
target size: 115
First few 

Running the nested CV for integrated features, with multiple kernels

In [48]:
results_MKR = main(method_option="MKR",Validation="nested_cv")

first few validation set idx:
[2, 7, 12, 13, 17, 20, 26, 39, 45, 47]
Fold: 0
Inner Fold: 0
target size: 115
First few target index: [327  39 166 367  17 387 158  45 293  27]
First few mask index:target size: 115
First few target index: [327  39 166 367  17 387 158  45 293  27]
First few mask index: [327  39 166 367  17 387 158  45 293  27]
 [327  39 166 367  17 387 158  45 293  27]
target size: 115
First few target index: [327  39 166 367  17 387 158  45 293  27]
First few mask index: [327  39 166 367  17 387 158  45 293  27]
target size: 115
First few target index: [327  39 166 367  17 387 158  45 293  27]
First few mask index: [327  39 166 367  17 387 158  45 293  27]
target size: 115
First few target index: [327  39 166 367  17 387 158  45 293  27]
First few mask index: [327  39 166 367  17 387 158  45 293  27]
target size: 115
First few target index: [327  39 166 367  17 387 158  45 293  27]
First few mask index: [327  39 166 367  17 387 158  45 293  27]
target size: 115
First few 

Running KR for integrated features. Features are linear combination.

In [49]:
results_KR1KR2 = main(method_option="KR1&KR2",Validation="nested_cv")

first few validation set idx:
[2, 7, 12, 13, 17, 20, 26, 39, 45, 47]
Fold: 0
Inner Fold: 0
target size: 115
First few target index: [327  39 166 367  17 387 158  45 293  27]
First few mask index: [327  39 166 367  17 387 158  45 293  27]
target size: 115
First few target index: [327  39 166 367  17 387 158  45 293  27]
First few mask index: [327  39 166 367  17 387 158  45 293  27]
target size: 115
First few target index: [327  39 166 367  17 387 158  45 293  27]
First few mask index: [327  39 166 367  17 387 158  45 293  27]
target size: 115
First few target index: [327  39 166 367  17 387 158  45 293  27]
First few mask index: [327  39 166 367  17 387 158  45 293  27]
target size: 115
First few target index: [327  39 166 367  17 387 158  45 293  27]
First few mask index: [327  39 166 367  17 387 158  45 293  27]
target size: 115
First few target index: [327  39 166 367  17 387 158  45 293  27]
First few mask index: [327  39 166 367  17 387 158  45 293  27]
target size: 115
First few 

### 2.2. CV to tune hyperparameters for independent test set

Running CV for DGI. The best hyperparameters are $\lambda=0.1, \sigma=10$.

In [50]:
main(method_option="KR1",Validation="cv")

first few validation set idx:
[2, 7, 12, 13, 17, 20, 26, 39, 45, 47]
Fold: 0
target size: 115
First few target index: [384 293 396 263  31 466 141 431  32 337]
First few mask index: [384 293 396 263  31 466 141 431  32 337]
target size: 115
First few target index: [384 293 396 263  31 466 141 431  32 337]
First few mask index: [384 293 396 263  31 466 141 431  32 337]
target size: 115
First few target index:target size: 115
First few target index: [384 293 396 263  31 466 141 431  32 337]
First few mask index: [384 293 396 263  31 466 141 431  32 337]
 [384 293 396 263  31 466 141 431  32 337]
First few mask index: [384 293 396 263  31 466 141 431  32 337]
target size: 115
First few target index: [384 293 396 263  31 466 141 431  32 337]
First few mask index: [384 293 396 263  31 466 141 431  32 337]
target size: 115
First few target index: [384 293 396 263  31 466 141 431  32 337]
First few mask index: [384 293 396 263  31 466 141 431  32 337]
target size: 115
First few target index: 

Running CV for Chem. The best hyperparameters are $\lambda=1, \sigma=10$.

In [51]:
main(method_option="KR2",Validation="cv")

first few validation set idx:
[2, 7, 12, 13, 17, 20, 26, 39, 45, 47]
Fold: 0
target size: 115
First few target index: [384 293 396 263  31 466 141 431  32 337]
First few mask index: [384 293 396 263  31 466 141 431  32 337]
target size: 115
First few target index: [384 293 396 263  31 466 141 431  32 337]
First few mask index: [384 293 396 263  31 466 141 431  32 337]
target size: 115
First few target index: [384 293 396 263  31 466 141 431  32 337]
First few mask index: [384 293 396 263  31 466 141 431  32 337]
target size: 115
First few target index: [384 293 396 263  31 466 141 431  32 337]target size: 115
First few target index: [384 293 396 263  31 466 141 431  32 337]
First few mask index: [384 293 396 263  31 466 141 431  32 337]
target size: 115
First few target index: [384 293 396 263  31 466 141 431  32 337]
First few mask index: [384 293 396 263  31 466 141 431  32 337]

First few mask index: [384 293 396 263  31 466 141 431  32 337]
target size: 115
First few target index: 

Running CV for DGI and Chem (multiple kernels). The best hyperparameters are $\lambda_{1}=0.1, \lambda_{2}=1, \sigma=10$.

In [52]:
main(method_option="MKR",Validation="cv")

first few validation set idx:
[2, 7, 12, 13, 17, 20, 26, 39, 45, 47]
Fold: 0
target size: 115
First few target index: [384 293 396 263  31 466 141 431  32 337]
First few mask index: [384 293 396 263  31 466 141 431  32 337]
target size: 115
First few target index: [384 293 396 263  31 466 141 431  32 337]
First few mask index: [384 293 396 263  31 466 141 431  32 337]
target size: 115
First few target index: [384 293 396 263  31 466 141 431  32 337]
First few mask index: [384 293 396 263  31 466 141 431  32 337]
target size:target size: 115
First few target index: [384 293 396 263  31 466 141 431  32 337]
First few mask index: [384 293 396 263  31 466 141 431  32 337]
 115
First few target index: [384 293 396 263  31 466 141 431  32 337]
First few mask index: [384 293 396 263  31 466 141 431  32 337]
target size: 115
First few target index: [384 293 396 263  31 466 141 431  32 337]
First few mask index: [384 293 396 263  31 466 141 431  32 337]
target size: 115
First few target index: 

Running CV for DGI and Chem (linear combination). The best hyperparameters are $\lambda=0.1, \sigma=10$

In [53]:
main(method_option="KR1&KR2",Validation="cv")

first few validation set idx:
[2, 7, 12, 13, 17, 20, 26, 39, 45, 47]
Fold: 0
target size: 115
First few target index: [384 293 396 263  31 466 141 431  32 337]
First few mask index: [384 293 396 263  31 466 141 431  32 337]
target size: 115
First few target index: [384 293 396 263  31 466 141 431  32 337]
First few mask index: [384 293 396 263  31 466 141 431  32 337]
target size: 115
First few target index: [384 293 396 263  31 466 141 431  32 337]
First few mask index: [384 293 396 263  31 466 141 431  32 337]
target size: 115
First few target index: [384 293 396 263  31 466 141 431  32 337]
First few mask index: [384 293 396 263  31 466 141 431  32 337]
target size: 115
First few target index: [384 293 396 263  31 466 141 431  32 337]
First few mask index: [384 293 396 263  31 466 141 431  32 337]
target size: 115
First few target index: [384 293 396 263  31 466 141 431  32 337]
First few mask index: [384 293 396 263  31 466 141 431  32 337]
target size: 115
First few target index: 

### 2.3. Independent test set

In [24]:
main(method_option="KR1", Validation="Validation", l1=0.1, s=10)

first few validation set idx:
[2, 7, 12, 13, 17, 20, 26, 39, 45, 47]
target size: 201
------ lmd1:  0.1 lmd1:  0 sigma:  10 ------
First few target index: [ 4 11 14 27 32 34 35 37 41 47]
First few mask index: [ 4 11 14 27 32 34 35 37 41 47]
KR1 starts:
KR1 ends:
proportion of ground truth: 0.02256888445786783
---evaluation---
-----
AUC-PR all: 0.4112425624842458
-----
AUC-ROC all: 0.8887751461224314
-----
AUC-PR per drug: 0
-----
AUC-ROC per drug: 0
-----
AUC-ROC top N: 0
-----
AUC-PR top N: 0
-----
F1: 0
-----
Fmax 0
-----
Smin 0


In [25]:
main(method_option="KR2", Validation="Validation", l1=1.0, s=10) 

first few validation set idx:
[2, 7, 12, 13, 17, 20, 26, 39, 45, 47]
target size: 201
------ lmd1:  1.0 lmd1:  0 sigma:  10 ------
First few target index: [ 4 11 14 27 32 34 35 37 41 47]
First few mask index: [ 4 11 14 27 32 34 35 37 41 47]
KR2 starts:
KR2 ends:
proportion of ground truth: 0.02256888445786783
---evaluation---
-----
AUC-PR all: 0.38059942289179904
-----
AUC-ROC all: 0.8812979242561445
-----
AUC-PR per drug: 0
-----
AUC-ROC per drug: 0
-----
AUC-ROC top N: 0
-----
AUC-PR top N: 0
-----
F1: 0
-----
Fmax 0
-----
Smin 0


In [26]:
main(method_option="MKR", Validation="Validation", l1=0.1, l2=1, s=10)

first few validation set idx:
[2, 7, 12, 13, 17, 20, 26, 39, 45, 47]
target size: 201
------ lmd1:  0.1 lmd1:  1 sigma:  10 ------
First few target index: [ 4 11 14 27 32 34 35 37 41 47]
First few mask index: [ 4 11 14 27 32 34 35 37 41 47]
MKR starts:
MKR ends:
proportion of ground truth: 0.02256888445786783
---evaluation---
-----
AUC-PR all: 0.4138175262320037
-----
AUC-ROC all: 0.8713086888710723
-----
AUC-PR per drug: 0
-----
AUC-ROC per drug: 0
-----
AUC-ROC top N: 0
-----
AUC-PR top N: 0
-----
F1: 0
-----
Fmax 0
-----
Smin 0


In [27]:
main(method_option="KR1&KR2", Validation="Validation", l1=0.1, s=10)

first few validation set idx:
[2, 7, 12, 13, 17, 20, 26, 39, 45, 47]
target size: 201
------ lmd1:  0.1 lmd1:  0 sigma:  10 ------
First few target index: [ 4 11 14 27 32 34 35 37 41 47]
First few mask index: [ 4 11 14 27 32 34 35 37 41 47]
KR1&KR2 starts:
KR1&KR2 ends:
proportion of ground truth: 0.02256888445786783
---evaluation---
-----
AUC-PR all: 0.4124813115885141
-----
AUC-ROC all: 0.8752913642985228
-----
AUC-PR per drug: 0
-----
AUC-ROC per drug: 0
-----
AUC-ROC top N: 0
-----
AUC-PR top N: 0
-----
F1: 0
-----
Fmax 0
-----
Smin 0


In [50]:
KR1_pr, KR1_roc = \
    main(method_option="KR1", Validation="plot", l1=0.1, s=10)             
KR1_pr.T.to_csv("Figs/KR1_pr.csv", index=False)
KR1_roc.T.to_csv("Figs/KR1_roc.csv", index=False)

first few validation set idx:
[2, 7, 12, 13, 17, 20, 26, 39, 45, 47]
target size: 201
------ lmd1:  0.1 lmd1:  0 sigma:  10 ------
First few target index: [ 4 11 14 27 32 34 35 37 41 47]
First few mask index: [ 4 11 14 27 32 34 35 37 41 47]
KR1 starts:
KR1 ends:
proportion of ground truth: 0.02256888445786783
---evaluation---
-----
AUC-PR all: 0.4112425624855005
-----
AUC-ROC all: 0.888775146158224
-----
AUC-PR per drug: 0
-----
AUC-ROC per drug: 0
-----
AUC-ROC top N: 0
-----
AUC-PR top N: 0
-----
F1: 0
-----
Fmax 0
-----
Smin 0


In [51]:
KR2_pr, KR2_roc = \
    main(method_option="KR2", Validation="plot", l1=1, s=10)
KR2_pr.T.to_csv("Figs/KR2_pr.csv", index=False)
KR2_roc.T.to_csv("Figs/KR2_roc.csv", index=False)

first few validation set idx:
[2, 7, 12, 13, 17, 20, 26, 39, 45, 47]
target size: 201
------ lmd1:  1 lmd1:  0 sigma:  10 ------
First few target index: [ 4 11 14 27 32 34 35 37 41 47]
First few mask index: [ 4 11 14 27 32 34 35 37 41 47]
KR2 starts:
KR2 ends:
proportion of ground truth: 0.02256888445786783
---evaluation---
-----
AUC-PR all: 0.38059942289179893
-----
AUC-ROC all: 0.8812979242561445
-----
AUC-PR per drug: 0
-----
AUC-ROC per drug: 0
-----
AUC-ROC top N: 0
-----
AUC-PR top N: 0
-----
F1: 0
-----
Fmax 0
-----
Smin 0


In [52]:
MKR_pr, MKR_roc = \
    main(method_option="MKR", Validation="plot", l1=0.1, l2=1, s=10)             
MKR_pr.T.to_csv("Figs/MKR_pr.csv", index=False)
MKR_roc.T.to_csv("Figs/MKR_roc.csv", index=False)

first few validation set idx:
[2, 7, 12, 13, 17, 20, 26, 39, 45, 47]
target size: 201
------ lmd1:  0.1 lmd1:  1 sigma:  10 ------
First few target index: [ 4 11 14 27 32 34 35 37 41 47]
First few mask index: [ 4 11 14 27 32 34 35 37 41 47]
MKR starts:
MKR ends:
proportion of ground truth: 0.02256888445786783
---evaluation---
-----
AUC-PR all: 0.4138175262320037
-----
AUC-ROC all: 0.8713086888710723
-----
AUC-PR per drug: 0
-----
AUC-ROC per drug: 0
-----
AUC-ROC top N: 0
-----
AUC-PR top N: 0
-----
F1: 0
-----
Fmax 0
-----
Smin 0


In [53]:
KR1KR2_pr, KR1KR2_roc = \
    main(method_option="KR1&KR2", Validation="plot", l1=0.1, s=10)             
KR1KR2_pr.T.to_csv("Figs/KR1KR2_pr.csv", index=False)
KR1KR2_roc.T.to_csv("Figs/KR1KR2_roc.csv", index=False)

first few validation set idx:
[2, 7, 12, 13, 17, 20, 26, 39, 45, 47]
target size: 201
------ lmd1:  0.1 lmd1:  0 sigma:  10 ------
First few target index: [ 4 11 14 27 32 34 35 37 41 47]
First few mask index: [ 4 11 14 27 32 34 35 37 41 47]
KR1&KR2 starts:
KR1&KR2 ends:
proportion of ground truth: 0.02256888445786783
---evaluation---
-----
AUC-PR all: 0.4124813116023129
-----
AUC-ROC all: 0.8752913643343151
-----
AUC-PR per drug: 0
-----
AUC-ROC per drug: 0
-----
AUC-ROC top N: 0
-----
AUC-PR top N: 0
-----
F1: 0
-----
Fmax 0
-----
Smin 0
