In [None]:
#!/usr/bin/env python
import pandas as pd
import networkx as nx
from networkx.algorithms.bipartite.matrix import biadjacency_matrix
import numpy as np
from sklearn.metrics import precision_recall_curve, auc
import random
from sklearn import metrics
import time
from joblib import Parallel, delayed
from sklearn.utils import parallel_backend
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LinearRegression
import numpy.linalg as LA
from sklearn.neighbors import NearestNeighbors
from sklearn.linear_model import Ridge

random.seed(1949) # for dataset split
np.random.seed(1949) # for matrix initialization

This file is for the study of ADRs prediction using LNSM, LNSM-CMI and LNSM-SMI with Regularized Linear Neighbourhood (RLN) similarity. After the function in section 1 run, nested CV, CV and result of test set in section 2 can be ran separately.

## 1. Define the functions used for LNSM-RLN family

In [None]:
def option(str):
    global methodOption
    methodOption = str

### Function for calculating regularized linear neightbourhood similarity

In [None]:
def RLN(feature_matrix, idx_train, idx_test):
    X = feature_matrix[idx_train, :]
    X_new = feature_matrix[idx_test, :]

    neigh = NearestNeighbors(n_neighbors = 200)
    neigh.fit(X)
    W = np.zeros([len(idx_train), len(idx_train)])
    W_new = np.zeros([len(idx_test), len(idx_train)])
    clf = Ridge(alpha=1)

    N = neigh.kneighbors(X, 200, return_distance=False)
    for i in range(len(idx_train)):
        # print("test")
        X_knn = X[N[i], :]
        clf.fit(X_knn.T, X[i, :])
        W[i, N] = clf.coef_

    N_new = neigh.kneighbors(X_new, 200, return_distance=False)
    for i in range(len(idx_test)):
        X_knn_new = X[N_new[i], :]
        clf.fit(X_knn_new.T, X_new[i, :])
        W_new[i, N_new[i]] = clf.coef_

    return W, W_new

### LNSM

In [None]:
def LNSM(matrix, feature_matrix, alpha, idx_train, idx_test):
    Y_0 = (matrix[:, idx_train].copy()).T

    W, W_new = RLN(feature_matrix=feature_matrix, idx_train=idx_train, idx_test=idx_test)

    max_iter = 1000
    Y_t1 = Y_0.copy()
    cost_t1 = alpha * np.trace(np.dot(np.dot(Y_t1.T, 1 - W), Y_t1)) + (1 - alpha)*LA.norm(Y_t1 - Y_0)**2

    for i in range(max_iter):
        Y_t2 = alpha * np.dot(W, Y_t1) + (1 - alpha) * Y_0

        cost_t2 = alpha * np.trace(np.dot(np.dot(Y_t2.T, 1 - W), Y_t2)) + (1 - alpha)*LA.norm(Y_t2 - Y_0)**2
        # print("---")
        # print(alpha)
        # print(cost_t2)
        Y_t1 = Y_t2.copy()
        cost_t1 = cost_t2

        if (cost_t2 - cost_t1) < (cost_t1 / 10000):
            print("LNSM converged")
            break
        if i == (max_iter - 1):
            print("maximum iteration reached")

    Y = Y_t2.copy()

    
    # Y = (1 - alpha) * np.dot(np.linalg.pinv(1 - alpha * W), Y_0)
    Y_new = np.dot(W_new, Y)
    matrix_new = matrix.copy().astype(float)
    matrix_new[:, idx_test] = Y_new.T
    # print(sum(sum(matrix_new[:, idx_test])))
    return matrix_new

LNSM-SMI

In [None]:
def LNSM_SMI(matrix, feature_matrix1, feature_matrix2, alpha, idx_train, idx_test):
    Y_0 = (matrix[:, idx_train].copy()).T
    
    W1, W1_new = RLN(feature_matrix=feature_matrix1, idx_train=idx_train, idx_test=idx_test)    
    W2, W2_new = RLN(feature_matrix=feature_matrix2, idx_train=idx_train, idx_test=idx_test)
    


    # Y = (1 - alpha) * np.linalg.inv(1 - alpha * W)
    c1 = np.trace(np.dot(np.dot(Y_0.T, 1 - W1), Y_0))
    c2 = np.trace(np.dot(np.dot(Y_0.T, 1 - W2), Y_0))
    cmax = max(c1, c2)
    theta1 = (cmax - c1) / ((cmax - c1) + (cmax - c2))
    theta2 = (cmax - c2) / ((cmax - c1) + (cmax - c2))

    Y_new = np.dot((np.dot(theta1, W1_new) + np.dot(theta2, W2_new)), Y_0)
    matrix_new = matrix.copy().astype(float)
    matrix_new[:, idx_test] = Y_new.T
    # print(sum(sum(matrix_new[:, idx_test])))
    return matrix_new

### LNSM-CMI

In [None]:
def LNSM_CMI(matrix, feature_matrix1, feature_matrix2, alpha, idx_train, idx_test):
    Y_0 = (matrix[:, idx_train].copy()).T
    # W1 = WMK1[idx_train, :][:, idx_train].copy()
    # W2 = WMK2[idx_train, :][:, idx_train].copy()
    # W1_new = WMK1[idx_test, :][:, idx_train].copy()
    # W2_new = WMK2[idx_test, :][:, idx_train].copy()
    W1, W1_new = RLN(feature_matrix=feature_matrix1, idx_train=idx_train, idx_test=idx_test)    
    W2, W2_new = RLN(feature_matrix=feature_matrix2, idx_train=idx_train, idx_test=idx_test)


    max_iter = 1000
    Y_t1 = Y_0.copy()
    cost_t1 = alpha * np.trace(np.dot(np.dot(Y_t1.T, 1 - W1), Y_t1)) + (1 - alpha)*LA.norm(Y_t1 - Y_0)**2

    for i in range(max_iter):
        Y_t2 = alpha * np.dot(W1, Y_t1) + (1 - alpha) * Y_0

        cost_t2 = alpha * np.trace(np.dot(np.dot(Y_t2.T, 1 - W1), Y_t2)) + (1 - alpha)*LA.norm(Y_t2 - Y_0)**2
        # print("---")
        # print(alpha)
        Y_t1 = Y_t2.copy()
        cost_t1 = cost_t2

        if (cost_t2 - cost_t1) < (cost_t1 / 10000):
            print("LNSM converged")
            break
        if i == (max_iter - 1):
            print("maximum iteration reached")

    Y1 = Y_t2.copy()

    Y_t1 = Y_0.copy()
    cost_t1 = alpha * np.trace(np.dot(np.dot(Y_t1.T, 1 - W2), Y_t1)) + (1 - alpha)*LA.norm(Y_t1 - Y_0)**2

    for i in range(max_iter):
        Y_t2 = alpha * np.dot(W2, Y_t1) + (1 - alpha) * Y_0

        cost_t2 = alpha * np.trace(np.dot(np.dot(Y_t2.T, 1 - W2), Y_t2)) + (1 - alpha)*LA.norm(Y_t2 - Y_0)**2
        # print("---")
        # print(alpha)
        # print(cost_t2)
        Y_t1 = Y_t2.copy()
        cost_t1 = cost_t2

        if (cost_t2 - cost_t1) < (cost_t1 / 10000):
            print("LNSM converged")
            break
        if i == (max_iter - 1):
            print("maximum iteration reached")

    Y2 = Y_t2.copy()



    # Y1 = (1 - alpha) * np.dot(np.linalg.pinv(1 - alpha * W1), Y_0)
    # Y2 = (1 - alpha) * np.dot(np.linalg.pinv(1 - alpha * W2), Y_0)
    Y1_new = np.dot(W1_new, Y1)
    Y2_new = np.dot(W2_new, Y2)

    cost1 = alpha * np.trace(np.dot(np.dot(Y1.T, 1 - W1), Y1)) + (1 - alpha)*LA.norm(Y1 - Y_0)**2
    cost2 = alpha * np.trace(np.dot(np.dot(Y2.T, 1 - W2), Y2)) + (1 - alpha)*LA.norm(Y2 - Y_0)**2
    costmax = max(cost1, cost2)
    theta1 = (costmax - cost1) / ((costmax - cost1) + (costmax - cost2))
    theta2 = (costmax - cost2) / ((costmax - cost1) + (costmax - cost2))
    # print(cost1)
    # print(cost2)
    # print(costmax - cost1)
    # print(costmax - cost2)

    Y_new = theta1 * Y1_new + theta2 * Y2_new
    matrix_new = matrix.copy().astype(float)
    matrix_new[:, idx_test] = Y_new.T
    # print(sum(sum(matrix_new[:, idx_test])))
    return matrix_new

In [None]:
def FeaturePreprocess(df_all, drug_nodes):
    
    drug_nodes_df = np.intersect1d(df_all.index, drug_nodes)
    df = df_all.loc[drug_nodes_df]
    _, q = df.shape
    drug_nodes_diff = np.setdiff1d(drug_nodes, (df.index).tolist())
    n = len(drug_nodes_diff)
    df_diff = pd.DataFrame(np.zeros(n*q).reshape(n,q))
    df_diff.index = drug_nodes_diff
    df_diff.columns = df.columns
    df_all = pd.concat([df, df_diff], axis = 0)
    featureMat = df_all.loc[drug_nodes]
    return np.array(featureMat)

### Functions for cross validation and nested cross validation

In [None]:
def fold(IDX1,IDX2,feature_matrix1,feature_matrix2,alpha,matrix):
    # IDX1 target index, need to be evaluated
    # IDX2 test index, masked
    target_idx = IDX1
    mask_idx = IDX2
    Ground_Truth = matrix.copy()
    side_effects_drug_relation_copy = matrix.copy()

    # target_idx = IDX2
    ### making all the links to predict as 0 ###############    
    for i in range(len(mask_idx)):
        side_effects_drug_relation_copy[:, mask_idx[i]] = 0
    
    m,n = side_effects_drug_relation_copy.shape

    drug_idx = list(range(n))
    existing_drug_idx = np.setdiff1d(drug_idx, mask_idx)

    print('LNSM starts:')
    if methodOption == "LNSM_CMI":
        side_effects_drug_relation_fact = LNSM_CMI(matrix=matrix, feature_matrix1=feature_matrix1, feature_matrix2=feature_matrix2, alpha=alpha, idx_train=existing_drug_idx, idx_test=target_idx)
    elif methodOption == "LNSM_SMI":
        side_effects_drug_relation_fact = LNSM_SMI(matrix=matrix, feature_matrix1=feature_matrix1, feature_matrix2=feature_matrix2, alpha=alpha, idx_train=existing_drug_idx, idx_test=target_idx)
    elif methodOption == "LNSM_WMK1":
        side_effects_drug_relation_fact = LNSM(matrix=matrix, feature_matrix=feature_matrix1, alpha=alpha, idx_train=existing_drug_idx, 
        idx_test=target_idx)
    elif methodOption == "LNSM_WMK2":
        side_effects_drug_relation_fact = LNSM(matrix=matrix, feature_matrix=feature_matrix2, alpha=alpha, idx_train=existing_drug_idx, 
        idx_test=target_idx)
    # side_effects_drug_relation_fact = perform_matrix_reconstruction(side_effects_drug_relation_copy, Gamma, weight_matrix, lmd, update_normalization)

    # Set the out put of GNMF as prediction score
    score = side_effects_drug_relation_fact.copy()

    pr_auc_all = 0
    roc_auc_all = 0


    print("proportion of ground truth:", sum(Ground_Truth[:, target_idx].ravel())/(Ground_Truth[:, target_idx].shape[0]*Ground_Truth[:, target_idx].shape[1]))

    print('---evaluation---')

    prec, recall, threshold = precision_recall_curve(Ground_Truth[:, target_idx].ravel(), score[:, target_idx].ravel())
    pr_auc_all = auc(recall, prec)
    roc_auc_all = roc_auc_score(Ground_Truth[:, target_idx].ravel(), score[:, target_idx].ravel())

    print("-----")

    print("AUC-PR all:", pr_auc_all)

    print("-----")

    print("AUC-ROC all:", roc_auc_all)

    print("-----")
    return pr_auc_all, roc_auc_all

In [None]:
def innerfold(IDX1,IDX2,feature_matrix1,feature_matrix2,alpha,matrix):
    # IDX1 target index, need to be evaluated
    # IDX2 test index, masked

    target_idx = IDX1
    mask_idx = IDX2
    Ground_Truth = matrix.copy()
    side_effects_drug_relation_copy = matrix.copy()

    # target_idx = IDX2
    ### making all the links to predict as 0 ###############    
    for i in range(len(mask_idx)):
        side_effects_drug_relation_copy[:, mask_idx[i]] = 0
    
    m,n = side_effects_drug_relation_copy.shape

    drug_idx = list(range(n))
    existing_drug_idx = np.setdiff1d(drug_idx, mask_idx)
    
    # calculate the mean for each drug
    mean_side_effect_score = (Ground_Truth.copy()[:, existing_drug_idx]).mean(axis=1)
    score_mean = side_effects_drug_relation_copy.copy().astype(float)

    # Set the prediction into mean
    for i in range(m):
        score_mean[i, mask_idx] =  mean_side_effect_score[i]

    if methodOption == "LNSM_CMI":
        side_effects_drug_relation_fact = LNSM_CMI(matrix=matrix, feature_matrix1=feature_matrix1, feature_matrix2=feature_matrix2, alpha=alpha, idx_train=existing_drug_idx, idx_test=target_idx)
    elif methodOption == "LNSM_SMI":
        side_effects_drug_relation_fact = LNSM_SMI(matrix=matrix, feature_matrix1=feature_matrix1, feature_matrix2=feature_matrix2, alpha=alpha, idx_train=existing_drug_idx, idx_test=target_idx)
    elif methodOption == "LNSM_WMK1":
        side_effects_drug_relation_fact = LNSM(matrix=matrix, feature_matrix=feature_matrix1, alpha=alpha, idx_train=existing_drug_idx, 
        idx_test=target_idx)
    elif methodOption == "LNSM_WMK2":
        side_effects_drug_relation_fact = LNSM(matrix=matrix, feature_matrix=feature_matrix2, alpha=alpha, idx_train=existing_drug_idx, 
        idx_test=target_idx)

    # Set the out put of GNMF as prediction score
    score = side_effects_drug_relation_fact.copy()

    pr_auc_all = 0
    roc_auc_all = 0

    prec, recall, threshold = precision_recall_curve(Ground_Truth[:, target_idx].ravel(), score[:, target_idx].ravel())
    pr_auc_all = auc(recall, prec)

    return pr_auc_all, roc_auc_all

In [None]:
def plotfold(IDX1,IDX2,feature_matrix1,feature_matrix2,alpha,matrix):
    # IDX1 target index, need to be evaluated
    # IDX2 test index, masked

    target_idx = IDX1
    mask_idx = IDX2
    Ground_Truth = matrix.copy()
    side_effects_drug_relation_copy = matrix.copy()

    # target_idx = IDX2
    ### making all the links to predict as 0 ###############    
    for i in range(len(mask_idx)):
        side_effects_drug_relation_copy[:, mask_idx[i]] = 0
    
    m,n = side_effects_drug_relation_copy.shape

    drug_idx = list(range(n))
    existing_drug_idx = np.setdiff1d(drug_idx, mask_idx)
    
    print('LNSM starts:')
    if methodOption == "LNSM_CMI":
        side_effects_drug_relation_fact = LNSM_CMI(matrix=matrix, feature_matrix1=feature_matrix1, feature_matrix2=feature_matrix2, alpha=alpha, idx_train=existing_drug_idx, idx_test=target_idx)
    elif methodOption == "LNSM_SMI":
        side_effects_drug_relation_fact = LNSM_SMI(matrix=matrix, feature_matrix1=feature_matrix1, feature_matrix2=feature_matrix2, alpha=alpha, idx_train=existing_drug_idx, idx_test=target_idx)
    elif methodOption == "LNSM_WMK1":
        side_effects_drug_relation_fact = LNSM(matrix=matrix, feature_matrix=feature_matrix1, alpha=alpha, idx_train=existing_drug_idx, 
        idx_test=target_idx)
    elif methodOption == "LNSM_WMK2":
        side_effects_drug_relation_fact = LNSM(matrix=matrix, feature_matrix=feature_matrix2, alpha=alpha, idx_train=existing_drug_idx, 
        idx_test=target_idx)
    # side_effects_drug_relation_fact = perform_matrix_reconstruction(side_effects_drug_relation_copy, Gamma, weight_matrix, lmd, update_normalization)

    # Set the out put of GNMF as prediction score
    score = side_effects_drug_relation_fact.copy()

    pr_auc_all = 0
    roc_auc_all = 0


    print("proportion of ground truth:", sum(Ground_Truth[:, target_idx].ravel())/(Ground_Truth[:, target_idx].shape[0]*Ground_Truth[:, target_idx].shape[1]))

    print('---evaluation---')

    prec, recall, prthreshold = precision_recall_curve(Ground_Truth[:, target_idx].ravel(), score[:, target_idx].ravel())
    pr_auc_all = auc(recall, prec)
    
    fpr, tpr, rocthreshold = metrics.roc_curve(Ground_Truth[:, target_idx].ravel(), score[:, target_idx].ravel())
    roc_auc_all = auc(fpr, tpr)

    print("-----")

    print("AUC-PR all:", pr_auc_all)

    print("-----")

    print("AUC-ROC all:", roc_auc_all)

    print("-----")

    Out1 = pd.DataFrame([prec, recall, prthreshold])
    Out2 = pd.DataFrame([fpr, tpr, rocthreshold])
    return Out1, Out2

### Function for assigning arguments of CV and nested CV, as well as finding the best hyperparameters

In [None]:
def setvar_tune(size):
# set var for hyper pars tuning size is the hyper par size ALL_...

    global ALL_AUCPR_all
    global ALL_AUROC_all

    ALL_AUCPR_all = np.zeros(size)
    ALL_AUROC_all = np.zeros(size)

In [None]:
def setvar_cv(FOLDS):
# set var for cv 

    global AUC_roc_all
    global AUC_pr_all

    
    AUC_roc_all = np.zeros(FOLDS)
    AUC_pr_all = np.zeros(FOLDS)

In [None]:
def asgvar_tune(idx, results):
    # assign var for cv from results
    # f: size of hyper pars
    ALL_AUCPR_all[idx] = results[0]
    ALL_AUROC_all[idx] = results[1]

In [None]:
def asgvar_cv(f, results):
    # assign var for cv from results
    # f: size of hyper pars

    AUC_pr_all[f] = results[0]
    AUC_roc_all[f] = results[1]

In [None]:
def tuning_results(tuneVar):
    idx = np.argmax(ALL_AUCPR_all)
    Var = tuneVar[idx]
    Value = ALL_AUCPR_all[idx]

    print("best hyperpar: ", Var)
    print("AUPRC: ", Value)

    return Var, Value

In [None]:
def setvar_besttune(innerfolds):
    global besttunevalue
    global besttunevar
    besttunevalue = np.zeros(innerfolds) # best metric value
    besttunevar = np.zeros(innerfolds) # the value of best var
    besttunevar = besttunevar.tolist()

In [None]:
def asg_besttune(f, value, var):
    besttunevalue[f] = value
    besttunevar[f] = var

In [None]:
def besttune():
    idx = np.argmax(besttunevalue)
    value = besttunevalue[idx]
    var = besttunevar[idx]
    return value, var

In [None]:
def cv_results():
    
    print("-----------")

    print("Mean AUC_pr_all", AUC_pr_all.mean()," ", "Standard Deviation:", AUC_pr_all.std())
    print("Mean AUC_roc_all", AUC_roc_all.mean()," ", "Standard Deviation:", AUC_roc_all.std())

    print("-----------")
    results = np.array([AUC_pr_all, AUC_roc_all])
    return results

### Function for parallel computation

In [None]:
def tuning_loop(innermatrix, idx_train_inner, idx_test_inner, feature_matrix_inner1, feature_matrix_inner2, hyperparList, i):
    a = hyperparList[i]
    idx_target_inner = idx_test_inner
    results = innerfold(idx_target_inner,idx_test_inner,feature_matrix1=feature_matrix_inner1,feature_matrix2=feature_matrix_inner2,alpha=a,matrix=innermatrix)
    asgvar_tune(i, results=results)

### Main function

In [None]:
def main(method_option,alpha=0.8,Validation="nested_cv",sets="intersect",a=0.8):
    random.seed(1949) # for dataset split
    np.random.seed(1949) # for matrix initialization
    option(method_option)

    random.seed(1949) # for dataset split
    np.random.seed(1949) # for matrix initialization
    df = pd.read_csv("data/side-effect-and-drug_name_upper.tsv",sep = "\t")
    drug_id = df["drugbank_id"] # put col of df in var
    drug_name = df["drugbank_name"]
    side_effect = df["side_effect_name"]
    
    
    edgelist1 = zip(side_effect, drug_name)
    ##making Biparite Graph##
    B = nx.DiGraph()
    B.add_nodes_from(side_effect,bipartite = 0)
    B.add_nodes_from(drug_name,bipartite = 1)
    B.add_edges_from(edgelist1)
    # B.add_weighted_edges_from(edgelist2)
    drug_nodes = {n for n, d in B.nodes(data=True) if d['bipartite']==1}
    side_effect_nodes = {n for n, d in B.nodes(data=True) if d['bipartite']==0}
    drug_nodes = list(drug_nodes)
    drug_nodes.sort()
    side_effect_nodes = list(side_effect_nodes)
    side_effect_nodes.sort()
    ###Getting the Bi-Adjacency matrix between side effects and drugs ###################
    matrix_all = biadjacency_matrix(B, row_order = side_effect_nodes, column_order = drug_nodes) # create biadjacency matrix for drug side effect graph
    matrix_all = matrix_all.A
    m_all,n_all = matrix_all.shape # number of side effect # number of drug
    
    
    ### Setting validation set / training set / testing set ###
    validate_sz = int(0.25 * n_all)
    IDX_all = list(range(n_all))
    random.shuffle(IDX_all)
    IDX_validate = sorted(IDX_all[0:validate_sz])
    print("first few validation set idx:")
    print(IDX_validate[0:10])
    IDX_validate_diff = np.setdiff1d(IDX_all, IDX_validate)
    matrix = matrix_all[:, IDX_validate_diff].copy()

    df1 = pd.read_csv("data/intersection_DGIdb_mat.tsv",sep = "\t")
    df2 = pd.read_csv("data/intersection_Fingerprint_mat.tsv",sep = "\t")
    featureMat1_all = FeaturePreprocess(df1, drug_nodes=drug_nodes)
    featureMat2_all = FeaturePreprocess(df2, drug_nodes=drug_nodes)
    featureMat1 = featureMat1_all[IDX_validate_diff, :].copy()
    featureMat2 = featureMat2_all[IDX_validate_diff, :].copy()
    
    
    non_zero_idx_union = np.hstack(np.where(~((featureMat1.sum(1) == 0) & (featureMat2.sum(1) == 0))))
    non_zero_idx_missing = np.hstack(np.where(~(~(featureMat1.sum(1) == 0) & ~(featureMat2.sum(1) == 0))))
    non_zero_idx_intersect = np.hstack(np.where(~(featureMat1.sum(1) == 0) & ~(featureMat2.sum(1) == 0)))
    if sets == "union":
        # union
        matrix = matrix[:, non_zero_idx_union].copy()
        featureMat1 = featureMat1[non_zero_idx_union, :].copy()
        featureMat2 = featureMat2[non_zero_idx_union, :].copy()
    elif sets == "intersect":
        # intersect
        non_zero_idx_intersect_all = np.hstack(np.where(~(featureMat1_all.sum(1) == 0) & ~(featureMat2_all.sum(1) == 0)))
    
        matrix_all = matrix_all[:, non_zero_idx_intersect_all].copy()
        featureMat1_all = featureMat1_all[non_zero_idx_intersect_all, :].copy()
        featureMat2_all = featureMat2_all[non_zero_idx_intersect_all, :].copy()
    
        matrix = matrix[:, non_zero_idx_intersect].copy()
        featureMat1 = featureMat1[non_zero_idx_intersect, :].copy()
        featureMat2 = featureMat2[non_zero_idx_intersect, :].copy()
    
        IDX_validate = np.setdiff1d(non_zero_idx_intersect_all, IDX_validate_diff)
        IDX_validate_diff = np.setdiff1d(non_zero_idx_intersect_all, IDX_validate)
    
        drug_nodes_intersect_all = np.array(drug_nodes)[non_zero_idx_intersect_all]
        drug_nodes_intersect_validate_diff = np.array(drug_nodes)[IDX_validate_diff]
        drug_nodes_intersect_validate = np.array(drug_nodes)[IDX_validate]
    
        IDX_validate = np.array([x for x in range(len(drug_nodes_intersect_all)) if drug_nodes_intersect_all[x] in drug_nodes_intersect_validate])
        IDX_validate_diff = np.array([x for x in range(len(drug_nodes_intersect_all)) if drug_nodes_intersect_all[x] in drug_nodes_intersect_validate_diff])
    
    m,n = matrix.shape # number of side effect # number of drug




    random.seed(1949) # for dataset split
    np.random.seed(1949) # for matrix initialization
    start_time = time.time()



    FOLDS = 5
    innerFOLDS = 4
    ####for test sets####
    setvar_cv(FOLDS)

    sz = n
    IDX = list(range(sz))
    fsz = int(sz/FOLDS)
    random.shuffle(IDX)
    IDX = np.array(IDX)
    offset = 0

    innersz = sz - fsz
    innerIDX = list(range(innersz))
    random.shuffle(innerIDX)
    innerIDX = np.array(innerIDX)
    innerfsz = int(innersz / innerFOLDS)
    inneroffset = 0
    # setvar_cv(FOLDS=FOLDS)
    if Validation == "nested_cv":
        print("---------- nested cv start ----------")
        for f in range(FOLDS):  # range(FOLDS):
            offset = 0 + f*fsz
            idx_test = IDX[offset:offset + fsz]
    
            idx_train = IDX[np.setdiff1d(np.arange(len(IDX)), np.arange(offset,offset + fsz))]
            print("Fold:",f)
            innermatrix = matrix[:, idx_train]
            innerfeatureMat1 = featureMat1[idx_train, :].copy()
            innerfeatureMat2 = featureMat2[idx_train, :].copy()
            # print(type(weight_matrix1_inner))
    
            setvar_besttune(innerFOLDS)
    
            for innerf in range(innerFOLDS):
                idx_test_inner = innerIDX[inneroffset:inneroffset + innerfsz]
                idx_train_inner = innerIDX[np.array(np.setdiff1d(np.arange(len    (idx_train)), np.arange(inneroffset,inneroffset + innerfsz)))]
    
                print("Inner Fold:", innerf)
    
                alpha = np.arange(0.1, 1, 0.05).tolist()
                hyperparList = alpha
                setvar_tune(len(hyperparList))
    
                with parallel_backend('threading'):
                    Parallel(n_jobs=20)(delayed(tuning_loop)(innermatrix = innermatrix, idx_train_inner = idx_train_inner, 
                        idx_test_inner = idx_test_inner, feature_matrix_inner1= innerfeatureMat1, \
                            feature_matrix_inner2=innerfeatureMat2, hyperparList = hyperparList, i = i) \
                                    for i in range(len(hyperparList)))
    
                # tuning_plot(tuneVar=C, tune="C")
                hyperpars, evalValue = tuning_results(tuneVar=hyperparList)
    
                asg_besttune(innerf, value=evalValue, var=hyperpars)
                    
            _, bestHyperPars = besttune()
                
    
            print("--- tuning end ---")
            a = bestHyperPars
    
                
            # idx_target = np.intersect1d(idx_test, WMK_non_zero_idx_intersect)
            idx_target = idx_test
            print('target size:', len(idx_target))
    
            print("------ lambda: ", a, "------")
    
            results = fold(idx_target,idx_test,featureMat1,featureMat2,alpha=a,matrix=matrix)
            asgvar_cv(f=f, results=results)
                
            
        out_mean, out = cv_results()
        return out_mean, out
    elif Validation == "cv":
        print("---------- cv start ----------")
        setvar_besttune(FOLDS)
        for f in range(FOLDS):  # range(FOLDS):
            offset = 0 + f*fsz
            idx_test = IDX[offset:offset + fsz]
            idx_train = IDX[np.setdiff1d(np.arange(len(IDX)), np.arange(offset,offset + fsz))]

            print("Fold:",f)


            alpha = np.arange(0.1, 1, 0.05).tolist()
            hyperparList = alpha
            setvar_tune(len(hyperparList))
    
            with parallel_backend('threading'):
                Parallel(n_jobs=20)(delayed(tuning_loop)(innermatrix = matrix, idx_train_inner = idx_train, idx_test_inner = idx_test, \
                    feature_matrix_inner1= featureMat1, feature_matrix_inner2= featureMat2, hyperparList = hyperparList, i = i) \
                    for i in range(len(hyperparList)))
    
            # tuning_plot(tuneVar=C, tune="C")
            hyperpars, evalValue = tuning_results(tuneVar=hyperparList)
    
            asg_besttune(f, value=evalValue, var=hyperpars)                

                    
        print("--- tuning end ---")
        # cv_results()
        _, bestHyperPars = besttune()
    
    elif Validation == "Validation":

        idx_test = IDX_validate
        idx_train = IDX_validate_diff
        idx_target = idx_test
        print('target size:', len(idx_target))
        print("------ a: ", a, "------")
    
        results = fold(idx_target,idx_test,feature_matrix1=featureMat1_all, feature_matrix2=featureMat2_all,alpha=a,matrix=matrix_all)
        return
    elif Validation == "plot":
    
        idx_test = IDX_validate
        idx_train = IDX_validate_diff
        idx_target = idx_test
        print('target size:', len(idx_target))
        print("------ a: ", a, "------")
    
        pr, roc = plotfold(idx_target,idx_test,feature_matrix1=featureMat1_all,feature_matrix2=featureMat2_all,alpha=a,matrix=matrix_all)
        
        return pr, roc

## 2. Nested CV and CV for LNSM-RLN family

### 2.1. Nested CV

Running the nested CV for feature DGI

In [None]:
main(method_option = "LNSM_CMI", Validation="nested_cv")

Running the nested CV for feature Chem

In [None]:
main(method_option = "LNSM_SMI", Validation="nested_cv")

Running the nested CV for feature DGI and Chem combine by CMI

In [None]:
main(method_option = "LNSM_WMK1", Validation="nested_cv")

Running the nested CV for feature DGI and Chem combine by SMI

In [None]:
main(method_option = "LNSM_WMK2", Validation="nested_cv")

### 2.2. CV to tune hyperparameters for independent test set

Running CV for DGI. The best hyperparameters are $\alpha=0.15$.

In [None]:
main(method_option = "LNSM_CMI",Validation="cv")

Running CV for DGI. The best hyperparameters are $\alpha=0.1$.

In [None]:
main(method_option = "LNSM_SMI",Validation="cv")

Running CV for DGI. The best hyperparameters are $\alpha=0.1$.

In [None]:
main(method_option = "LNSM_WMK1",Validation="cv")

Running CV for DGI. The best hyperparameters are $\alpha=0.15$.

In [None]:
main(method_option = "LNSM_WMK2",Validation="cv")

### 2.3. Independent test set

DGI

In [None]:
main(method_option = "LNSM_CMI", Validation="Validation", a=0.15)

Chem

In [None]:
main(method_option = "LNSM_SMI", Validation="Validation", a=0.1)

DGI and Chem combine by CMI

In [None]:
main(method_option = "LNSM_WMK1", Validation="Validation", a=0.1)

DGI and Chem combine by SMI

In [None]:
main(method_option = "LNSM_WMK2", Validation="Validation", a=0.15)

### * Save data for PR and ROC

In [None]:
LNSM_CMI_RLN_pr, LNSM_CMI_RLN_roc = \
    main(method_option="LNSM_CMI", Validation="plot", a=0.15)            
LNSM_CMI_RLN_pr.T.to_csv("Figs/LNSM_CMI_RLN_pr.csv", index=False)
LNSM_CMI_RLN_roc.T.to_csv("Figs/LNSM_CMI_RLN_roc.csv", index=False)

In [None]:
LNSM_SMI_RLN_pr, LNSM_SMI_RLN_roc = \
    main(method_option="LNSM_SMI", Validation="plot", a=0.1)
LNSM_SMI_RLN_pr.T.to_csv("Figs/LNSM_SMI_RLN_pr.csv", index=False)
LNSM_SMI_RLN_roc.T.to_csv("Figs/LNSM_SMI_RLN_roc.csv", index=False)

In [None]:
LNSM_WMK1_RLN_pr, LNSM_WMK1_RLN_roc = \
    main(method_option="LNSM_WMK1", Validation="plot", a=0.1)
LNSM_WMK1_RLN_pr.T.to_csv("Figs/LNSM_WMK1_RLN_pr.csv", index=False)
LNSM_WMK1_RLN_roc.T.to_csv("Figs/LNSM_WMK1_RLN_roc.csv", index=False)

In [None]:
LNSM_WMK2_RLN_pr, LNSM_WMK2_RLN_roc = \
    main(method_option="LNSM_WMK2", Validation="plot", a=0.15)
LNSM_WMK2_RLN_pr.T.to_csv("Figs/LNSM_WMK2_RLN_pr.csv", index=False)
LNSM_WMK2_RLN_roc.T.to_csv("Figs/LNSM_WMK2_RLN_roc.csv", index=False)