# Evaluation Results of Adverse Drug Reactions (ADRs) in SIDER

Import necessary modules: 

In [1]:

import numpy as np
import pandas as pd
from ADRprofilePrediction import Pairs2Mat, evaluation
from Models import loadHyperpar
import json

In [2]:
import sklearn
print(sklearn.__version__)

0.24.2


## Load data

Load the feature data in to a dictionary. Drug-target, drug-enzyme, drug-chemical structure fingerprint, drug-gene interaction, drug-transporter, drug-pathway and drug-indication are included.

In [3]:
features_dict = {
    "target":Pairs2Mat(path="data/drug_target.tsv",colname1="0",colname2="1"),
    "enzyme":Pairs2Mat(path="data/drug_enzyme.tsv",colname1="0",colname2="1"),
    "Chem":pd.read_csv("data/drug_chemsfp.tsv",sep = "\t",header=0,index_col=0),
    "DGI":Pairs2Mat(path="data/interactions.tsv",colname1="drug_claim_name",colname2="gene_name"),
    "transporter":Pairs2Mat(path="data/drug_transporter.tsv",colname1="0",colname2="1"),
    "pathway":Pairs2Mat(path="data/drug_pathway.tsv",colname1="0",colname2="1"),
    "indication":Pairs2Mat(path="data/drug_indication.tsv",colname1="1_x",colname2="6")
}


Load ADR data from SIDER and OFFSIDES. Variable SEs is a dict that stores ADR data. Variable filter controls the frequency of the ADR. When filter is "all", only the ADRs with extremely low frequencies are removed; when filter is "rare" only frequency less than 50 were used.


In [4]:
filter = "all"
SEs = {}
if filter == "all":
    SIDER = Pairs2Mat(path="data/drug_se.tsv",colname1="1_x",colname2="5")
    column_sums = np.sum(SIDER, axis=0)
    SEs["SIDER"] = SIDER.loc[:, (column_sums >= 5)]

    OFFSIDERS = Pairs2Mat(path="data/OFFSIDES.csv",colname1="drug_concept_name",colname2="condition_concept_name",sep = ",")
    column_sums = np.sum(OFFSIDERS, axis=0)
    SEs["OFFSIDES"] = OFFSIDERS.loc[:, column_sums >= 5]
elif filter == "rare":
    SIDER = Pairs2Mat(path="data/drug_se.tsv",colname1="1_x",colname2="5")
    column_sums = np.sum(SIDER, axis=0)
    SEs["SIDER"] = SIDER.loc[:, (column_sums < 50)]

    OFFSIDERS = Pairs2Mat(path="data/OFFSIDES.csv",colname1="drug_concept_name",colname2="condition_concept_name",sep = ",")
    column_sums = np.sum(OFFSIDERS, axis=0)
    SEs["OFFSIDES"] = OFFSIDERS.loc[:, column_sums < 50]


## Set variables

The variables below includes all the options for the code.

- features_names: This varible is the list of all the features including the target feature, the enzyme feature, chemical structure fingerprint (Chem), drug-gene interaction (DGI), the transporter featrue, the pathway feature, the indication feature.
- SE_names: ADR data from SIDER.
- methods: This option is machine learning methods used for prediction. Only the top three methods were compared, SKR, VKR and KRR.
- metrice_names: Metric we used to evaluate the performance of methods: AUPR, AUROC, AUPR per drug, AUROC per drug, AUPR+AUROC and AUPR+AUROC per drug.
- SE_name: The used ADR data in this file.
- metric: We used AUPR as the tuning metrice in Nested CV and CV.

In [7]:
features_names = ["target", "enzyme", "Chem", "DGI", "transporter", "pathway", "indication"]
# SEs_names = ["SIDER", "OFFSIDES"]
methods = ["SKR", "KRR", "VKR", "Naive", "LNSM_RLN", "LNSM_jaccard"]
# methods = ["SKR", "KR", "KRR", "Naive", "LNSM_RLN", "LNSM_jaccard", "VKR", "SVM", "OCCA", "SCCA", "RF", "BRF"]
# tuning_metrices=["AUROC", "AUPR", "AUROCperdrug", "AUPRperdrug"]
metrice_names = ["AUPR+AUROC", "AUPR+AUROCperdrug", "AUROC", "AUPR", "AUROCperdrug", "AUPRperdrug"]
SEs_name = "SIDER"
metrice = "AUPR"

Set the variables for hyperparameters. We summarized 3 types of hyperparameters (SVM, RF and BRF are not competitive and time-consuming, and were tuned and trained in a seperated file -- SVM_RF.ipynb): 
 - A: This hyperparameters are tuned according to the step $\dots, 10^{-1}, 10^{0}, 10^{1}, \dots$ ($\sigma_X$ and $\sigma_Y$ does not change during tuning so they can be set as $10$ and $100$ respectively).
 - B: This hyperparameters are in $[0,1]$ and tuned according to the step $0, 0.1, \dots, 1$.
 - C: This hyperparameters are tuned based on $5, 10, 15, \dots$.

In [8]:
A = 10**np.arange(-2, 3, 1, dtype=float)
B = np.arange(0.1, 1, 0.1, dtype=float)
C = np.arange(5, 20, 5, dtype=int)
A10 = 10**np.arange(1, 2, 1, dtype=float)
A100 = 10**np.arange(2, 3, 1, dtype=float)
all_hyperparlist = {
    "SKR":[A,B,A10,A100], 
    "KRR":[A,A],
    "VKR":[A,A,C], 
    "Naive":[], 
    "LNSM_RLN":[B,A], 
    "LNSM_jaccard":[B], 
    # "SVM":[A,A,A], 
    # "RF":[C], 
    # "BRF":[C]
}

Set dictionaries to store the tuned hyperparameters and the results of CV and Nested CV.

In [10]:
hyperparsOut = {}
hyperparsOut["nested_cv"] = {}
hyperparsOut["cv"] = {}
results = {}
results["nested_cv"] = {}
results["cv"] = {}

## Nested CV and CV

Load tuned hyperparameters. If fully rerunning the tuning step of Nested CV and CV is required, please skip loading variable hyperpars and remove the option `hyperparfixed` of the function `evaluation()`.

In [55]:
# Open and read the JSON file
with open(f'results/hyperpars_{SEs_name}.xml', 'r') as xml_file:
    hyperpars = json.load(xml_file)

In [17]:
for method in methods:
    validation = "nested_cv"
    hyperparsOut[validation][method] = {}
    results[validation][method] = {}
    for str in features_names:
        print(f"using feature {str}")
        # Load the hyperparameter combination
        hyperparList = loadHyperpar(*all_hyperparlist[method],method_option=method)
        results[validation][method][str], hyperparsOut[validation][method][str] = evaluation(Y=SEs[SEs_name], X=features_dict[str], method_option=method,tuning_metrice=metrice,hyperparList=hyperparList,hyperparfixed=hyperpars[validation][method][str],Validation=validation,n_jobs=1)

    validation = "cv"
    hyperparsOut[validation][method] = {}
    results[validation][method] = {}
    for str in features_names:
        print(f"using feature {str}")
        hyperparList = loadHyperpar(*all_hyperparlist[method],method_option=method)
        results[validation][method][str], hyperparsOut[validation][method][str] = evaluation(Y=SEs[SEs_name], X=features_dict[str], method_option=method,tuning_metrice=metrice,hyperparList=hyperparList,hyperparfixed=hyperpars[validation][method][str],Validation=validation,n_jobs=1)

using feature target
The SKR requires hyperparameter lambda, c, sigma_X, sigma_Y
---------- nested cv start ----------
Fold: 0
number of hyperpars combination:  45
first few training idx:  [ 57  75 258 281 294 362 474 479 680 698]
first few testing idx:  [ 70 151 209 236 411 438 439 625 657 704]
--- tuning end ---
target size: 142
------ best hyper pars:  (0.01, 0.4, 10, 100) ------
SKR starts:
SKR ends:
-----------
AUPRperdrug: 0.46165763532831294
AUROCperdrug: 0.8978972570529985
AUPR+AUROCperdrug: 1.3595548923813114
AUPR: 0.41537133982867747
AUROC: 0.8687649857050729
AUPR+AUROC: 1.2841363255337503
-----------
Fold: 1
number of hyperpars combination:  45
first few training idx:  [ 70 151 209 236 411 438 439 625 657 704]
first few testing idx:  [ 57  75 258 281 294 362 474 479 680 698]
--- tuning end ---
target size: 142
------ best hyper pars:  (0.01, 0.4, 10, 100) ------
SKR starts:
SKR ends:
-----------
AUPRperdrug: 0.46629645108222045
AUROCperdrug: 0.9021824148263183
AUPR+AUROCperd



VKR ends:
-----------
AUPRperdrug: 0.4414199494233323
AUROCperdrug: 0.8888652896768148
AUPR+AUROCperdrug: 1.3302852391001472
AUPR: 0.3951850082752013
AUROC: 0.8752621672235312
AUPR+AUROC: 1.2704471754987325
-----------
Fold: 3
number of hyperpars combination:  75
first few training idx:  [ 70 151 209 236 411 438 439 625 657 704]
first few testing idx:  [ 14  21  36  93 153 340 352 403 418 538]
--- tuning end ---
target size: 142
------ best hyper pars:  (10, 0.01, 15) ------
VKR starts:
VKR ends:
-----------
AUPRperdrug: 0.47833690246522237
AUROCperdrug: 0.8972055841573693
AUPR+AUROCperdrug: 1.3755424866225916
AUPR: 0.4275471452853586
AUROC: 0.8737962850631014
AUPR+AUROC: 1.30134343034846
-----------
Fold: 4
number of hyperpars combination:  75
first few training idx:  [ 70 151 209 236 411 438 439 625 657 704]
first few testing idx:  [ 15  50 158 198 208 278 541 564 579 686]
--- tuning end ---
target size: 142
------ best hyper pars:  (10, 0.01, 15) ------
VKR starts:
VKR ends:
-------



VKR ends:
-----------
AUPRperdrug: 0.4503620748771144
AUROCperdrug: 0.8859509594708835
AUPR+AUROCperdrug: 1.3363130343479979
AUPR: 0.41586098158259405
AUROC: 0.8756739195173937
AUPR+AUROC: 1.2915349010999877
-----------
Fold: 4
number of hyperpars combination:  75
first few training idx:  [123 155 179 212 231 458 481 485 510 523]
first few testing idx:  [ 49  83 163 209 214 253 278 387 466 479]
--- tuning end ---
target size: 108
------ best hyper pars:  (10, 1, 15) ------
VKR starts:
VKR ends:
-----------
AUPRperdrug: 0.43902551850450117
AUROCperdrug: 0.8828594219542886
AUPR+AUROCperdrug: 1.3218849404587898
AUPR: 0.38343629758930986
AUROC: 0.8539107291954282
AUPR+AUROC: 1.2373470267847382
-----------
Mean AUPRperdrug: 0.4438169187289612, std: 0.012550276496160731
Mean AUROCperdrug: 0.8851815177814834, std: 0.0020786082582573556
Mean AUPR+AUROCperdrug: 1.3289984365104446, std: 0.014206997525150606
Mean AUPR: 0.39821476956867646, std: 0.024372974363863285
Mean AUROC: 0.8678675386677372,



VKR ends:
-----------
AUPRperdrug: 0.4516524710091313
AUROCperdrug: 0.8967192125001915
AUPR+AUROCperdrug: 1.3483716835093227
AUPR: 0.4032362624419107
AUROC: 0.874010357419414
AUPR+AUROC: 1.2772466198613248
-----------
Fold: 2
number of hyperpars combination:  75
first few training idx:  [ 94 124 142 193 241 339 379 438 651 689]
first few testing idx:  [  5 106 111 220 227 306 395 463 520 538]
--- tuning end ---
target size: 150
------ best hyper pars:  (10, 0.1, 15) ------
VKR starts:
VKR ends:
-----------
AUPRperdrug: 0.45234098226399716
AUROCperdrug: 0.8972715794191863
AUPR+AUROCperdrug: 1.3496125616831836
AUPR: 0.4361310366318718
AUROC: 0.8880710401596048
AUPR+AUROC: 1.3242020767914766
-----------
Fold: 3
number of hyperpars combination:  75
first few training idx:  [ 94 124 142 193 241 339 379 438 651 689]
first few testing idx:  [ 42 120 130 249 307 417 426 437 654 706]
--- tuning end ---
target size: 150
------ best hyper pars:  (10, 0.01, 15) ------
VKR starts:
VKR ends:
-------

## Save Output

Store the tuned hyperparameters for reproducing results.

In [24]:
# with open(f'results/hyperpars_{SEs_name}.xml', 'w') as xml_file:
#    json.dump(hyperparsOut, xml_file)

Store the results of Nested CV and CV.

In [25]:
with open(f'results/results_{SEs_name}_{filter}.xml', 'w') as xml_file:
   json.dump(results, xml_file)

## Reorganize the Results and Calculate the P-value of Method Comparison

Load the results of Nested CV and CV.

In [19]:
with open(f'results/results_{SEs_name}_{filter}.xml', 'r') as xml_file:
    results = json.load(xml_file)

Orgainize the results of Nested CV into table.

In [20]:
df = pd.DataFrame()
for m, fs in results["nested_cv"].items():
    for f, mes in fs.items():
        for me, scores in mes.items():
            temp_df = pd.DataFrame({
                'method': m,
                'feature': f,
                'metric': me,
                "score": scores
            })
            df = pd.concat([df, temp_df], ignore_index=True)

custom_order = ["pathway","Chem", "DGI",  "indication", "target", "transporter", "enzyme"]
df['feature'] = pd.Categorical(df['feature'], categories=custom_order, ordered=True)
df['method'] = pd.Categorical(df['method'], categories=methods, ordered=True)
df['metric'] = pd.Categorical(df['metric'], categories=metrice_names, ordered=True)
df2 = pd.pivot_table(df, values=['score'], index=["feature", "method"], aggfunc={'score': ["mean","std"]}, columns=["metric"])
df3 = df2.sort_index(axis=1, level='metric').sort_index(level='feature')
df3.to_excel(f'results/nested_cv_results_{SEs_name}_{filter}.xlsx')
df3

Unnamed: 0_level_0,Unnamed: 1_level_0,score,score,score,score,score,score,score,score,score,score,score,score
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std
Unnamed: 0_level_2,metric,AUPR+AUROC,AUPR+AUROC,AUPR+AUROCperdrug,AUPR+AUROCperdrug,AUROC,AUROC,AUPR,AUPR,AUROCperdrug,AUROCperdrug,AUPRperdrug,AUPRperdrug
feature,method,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3
pathway,SKR,1.334705,0.025306,1.404288,0.013062,0.87733,0.007133,0.457375,0.018828,0.90262,0.003697,0.501668,0.009512
pathway,KRR,1.314502,0.026516,1.392919,0.015386,0.863253,0.007199,0.451248,0.019692,0.892696,0.00452,0.500223,0.010999
pathway,VKR,1.319026,0.02429,1.375027,0.013385,0.879535,0.005165,0.439491,0.019223,0.89821,0.0032,0.476818,0.010276
pathway,Naive,1.242813,0.016862,1.31398,0.009992,0.859248,0.006301,0.383564,0.011161,0.880223,0.003061,0.433758,0.00778
pathway,LNSM_RLN,1.195971,0.056031,1.274742,0.031226,0.806585,0.025313,0.389386,0.031209,0.826507,0.014926,0.448235,0.017623
pathway,LNSM_jaccard,0.826184,0.109095,0.979166,0.077062,0.632716,0.062644,0.193468,0.048195,0.644224,0.049228,0.334942,0.0298
Chem,SKR,1.275559,0.030553,1.356358,0.023997,0.867037,0.009402,0.408522,0.022789,0.893836,0.004003,0.462522,0.020765
Chem,KRR,1.259418,0.030815,1.344612,0.021969,0.854482,0.01033,0.404936,0.022589,0.884176,0.003746,0.460436,0.019664
Chem,VKR,1.273031,0.0304,1.34857,0.025293,0.871534,0.00866,0.401497,0.022613,0.895063,0.00436,0.453507,0.021192
Chem,Naive,1.240348,0.025632,1.314588,0.025718,0.864022,0.008082,0.376326,0.019921,0.884678,0.00566,0.42991,0.020848


Orgainize the results of CV into table.

In [21]:
df = pd.DataFrame()
for m, fs in results["cv"].items():
    for f, mes in fs.items():
        for me, scores in mes.items():
            temp_df = pd.DataFrame({
                'method': m,
                'feature': f,
                'metric': me,
                "score": scores
            },index=["1"])
            df = pd.concat([df, temp_df], ignore_index=True)
df['feature'] = pd.Categorical(df['feature'], categories=custom_order, ordered=True)
df['method'] = pd.Categorical(df['method'], categories=methods, ordered=True)
df['metric'] = pd.Categorical(df['metric'], categories=metrice_names, ordered=True)
df2 = pd.pivot_table(df, values=['score'], index=["feature", "method"], columns="metric")
df2.to_excel(f'results/cv_results_{SEs_name}_{filter}.xlsx')
df2

Unnamed: 0_level_0,Unnamed: 1_level_0,score,score,score,score,score,score
Unnamed: 0_level_1,metric,AUPR+AUROC,AUPR+AUROCperdrug,AUROC,AUPR,AUROCperdrug,AUPRperdrug
feature,method,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
pathway,SKR,1.258435,1.345329,0.859145,0.39929,0.886235,0.459094
pathway,KRR,1.26042,1.345745,0.859982,0.400438,0.886422,0.459323
pathway,VKR,1.257367,1.343312,0.858144,0.399223,0.884432,0.458879
pathway,Naive,1.258257,1.343673,0.858629,0.399628,0.885073,0.4586
pathway,LNSM_RLN,1.233189,1.295248,0.828848,0.404342,0.833196,0.462052
pathway,LNSM_jaccard,0.406648,0.526215,0.332758,0.07389,0.376094,0.150122
Chem,SKR,1.26503,1.353039,0.870026,0.395004,0.896757,0.456282
Chem,KRR,1.253668,1.339958,0.864452,0.389216,0.890766,0.449191
Chem,VKR,1.263963,1.344892,0.865968,0.397995,0.890499,0.454393
Chem,Naive,1.244534,1.331759,0.85997,0.384565,0.886647,0.445113


Calculate the P-value for method comparison based on the result of nested CV.

In [22]:
df = pd.DataFrame()
for m, fs in results["nested_cv"].items():
    for f, mes in fs.items():
        for me, scores in mes.items():
            temp_df = pd.DataFrame({
                'method': m,
                'feature': f,
                'metric': me,
            }, index=["1"])
            temp_df2 = pd.concat([temp_df, pd.DataFrame(scores, columns=["1"]).T], axis=1)
            df = pd.concat([df, temp_df2], ignore_index=True)
for m in metrice_names:
    for f in features_names:
        df2 = df[(df["metric"] == m) & (df["feature"] == f)]
        df3 = df2.iloc[:, np.array([0, 3, 4, 5, 6, 7])]
        df4 = df3.set_index(df3.columns[0])
        df5 = df4.T.ptests(paired=True, stars=False)
        df5.to_excel(f'results/pvalue_{SEs_name}_{filter}_{f}_{m}.xlsx')