In [2]:

from itertools import product,combinations
import numpy as np
import pandas as pd
from ADRprofilePrediction import Pairs2Mat,evaluation3
from APPFC import completion
from Models import loadHyperpar
import seaborn as sns 
import matplotlib.pylab as plt
from matplotlib.ticker import MultipleLocator
import matplotlib.patheffects as path_effects
import json
import pingouin as pg
from functools import reduce

In [3]:

class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        else:
            return super(NpEncoder, self).default(obj)

In [4]:
import sklearn
print(sklearn.__version__)

0.24.2


In [5]:
SEs_name = "SIDER"
metrice = "mse"

In [6]:
features_dict = {
    "target":Pairs2Mat(path="data/drug_target.tsv",colname1="0",colname2="1"),
    "enzyme":Pairs2Mat(path="data/drug_enzyme.tsv",colname1="0",colname2="1"),
    "Chem":pd.read_csv("data/drug_chemsfp.tsv",sep = "\t",header=0,index_col=0),
    "DGI":Pairs2Mat(path="data/interactions.tsv",colname1="drug_claim_name",colname2="gene_name"),
    "transporter":Pairs2Mat(path="data/drug_transporter.tsv",colname1="0",colname2="1"),
    "pathway":Pairs2Mat(path="data/drug_pathway.tsv",colname1="0",colname2="1"),
    "indication":Pairs2Mat(path="data/drug_indication.tsv",colname1="1_x",colname2="6")
}


In [7]:
filter = "all"
SEs = {}
if filter == "all":
    SIDER = Pairs2Mat(path="data/drug_se.tsv",colname1="1_x",colname2="5")
    column_sums = np.sum(SIDER, axis=0)
    SEs["SIDER"] = SIDER.loc[:, (column_sums >= 5)]

    OFFSIDES = Pairs2Mat(path="data/OFFSIDES.csv",colname1="drug_concept_name",colname2="condition_concept_name",sep = ",")
    column_sums = np.sum(OFFSIDES, axis=0)
    SEs["OFFSIDES"] = OFFSIDES.loc[:, column_sums >= 5]
elif filter == "rare":
    SIDER = Pairs2Mat(path="data/drug_se.tsv",colname1="1_x",colname2="5")
    column_sums = np.sum(SIDER, axis=0)
    SEs["SIDER"] = SIDER.loc[:, (column_sums < 50)]

    OFFSIDES = Pairs2Mat(path="data/OFFSIDES.csv",colname1="drug_concept_name",colname2="condition_concept_name",sep = ",")
    column_sums = np.sum(OFFSIDES, axis=0)
    SEs["OFFSIDES"] = OFFSIDES.loc[:, column_sums < 50]


In [8]:
SEs["SIDER"].shape

(1344, 2556)

In [9]:
SEs["OFFSIDES"].shape

(2730, 12750)

In [10]:

df_dict = features_dict
intersection_indices = df_dict['target'].index
for df_name, df in df_dict.items():
    intersection_indices = intersection_indices.intersection(df.index)
intersection_indices = intersection_indices.tolist()
print("Intersection indices:", intersection_indices)
print("Size of intersection set", len(intersection_indices))

Intersection indices: ['ACETAMINOPHEN', 'ACETAZOLAMIDE', 'ALFENTANIL', 'ALLOPURINOL', 'AMANTADINE', 'AMIODARONE', 'AMITRIPTYLINE', 'AMLODIPINE', 'AMOXICILLIN', 'AMPRENAVIR', 'ANASTROZOLE', 'APIXABAN', 'APOMORPHINE', 'ARIPIPRAZOLE', 'ATAZANAVIR', 'ATENOLOL', 'AZATHIOPRINE', 'AZELASTINE', 'AZITHROMYCIN', 'BENAZEPRIL', 'BENZOCAINE', 'BEPRIDIL', 'BETAMETHASONE', 'BEZAFIBRATE', 'BICALUTAMIDE', 'BISOPROLOL', 'BOSENTAN', 'BOSUTINIB', 'BROMOCRIPTINE', 'BUMETANIDE', 'BUPRENORPHINE', 'BUSPIRONE', 'CABERGOLINE', 'CANAGLIFLOZIN', 'CANDESARTAN', 'CARBAMAZEPINE', 'CARBOPLATIN', 'CARVEDILOL', 'CEFACLOR', 'CEFAZOLIN', 'CEFTRIAXONE', 'CELECOXIB', 'CERIVASTATIN', 'CHLORAMBUCIL', 'CHLORPROMAZINE', 'CHLORPROPAMIDE', 'CIDOFOVIR', 'CIMETIDINE', 'CINOXACIN', 'CIPROFLOXACIN', 'CISPLATIN', 'CITALOPRAM', 'CLADRIBINE', 'CLARITHROMYCIN', 'CLOBAZAM', 'CLOMIPRAMINE', 'CLONIDINE', 'CLOPIDOGREL', 'CLOTRIMAZOLE', 'CODEINE', 'COLCHICINE', 'CRIZOTINIB', 'CYPROHEPTADINE', 'DAPAGLIFLOZIN', 'DARUNAVIR', 'DASATINIB', 'DESIP

In [11]:
features_names = ["target", "enzyme", "Chem", "DGI", "transporter", "pathway", "indication"]
# SEs_names = ["SIDER", "OFFSIDES"]
# methods = ["SKR", "KR", "KRR", "Naive", "LNSM_RLN", "LNSM_jaccard", "VKR"]
methods = ["SKR", "KRR", "VKR", "Naive", "LNSM_RLN", "LNSM_jaccard"]
# methods = ["SKR", "KR", "KRR", "Naive", "LNSM_RLN", "LNSM_jaccard", "VKR", "SVM", "OCCA", "SCCA", "RF", "BRF"]
tuning_metrices=["AUROC", "AUPR", "AUROCperdrug", "AUPRperdrug"]
metrice_names = ["AUPR+AUROC", "AUPR+AUROCperdrug", "AUROC", "AUPR", "AUROCperdrug", "AUPRperdrug"]

In [12]:
A = 10**np.arange(-2, 3, 1, dtype=float)
C = np.arange(5, 205, 50, dtype=int)
C2 = np.arange(0.1, 1, 0.1, dtype=float)
all_hyperparlist = {
    "MKRR":[A,A,C2],
    "TNMF":[C],
    "TWNMF":[C]
}

In [13]:
fhyperpars = {}
fhyperpars["nested_cv"] = {}
fhyperpars["cv"] = {}
fhyperparsOut = {}
fhyperparsOut["nested_cv"] = {}
fhyperparsOut["cv"] = {}
fresults = {}
fresults["nested_cv"] = {}
fresults["cv"] = {}

In [14]:
method = "TNMF"
validation = "completion"
hyperparList = loadHyperpar(*all_hyperparlist[method],method_option=method)
features_new = completion(Y=SEs[SEs_name], X=features_dict,method_option=method,tuning_metrice=metrice,hyperparList=hyperparList,hyperparfixed=(105,),Validation=validation,n_jobs=1)

The TNMF requires hyperparameter k
common drugs chosen to be the test set:  set()
TNMF starts:
TNMF ends:


In [15]:
hyperpars = {}
hyperpars["nested_cv"] = {}
hyperpars["cv"] = {}
hyperparsOut = {}
hyperparsOut["nested_cv"] = {}
hyperparsOut["cv"] = {}
results = {}
results["nested_cv"] = {}
results["cv"] = {}

In [16]:
# selected_features = ["DGI", "Chem"]
# selected_features = ["target", "transporter"]
selected_features = list(combinations(list(features_dict.keys()), 2))


In [17]:
len(selected_features)

21

In [19]:
for i in range(len(selected_features)):
    print(selected_features[i])
    features_dict_subset = {key: features_dict[key] for key in selected_features[i]}
    features_new_subset = {key: features_new[key] for key in selected_features[i]}
    method = "MKRR"
    metrice = "AUPR"
    validation = "nested_cv"
    hyperparsOut[validation][method] = {}
    results[validation][method] = {}
    hyperparsOut[validation][method]["noAPPFC"] = {}
    results[validation][method]["noAPPFC"] = {}
    hyperparsOut[validation][method]["APPFC"] = {}
    results[validation][method]["APPFC"] = {}
    print("without APPFC")
    hyperparList = loadHyperpar(*all_hyperparlist[method],method_option=method)
    results[validation][method]["noAPPFC"], hyperparsOut[validation][method]["APPFC"] = evaluation3(Y=SEs[SEs_name], X=features_dict_subset, X2=features_dict_subset, method_option=method,tuning_metrice=metrice,hyperparList=hyperparList,Validation=validation,n_jobs=10)
    print("with APPFC")
    hyperparList = loadHyperpar(*all_hyperparlist[method],method_option=method)
    results[validation][method]["APPFC"], hyperparsOut[validation][method]["APPFC"] = evaluation3(Y=SEs[SEs_name], X=features_dict_subset, X2=features_new_subset, method_option=method,tuning_metrice=metrice,hyperparList=hyperparList,Validation=validation,n_jobs=10)
    method = "MKRR"
    metrice = "AUPR"
    validation = "cv"
    hyperparsOut[validation][method] = {}
    results[validation][method] = {}
    hyperparsOut[validation][method]["noAPPFC"] = {}
    results[validation][method]["noAPPFC"] = {}
    hyperparsOut[validation][method]["APPFC"] = {}
    results[validation][method]["APPFC"] = {}
    print("without APPFC")
    hyperparList = loadHyperpar(*all_hyperparlist[method],method_option=method)
    results[validation][method]["noAPPFC"], hyperparsOut[validation][method]["APPFC"] = evaluation3(Y=SEs[SEs_name], X=features_dict_subset, X2=features_dict_subset, method_option=method,tuning_metrice=metrice,hyperparList=hyperparList,Validation=validation,n_jobs=10)
    print("with APPFC")
    hyperparList = loadHyperpar(*all_hyperparlist[method],method_option=method)
    results[validation][method]["APPFC"], hyperparsOut[validation][method]["APPFC"] = evaluation3(Y=SEs[SEs_name], X=features_dict_subset, X2=features_new_subset, method_option=method,tuning_metrice=metrice,hyperparList=hyperparList,Validation=validation,n_jobs=10)

    print('integration results:')
    df = pd.DataFrame()
    for m, mes in results["nested_cv"]["MKRR"].items():
        for me, scores in mes.items():
            temp_df = pd.DataFrame({
                'method': m,
                'metric': me,
                "score": scores
            })
            df = pd.concat([df, temp_df], ignore_index=True)
    
    metrice_names = ["AUPR+AUROC", "AUPR+AUROCperdrug", "AUROC", "AUPR", "AUROCperdrug", "AUPRperdrug"]
    APPFC = ["APPFC", "noAPPFC"]
    df['method'] = pd.Categorical(df['method'], categories=APPFC, ordered=True)
    df['metric'] = pd.Categorical(df['metric'], categories=metrice_names, ordered=True)
    df2 = pd.pivot_table(df, values=['score'], index=["method"], aggfunc={'score': ["mean","std"]}, columns=["metric"])
    df3 = df2.sort_index(axis=1, level='metric').sort_index(level='feature')
    df3.to_excel(f'results/nested_cv_results_integration_{SEs_name}_{method}_{selected_features[i][0]}_{selected_features[i][1]}.xlsx')
    print(df3)
    df = pd.DataFrame()
    for m, mes in results["cv"]["MKRR"].items():
        for me, scores in mes.items():
            temp_df = pd.DataFrame({
                'method': m,
                'metric': me,
                "score": scores
            },index=["1"])
            df = pd.concat([df, temp_df], ignore_index=True)
    df['method'] = pd.Categorical(df['method'], categories=APPFC, ordered=True)
    df['metric'] = pd.Categorical(df['metric'], categories=metrice_names, ordered=True)
    df2 = pd.pivot_table(df, values=['score'], index=["method"], columns="metric")
    df2.to_excel(f'results/cv_results_integration_{SEs_name}_{method}_{selected_features[i][0]}_{selected_features[i][1]}.xlsx')
    print(df2)

('target', 'enzyme')
without APPFC
The MKRR requires hyperparameter lambda, sigma_X
---------- nested cv start ----------
Fold: 0
number of hyperpars combination:  225
first few training idx:  [20 23 28 29 31 36 39 42 44 45]
first few testing idx:  [ 100  192  252  315  530  560  983 1170 1189 1266]
Inner Fold: 0
Inner Fold: 1
Inner Fold: 2
Inner Fold: 3
best hyperpar: (10.0, 10.0, 0.1)
AUPR: 0.39717999716760705
AUPR for each fold: [0.37487083 0.37538756 0.42640776 0.41205384]
--- tuning end ---
target size: 104
------ best hyper pars:  (10.0, 10.0, 0.1) ------
MKRR starts:
MKRR ends:
-----------
AUPRperdrug: 0.42749730937266806
AUROCperdrug: 0.8808568572995166
AUPR+AUROCperdrug: 1.3083541666721845
AUPR: 0.38242306944912907
AUROC: 0.8590692080245685
AUPR+AUROC: 1.2414922774736976
-----------
Fold: 1
number of hyperpars combination:  225
first few training idx:  [20 29 36 39 42 44 49 51 54 56]
first few testing idx:  [  31  128  312  333  446  565  871  885  951 1125]
Inner Fold: 0
Inne

In [20]:
results

{'nested_cv': {'MKRR': {'noAPPFC': {'AUPRperdrug': [0.5428753090389344,
     0.4920437409721433,
     0.4995945093254166,
     0.49883586330311347,
     0.5064590168798764],
    'AUROCperdrug': [0.8975726627231507,
     0.8939465134647826,
     0.8854679944515749,
     0.8895306235224317,
     0.9026516492583595],
    'AUPR+AUROCperdrug': [1.4404479717620853,
     1.3859902544369258,
     1.3850625037769915,
     1.3883664868255452,
     1.4091106661382358],
    'AUPR': [0.5338116801730645,
     0.4495542188564918,
     0.47351506069045396,
     0.5007316414056077,
     0.5033437672085205],
    'AUROC': [0.8794920911466683,
     0.8604534568923722,
     0.8612164981066543,
     0.8735661487566783,
     0.8791966596184553],
    'AUPR+AUROC': [1.4133037713197327,
     1.3100076757488641,
     1.3347315587971083,
     1.374297790162286,
     1.3825404268269756]},
   'APPFC': {'AUPRperdrug': [0.5485872081370009,
     0.49687340761178117,
     0.497491172842328,
     0.5018269085338194,
   