In [5]:
import pandas as pd
import math
from math import sqrt
import sklearn.preprocessing as sk
import seaborn as sns
from sklearn import metrics
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split
import random
from random import randint
from sklearn.model_selection import StratifiedKFold
from scipy.stats import pearsonr, spearmanr, kendalltau
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [6]:
def RF(X_train, Y_train, X_test, n_estimators, depth, mtry, cv, seed):
    Y_pred_0 = np.zeros([X_test[0].shape[0], 1])
    Y_pred_1 = np.zeros([X_test[1].shape[0], 1])
    RF_pipe= Pipeline([('scaler', StandardScaler()),('RnF', RandomForestRegressor())])
    model = GridSearchCV(RF_pipe, 
                         param_grid={"RnF__n_estimators": n_estimators, "RnF__max_depth": depth, "RnF__max_features": mtry, "RnF__n_jobs": [25]}, 
                         scoring='neg_mean_squared_error', cv=KFold(n_splits=cv, shuffle=True, random_state=seed))
    y_train = Y_train
    x_train = X_train
    model.fit(x_train, y_train)
    Y_pred_0 = model.predict(X_test[0])
    Y_pred_1 = model.predict(X_test[1])
    return Y_pred_0, Y_pred_1

In [3]:
n_estimators = [100, 500, 1000]
depth = [10, 50]
mtry = [1/4]

folds = 10
seeds = 42
drugs = ["Bortezomib", "Entinostat", "Sirolimus","Docetaxel","Gemcitabine", "Crizotinib", 
         "Lapatinib","Vorinostat","Erlotinib","Paclitaxel","Pictilisib"]

CTRP_exprs = pd.read_csv("Data_All/CTRP.exprsALL.tsv", sep = "\t", index_col=0)
GDSC_exprs = pd.read_csv("Data_All/GDSCv2.exprsALL.tsv", sep = "\t", index_col=0)
gCSI_exprs = pd.read_csv("Data_All/gCSI.exprsALL.tsv", sep = "\t", index_col=0)

CTRP_aac = pd.read_csv("Data_All/CTRP.aacALL.tsv", sep = "\t", index_col=0)
GDSC_aac = pd.read_csv("Data_All/GDSCv2.aacALL.tsv", sep = "\t", index_col=0)
gCSI_aac = pd.read_csv("Data_All/gCSI.aacALL.tsv", sep = "\t", index_col=0)

CTRP_ic50 = pd.read_csv("Data_All/CTRP.logIC50.tsv", sep = "\t", index_col=0)
GDSC_ic50 = pd.read_csv("Data_All/GDSC.logIC50.tsv", sep = "\t", index_col=0)
gCSI_ic50 = pd.read_csv("Data_All/gCSI.logIC50.tsv", sep = "\t", index_col=0)

CTRP_info = pd.read_csv("Data_All/CTRP.infoALL.tsv", sep = "\t", index_col=0)
idx_other_ctrp = CTRP_info.index[CTRP_info["Tumor"] == 1]
GDSC_info = pd.read_csv("Data_All/GDSCv2.infoALL.tsv", sep = "\t", index_col=0)
idx_other_gdsc = GDSC_info.index[GDSC_info["Tumor"] == 1]
gCSI_info = pd.read_csv("Data_All/gCSI.infoALL.tsv", sep = "\t", index_col=0)
idx_other_gcsi = gCSI_info.index[gCSI_info["Tumor"] == 1]

In [4]:
# Cross-domain AAC

for drug in drugs:
    CTRP_aac_drug = CTRP_aac.loc[drug].dropna()
    GDSC_aac_drug = GDSC_aac.loc[drug].dropna()
    gCSI_aac_drug = gCSI_aac.loc[drug].dropna()

    idx_ctrp = CTRP_exprs.columns.intersection(CTRP_aac_drug.index)
    idx_ctrp = [x for x in idx_ctrp if x not in idx_other_ctrp]    
    idx_gdsc = GDSC_exprs.columns.intersection(GDSC_aac_drug.index)
    idx_gdsc = [x for x in idx_gdsc if x not in idx_other_gdsc]    
    idx_gcsi = gCSI_exprs.columns.intersection(gCSI_aac_drug.index)
    idx_gcsi = [x for x in idx_gcsi if x not in idx_other_gcsi]  

    CTRP_exprs_drug = pd.DataFrame.transpose(CTRP_exprs.loc[:,idx_ctrp])
    CTRP_aac_drug = CTRP_aac_drug.loc[idx_ctrp]
    GDSC_exprs_drug = pd.DataFrame.transpose(GDSC_exprs.loc[:,idx_gdsc])
    GDSC_aac_drug = GDSC_aac_drug.loc[idx_gdsc]
    gCSI_exprs_drug = pd.DataFrame.transpose(gCSI_exprs.loc[:,idx_gcsi])
    gCSI_aac_drug = gCSI_aac_drug.loc[idx_gcsi]

    X_train_N = CTRP_exprs_drug.values
    y_train = CTRP_aac_drug.values

    pred_gdsc, pred_gcsi = RF(X_train_N, y_train, [GDSC_exprs_drug.values, gCSI_exprs_drug.values],
                   n_estimators, depth, mtry, folds, seeds)
    print(drug)
    print("GDSC Pearson correlation and p-value:", pearsonr(pred_gdsc, GDSC_aac_drug.values))
    print("GDSC RMSE:", sqrt(mean_squared_error(pred_gdsc, GDSC_aac_drug.values)))
    print("gCSI Pearson correlation and p-value:", pearsonr(pred_gcsi, gCSI_aac_drug.values))
    print("gCSI RMSE:", sqrt(mean_squared_error(pred_gcsi, gCSI_aac_drug.values)))
    print("*****************************************************************")
    print("GDSC Spearman correlation and p-value:", spearmanr(pred_gdsc, GDSC_aac_drug.values))
    print("gCSI Spearman correlation and p-value:", spearmanr(pred_gcsi, gCSI_aac_drug.values))
    print("GDSC Kendall correlation and p-value:", kendalltau(pred_gdsc, GDSC_aac_drug.values))
    print("gCSI Kendall correlation and p-value:", kendalltau(pred_gcsi, gCSI_aac_drug.values))    
    print("-----------------------------------------------------------------")

Bortezomib
GDSC Pearson correlation and p-value: (0.18081495803086495, 0.0058513998343033575)
GDSC RMSE: 0.07991088844528416
gCSI Pearson correlation and p-value: (0.311525704747294, 1.0233028036075855e-07)
gCSI RMSE: 0.14998513650555248
*****************************************************************
GDSC Spearman correlation and p-value: SpearmanrResult(correlation=0.19194006892657567, pvalue=0.0034039295245807525)
gCSI Spearman correlation and p-value: SpearmanrResult(correlation=0.3115327637196538, pvalue=1.0225885153086091e-07)
GDSC Kendall correlation and p-value: KendalltauResult(correlation=0.12840203274985884, pvalue=0.0036691191631342854)
gCSI Kendall correlation and p-value: KendalltauResult(correlation=0.21336405529953922, pvalue=1.0256663772964481e-07)
-----------------------------------------------------------------
Entinostat
GDSC Pearson correlation and p-value: (0.5352420167781098, 3.752616428628334e-18)
GDSC RMSE: 0.14335165322935417
gCSI Pearson correlation and p-va

Pictilisib
GDSC Pearson correlation and p-value: (0.24193003166749605, 0.00018648052836797593)
GDSC RMSE: 0.09789114961184937
gCSI Pearson correlation and p-value: (0.3135605070491468, 9.304328808589086e-08)
gCSI RMSE: 0.11995608175129373
*****************************************************************
GDSC Spearman correlation and p-value: SpearmanrResult(correlation=0.22084717705208987, pvalue=0.0006681435552194354)
gCSI Spearman correlation and p-value: SpearmanrResult(correlation=0.24103280463967888, pvalue=4.891719314817481e-05)
GDSC Kendall correlation and p-value: KendalltauResult(correlation=0.15234217380140125, pvalue=0.0005211981072574222)
gCSI Kendall correlation and p-value: KendalltauResult(correlation=0.16038961087654122, pvalue=6.721472171671624e-05)
-----------------------------------------------------------------


In [5]:
#Cross-domain IC50

for drug in drugs:
    CTRP_ic50_drug = CTRP_ic50.loc[drug].dropna()
    GDSC_ic50_drug = GDSC_ic50.loc[drug].dropna()
    gCSI_ic50_drug = gCSI_ic50.loc[drug].dropna()

    idx_ctrp = CTRP_exprs.columns.intersection(CTRP_ic50_drug.index)
    idx_ctrp = [x for x in idx_ctrp if x not in idx_other_ctrp]
    idx_gdsc = GDSC_exprs.columns.intersection(GDSC_ic50_drug.index)
    idx_gdsc = [x for x in idx_gdsc if x not in idx_other_gdsc]
    idx_gcsi = gCSI_exprs.columns.intersection(gCSI_ic50_drug.index)
    idx_gcsi = [x for x in idx_gcsi if x not in idx_other_gcsi]

    CTRP_exprs_drug = pd.DataFrame.transpose(CTRP_exprs.loc[:,idx_ctrp])
    CTRP_ic50_drug = CTRP_ic50_drug.loc[idx_ctrp]
    GDSC_exprs_drug = pd.DataFrame.transpose(GDSC_exprs.loc[:,idx_gdsc])
    GDSC_ic50_drug = GDSC_ic50_drug.loc[idx_gdsc]
    gCSI_exprs_drug = pd.DataFrame.transpose(gCSI_exprs.loc[:,idx_gcsi])
    gCSI_ic50_drug = gCSI_ic50_drug.loc[idx_gcsi]

    X_train_N = CTRP_exprs_drug.values
    y_train = CTRP_ic50_drug.values

    pred_gdsc, pred_gcsi = RF(X_train_N, y_train, [GDSC_exprs_drug.values, gCSI_exprs_drug.values],
                   n_estimators, depth, mtry, folds, seeds)
    print(drug)
    print("GDSC Pearson correlation and p-value:", pearsonr(pred_gdsc, GDSC_ic50_drug.values))
    print("GDSC RMSE:", sqrt(mean_squared_error(pred_gdsc, GDSC_ic50_drug.values)))
    print("gCSI Pearson correlation and p-value:", pearsonr(pred_gcsi, gCSI_ic50_drug.values))
    print("gCSI RMSE:", sqrt(mean_squared_error(pred_gcsi, gCSI_ic50_drug.values)))
    print("*****************************************************************")
    print("GDSC Spearman correlation and p-value:", spearmanr(pred_gdsc, GDSC_ic50_drug.values))
    print("gCSI Spearman correlation and p-value:", spearmanr(pred_gcsi, gCSI_ic50_drug.values))
    print("GDSC Kendall correlation and p-value:", kendalltau(pred_gdsc, GDSC_ic50_drug.values))
    print("gCSI Kendall correlation and p-value:", kendalltau(pred_gcsi, gCSI_ic50_drug.values))    
    print("-----------------------------------------------------------------")

Bortezomib
GDSC Pearson correlation and p-value: (0.26106209851923684, 7.399398654147473e-05)
GDSC RMSE: 1.2829655983634736
gCSI Pearson correlation and p-value: (0.23714044955954597, 6.322115687860671e-05)
gCSI RMSE: 2.17292030499894
*****************************************************************
GDSC Spearman correlation and p-value: SpearmanrResult(correlation=0.24581586257802063, pvalue=0.00019608850992349739)
gCSI Spearman correlation and p-value: SpearmanrResult(correlation=0.2125827448640166, pvalue=0.00034909478163805527)
GDSC Kendall correlation and p-value: KendalltauResult(correlation=0.1677705216866411, pvalue=0.00018514027800471442)
gCSI Kendall correlation and p-value: KendalltauResult(correlation=0.14277610169928576, pvalue=0.00037806766063414477)
-----------------------------------------------------------------
Entinostat
GDSC Pearson correlation and p-value: (0.547634731406748, 2.5611396186111403e-15)
GDSC RMSE: 0.9291647630358516
gCSI Pearson correlation and p-value

  c /= stddev[:, None]
  c /= stddev[None, :]
  return (a < x) & (x < b)
  return (a < x) & (x < b)
  cond2 = cond0 & (x <= _a)


Sirolimus
GDSC Pearson correlation and p-value: (0.09929599538394632, 0.6151645623868786)
GDSC RMSE: 8.295585532889406
gCSI Pearson correlation and p-value: (0.1764994461133595, 0.2960288365837803)
gCSI RMSE: 8.501139430077828
*****************************************************************
GDSC Spearman correlation and p-value: SpearmanrResult(correlation=0.07031929523840467, pvalue=0.7221620416603671)
gCSI Spearman correlation and p-value: SpearmanrResult(correlation=0.179296978765496, pvalue=0.2883162866150426)
GDSC Kendall correlation and p-value: KendalltauResult(correlation=0.05823804648455101, pvalue=0.711043444207748)
gCSI Kendall correlation and p-value: KendalltauResult(correlation=0.12272768240132814, pvalue=0.2927047700703984)
-----------------------------------------------------------------
Docetaxel
GDSC Pearson correlation and p-value: (0.3081716282956794, 1.1713147166294507e-05)
GDSC RMSE: 3.8390835654362268
gCSI Pearson correlation and p-value: (0.3783493453868809, 3.