In [1]:
import numpy as np
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
import pandas as pd
import math
from math import sqrt
import sklearn.preprocessing as sk
import seaborn as sns
from sklearn import metrics
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split
import random
from random import randint
from sklearn.model_selection import StratifiedKFold
from scipy.stats import pearsonr, spearmanr, kendalltau
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

  from numpy.core.umath_tests import inner1d


In [15]:
def RG(X_train, Y_train, X_test, alpha, cv, seed):
    Y_pred_0 = np.zeros([X_test.shape[0], 1])
    RG_pipe = Pipeline([('scaler', StandardScaler()),('RdG', Ridge())])
    model = GridSearchCV(RG_pipe, param_grid={"RdG__alpha": alpha}, scoring='neg_mean_squared_error', cv=KFold(n_splits=cv, shuffle=True, random_state=seed))
    y_train = Y_train
    x_train = X_train
    model.fit(x_train, y_train)
    Y_pred_0 = model.predict(X_test)
    return Y_pred_0 

alph = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
folds = 10
seeds = 42

In [16]:

drugs = ["Bortezomib", "Entinostat", "Sirolimus","Docetaxel","Gemcitabine", "Crizotinib", 
         "Lapatinib","Vorinostat","Erlotinib","Paclitaxel","Pictilisib"]

GDSCv1_exprs = pd.read_csv("Data_All/GDSCv1.exprsALL.tsv", sep = "\t", index_col=0)
gCSI_exprs = pd.read_csv("Data_All/gCSI.exprsALL.tsv", sep = "\t", index_col=0)

gCSI_aac = pd.read_csv("Data_All/gCSI.aacALL.tsv", sep = "\t", index_col=0)
GDSCv1_aac = pd.read_csv("Data_All/GDSCv1.aacALL.tsv", sep = "\t", index_col=0)

gCSI_ic50 = pd.read_csv("Data_All/gCSI.logIC50.tsv", sep = "\t", index_col=0)
GDSCv1_ic50 = pd.read_csv("Data_All/GDSCv1.logIC50.tsv", sep = "\t", index_col=0)

gCSI_info = pd.read_csv("Data_All/gCSI.infoALL.tsv", sep = "\t", index_col=0)
idx_other_gcsi = gCSI_info.index[gCSI_info["Tumor"] == 1]
GDSCv1_info = pd.read_csv("Data_All/GDSCv1.infoALL.tsv", sep = "\t", index_col=0)
idx_other_gdscv1 = GDSCv1_info.index[GDSCv1_info["Tumor"] == 1]

In [18]:
for drug in drugs:
    GDSCv1_aac_drug = GDSCv1_aac.loc[drug].dropna()
    gCSI_aac_drug = gCSI_aac.loc[drug].dropna()

    idx_gdscv1 = GDSCv1_exprs.columns.intersection(GDSCv1_aac_drug.index)
    idx_gdscv1 = [x for x in idx_gdscv1 if x not in idx_other_gdscv1]
    idx_gcsi = gCSI_exprs.columns.intersection(gCSI_aac_drug.index)
    idx_gcsi = [x for x in idx_gcsi if x not in idx_other_gcsi]

    GDSCv1_exprs_drug = pd.DataFrame.transpose(GDSCv1_exprs.loc[:,idx_gdscv1])
    GDSCv1_aac_drug = GDSCv1_aac_drug.loc[idx_gdscv1]

    gCSI_exprs_drug = pd.DataFrame.transpose(gCSI_exprs.loc[:,idx_gcsi])
    gCSI_aac_drug = gCSI_aac_drug.loc[idx_gcsi]

    X_train_N = GDSCv1_exprs_drug.values
    y_train = GDSCv1_aac_drug.values

    pred_gcsi = RG(X_train_N, y_train, gCSI_exprs_drug.values,
                   alph, folds, seeds)
    print(drug)
    print("gCSI Pearson correlation and p-value:", pearsonr(pred_gcsi, gCSI_aac_drug.values))
    print("gCSI RMSE:", sqrt(mean_squared_error(pred_gcsi, gCSI_aac_drug.values)))
    print("****************************************************************")        
    print("gCSI Spearman correlation and p-value:", spearmanr(pred_gcsi, gCSI_aac_drug.values))
    print("gCSI Kendall correlation and p-value:", kendalltau(pred_gcsi, gCSI_aac_drug.values))        
    print("-----------------------------------------------------------------")

Bortezomib
gCSI Pearson correlation and p-value: (0.1439001118508592, 0.015965986198139825)
gCSI RMSE: 0.2112326027143306
****************************************************************
gCSI Spearman correlation and p-value: SpearmanrResult(correlation=0.1833434464360879, pvalue=0.0020675676627401416)
gCSI Kendall correlation and p-value: KendalltauResult(correlation=0.12585765488991296, pvalue=0.001693103427735283)
-----------------------------------------------------------------
Entinostat
gCSI Pearson correlation and p-value: (-0.05876800051301458, 0.3271726467684985)
gCSI RMSE: 0.3034206712761239
****************************************************************
gCSI Spearman correlation and p-value: SpearmanrResult(correlation=-0.05209597111309115, pvalue=0.38516307388794435)
gCSI Kendall correlation and p-value: KendalltauResult(correlation=-0.03525030632684026, pvalue=0.3948637250389031)
-----------------------------------------------------------------
Sirolimus
gCSI Pearson corr

In [21]:
for drug in drugs:
    GDSCv1_aac_drug = GDSCv1_aac.loc[drug].dropna()
    gCSI_aac_drug = gCSI_aac.loc[drug].dropna()

    idx_gdscv1gcsi = GDSCv1_aac_drug.index.intersection(gCSI_aac_drug.index) 
    idx_gdscv1gcsi = gCSI_exprs.columns.intersection(idx_gdscv1gcsi)
    idx_gdscv1gcsi = GDSCv1_exprs.columns.intersection(idx_gdscv1gcsi)
    idx_gdscv1gcsi = [x for x in idx_gdscv1gcsi if x not in idx_other_gcsi] 
    idx_gdscv1gcsi = [x for x in idx_gdscv1gcsi if x not in idx_other_gdscv1]
 
    GDSCv1_aac_drug_2 = GDSCv1_aac_drug.loc[idx_gdscv1gcsi]
    gCSI_aac_drug_2 = gCSI_aac_drug.loc[idx_gdscv1gcsi]
    print(drug)
    print("Baseline Pearson correlation between GDSCv1 and gCSI:", pearsonr(GDSCv1_aac_drug_2.values, gCSI_aac_drug_2.values))
    print("Baseline Spearman correlation between GDSCv1 and gCSI:", spearmanr(GDSCv1_aac_drug_2.values, gCSI_aac_drug_2.values))
    print("Baseline Kendall correlation between GDSCv1 and gCSI:", kendalltau(GDSCv1_aac_drug_2.values, gCSI_aac_drug_2.values))      
    print("sample size of GDSCv1_gCSI:", len(idx_gdscv1gcsi))    
    
    print("------------------------------------------------")


Bortezomib
Baseline Pearson correlation between GDSCv1 and gCSI: (0.37925605477463586, 0.2797607414594532)
Baseline Spearman correlation between GDSCv1 and gCSI: SpearmanrResult(correlation=0.33333333333333326, pvalue=0.34659350708733405)
Baseline Kendall correlation between GDSCv1 and gCSI: KendalltauResult(correlation=0.24444444444444444, pvalue=0.38071979717813054)
sample size of GDSCv1_gCSI: 10
------------------------------------------------
Entinostat
Baseline Pearson correlation between GDSCv1 and gCSI: (0.2233154859491209, 0.5351386181685034)
Baseline Spearman correlation between GDSCv1 and gCSI: SpearmanrResult(correlation=0.531509589558614, pvalue=0.11385241441710078)
Baseline Kendall correlation between GDSCv1 and gCSI: KendalltauResult(correlation=0.4057984286223453, pvalue=0.11499183293112152)
sample size of GDSCv1_gCSI: 10
------------------------------------------------
Sirolimus
Baseline Pearson correlation between GDSCv1 and gCSI: (0.5646231264355911, 0.089040036372112