In [11]:
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.linear_model import ElasticNetCV
from sklearn.ensemble import RandomForestRegressor

In [2]:
dataset = 'gdsc2'
ont = 'ctg'

gene_index = pd.read_csv('../data/gene2ind_' + ont + '_' + dataset + '.txt', sep='\t', header=None, names=(['I', 'G']))
gene_list = gene_index['G']

cell_index = pd.read_csv('../data/cell2ind_' + dataset + '.txt', sep='\t', header=None, names=(['I', 'C']))
cell_map = dict(zip(cell_index['C'], cell_index['I']))

cell_features = pd.read_csv('../data/cell2mutation_' + ont + '_' + dataset + '.txt', header=None, names=gene_list)

In [3]:
def prepare_data(train_df, test_df):

    train_Y = np.array(train_df['auc'])
    test_Y = np.array(test_df['auc'])

    train_X = np.empty(shape = (len(train_df), len(gene_list)))
    test_X = np.empty(shape = (len(test_df), len(gene_list)))

    for i, row in train_df.iterrows():
        train_X[i] = np.array(cell_features.iloc[int(cell_map[row['cell']])])

    for i, row in test_df.iterrows():
        test_X[i] = np.array(cell_features.iloc[int(cell_map[row['cell']])])
        
    return train_X, train_Y, test_X, test_Y

In [6]:
def run_elastic_net(dataset, ont, drug):
    
    all_pred = []
    all_test = []
    for i in range(1, 11):

        train_df = pd.read_csv("../data/training_files/" + str(i) + "_train_" + dataset + '_' + drug + ".txt", sep='\t', header=None, names=(['cell', 'smiles', 'auc']))
        test_df = pd.read_csv("../data/training_files/" + str(i) + "_test_" + dataset + '_' + drug + ".txt", sep='\t', header=None, names=['cell', 'smiles', 'auc'])
        train_X, train_Y, test_X, test_Y = prepare_data(train_df, test_df)
        all_test = np.concatenate((all_test, test_Y), axis=0)
        
        regr = ElasticNetCV(fit_intercept=False, cv=10, max_iter=3000, tol=1e-3, n_jobs=-2)
        regr.fit(train_X, train_Y)
        predicted_Y = regr.predict(test_X)
        all_pred = np.concatenate((all_pred, predicted_Y), axis=0)
    
    corr = stats.spearmanr(all_pred, all_test)[0]
    
    modeldir = "../models/elastic_net/"
    np.savetxt(modeldir + "predict_" + dataset + '_' + drug + '_' + str(i) + ".txt", all_pred, fmt = '%.4e')
    return corr


In [12]:
def run_random_forest(dataset, ont, drug):
    
    all_pred = []
    all_test = []
    for i in range(1, 11):

        train_df = pd.read_csv("../data/training_files/" + str(i) + "_train_" + dataset + '_' + drug + ".txt", sep='\t', header=None, names=(['cell', 'smiles', 'auc']))
        test_df = pd.read_csv("../data/training_files/" + str(i) + "_test_" + dataset + '_' + drug + ".txt", sep='\t', header=None, names=['cell', 'smiles', 'auc'])
        train_X, train_Y, test_X, test_Y = prepare_data(train_df, test_df)
        all_test = np.concatenate((all_test, test_Y), axis=0)
        
        regr = RandomForestRegressor(random_state=0, n_jobs=-2)
        regr.fit(train_X, train_Y)
        predicted_Y = regr.predict(test_X)
        all_pred = np.concatenate((all_pred, predicted_Y), axis=0)
    
    corr = stats.spearmanr(all_pred, all_test)[0]
    
    modeldir = "../models/random_forest/"
    np.savetxt(modeldir + "predict_" + dataset + '_' + drug + '_' + str(i) + ".txt", all_pred, fmt = '%.4e')
    return corr


In [7]:
drugs = ["Palbociclib", "nutlin-3A", "paclitaxel", "Trametinib", "Dabrafenib", "vincristine", "Sorafenib", "docetaxel", "Epirubicin", "Cediranib"]

for drug in drugs:
    corr = run_elastic_net(dataset, ont, drug)
    print("Corr for {}: {:.4f}".format(drug, corr))

Corr for Palbociclib: 0.0615
Corr for nutlin-3A: 0.0186
Corr for paclitaxel: 0.0213
Corr for Trametinib: 0.0131
Corr for Dabrafenib: 0.0051
Corr for vincristine: 0.0286
Corr for Sorafenib: -0.0771
Corr for docetaxel: 0.0145
Corr for Epirubicin: 0.0540
Corr for Cediranib: 0.0964


In [14]:
for drug in drugs:
    corr = run_random_forest(dataset, ont, drug)
    print("Corr for {}: {:.4f}".format(drug, corr))

Corr for Palbociclib: 0.1281
Corr for nutlin-3A: 0.0521
Corr for paclitaxel: 0.0271
Corr for Trametinib: 0.1217
Corr for Dabrafenib: 0.0343
Corr for vincristine: 0.1400
Corr for Sorafenib: 0.0488
Corr for docetaxel: 0.1028
Corr for Epirubicin: 0.1566
Corr for Cediranib: -0.0283
