In [1]:
import sys
from google.colab import drive
drive.mount('/content/drive')
path = "/content/drive/MyDrive/Colab_Notebooks/DeepSynergy"
import os
os.chdir(path)

Mounted at /content/drive


In [2]:
import numpy as np
import pandas as pd
import gzip
import pickle
import matplotlib.pyplot as plt
import keras as K
import tensorflow as tf
from keras import backend
from keras.models import load_model

In [3]:
os.environ["CUDA_VISIBLE_DEVICES"]="0"
tf.test.gpu_device_name()

'/device:GPU:0'

In [4]:
def prediction(model_name, data_file):
  model = load_model(model_name)
  file = gzip.open(data_file, 'rb')
  X_tr, X_val, X_train, X_test, y_tr, y_val, y_train, y_test = pickle.load(file)
  file.close()

  prediction_test = model.predict(X_test)
  prediction_test = prediction_test.reshape(len(prediction_test),)

  return prediction_test, y_test, model, X_val, y_val

In [5]:
def performance(prediction_test, y_test, threshold_true, model, X_val, y_val):
  #Pearson
  from scipy.stats import pearsonr
  print('Pearson:', pearsonr(prediction_test, y_test)[0])


  y_test_class = y_test > threshold_true  #30
  #ROC PR
  from sklearn.metrics import roc_auc_score
  roc_auc = roc_auc_score(y_test_class, prediction_test)
  print('ROC:', roc_auc)

  from sklearn.metrics import average_precision_score
  pr_auc = average_precision_score(y_test_class, prediction_test)
  print('PR:', pr_auc)

  # accuracy (ACC), balanced accuracy (BACC), precision (PREC), sensitivity (TPR), specificity (TNR) and Cohen’s Kappa
  # appropriate threshold (optimized for BACC on validation set)
  prediction_val = model.predict(X_val)
  prediction_val = prediction_val.reshape(len(prediction_val),)
  y_val_class = y_val > threshold_true  #30

  from sklearn.metrics import balanced_accuracy_score
  BACC_cand = []
  for t in range(0,30):
    BACC = balanced_accuracy_score(y_val_class, prediction_val > t)
    #print(BACC)
    BACC_cand.append(BACC)

  threshold = BACC_cand.index(max(BACC_cand))+10
  print(threshold)
  prediction_test_class = prediction_test > threshold

  from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
  # ACC
  print('ACC: %.3f' % accuracy_score(y_test_class, prediction_test_class))

  # BACC
  print('BACC: %.3f' % balanced_accuracy_score(y_test_class, prediction_test_class))

  # PREC
  print('PREC: %.3f' % precision_score(y_test_class, prediction_test_class))

  # sensitivity (TPR)
  print('Recall: %.3f' % recall_score(y_test_class, prediction_test_class))

  # specificity (TNR)
  from sklearn.metrics import confusion_matrix
  conf_matrix = confusion_matrix(y_test_class, prediction_test_class)
  tn, fp, fn, tp = confusion_matrix(y_test_class, prediction_test_class).ravel()
  specificity = tn / (tn+fp)
  print('Specificity: %.3f' % specificity)

  # Kappa
  from sklearn.metrics import cohen_kappa_score
  kappa_value = cohen_kappa_score(y_test_class, prediction_test_class)
  print('Kappa: %.3f' % kappa_value)

## Check if same cell line produce similar synergy score regardless of different drug combinations.

In [None]:
model = load_model('my_model_15_emb.h5')
"""
file = gzip.open('dataset/data_testfold0_tanh.p.gz', 'rb')
X_tr, X_val, X_train, X_test, y_tr, y_val, y_train, y_test = pickle.load(file)
file.close()
"""
file = gzip.open('./dataset/X_15_emb.p', 'rb')
X = pickle.load(file)
file.close()

In [None]:
labels_15 = pd.read_csv('dataset/labels_15.csv')
labels_15 = pd.concat([labels_15, labels_15])
A2058_ind = labels_15[labels_15.cell_line == 'A2058'].index
A2058_X = X[A2058_ind]
A2058_X.shape

(154, 50)

In [None]:
prediction = model.predict(A2058_X)
prediction

## Performance

In [6]:
# experiment in paper
prediction_test, y_test, model, X_val, y_val = prediction('my_model.h5', 'dataset/data_testfold0_tanh.p.gz')
performance(prediction_test, y_test, 30, model, X_val, y_val)

Pearson: 0.717695423683101
ROC: 0.9285714442826538
PR: 0.5534515228632259
35
ACC: 0.951
BACC: 0.622
PREC: 0.723
Recall: 0.251
Specificity: 0.994
Kappa: 0.353


In [7]:
# pca 12 embedding for cell lines
prediction_test_c12, y_test_c12, model_c12, X_val_c12, y_val_c12 = prediction('my_model_15_emb_c12.h5', 'dataset/data_15_emb_c12_testfold0.p')
threshold_true_c12 = y_test_c12[-((len(y_test_c12*0.1))-1)]
performance(prediction_test_c12, y_test_c12, threshold_true_c12, model_c12, X_val_c12, y_val_c12)

Pearson: 0.6075466541635063
ROC: 0.8042117974238876
PR: 0.941379275477126
10
ACC: 0.448
BACC: 0.639
PREC: 0.962
Recall: 0.335
Specificity: 0.942
Kappa: 0.132


In [8]:
# pca 33 embedding for cell lines
prediction_test_c33, y_test_c33, model_c33, X_val_c33, y_val_c33 = prediction('my_model_15_emb_c33.h5', 'dataset/data_15_emb_c33_testfold0.p')
threshold_true_c33 = y_test_c33[-((len(y_test_c33*0.1))-1)]
performance(prediction_test_c33, y_test_c33, threshold_true_c33, model_c33, X_val_c33, y_val_c33)

Pearson: 0.6043559293669647
ROC: 0.7923878439695551
PR: 0.933497770573317
10
ACC: 0.412
BACC: 0.616
PREC: 0.956
Recall: 0.290
Specificity: 0.942
Kappa: 0.107


In [9]:
# no preprocessing and embedding for drugs and cell lines
prediction_test_15, y_test_15, model_15, X_val_15, y_val_15 = prediction('my_model_15.h5', 'dataset/data_15_testfold0.p')
threshold_true_15 = y_test_15[-((len(y_test_15*0.1))-1)]
performance(prediction_test_15, y_test_15, threshold_true_15, model_15, X_val_15, y_val_15)

Pearson: 0.07297341018787547
ROC: 0.4939393662177986
PR: 0.8213371739961118
22
ACC: 0.242
BACC: 0.493
PREC: 0.791
Recall: 0.093
Specificity: 0.893
Kappa: -0.006
