In [1]:
# import libraries, set random seed for reproducibility
# recommended TensorFlow version >= 1.12.0
# recommended Python version >= 3.5

# if libraries are missing:
# !pip install h5py
# !pip install matplotlib
# !pip install numpy
# !pip install pandas
# !pip install seaborn
# !pip install sklearn
# !pip install tensorflow
# !pip install tqdm

import os
import platform
import random
import shutil
import sys

import h5py
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sb
import sklearn.metrics
import tensorflow as tf
from tqdm import tqdm_notebook as tqdm

from keras import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor

random.seed(0)

print("TensorFlow", tf.__version__)
print("Python", platform.sys.version)

TensorFlow 1.13.1
Python 3.7.1 (v3.7.1:260ec2c36a, Oct 20 2018, 03:13:28) 
[Clang 6.0 (clang-600.0.57)]


In [None]:
X = ###
nsamples, nx, ny = X.shape
X = X.reshape((nsamples,nx*ny))

num_fts = ###

In [None]:
auc_values = ###
ic50_values = ###

In [None]:
# need to scale values???

In [None]:
# Using Skicit-learn to split data into training and testing sets
# Split the data into training and testing sets
train_ft_auc, test_ft_auc, train_lab_auc, test_lab_auc = train_test_split(X, auc_values, test_size = 0.3, random_state = 0)
train_ft_ic50, test_ft_ic50, train_lab_ic50, test_lab_ic50 = train_test_split(X, ic50_values, test_size = 0.3, random_state = 0)


In [None]:
# code from here
# https://medium.com/datadriveninvestor/building-neural-network-using-keras-for-regression-ceee5a9eadff
# https://datascienceplus.com/keras-regression-based-neural-networks/

In [None]:
def build_regressor():
    regressor = Sequential()
    regressor.add(Dense(units=100, input_dim=num_fts, kernel_initializer='normal', activation='relu'))
    regressor.add(Dense(units=50, activation = 'relu'))
    regressor.add(Dense(units=38, activation = 'linear'))
    regressor.compile(optimizer='adam', loss='mean_squared_error',  metrics=['mae', 'mse','accuracy'])
    return regressor

In [None]:
regressor = KerasRegressor(build_fn=build_regressor, batch_size=20,epochs=100)

In [None]:
results=regressor.fit(X_train,y_train)

In [None]:
y_pred= regressor.predict(X_test)

In [None]:
fig, ax = plt.subplots()
ax.scatter(y_test, y_pred)
ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=4)
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
plt.show()

In [None]:
val_loss = results.history['val_loss']
loss = results.history['loss']
print(regressor.summary())
# let's plot the performance curve
plt.figure()
plt.plot(val_loss, label='val_loss')
plt.plot(loss, label = 'loss')
plt.legend()
plt.show()

In [None]:
print(results.history.keys())
# "Loss"
plt.plot(results.history['loss'])
plt.plot(results.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()

In [None]:
# use shapley additive feature attribution
# code from here: https://towardsdatascience.com/interpretability-of-deep-learning-models-9f52e54d72ab

In [None]:
# do interpretability
import shap

#initialize js methods for visualization
shap.initjs()

# create an instance of the DeepSHAP which is called DeepExplainer
explainer_shap = shap.DeepExplainer(model=regressor,
                                 data=X_train)

# Fit the explainer on a subset of the data (you can try all but then gets slower)
shap_values = explainer_shap.shap_values(X=X_train.values[:500],
                                      ranked_outputs=True)

In [None]:
# now let's inspect some individual explanations inferred by DeepSHAP
shap.force_plot(explainer_shap.expected_value,
                shap_values[0][0],
                feature_names=X_train.columns)

shap.force_plot(explainer_shap.expected_value,
                shap_values[0][0][1],
                X_train.values[:500][0],
                feature_names=X_train.columns,)

shap.force_plot(explainer_shap.expected_value,
                shap_values[0][0][1],
                X_train.values[:500][0],
                feature_names=X_train.columns,)

In [None]:
# to get the output value and base value
record = 1 # this is just to pick one record in the dataset 
base_value = explainer_2.expected_value
output= base_value + np.sum(shap_values[0][0][record])
print('base value: ',base_value)
print('output value: ',output)

#sanity check that the ouput value is equal to the actual prediction
print(np.round(output,decimals=1) == np.round(model.predict(X_train.values)[record],decimals=1))


# to get the shape values or each feature
shap_df = pd.DataFrame(list(dict(zip(X_train.columns.values,base_value)).items()),
             columns=['features','shapvals']).sort_values(by='shapvals', ascending=True)
print(shap_df)

In [None]:
# get the ovearall mean contribution of each feature variable
shap.summary_plot(shap_values[0], X_train.values[:500], feature_names=X_train.columns)