### Deployed in Colabs with TPU accelerator

In [1]:
# Connect to GDrive

import gc
import pickle
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
# import the necessary libraries to execute this code
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GroupKFold, GroupShuffleSplit
from sklearn.model_selection import RandomizedSearchCV as RSCV

# build NN for class
from tensorflow.keras.layers import  Dropout
from tensorflow.keras.regularizers import l1_l2
from tensorflow.keras.layers import Dense,Dropout
import tensorflow
from tensorflow import keras


In [3]:
# create a function that will build and compile a Keras model

def NN_builder(n_hidden=1, optimizer = 'rmsprop', units=40, learning_rate = 0.001, input_shape=[14], 
               regularization=0.001, dropout=0.2, activation = 'sigmoid'):
    model = keras.models.Sequential()
    model.add(keras.layers.InputLayer(input_shape=input_shape))
  
    for layer in range (n_hidden):
        model.add(keras.layers.Dense(units=40, activation=activation, activity_regularizer=l1_l2(regularization)))
        model.add(Dropout(dropout))
  
    model.add(keras.layers.Dense(units=1,activation='sigmoid'))
    optimizer = keras.optimizers.RMSprop(learning_rate=learning_rate)

    model.compile(loss="mean_absolute_error", optimizer=optimizer)
    return model

NN = tensorflow.keras.wrappers.scikit_learn.KerasRegressor(NN_builder)



In [6]:
datafile = "/content/Dataset_14_feat.xlsx"
df = pd.read_excel(datafile)

model = NN
p_grid ={"n_hidden" : [1,2,3,4,5],
         "units" : [10,20,30,40,50],
         "learning_rate": [0.0001,0.001,0.01],
         "regularization":[1e-2,1e-3,1e-4],
         "dropout":[0.0,0.1,0.2,0.3],
         "batch_size":[32, 64, 128, 256],
         "activation": ['relu', 'tanh', 'sigmoid']}

X = df.drop(['Experimental_index','DP_Group','Release'],axis='columns')
stdScale = StandardScaler().fit(X)
X=stdScale.transform(X)
Y = df['Release']
G = df['DP_Group']
E = df['Experimental_index']
T = df['Time']    

In [7]:
# Number of epochs
epochs = 50

# Define callaback
callbacks=[keras.callbacks.EarlyStopping(monitor='loss', patience=5)]

NUM_TRIALS = 10

itr_number = [] # create new empty list for itr number 
outer_results = []
inner_results = []
model_params = []
G_test_list = []
y_test_list = []
E_test_list = []
T_test_list = []
pred_list = []
pred_var_list = []

for i in range(NUM_TRIALS): #configure the cross-validation procedure - outer loop (test set) 

      cv_outer = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=i) #hold back 20% of the groups for test set

      # split data using GSS
      for train_index, test_index in cv_outer.split(X, Y, G):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = Y[train_index], Y[test_index]
        G_train, G_test = G[train_index], G[test_index]
        E_train, E_test = E[train_index], E[test_index]
        T_train, T_test = T[train_index], T[test_index]

        # store test set information
        G_test = np.array(G_test) #prevents index from being brought from dataframe
        G_test_list.append(G_test)
        E_test = np.array(E_test) #prevents index from being brought from dataframe
        E_test_list.append(E_test)
        T_test = np.array(T_test) #prevents index from being brought from dataframe
        T_test_list.append(T_test)
        y_test = np.array(y_test) #prevents index from being brought from dataframe
        y_test_list.append(y_test)

        # configure the cross-validation procedure - inner loop (validation set/HP optimization)
        cv_inner = GroupKFold(n_splits=10) #should be 10 fold group split for inner loop

        # define search space
        search = RSCV(model, p_grid, n_iter=50, verbose=3, scoring='neg_mean_absolute_error', cv=cv_inner, refit=True) # should be 100

        # execute search
        result = search.fit(X_train, y_train, groups=G_train, callbacks=callbacks, epochs=epochs, verbose=0)

        # get the best performing model fit on the whole training set
        best_model = result.best_estimator_

        # get the score for the best performing model and store
        best_score = abs(result.best_score_)
        inner_results.append(best_score)

        # evaluate model and estimate epistemic uncertainty on the hold out dataset
        predictions = []
        for _ in range(100):
            predictions += [best_model.predict(X_test, verbose=0)]
        
        yhat, pred_unbiased = np.mean(np.array(predictions), axis=0), np.std(np.array(predictions), axis=0)

        # store drug release predictions
        pred_list.append(yhat)
            
        # store prediction variance
        pred_var_list.append(pred_unbiased)

        # evaluate the model
        acc = mean_absolute_error(y_test, yhat)

        # store the result
        itr_number.append(i+1)
        outer_results.append(acc)
        model_params.append(result.best_params_)

      # report progress at end of each inner loop
      print('\n################################################################\n\nSTATUS REPORT:') 
      print('Iteration '+str(i+1)+' of '+str(NUM_TRIALS)+' runs completed') 
      print('Test_Score: %.3f, Best_Valid_Score: %.3f, \n\nBest_Model_Params: \n%s' % (acc, best_score, result.best_params_))
      print("\n################################################################\n ")


Fitting 10 folds for each of 50 candidates, totalling 500 fits
[CV 1/10] END activation=relu, batch_size=128, dropout=0.1, learning_rate=0.0001, n_hidden=1, regularization=0.001, units=50;, score=-0.286 total time=   2.4s
[CV 2/10] END activation=relu, batch_size=128, dropout=0.1, learning_rate=0.0001, n_hidden=1, regularization=0.001, units=50;, score=-0.260 total time=   3.5s
[CV 3/10] END activation=relu, batch_size=128, dropout=0.1, learning_rate=0.0001, n_hidden=1, regularization=0.001, units=50;, score=-0.262 total time=   2.5s
[CV 4/10] END activation=relu, batch_size=128, dropout=0.1, learning_rate=0.0001, n_hidden=1, regularization=0.001, units=50;, score=-0.187 total time=   3.3s




[CV 5/10] END activation=relu, batch_size=128, dropout=0.1, learning_rate=0.0001, n_hidden=1, regularization=0.001, units=50;, score=-0.314 total time=   2.6s




[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[CV 10/10] END activation=sigmoid, batch_size=256, dropout=0.3, learning_rate=0.0001, n_hidden=3, regularization=0.001, units=20;, score=-0.339 total time=   3.8s
[CV 1/10] END activation=relu, batch_size=128, dropout=0.0, learning_rate=0.01, n_hidden=2, regularization=0.0001, units=10;, score=-0.257 total time=   2.5s
[CV 2/10] END activation=relu, batch_size=128, dropout=0.0, learning_rate=0.01, n_hidden=2, regularization=0.0001, units=10;, score=-0.215 total time=   3.4s
[CV 3/10] END activation=relu, batch_size=128, dropout=0.0, learning_rate=0.01, n_hidden=2, regularization=0.0001, units=10;, score=-0.220 total time=   2.8s
[CV 4/10] END activation=relu, batch_size=128, dropout=0.0, learning_rate=0.01, n_hidden=2, regularization=0.0001, units=10;, score=-0.247 total time=   3.4s
[CV 5/10] END activation=relu, batch_size=128, dropout=0.0, learning_rate=0.01, n_hidden=2, regularization=0.0001, units=10;, score=-0.195 t

In [8]:
#create dataframe with results of nested CV
list_of_tuples = list(zip(itr_number, inner_results, outer_results, model_params, G_test_list, E_test_list, T_test_list, y_test_list, pred_list, pred_var_list))
CV_dataset = pd.DataFrame(list_of_tuples, columns = ['Iter', 'Valid Score', 'Test Score', 'Model Parms', 'DP_Groups', "Experimental Index", "Time", 'Experimental_Release', 'Predicted_Release','Prediction_Variance'])
CV_dataset['Score_difference'] = abs(CV_dataset['Valid Score'] - CV_dataset['Test Score']) #Groupby dataframe model iterations that best fit the data (i.e., validitaion <= test)
CV_dataset.sort_values(by=['Score_difference', 'Test Score'], ascending=True, inplace=True) 
CV_dataset = CV_dataset.reset_index(drop=True) # Reset index of dataframe
CV_dataset.to_pickle("drive/My Drive/zero_shot_14_feat_NN.pkl") # save dataframe as pickle file


In [9]:
import pickle
# assign the best model paramaters
best_model_params = CV_dataset.iloc[0,3]
# set params from the best model to a class object
best_model = model.set_params(**best_model_params)
best_model = best_model.fit(X, Y)
with open('drive/My Drive/zero_shot_14_feat_NN_model.pkl', 'wb') as file: # Save the Model to pickle file
          pickle.dump(best_model, file)



In [10]:
CV_dataset.describe()

Unnamed: 0,Iter,Valid Score,Test Score,Score_difference
count,10.0,10.0,10.0,10.0
mean,5.5,0.199359,0.220878,0.036656
std,3.02765,0.016593,0.041411,0.044351
min,1.0,0.160964,0.168039,0.002012
25%,3.25,0.193094,0.203141,0.012898
50%,5.5,0.200395,0.214897,0.02472
75%,7.75,0.210006,0.232844,0.032439
max,10.0,0.217717,0.312406,0.151442
