In [1]:
import numpy as np
import tensorflow as tf
import random as rn

# The below is necessary in Python 3.2.3 onwards to
# have reproducible behavior for certain hash-based operations.
# See these references for further details:
# https://docs.python.org/3.4/using/cmdline.html#envvar-PYTHONHASHSEED
# https://github.com/keras-team/keras/issues/2280#issuecomment-306959926
my_seed = 2024
import os
os.environ['PYTHONHASHSEED'] = '0'

# The below is necessary for starting Numpy generated random numbers
# in a well-defined initial state.

np.random.seed(my_seed)

# The below is necessary for starting core Python generated random numbers
# in a well-defined state.

rn.seed(my_seed)

# Force TensorFlow to use single thread. (to force it set the threads to 1)
# Multiple threads are a potential source of
# non-reproducible results.
# For further details, see: https://stackoverflow.com/questions/42022950/which-seeds-have-to-be-set-where-to-realize-100-reproducibility-of-training-res

session_conf = tf.ConfigProto(intra_op_parallelism_threads=0, inter_op_parallelism_threads=0)

from keras import backend as K

# The below tf.set_random_seed() will make random number generation
# in the TensorFlow backend have a well-defined initial state.
# For further details, see: https://www.tensorflow.org/api_docs/python/tf/set_random_seed

tf.set_random_seed(my_seed)

sess = tf.Session(graph=tf.get_default_graph(), config=session_conf)
K.set_session(sess)

rn.seed(my_seed)
#--- all other imports
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense, Dropout
import keras.backend
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV
from keras.optimizers import SGD, Adam
from numpy.random import seed
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import make_scorer
from sklearn.metrics import auc
from sklearn.metrics import precision_recall_curve
from sklearn.externals import joblib
from sklearn.model_selection import cross_validate
from sklearn.metrics import average_precision_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import copy
from bioinformatics_helpers.utils import get_mendelian_dataset
from bioinformatics_helpers.utils import hingesig_tf

Using TensorFlow backend.


In [2]:
#read data from last expirment

cv_results = pd.read_csv("cv_results_scaler_adam.csv")
cv_results.sort_values("mean_train_AU_PRC", inplace=True, ascending=False)
cv_results.head(10)[["params","mean_train_AU_PRC"]]

Unnamed: 0,params,mean_train_AU_PRC
7,"{'model__architecture': (100, 80)}",0.993681
8,"{'model__architecture': (100, 40)}",0.984577
16,"{'model__architecture': (100, 80, 40)}",0.979844
17,"{'model__architecture': (100, 40, 20)}",0.966692
9,"{'model__architecture': (100, 10)}",0.954238
18,"{'model__architecture': (80, 40, 20)}",0.949957
23,"{'model__architecture': (100, 80, 50, 20)}",0.938269
6,"{'model__architecture': (100,)}",0.92805
19,"{'model__architecture': (80, 20, 10)}",0.927387
10,"{'model__architecture': (40, 20)}",0.925833


In [3]:
#get first 10 params
params = {
            "model__architecture": cv_results.head(10).param_model__architecture.apply(eval).values,
            "model__dropout_rate": [0,0.2, 0.4, 0.6, 0.8],
            "model__epochs": [150]
         }
#Srivastana et al 2014 suggests it's better to set the dropout rate between 0.4-0.8, but our dataset it's much different than those they used, it's worth seeing how it works
#We put a dropout rate 0 to have an immediate comparison 

In [None]:
feature_per_example = 26
batch_size = 5000

In [None]:
def create_model(architecture=(100,80), dropout_rate=0.2):
    model = Sequential()
    weights_initializer = keras.initializers.glorot_normal(seed=my_seed)
    bias_init = keras.initializers.RandomNormal(mean=0.1, stddev=0.05, seed=my_seed)
    input_dim = feature_per_example
    for units in architecture:
        model.add(
            Dense(
                units,
                input_dim = input_dim,
                kernel_initializer = weights_initializer,
                bias_initializer = bias_init,
                activation="relu"
            )
        )
        input_dim=None # for the next layer keras infers its dimensions
        model.add(
            Dropout(rate=dropout_rate, seed=my_seed)
        )
        
    model.add(
        Dense(
            1,
            kernel_initializer=weights_initializer,
            bias_initializer=keras.initializers.zeros(),
            activation='sigmoid'
    ))
    optimizer = Adam()
    model.compile(loss=hingesig_tf, optimizer=optimizer)
    return model

In [None]:
train_X, train_y, test_X, test_y = get_mendelian_dataset()

def prc_score(y_true, y_pred):
    precision, recall, _ = precision_recall_curve(y_true=y_true, probas_pred=y_pred)
    return auc(x=recall, y=precision)

scoring = {
    'AVG_PREC': make_scorer(average_precision_score, needs_threshold=True),
    'AU_PRC' : make_scorer(prc_score, needs_threshold=True),
    'AU_ROC' : make_scorer(roc_auc_score, needs_threshold=True)
}

In [None]:
model = KerasClassifier(build_fn=create_model, verbose=0, shuffle=True, batch_size=batch_size, epochs=150)
pipe = Pipeline([("scaler", StandardScaler()), ("model",model)])
cv = StratifiedKFold(n_splits=5, random_state=my_seed, shuffle=True)
grid_search = GridSearchCV(estimator=pipe,param_grid=params,
                           scoring=scoring,
                           return_train_score=True,
                           cv=cv,
                           refit=False
                           )
grid_search.fit(train_X, train_y)
#saving cv_results_
cv_results = pd.DataFrame.from_dict(grid_search.cv_results_)
cv_results.to_csv("cv_results_scaler_adam_dropout_top_ten_train.csv")