In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"] = ""
os.environ['PYTHONHASHSEED'] = '0'
import numpy as np
import tensorflow as tf
import random as rn

# The below is necessary in Python 3.2.3 onwards to
# have reproducible behavior for certain hash-based operations.
# See these references for further details:
# https://docs.python.org/3.4/using/cmdline.html#envvar-PYTHONHASHSEED
# https://github.com/keras-team/keras/issues/2280#issuecomment-306959926
my_seed = 2024

np.random.seed(my_seed)

rn.seed(my_seed)

# Force TensorFlow to use single thread.
# Multiple threads are a potential source of
# non-reproducible results.
# For further details, see: https://stackoverflow.com/questions/42022950/which-seeds-have-to-be-set-where-to-realize-100-reproducibility-of-training-res
log_device_placement = False
import sys
if "log_device_tf" in sys.argv: 
    log_device_placement = True

session_conf = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1, device_count = {"GPU" : 0},
                              log_device_placement=log_device_placement)

from keras import backend as K

# The below tf.set_random_seed() will make random number generation
# in the TensorFlow backend have a well-defined initial state.
# For further details, see: https://www.tensorflow.org/api_docs/python/tf/set_random_seed

tf.set_random_seed(my_seed)

sess = tf.Session(graph=tf.get_default_graph(), config=session_conf)
K.set_session(sess)
import keras
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV
from keras.optimizers import SGD
from numpy.random import seed
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import make_scorer
from sklearn.metrics import auc
from sklearn.metrics import precision_recall_curve
from sklearn.externals import joblib
from sklearn.model_selection import cross_validate
from sklearn.metrics import average_precision_score
import copy
import matplotlib.pyplot as plt
import theano.tensor as T
import seaborn as sns
from bioinformatics_helpers.utils import interpolated_precision_recall_curve as pr_curve
from bioinformatics_helpers.utils import hingesig_tf
from bioinformatics_helpers.utils import get_mendelian_dataset
from bioinformatics_helpers.balanced_generator import BalancedGenerator
from bioinformatics_helpers.utils import CustomKerasClassifier
from bioinformatics_helpers.utils import GetAUPRCCallback

Using TensorFlow backend.


In [2]:
feature_per_example=26

def create_model():
    model = Sequential()
    initializer = keras.initializers.glorot_uniform(seed=my_seed)
    model.add(Dense(
            300, 
            input_dim=feature_per_example, 
            kernel_initializer=initializer,
            activation="sigmoid")
             )
    model.add(Dense(
            1,
            kernel_initializer=initializer,
            activation='sigmoid'
    ))
    optimizer = SGD(lr=0.01, decay=0, momentum=0, nesterov=False)
    model.compile(loss=hingesig_tf, optimizer=optimizer)
    return model


In [3]:
train_X, train_y, test_X, test_y = get_mendelian_dataset()


In [6]:
cv_results_sample_negative = pd.read_csv("../data/cv_results_mb_gen.csv")
ordered_cv_results = cv_results_sample_negative.sort_values("mean_train_AU_PRC",ascending=False)
sliced_results = ordered_cv_results[["params"]]
series = sliced_results["params"].apply(lambda x: eval(x)).apply(pd.Series)
sliced_results = pd.concat([series, sliced_results], axis=1)
sliced_results = sliced_results.drop("params", axis=1)
top_5 = sliced_results.head()

In [None]:
base_data_folder = "300_epochs_data_best_train_scores"
for (idx, row) in top_5.iterrows():
    params = row.to_dict()
    print("BEGIN TRAINING WITH: ", params)
    gen = BalancedGenerator
    model = CustomKerasClassifier(build_fn = create_model, generator=gen, epochs = 300, verbose=1, shuffle=False, **params)
    callback = GetAUPRCCallback(train_X=train_X, train_y=train_y, test_X=test_X, test_y=test_y)
    history = model.fit(train_X, train_y, callbacks=[callback])
    test_probas = model.predict_proba(test_X)[:,1]
    train_probas = model.predict_proba(train_X)[:,1]
    #   saving train history

    data = pd.DataFrame.from_dict({
        "loss" : history.history["loss"],
        "train_history" : callback.train_AUPRC,
        "test_history" : callback.test_AUPRC,
        **params
    }
    )
    filename = "MLP_gen_npr_{}_psp_{}_np_{}.csv".format(
        params["np_ratio"],
        params["positive_sample_perc"],
        params["negative_perc"]
        )
        
    data.to_csv(os.path.join(base_data_folder, filename), index=False)
    #saving test scores
    filename = "MLP_gen_test_scores_npr_{}_psp_{}_np_{}.csv".format(
        params["np_ratio"],
        params["positive_sample_perc"],
        params["negative_perc"]
    )
    pd.Series(test_probas).to_csv(os.path.join(base_data_folder, filename), index=False)
    
    # saving train sores
    filename = "MLP_gen_train_scores_npr_{}_psp_{}_np_{}.csv".format(
        params["np_ratio"],
        params["positive_sample_perc"],
        params["negative_perc"]
    )

    pd.Series(train_probas).to_csv(os.path.join(base_data_folder, filename), index=False)