# Second deep learning experiment

* MLP with only a hidden layer with 300 units
* Optimizator: SGD with learning rate at 0.01
* Custom batch generator: balanced minibatch generator with the following parameters:
    * positive_sample_perc: positives to sample
    * np_ratio: negative-positive ratio in minibatch
    * negative_perc: whether to randomly undersample negative data
* Hyperparameters:
    * positive_sample_perc
    * np_ratio
    * negative_perc
* Sigmoid output activation function
* Hidden layer activation: sigmoid
* Loss function: hingeloss
* Weights initializer: glorot uniform
* Epochs: 10
The training and test set are not altered
### Libraries
* keras:2.2.0
* scikit-learn:0.19.1
* pandas:0.23.0
* numpy:1.14.5


In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"] = ""
os.environ['PYTHONHASHSEED'] = '0'
import numpy as np
import tensorflow as tf
import random as rn

# The below is necessary in Python 3.2.3 onwards to
# have reproducible behavior for certain hash-based operations.
# See these references for further details:
# https://docs.python.org/3.4/using/cmdline.html#envvar-PYTHONHASHSEED
# https://github.com/keras-team/keras/issues/2280#issuecomment-306959926
my_seed = 2024

np.random.seed(my_seed)

rn.seed(my_seed)

# Force TensorFlow to use single thread.
# Multiple threads are a potential source of
# non-reproducible results.
# For further details, see: https://stackoverflow.com/questions/42022950/which-seeds-have-to-be-set-where-to-realize-100-reproducibility-of-training-res
log_device_placement = False
import sys
if "log_device_tf" in sys.argv: 
    log_device_placement = True

session_conf = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1, device_count = {"GPU" : 0},
                              log_device_placement=log_device_placement)

from keras import backend as K

# The below tf.set_random_seed() will make random number generation
# in the TensorFlow backend have a well-defined initial state.
# For further details, see: https://www.tensorflow.org/api_docs/python/tf/set_random_seed

tf.set_random_seed(my_seed)

sess = tf.Session(graph=tf.get_default_graph(), config=session_conf)
K.set_session(sess)
import keras
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV
from keras.optimizers import SGD
from numpy.random import seed
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import make_scorer
from sklearn.metrics import auc
from sklearn.metrics import precision_recall_curve
from sklearn.externals import joblib
from sklearn.model_selection import cross_validate
from sklearn.metrics import average_precision_score
import copy
import matplotlib.pyplot as plt
import theano.tensor as T
import seaborn as sns
from bioinformatics_helpers.utils import interpolated_precision_recall_curve as pr_curve
from bioinformatics_helpers.utils import hingesig_tf
from bioinformatics_helpers.utils import get_mendelian_dataset
from bioinformatics_helpers.balanced_generator import BalancedGenerator
from bioinformatics_helpers.utils import CustomKerasClassifier

Using TensorFlow backend.


Definiamo la griglia dei parametri da ricercare e il numero di feature per ogni esempio e settiamo il seed numpy per i numeri casuali

In [2]:
params = {
        'positive_sample_perc': [0.25, 0.5, 0.75, 1, 1.25, 1.5],
        'np_ratio': [0.5, 1, 1.5, 3, 5],
        'negative_perc': [0.25, 0.50, 1]
    }

feature_per_example=26



Keras per funzionare con scikit mette a disposizione la classe `KerasClassifier` che fa da wrapper. Ha bisogno di una funzione che crea e compila il modello

In [3]:
def create_model():
    model = Sequential()
    initializer = keras.initializers.glorot_uniform(seed=my_seed)
    model.add(Dense(
            300, 
            input_dim=feature_per_example, 
            kernel_initializer=initializer,
            activation="sigmoid")
             )
    model.add(Dense(
            1,
            kernel_initializer=initializer,
            activation='sigmoid'
    ))
    optimizer = SGD(lr=0.01, decay=0, momentum=0, nesterov=False)
    model.compile(loss=hingesig_tf, optimizer=optimizer)
    return model

In [4]:
train_X, train_y, test_X, test_y = get_mendelian_dataset()

In [5]:
cv_X = np.concatenate([train_X, test_X])
cv_y = np.concatenate([train_y, test_y])
train_idx = np.arange(0, len(train_X))
test_idx = np.arange(len(train_X), len(cv_X))
assert np.all(np.equal(cv_X[train_idx][-1], train_X[-1]))
assert np.all(np.equal(cv_X[test_idx][-1], test_X[-1]))
assert np.all(np.equal(cv_X[test_idx][0], test_X[0]))

Creiamo la funzione di scoring, calcolando l'AUC per la precision recall curve specifichiamo `reorder=False` perchè non è una curva "ascendente", come nel caso della curva ROC .

In [6]:
def prc_score(y_true, y_pred):
    precision, recall, _ = precision_recall_curve(y_true=y_true, probas_pred=y_pred)
    return auc(x=recall, y=precision)

scoring = {
    'AU_PRC': make_scorer(prc_score, needs_threshold=True),
    'AU_ROC': make_scorer(roc_auc_score, needs_threshold=True),
    'AVG_PREC': make_scorer(average_precision_score, needs_threshold=True)
}

In [None]:
gen = BalancedGenerator
model = CustomKerasClassifier(build_fn = create_model, generator=gen, verbose=1, shuffle=False)
    
grid_search = GridSearchCV(estimator=model, 
                           param_grid=params,
                           scoring=scoring,
                           refit=False,
                           cv=[(train_idx, test_idx),(train_idx, test_idx),(train_idx, test_idx),(train_idx, test_idx)], 
                           return_train_score=True,
                           n_jobs=1)
grid_search.fit(cv_X, cv_y)
#saving cv_results_
cv_results = pd.DataFrame.from_dict(grid_search.cv_results_)
cv_results.to_csv("cv_results_mb_gen.csv", index=False)
