# Un MLP più semplice per la predizione

l'MLP di questo esperimento avrà fissati i seguenti parametri:
* Ottimizzatore: SGD con i parametri di default
* funzione di perdita: hingeloss
* funzione di attivazione: sigmoid
* architettura della rete: { [(2), (5), (10), (20), (40), (80),(100),(100, 80), (100, 40), (100, 10), (40, 20), (40,10), (20,10), (20,5), (10, 5), (10,2), (100, 80, 40), (100,40,20),(80,40,20), (80, 20,10), (40,20,10),(20,10,5), (10,5,2), (100,80,50,20), (100,50,25,10), (80, 60, 20,10), (50, 30, 20, 10),(30,15,7,3) ]}
                           
* numero di epoche: 150
* dimensione batch: 5000, così abbiamo una maggiore probabilità di avere un positivo nel minibatch

Il training set verrà normalizzato in modo da avere media 0 e varianza unitaria. Se l'esperimento va bene è consigliabile provare ad effettuare una feature reduction.
    

In [None]:
my_seed = 2024
import numpy as np

np.random.seed(my_seed)
import random as rn
rn.seed(my_seed)
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"] = ""
os.environ['PYTHONHASHSEED'] = '0'
import tensorflow as tf
tf.set_random_seed(my_seed)

log_device_placement = False
import sys
if "log_device_tf" in sys.argv: 
    log_device_placement = True

session_conf = tf.ConfigProto(intra_op_parallelism_threads=0, inter_op_parallelism_threads=0, device_count = {"GPU" : 0},
                              log_device_placement=log_device_placement)
import keras
from keras import backend as K

sess = tf.Session(graph=tf.get_default_graph(), config=session_conf)
K.set_session(sess)

from keras.optimizers import Adam, SGD, Adadelta
from keras.wrappers.scikit_learn import KerasClassifier
from bioinformatics_helpers.utils import get_mendelian_dataset
from sklearn.metrics import make_scorer
from sklearn.metrics import roc_auc_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import auc
from sklearn.metrics import precision_recall_curve
from keras import Sequential
from keras.layers import Dense, Activation, LeakyReLU
from bioinformatics_helpers.utils import hingesig_tf
from bioinformatics_helpers.utils import ExhaustiveSearch
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [None]:
params= {"model__architecture" : [(2,), (5,), (10,), (20,), (40,), (80,),(100,),(100, 80), (100, 40), (100, 10), (40, 20), (40,10), (20,10), (20,5), (10, 5), (10,2), (100, 80, 40), (100,40,20),(80,40,20), (80, 20,10), (40,20,10),(20,10,5), (10,5,2), (100,80,50,20), (100,50,25,10), (80, 60, 20,10), (50, 30, 20, 10),(30,15,7,3) ]}
feature_per_example = 26
batch_size = 5000

In [None]:

def create_model(architecture=(10,)):
    model = Sequential()
    weights_initializer = keras.initializers.glorot_normal(seed=my_seed)
    bias_init = keras.initializers.RandomNormal(mean=0.1, stddev=0.05, seed=my_seed)
    input_dim = feature_per_example
    for units in architecture:
        model.add(
            Dense(
                units,
                input_dim = input_dim,
                kernel_initializer = weights_initializer,
                bias_initializer = bias_init,
                activation="relu"
            )
        )
    model.add(
        Dense(
            1,
            kernel_initializer=weights_initializer,
            bias_initializer=keras.initializers.zeros(),
            activation='sigmoid'
    ))
    optimizer = SGD()
    model.compile(loss=hingesig_tf, optimizer=optimizer)
    return model

In [None]:
train_X, train_y, test_X, test_y = get_mendelian_dataset()


In [None]:
def prc_score(y_true, y_pred):
    precision, recall, _ = precision_recall_curve(y_true=y_true, probas_pred=y_pred)
    return auc(x=recall, y=precision)

scoring = {
    'AVG_PREC': make_scorer(average_precision_score, needs_threshold=True),
    'AU_PRC' : make_scorer(prc_score, needs_threshold=True),
    'AU_ROC' : make_scorer(roc_auc_score, needs_threshold=True)
}

In [None]:
model = KerasClassifier(build_fn=create_model, verbose=1, shuffle=True, batch_size=batch_size, epochs=150)
pipe = Pipeline([("model",model)])
cv = StratifiedKFold(n_splits=5, random_state=my_seed, shuffle=True)
grid_search = GridSearchCV(estimator=pipe,param_grid=params,
                           scoring=scoring,
                           return_train_score=True,
                           cv=cv,
                           refit=False
                           )
grid_search.fit(train_X, train_y)
#saving cv_results_
cv_results = pd.DataFrame.from_dict(grid_search.cv_results_)



In [None]:
cv_results = pd.DataFrame.from_dict(grid_search.cv_results_)
cv_results.to_csv("cv_results_no_scaler_SGD.csv", index=False)