In [1]:
import pandas as pd
print("pandas version: {}". format(pd.__version__))

import numpy as np
print("numpy version: {}". format(np.__version__))

import sklearn
from sklearn.model_selection import ShuffleSplit, StratifiedShuffleSplit, train_test_split
from sklearn.metrics import precision_recall_curve, roc_curve, confusion_matrix, roc_auc_score
print("sklearn version: {}". format(sklearn.__version__))

import keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, BatchNormalization
print("keras version: {}". format(keras.__version__))

import tensorflow as tf
print("tensorflow version: {}". format(tf.__version__))

import optuna
print("optuna version: {}". format(optuna.__version__))

import mlflow
from mlflow.utils.mlflow_tags import MLFLOW_PARENT_RUN_ID
from mlflow.tracking import MlflowClient
print("mlflow version: {}". format(mlflow.__version__))

import matplotlib.pyplot as plt
import seaborn as sns

from datetime import datetime
import os
import io
import warnings
warnings.simplefilter('ignore')

pandas version: 1.4.3
numpy version: 1.23.2
sklearn version: 1.0.2
keras version: 2.10.0
tensorflow version: 2.10.0


  from .autonotebook import tqdm as notebook_tqdm


optuna version: 3.0.1
mlflow version: 1.28.0


In [2]:
RANDOM_STATE=42
N_TRAILS=15 # run for x runs
TIMEOUT=600 # run for max 10 minutes (if the last run at 9 minutes runs for 1h, the active run is not killed)

In [3]:
client = MlflowClient()
try:
    experiment = client.create_experiment("Titanic")
except:
    experiment = client.get_experiment_by_name("Titanic").experiment_id

parent_run = client.create_run(experiment_id=experiment)

In [4]:
def load_data():
    # load prepared training and test dataset
    df_train = pd.read_pickle('../03_DataPreprocessing/df_train_prepared_reduced.pkl')
    df_test = pd.read_pickle('../03_DataPreprocessing/df_test_prepared_reduced.pkl')

    # split the training and test dataset to the input features (x_train, x_test) and the survival class (y_train)
    y_train = df_train['Survived']
    x_train = df_train.drop(['Survived'], axis=1)
    x_test = df_test

    x_train, x_validate, y_train, y_validate = train_test_split(x_train, y_train, test_size=0.3, stratify=y_train, random_state=RANDOM_STATE)

    return x_train, y_train, x_validate, y_validate, x_test

x_train, y_train, x_validate, y_validate, x_test = load_data()

In [5]:
def evaluate_model(y_validate, y_validate_pred, y_validate_scores, child_run):
    """
    evaluate the classification model with
    - classification report
    - precision-recall-curve
    - ROC curve
    """


    def plot_confusion_matrix(y_validate, y_validate_pred):
        group_names = ["True Neg", "False Pos", "False Neg", "True Pos"]
        group_counts = ["{0:0.0f}".format(value) for value in
                        confusion_matrix(y_validate, y_validate_pred).flatten()]
        group_percentages = ["{0:.2%}".format(value) for value in
                             confusion_matrix(y_validate, y_validate_pred).flatten()/np.sum(confusion_matrix(y_validate, y_validate_pred))]
        labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in
                  zip(group_names,group_counts,group_percentages)]
        labels = np.asarray(labels).reshape(2,2)

        fig2, ax2 = plt.subplots()
        sns.heatmap(confusion_matrix(y_validate, y_validate_pred), annot=labels, fmt="", cmap='Blues')
        plt.xlabel('Predicted Label')
        plt.ylabel('True Label')
        client.log_figure(child_run.info.run_id, fig2, 'plot_confusion_matrix.png')
        plt.close()


    def plot_precision_recall_vs_threshold(y_validate, y_scores, child_run):
        precisions, recalls, thresholds = precision_recall_curve(y_validate, y_scores)

        # convert to f score
        fscore = (2 * precisions * recalls) / (precisions + recalls)
        # locate the index of the largest f score
        ix = np.argmax(fscore)
        
        client.log_metric(child_run.info.run_id, "f1_score", round(fscore[ix], 5))

        fig3, ax3 = plt.subplots()
        ax3.plot(thresholds, precisions[:-1], "b", label="Precision")
        ax3.plot(thresholds, recalls[:-1], "g", label="Recall")
        ax3.plot(thresholds, fscore[:-1], "r", label="F1 Score")
        ax3.axvline(x=thresholds[ix], color='red', linestyle='--')
        plt.axhline(y=precisions[ix], color='b', linestyle='--')
        plt.axhline(y=recalls[ix], color='g', linestyle='--')
        ax3.set_xlabel("Threshold")
        ax3.legend(loc="upper left")
        ax3.set_ylim([0,1])
        client.log_figure(child_run.info.run_id, fig3, 'plot_f1.png')
        plt.close()

        fig4, ax4 = plt.subplots()
        ax4.plot(recalls, precisions, marker='.', label='Logistic')
        ax4.scatter(recalls[ix], precisions[ix], 200, marker='o', color='red', label='Best')
        ax4.set_xlabel('Recall')
        ax4.set_ylabel('Precision')
        client.log_figure(child_run.info.run_id, fig4, 'plot_precision_recall.png')
        plt.close()
        

    def plot_roc_curve(y_validate, y_scores, child_run):
        fpr, tpr, thresholds = roc_curve(y_validate, y_scores)

        roc_auc = round(roc_auc_score(y_validate, y_scores), 3)
        
        optimal_idx = np.argmax(tpr - fpr)

        fig5, ax5 = plt.subplots()
        ax5.plot(fpr, tpr, linewidth=2)
        ax5.plot([0,1], [0,1], 'k--')
        ax5.axis([0,1,0,1])
        ax5.scatter(fpr[optimal_idx], tpr[optimal_idx], 200, marker='o', color='red', label='Best')
        ax5.set_xlabel('False Positive Rate')
        ax5.set_ylabel('True Positive Rate')
        client.log_figure(child_run.info.run_id, fig5, 'plot_roc_curve.png')
        plt.close()

        client.log_metric(child_run.info.run_id, "roc_auc", roc_auc)

        

    plot_confusion_matrix(y_validate, y_validate_pred)
    plot_precision_recall_vs_threshold(y_validate, y_validate_scores, child_run)
    plot_roc_curve(y_validate, y_validate_scores, child_run)

In [6]:
def create_model(trial):

    child_run = client.create_run(
        experiment_id=experiment,
        tags={
            MLFLOW_PARENT_RUN_ID: parent_run.info.run_id
        }
    )


    model = Sequential()
    model.add(keras.Input(shape=(x_train.shape[1],)))

    n_layers = trial.suggest_int('n_layers', 2, 4)
    client.log_param(child_run.info.run_id, "n_layers", n_layers-1)

    for layer in range(1,n_layers):
        n_units = trial.suggest_int('n_units_'+str(layer), 2, 20)
        client.log_param(child_run.info.run_id, 'n_units_'+str(layer), n_units)
        activation = trial.suggest_categorical('activation_'+str(layer), ['relu', 'tanh', 'elu'])
        client.log_param(child_run.info.run_id, 'activation_'+str(layer), activation)
        dropout = trial.suggest_float('dropout', 0.1, 0.6, log=False)
        client.log_param(child_run.info.run_id, 'dropout_'+str(layer), dropout)

        model.add(Dense(units=n_units))
        model.add(BatchNormalization())
        model.add(Activation(activation))
        model.add(Dropout(dropout))

    model.add(Dense(1, activation='sigmoid'))


    lr =  trial.suggest_float('lr', 0.001, 1, log=True)
    client.log_param(child_run.info.run_id, 'lr', lr)
    beta_1 = trial.suggest_float('beta_1', 0.07, 0.999, log=False)
    client.log_param(child_run.info.run_id, 'beta_1', beta_1)
    beta_2 = trial.suggest_float('beta_2', 0.07, 0.999, log=False)
    client.log_param(child_run.info.run_id, 'beta_2', beta_2)

    adam = keras.optimizers.Adam(
        lr=lr,
        beta_1=beta_1,
        beta_2=beta_2,
        amsgrad=False
        )

    # compile the keras model
    model.compile(
        loss='binary_crossentropy',
        optimizer=adam,
        metrics=['accuracy']
        )

    return model, child_run

In [7]:
class Objective:
    
    def __init__(self):
        self.best_model = None
        self._model = None

    
    def __call__(self, trial):
    
        model, child_run = create_model(trial)
        self._model = model

        tensorboard_identifier = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
        logdir = os.path.join('tensorboard', tensorboard_identifier)
        client.log_param(child_run.info.run_id, "tensorboard", tensorboard_identifier)
        # file_writer_cm = tf.summary.create_file_writer(logdir + '/cm')

        tensorboard_callback = keras.callbacks.TensorBoard(log_dir=logdir, histogram_freq=1)


        def log_confusion_matrix(epoch, logs):
            y_validate_scores = model.predict(x_validate)
            y_validate_pred = tf.greater(y_validate_scores, .5)

            # con_mat = tf.math.confusion_matrix(labels=y_validate, predictions=y_validate_pred).numpy()
            con_mat = tf.math.confusion_matrix(labels=y_validate, predictions=y_validate_pred)
            con_mat_norm = np.around(con_mat.astype('float') / con_mat.sum(axis=1)[:, np.newaxis], decimals=2)

            con_mat_df = pd.DataFrame(con_mat_norm, index=[0,1], columns=[0,1])

            figure = plt.figure(figsize=(8, 8))
            # sns.heatmap(con_mat_df, annot=True,cmap=plt.cm.Blues)
            sns.heatmap(con_mat_df, annot=True,cmap=plt.cm.Blues)
            plt.tight_layout()
            plt.ylabel('True label')
            plt.xlabel('Predicted label')

            plt.close()

            return figure

        cm_callback = keras.callbacks.LambdaCallback(on_epoch_end=log_confusion_matrix)

        # early stopping
        earlystopping = keras.callbacks.EarlyStopping(
            monitor='accuracy',
            patience=50,
            verbose=1,
            mode='min'
            )


        # fit the keras model on the dataset
        model.fit(
            x_train, y_train,
            epochs=500,
            verbose=0,
            batch_size=10,
            callbacks=[earlystopping, tensorboard_callback],
            validation_data=(x_validate, y_validate)
        )


        # evaluate the keras model
        _, accuracy = model.evaluate(x_validate, y_validate)
        client.log_metric(child_run.info.run_id, "cv_score", accuracy)

        # Keras maodel predicts the probabilities instead of the classes
        y_validate_scores = model.predict(x_validate)

        # set the thresholds for the classes to 0.5 to get the prediction from the probabilities
        y_validate_pred = tf.greater(y_validate_scores, .5)

        evaluate_model(y_validate, y_validate_pred, y_validate_scores, child_run)



        return accuracy

    def callback(self, study, trial):
        if study.best_trial == trial:
            self.best_model = self._model


In [8]:
objective = Objective()

study = optuna.create_study(direction="maximize")

study.optimize(
  objective,
  n_trials=N_TRAILS,
  timeout=TIMEOUT,
  n_jobs=-1,
  callbacks=[objective.callback]
  )

print("Study statistics: ")
print("Number of finished trials: ", len(study.trials))

print("Best trial:")
print(study.best_value)
print(study.best_params)

client.log_metric(parent_run.info.run_id, "best_cv_score", round(study.best_value, 3))

for param in study.best_params:
  client.log_param(parent_run.info.run_id, param, study.best_params[param])


mlflow.end_run()

[32m[I 2022-12-21 16:01:36,172][0m A new study created in memory with name: no-name-881711d0-0c45-4e58-854a-072b827898d6[0m


Epoch 51: early stopping


[32m[I 2022-12-21 16:03:17,064][0m Trial 2 finished with value: 0.8208954930305481 and parameters: {'n_layers': 3, 'n_units_1': 18, 'activation_1': 'elu', 'dropout': 0.3779191572184276, 'n_units_2': 9, 'activation_2': 'elu', 'lr': 0.0010111987333253133, 'beta_1': 0.143768801912389, 'beta_2': 0.8853502781328588}. Best is trial 2 with value: 0.8208954930305481.[0m


Epoch 51: early stopping
Epoch 51: early stopping
Epoch 51: early stopping

[32m[I 2022-12-21 16:03:50,690][0m Trial 1 finished with value: 0.8320895433425903 and parameters: {'n_layers': 4, 'n_units_1': 16, 'activation_1': 'elu', 'dropout': 0.5683430274460587, 'n_units_2': 10, 'activation_2': 'elu', 'n_units_3': 12, 'activation_3': 'relu', 'lr': 0.0037239342799583195, 'beta_1': 0.7324227022960905, 'beta_2': 0.4064271364405164}. Best is trial 1 with value: 0.8320895433425903.[0m




[32m[I 2022-12-21 16:03:53,029][0m Trial 3 finished with value: 0.8246268630027771 and parameters: {'n_layers': 4, 'n_units_1': 4, 'activation_1': 'elu', 'dropout': 0.5895037358489195, 'n_units_2': 7, 'activation_2': 'relu', 'n_units_3': 12, 'activation_3': 'tanh', 'lr': 0.005747690769275089, 'beta_1': 0.12269449796367704, 'beta_2': 0.8512358165851126}. Best is trial 1 with value: 0.8320895433425903.[0m
[32m[I 2022-12-21 16:03:53,716][0m Trial 0 finished with value: 0.38432836532592773 and parameters: {'n_layers': 4, 'n_units_1': 15, 'activation_1': 'tanh', 'dropout': 0.2536531284787026, 'n_units_2': 18, 'activation_2': 'relu', 'n_units_3': 19, 'activation_3': 'elu', 'lr': 0.34846146998420796, 'beta_1': 0.3059587284215278, 'beta_2': 0.16876721565856231}. Best is trial 1 with value: 0.8320895433425903.[0m


Epoch 51: early stopping


[32m[I 2022-12-21 16:05:13,504][0m Trial 4 finished with value: 0.7985074520111084 and parameters: {'n_layers': 3, 'n_units_1': 9, 'activation_1': 'elu', 'dropout': 0.29750147594934895, 'n_units_2': 6, 'activation_2': 'relu', 'lr': 0.15970982734446193, 'beta_1': 0.3853410690185057, 'beta_2': 0.5811693357912586}. Best is trial 1 with value: 0.8320895433425903.[0m


Epoch 51: early stopping


[32m[I 2022-12-21 16:06:12,218][0m Trial 6 finished with value: 0.8283582329750061 and parameters: {'n_layers': 4, 'n_units_1': 16, 'activation_1': 'relu', 'dropout': 0.1934842975066838, 'n_units_2': 19, 'activation_2': 'tanh', 'n_units_3': 10, 'activation_3': 'tanh', 'lr': 0.022156683988772834, 'beta_1': 0.22350593063400473, 'beta_2': 0.9203210781045341}. Best is trial 1 with value: 0.8320895433425903.[0m


Epoch 101: early stopping


[32m[I 2022-12-21 16:06:46,543][0m Trial 5 finished with value: 0.6156716346740723 and parameters: {'n_layers': 2, 'n_units_1': 7, 'activation_1': 'tanh', 'dropout': 0.4488819880300666, 'lr': 0.21375310386529037, 'beta_1': 0.7818501873952599, 'beta_2': 0.2747653319555869}. Best is trial 1 with value: 0.8320895433425903.[0m


Epoch 70: early stopping


[32m[I 2022-12-21 16:07:04,727][0m Trial 7 finished with value: 0.6156716346740723 and parameters: {'n_layers': 4, 'n_units_1': 10, 'activation_1': 'elu', 'dropout': 0.4709470046803286, 'n_units_2': 14, 'activation_2': 'elu', 'n_units_3': 11, 'activation_3': 'relu', 'lr': 0.3396087698935784, 'beta_1': 0.964796220425802, 'beta_2': 0.7750375432278382}. Best is trial 1 with value: 0.8320895433425903.[0m


Epoch 59: early stopping


[32m[I 2022-12-21 16:07:43,569][0m Trial 8 finished with value: 0.6156716346740723 and parameters: {'n_layers': 4, 'n_units_1': 2, 'activation_1': 'tanh', 'dropout': 0.14632122345738033, 'n_units_2': 13, 'activation_2': 'relu', 'n_units_3': 2, 'activation_3': 'relu', 'lr': 0.4035938299156219, 'beta_1': 0.10942466587181997, 'beta_2': 0.445072843797815}. Best is trial 1 with value: 0.8320895433425903.[0m


Epoch 51: early stopping


[32m[I 2022-12-21 16:08:22,288][0m Trial 9 finished with value: 0.8320895433425903 and parameters: {'n_layers': 4, 'n_units_1': 19, 'activation_1': 'elu', 'dropout': 0.13983720885701026, 'n_units_2': 3, 'activation_2': 'relu', 'n_units_3': 14, 'activation_3': 'tanh', 'lr': 0.0023934986093148026, 'beta_1': 0.54165851190686, 'beta_2': 0.18093510265496526}. Best is trial 1 with value: 0.8320895433425903.[0m


Epoch 51: early stopping


[32m[I 2022-12-21 16:08:55,079][0m Trial 10 finished with value: 0.8320895433425903 and parameters: {'n_layers': 4, 'n_units_1': 20, 'activation_1': 'relu', 'dropout': 0.39593075007526035, 'n_units_2': 9, 'activation_2': 'tanh', 'n_units_3': 14, 'activation_3': 'elu', 'lr': 0.0292863122058019, 'beta_1': 0.1976044375875237, 'beta_2': 0.8171166867458859}. Best is trial 1 with value: 0.8320895433425903.[0m


Epoch 51: early stopping


[32m[I 2022-12-21 16:09:40,916][0m Trial 13 finished with value: 0.8358209133148193 and parameters: {'n_layers': 2, 'n_units_1': 13, 'activation_1': 'relu', 'dropout': 0.5930212250055682, 'lr': 0.016921437444966523, 'beta_1': 0.6038741027426374, 'beta_2': 0.46375704065030976}. Best is trial 13 with value: 0.8358209133148193.[0m


Epoch 59: early stopping


[32m[I 2022-12-21 16:10:11,525][0m Trial 12 finished with value: 0.6156716346740723 and parameters: {'n_layers': 4, 'n_units_1': 18, 'activation_1': 'elu', 'dropout': 0.2985476157071377, 'n_units_2': 5, 'activation_2': 'relu', 'n_units_3': 8, 'activation_3': 'tanh', 'lr': 0.45298553213259596, 'beta_1': 0.9484008114205389, 'beta_2': 0.6660456847883547}. Best is trial 13 with value: 0.8358209133148193.[0m


Epoch 51: early stopping


[32m[I 2022-12-21 16:10:22,571][0m Trial 14 finished with value: 0.8171641826629639 and parameters: {'n_layers': 3, 'n_units_1': 14, 'activation_1': 'elu', 'dropout': 0.5897613566173392, 'n_units_2': 3, 'activation_2': 'elu', 'lr': 0.0015648931714226583, 'beta_1': 0.6211998306894324, 'beta_2': 0.07487705569906473}. Best is trial 13 with value: 0.8358209133148193.[0m


Epoch 94: early stopping


[32m[I 2022-12-21 16:10:28,661][0m Trial 11 finished with value: 0.33208954334259033 and parameters: {'n_layers': 4, 'n_units_1': 10, 'activation_1': 'relu', 'dropout': 0.37945740575737663, 'n_units_2': 14, 'activation_2': 'relu', 'n_units_3': 4, 'activation_3': 'elu', 'lr': 0.48449485245617707, 'beta_1': 0.6353848847220178, 'beta_2': 0.44247648956535984}. Best is trial 13 with value: 0.8358209133148193.[0m


Study statistics: 
Number of finished trials:  15
Best trial:
0.8358209133148193
{'n_layers': 2, 'n_units_1': 13, 'activation_1': 'relu', 'dropout': 0.5930212250055682, 'lr': 0.016921437444966523, 'beta_1': 0.6038741027426374, 'beta_2': 0.46375704065030976}
