In [1]:
import os

import numpy as np
import pandas as pd
from scipy.stats import median_absolute_deviation
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras 
import tensorflow_addons as tfa
import random
from keras.regularizers import l2
random_state = 1234
from sklearn.metrics import roc_curve, auc
from matplotlib import pyplot as plt
import optuna
from pathlib import Path
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [2]:
# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [13]:
parent_dir = Path(os.getcwd()).parent.parent
cross_validation_dir = os.path.join(parent_dir, "Data", "train_test_indices.npy")
data_dir = os.path.join(parent_dir, "Data" , "BCAST_train")
train_test_indices = np.load(cross_validation_dir, allow_pickle = True)

In [4]:
def list_files_in_dir(dirname):
    dir_files = list()
    for root, _, files in os.walk(dirname):
        for file in files:
            dir_files.append(os.path.join(root, file))
    
    return dir_files

In [5]:
def build_mlp_model(input_shape=(96,), n_hidden_layers=2, n_hidden_nodes=16, 
                    activation="relu", learning_rate=0.001, weight_decay = 0, l2_kernel = 0.01, l2_bias = 0.01):
    # optimizer parameters
    loss = "binary_crossentropy"
    optimizer = tfa.optimizers.AdamW(learning_rate=learning_rate, weight_decay = weight_decay)
    metrics = keras.metrics.AUC(name='auc')
    
    # ANN model
    model = keras.models.Sequential()
    model.add(keras.layers.Flatten(input_shape=input_shape))
    for _ in range(n_hidden_layers):
        model.add(keras.layers.Dense(n_hidden_nodes, activation=activation))
    
    model.add(keras.layers.Dense(1, activation="sigmoid"))
    
    # optimizer
    model.compile(loss=loss, optimizer=optimizer, metrics=metrics)
    
    return model

In [6]:
def get_row_indices_with_sum_zero(X):
    return X.index[(X.sum(axis=1) == 0)].tolist()

In [7]:
def scale_rows(X):
    
    return X.div(X.sum(axis=1), axis=0)

In [8]:
def dataset_generator(data_dir, num_files, y_col):

  for datafile in random.sample(list_files_in_dir(data_dir), num_files):
      data = pd.read_csv(datafile)
      X = data.iloc[:, :96]
      y = (data[y_col]).astype(np.int_)
      X.columns = X.columns.str.replace('[', '').str.replace(']', '').str.replace('>', '')
#           print(X.sum(axis=0))
      X = scale_rows(X)
#           print(X.sum(axis=1))
      yield X, y

In [9]:
def scale_data(df):
    # Define the scaler 
    scaler = StandardScaler().fit(df)
    # Scales each individual row   
    df[df.columns] = scaler.fit_transform(df[df.columns])
    
    return df
    

In [10]:
def train_val_test_generator(data_dir, num_samples, y_col, test_frac=0.1, n_folds = 10):
    val_frac = test_frac/(1.0 - test_frac)
    
    for X, y in dataset_generator(data_dir, num_samples, y_col):
        fold_data = list()
        skf = StratifiedKFold(n_splits=n_folds, shuffle = True)
#         skf = StratifiedKFold(n_splits=num_folds)
        
        for train_val_index, test_index in skf.split(X, y):
            X_test, y_test = X.iloc[test_index, :], y.iloc[test_index]
            X_train, X_val, y_train, y_val = train_test_split(
                X.iloc[train_val_index,:], y.iloc[train_val_index], 
                test_size=val_frac, 
                random_state=random_state, 
                stratify=y.iloc[train_val_index]
            )
            
            #X_train, X_val, X_test = scale_columns(X_train, X_val, X_test)
            fold_data.append(((X_train, y_train), 
                              (X_val, y_val), 
                              (X_test, y_test)))
            
        yield fold_data

In [11]:
def train_val_test_generator_default(data_dir, num_samples, y_col, test_frac=0.1):
    
    val_frac = test_frac/(1.0 - test_frac)
    
    for X, y in dataset_generator(data_dir, num_samples, y_col):
        fold_data = list()
        for train_val_index, test_index in train_test_indices:
            X_test, y_test = X.iloc[test_index, :], y.iloc[test_index]
            X_train, X_val, y_train, y_val = train_test_split(
                X.iloc[train_val_index,:], y.iloc[train_val_index], 
                test_size=val_frac, 
                random_state=random_state, 
                stratify=y.iloc[train_val_index]
            )
#             print(train_val_index, test_index)
            
            #X_train, X_val, X_test = scale_columns(X_train, X_val, X_test)
            fold_data.append(((X_train, y_train), 
                              (X_val, y_val), 
                              (X_test, y_test)))
            
        yield fold_data

In [12]:
def compute_mlp_performance(trial, input_shape=(96,), data_dir=data_dir, n_folds=10, class_weight_0 = 1, class_weight_1 = 1, num_samples = 1, y_col = 'is_sig3_20'):
    aucs = list()
    models = list()
    
    # 60-20-20 split
    test_frac=1.0/float(n_folds)
#     fpr_list = []
#     tpr_list = []
#     roc_auc_list = []
    
    for folds_data in train_val_test_generator(data_dir, num_samples=num_samples, y_col=y_col):
        fold_aucs = list()
        fold_models = list()
        
        for fold_data in folds_data:
            # get data
            (X_train, y_train), (X_val, y_val), (X_test, y_test) = fold_data
            # build model and ensure that parameters passed in are within the normal range
            # if we don't type cast as integers, bayesian optimizer will guess float values
            model = build_mlp_model(input_shape, 
                                    trial.suggest_int('n_hidden_layers', 1,20), 
                                    trial.suggest_int('n_hidden_nodes', 20, 300), 
                                    trial.suggest_categorical("activation", ["relu", "sigmoid", "softmax"]), 
                                    trial.suggest_float('learning_rate', 1e-9, 1e-1),
                                    weight_decay = 0,
                                    l2_kernel = 0,
                                    l2_bias = 0)
            model.fit(X_train, y_train, 
                      validation_data=(X_val, y_val), 
                      epochs=1000, batch_size=32, verbose=0,
                      callbacks=[keras.callbacks.EarlyStopping(monitor='val_auc', patience=20)])
            
            # evaluate
            y_score = model.evaluate(X_test, y_test, verbose=0)[1]
            fold_aucs.append(y_score)
            fold_models.append(model)
        aucs.append(fold_aucs)
        models.append(fold_models)
        
    # Gets median index value for all the different samples (rows)  
    medianIndices = [indices[len(aucs[0])//2] for indices in np.argsort(aucs, axis=1)]
    medianValues = [values[index] for values, index in zip(aucs, medianIndices)]
    
    # Gets the file which contains the median of median value
    fileInd = np.argsort(medianValues)[len(medianValues)//2]
    
    aucs = np.array(aucs)
    
    median_of_median_model = models[fileInd][medianIndices[fileInd]]
    median_of_median_auc = np.median(np.median(aucs, axis=1))
    mad_of_mad_auc = median_absolute_deviation(aucs, axis=1)
#     return median_of_median_auc, mad_of_mad_auc
    return median_of_median_auc



In [14]:
study_is_sig3 = optuna.create_study(direction='maximize')
study_is_sig3.optimize(compute_mlp_performance, n_trials=150)

[32m[I 2022-07-14 11:30:20,005][0m A new study created in memory with name: no-name-e7ce165c-8c58-4d80-b904-c31327427906[0m
[32m[I 2022-07-14 11:30:45,322][0m Trial 0 finished with value: 0.5 and parameters: {'n_hidden_layers': 3, 'n_hidden_nodes': 259, 'activation': 'softmax', 'learning_rate': 0.025736919527866744}. Best is trial 0 with value: 0.5.[0m
[32m[I 2022-07-14 11:31:23,614][0m Trial 1 finished with value: 0.5 and parameters: {'n_hidden_layers': 20, 'n_hidden_nodes': 193, 'activation': 'sigmoid', 'learning_rate': 0.031496948003997514}. Best is trial 0 with value: 0.5.[0m
[32m[I 2022-07-14 11:31:51,480][0m Trial 2 finished with value: 0.6509235501289368 and parameters: {'n_hidden_layers': 4, 'n_hidden_nodes': 242, 'activation': 'relu', 'learning_rate': 0.03486786389623924}. Best is trial 2 with value: 0.6509235501289368.[0m
[32m[I 2022-07-14 11:32:28,996][0m Trial 3 finished with value: 0.5 and parameters: {'n_hidden_layers': 20, 'n_hidden_nodes': 289, 'activation

[32m[I 2022-07-14 11:45:23,663][0m Trial 32 finished with value: 0.6723501086235046 and parameters: {'n_hidden_layers': 2, 'n_hidden_nodes': 68, 'activation': 'relu', 'learning_rate': 0.09200015466246662}. Best is trial 14 with value: 0.7137255072593689.[0m
[32m[I 2022-07-14 11:45:46,404][0m Trial 33 finished with value: 0.5 and parameters: {'n_hidden_layers': 4, 'n_hidden_nodes': 109, 'activation': 'relu', 'learning_rate': 0.0901886553189289}. Best is trial 14 with value: 0.7137255072593689.[0m
[32m[I 2022-07-14 11:46:08,442][0m Trial 34 finished with value: 0.6411582827568054 and parameters: {'n_hidden_layers': 3, 'n_hidden_nodes': 81, 'activation': 'relu', 'learning_rate': 0.0804960878992818}. Best is trial 14 with value: 0.7137255072593689.[0m
[32m[I 2022-07-14 11:46:32,427][0m Trial 35 finished with value: 0.6787581741809845 and parameters: {'n_hidden_layers': 2, 'n_hidden_nodes': 35, 'activation': 'relu', 'learning_rate': 0.08779351293548358}. Best is trial 14 with val

[32m[I 2022-07-14 11:58:30,397][0m Trial 64 finished with value: 0.6696363389492035 and parameters: {'n_hidden_layers': 1, 'n_hidden_nodes': 117, 'activation': 'relu', 'learning_rate': 0.08273938598480439}. Best is trial 14 with value: 0.7137255072593689.[0m
[32m[I 2022-07-14 11:59:00,271][0m Trial 65 finished with value: 0.6554085910320282 and parameters: {'n_hidden_layers': 2, 'n_hidden_nodes': 20, 'activation': 'relu', 'learning_rate': 0.06662398804565119}. Best is trial 14 with value: 0.7137255072593689.[0m
[32m[I 2022-07-14 11:59:28,440][0m Trial 66 finished with value: 0.5 and parameters: {'n_hidden_layers': 11, 'n_hidden_nodes': 49, 'activation': 'relu', 'learning_rate': 0.06214895989420262}. Best is trial 14 with value: 0.7137255072593689.[0m
[32m[I 2022-07-14 11:59:56,326][0m Trial 67 finished with value: 0.6638462543487549 and parameters: {'n_hidden_layers': 3, 'n_hidden_nodes': 66, 'activation': 'relu', 'learning_rate': 0.07274077966342128}. Best is trial 14 with 

[32m[I 2022-07-14 12:11:48,553][0m Trial 96 finished with value: 0.676470547914505 and parameters: {'n_hidden_layers': 2, 'n_hidden_nodes': 110, 'activation': 'relu', 'learning_rate': 0.07535288828876766}. Best is trial 14 with value: 0.7137255072593689.[0m
[32m[I 2022-07-14 12:12:12,160][0m Trial 97 finished with value: 0.6700372099876404 and parameters: {'n_hidden_layers': 1, 'n_hidden_nodes': 116, 'activation': 'relu', 'learning_rate': 0.07850999480709339}. Best is trial 14 with value: 0.7137255072593689.[0m
[32m[I 2022-07-14 12:12:36,460][0m Trial 98 finished with value: 0.6415033340454102 and parameters: {'n_hidden_layers': 2, 'n_hidden_nodes': 77, 'activation': 'relu', 'learning_rate': 0.06873575603504913}. Best is trial 14 with value: 0.7137255072593689.[0m
[32m[I 2022-07-14 12:13:06,097][0m Trial 99 finished with value: 0.672577440738678 and parameters: {'n_hidden_layers': 1, 'n_hidden_nodes': 65, 'activation': 'relu', 'learning_rate': 0.06471151806293074}. Best is t

[32m[I 2022-07-14 12:25:56,365][0m Trial 128 finished with value: 0.6886082589626312 and parameters: {'n_hidden_layers': 2, 'n_hidden_nodes': 119, 'activation': 'softmax', 'learning_rate': 0.04916102667958574}. Best is trial 115 with value: 0.7187482714653015.[0m
[32m[I 2022-07-14 12:26:20,456][0m Trial 129 finished with value: 0.5 and parameters: {'n_hidden_layers': 9, 'n_hidden_nodes': 190, 'activation': 'softmax', 'learning_rate': 0.04833490089478498}. Best is trial 115 with value: 0.7187482714653015.[0m
[32m[I 2022-07-14 12:26:38,606][0m Trial 130 finished with value: 0.5 and parameters: {'n_hidden_layers': 3, 'n_hidden_nodes': 121, 'activation': 'softmax', 'learning_rate': 0.028730544703269807}. Best is trial 115 with value: 0.7187482714653015.[0m
[32m[I 2022-07-14 12:27:11,210][0m Trial 131 finished with value: 0.609477162361145 and parameters: {'n_hidden_layers': 2, 'n_hidden_nodes': 115, 'activation': 'softmax', 'learning_rate': 0.00413645261360541}. Best is trial 11

In [15]:
study_is_sig3.optimize(compute_mlp_performance, n_trials=150)

[32m[I 2022-07-14 12:34:50,675][0m Trial 150 finished with value: 0.6198848783969879 and parameters: {'n_hidden_layers': 2, 'n_hidden_nodes': 261, 'activation': 'relu', 'learning_rate': 0.053063599474797024}. Best is trial 115 with value: 0.7187482714653015.[0m
[32m[I 2022-07-14 12:35:19,953][0m Trial 151 finished with value: 0.6887255012989044 and parameters: {'n_hidden_layers': 1, 'n_hidden_nodes': 283, 'activation': 'relu', 'learning_rate': 0.0854453168614925}. Best is trial 115 with value: 0.7187482714653015.[0m
[32m[I 2022-07-14 12:35:48,296][0m Trial 152 finished with value: 0.6287582218647003 and parameters: {'n_hidden_layers': 1, 'n_hidden_nodes': 284, 'activation': 'relu', 'learning_rate': 0.0814079694140132}. Best is trial 115 with value: 0.7187482714653015.[0m
[32m[I 2022-07-14 12:36:21,564][0m Trial 153 finished with value: 0.655430793762207 and parameters: {'n_hidden_layers': 2, 'n_hidden_nodes': 293, 'activation': 'relu', 'learning_rate': 0.09413627181702301}. 

[32m[I 2022-07-14 12:48:02,175][0m Trial 182 finished with value: 0.6780193150043488 and parameters: {'n_hidden_layers': 1, 'n_hidden_nodes': 298, 'activation': 'relu', 'learning_rate': 0.08790079530717204}. Best is trial 115 with value: 0.7187482714653015.[0m
[32m[I 2022-07-14 12:48:26,295][0m Trial 183 finished with value: 0.676679253578186 and parameters: {'n_hidden_layers': 1, 'n_hidden_nodes': 300, 'activation': 'relu', 'learning_rate': 0.08459602366709067}. Best is trial 115 with value: 0.7187482714653015.[0m
[32m[I 2022-07-14 12:48:49,728][0m Trial 184 finished with value: 0.686601310968399 and parameters: {'n_hidden_layers': 1, 'n_hidden_nodes': 300, 'activation': 'relu', 'learning_rate': 0.08661472761685511}. Best is trial 115 with value: 0.7187482714653015.[0m


KeyboardInterrupt: 

In [16]:
study_is_sig3.best_trial

FrozenTrial(number=115, values=[0.7187482714653015], datetime_start=datetime.datetime(2022, 7, 14, 12, 19, 52, 623336), datetime_complete=datetime.datetime(2022, 7, 14, 12, 20, 22, 462128), params={'n_hidden_layers': 1, 'n_hidden_nodes': 134, 'activation': 'relu', 'learning_rate': 0.010608419991241444}, distributions={'n_hidden_layers': IntUniformDistribution(high=20, low=1, step=1), 'n_hidden_nodes': IntUniformDistribution(high=300, low=20, step=1), 'activation': CategoricalDistribution(choices=('relu', 'sigmoid', 'softmax')), 'learning_rate': UniformDistribution(high=0.1, low=1e-09)}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=115, state=TrialState.COMPLETE, value=None)