In [4]:
import numpy as np
import optuna
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.layers import MaxPooling1D, Conv1D, Flatten

def filter_set_by_base_quality(data, quality_threshold=0.05, drop_base_quality=True):
    features = data['features']
    labels = data['labels']

    list_above_threshold = []
    
    for i in range(features.shape[0]):
        if features[i, 4, 7] > quality_threshold:
            list_above_threshold.append(i)
    
    above_threshold_indices = np.array(list_above_threshold)
    
    above_threshold_features = features[above_threshold_indices]
    above_threshold_labels = labels[above_threshold_indices]
    
    if drop_base_quality:
        above_threshold_features = above_threshold_features[:, np.arange(above_threshold_features.shape[1]) != 4, :]
    
    return above_threshold_features, above_threshold_labels

def set_num_nucleotides(features, num = 7):
    if num > 7:
        print('Cannot have more than 7 nucleotides.')
    else:
        num_to_remove = 7 - num
        
        keep_from_start = num_to_remove
        keep_from_end = features.shape[2] - num_to_remove

        new_features = features[:, :, keep_from_start:keep_from_end]
        
        return new_features

def lstm_hybrid_run(trial): 

    num_nucleotides = trial.suggest_int("num_nucleotides", 1, 7)

    train_set = np.load('../data/ml_data/HG002_2_3_00_400k_train.npz')
    val_set = np.load('../data/ml_data/HG002_2_3_00_400k_val.npz')

    # build train set
    train_features, y_train = filter_set_by_base_quality(train_set)
    train_features = set_num_nucleotides(train_features, num_nucleotides)
    X_train = train_features.transpose((0, 2, 1))

    # build val set    
    val_features, y_val = filter_set_by_base_quality(val_set)
    val_features = set_num_nucleotides(val_features, num_nucleotides)
    X_val = val_features.transpose((0, 2, 1)) 

    # dynamically set input_shape for model
    full_window = num_nucleotides * 2 + 1
    input_shape = ((full_window), 5)   

    model = Sequential()

    # lstm layer
    model.add(LSTM(64, input_shape=input_shape, return_sequences=True))

    # convolutional layers & max pooling
    model.add(Conv1D(32, kernel_size=3, activation='relu', padding='same'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Conv1D(64, kernel_size=3, activation='relu', padding='same'))
    model.add(MaxPooling1D(pool_size=2))

    # flatten
    model.add(Flatten())

    # dense/output layer
    model.add(Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    train_loss_history = []
    train_accuracy_history = []
    val_loss_history = []
    val_accuracy_history = []

    epochs = 10

    for epoch in range(epochs):
        history = model.fit(X_train, y_train, validation_data=(X_val, y_val), batch_size=32)
        
        train_loss_history.append(history.history['loss'])
        train_accuracy_history.append(history.history['accuracy'])
        val_loss_history.append(history.history['val_loss'])
        val_accuracy_history.append(history.history['val_accuracy'])
        
        print(f"Epoch {epoch + 1}/{epochs} - "
            f"Train Loss: {history.history['loss'][0]:.4f}, "
            f"Train Acc: {history.history['accuracy'][0]:.4f}, "
            f"Val Loss: {history.history['val_loss'][0]:.4f}, "
            f"Val Acc: {history.history['val_accuracy'][0]:.4f}")
        
    accuracy = val_accuracy_history[len(val_accuracy_history) - 1]
    return accuracy

In [3]:
print(f"Training the LSTM-CNN hybrid model.")
study = optuna.create_study(direction="maximize")
study.optimize(lstm_hybrid_run, n_trials=10)
fig = optuna.visualization.plot_slice(study)
fig.show()

[I 2023-09-21 11:53:45,572] A new study created in memory with name: no-name-56fa272e-832d-45bc-bbce-95f8f6d52eba


Training the LSTM-CNN hybrid model.


[W 2023-09-21 11:53:49,374] Trial 0 failed with parameters: {'num_nucleotides': 7} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "c:\Users\mmm\anaconda3\lib\site-packages\optuna\study\_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\mmm\AppData\Local\Temp\ipykernel_18692\2064583559.py", line 88, in lstm_hybrid_run
    history = model.fit(X_train, y_train, validation_data=(X_val, y_val), batch_size=32)
  File "c:\Users\mmm\anaconda3\lib\site-packages\keras\src\utils\traceback_utils.py", line 65, in error_handler
    return fn(*args, **kwargs)
  File "c:\Users\mmm\anaconda3\lib\site-packages\keras\src\engine\training.py", line 1742, in fit
    tmp_logs = self.train_function(iterator)
  File "c:\Users\mmm\anaconda3\lib\site-packages\tensorflow\python\util\traceback_utils.py", line 150, in error_handler
    return fn(*args, **kwargs)
  File "c:\Users\mmm\anaconda3\lib\site-packages\tensorflow\python\e

In [10]:
"""
Hparams to optimize: 
num_hidden_layers = [1]
num_units = [32, 64, 128]
dropout_rates = [0.0, 0.1, 0.2]
activation_functions = ['relu', 'tanh']
learning_rates = [0.05, 0.1, 0.15]
momentum_values = [0.0, 0.1, 0.5, 0.9]
batch_sizes = [32, 64, 128]
"""

from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import LSTM, Dense, Dropout
import optuna

def objective(trial):
    epochs = 5

    # hyperparameter tuning
    num_hidden = 1
    num_nucleotides = trial.suggest_int("num_nucleotides", 1, 7)
    num_unit = trial.suggest_int("num_unit", 32, 128)
    dropout_rate = trial.suggest_float("dropout_rate", 0.0, 0.2)
    activation_function = trial.suggest_categorical("activation_function", ['relu', 'tanh'])
    learning_rate = trial.suggest_float("learning_rate", 0.05, 0.15)
    momentum = trial.suggest_float("momentum_values", 0.0, 0.9)
    batch_size = trial.suggest_int("batch_size", 32, 128)
    # num_hidden_layers = [1]
    # num_units = [32, 64, 128]
    # dropout_rates = [0.0, 0.1, 0.2]
    # activation_functions = ['relu', 'tanh']
    # learning_rates = [0.05, 0.1, 0.15]
    # momentum_values = [0.0, 0.1, 0.5, 0.9]
    # batch_sizes = [32, 64, 128]

    # build train set
    train_set = np.load('../data/ml_data/HG002_2_3_00_400k_train.npz')
    val_set = np.load('../data/ml_data/HG002_2_3_00_400k_val.npz')
    train_features, y_train = filter_set_by_base_quality(train_set)
    train_features = set_num_nucleotides(train_features, num_nucleotides)
    X_train = train_features.transpose((0, 2, 1))

    # build val set    
    val_features, y_val = filter_set_by_base_quality(val_set)
    val_features = set_num_nucleotides(val_features, num_nucleotides)
    X_val = val_features.transpose((0, 2, 1)) 

    # dynamically set input_shape for model
    full_window = num_nucleotides * 2 + 1
    input_shape = ((full_window), 5)  


    model = Sequential()
    for _ in range(num_hidden):
        if _ == 0:
            model.add(LSTM(num_unit, input_shape=input_shape, return_sequences=(num_hidden > 1)))
        else:
            model.add(LSTM(num_unit, return_sequences=(_ < num_hidden)))
        model.add(Dropout(dropout_rate))

    model.add(Dense(1, activation='sigmoid'))

    optimizer = Adam(learning_rate=learning_rate, beta_1=momentum)
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

    for epoch in range(epochs):
        history = model.fit(X_train, y_train, validation_data=(X_val, y_val), batch_size=batch_size, verbose=0)
        val_accuracy = history.history['val_accuracy'][-1]

        print(
            f"Epoch {epoch + 1}/{epochs}, " 
            f"Val Acc: {val_accuracy:.4f}")
    
    return val_accuracy
        # if val_accuracy > best_accuracy:
        #         model_info = {
        #             'model': model,
        #             'val_accuracy': val_accuracy,
        #             'hyperparameters': {
        #                 'Hidden Layers': num_hidden,
        #                 'Units': num_unit,
        #                 'Dropout': dropout_rate,
        #                 'Activation': activation_function,
        #                 'Learning Rate': learning_rate,
        #                 'Momentum': momentum,
        #                 'Batch Size': batch_size
        #             }
        #         }
        #         top_models_info.append(model_info)
            


In [11]:
print(f"Training the LSTM-CNN hybrid model.")
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=10)
fig = optuna.visualization.plot_slice(study)
fig.show()

[I 2023-09-21 12:16:41,396] A new study created in memory with name: no-name-8183a1c0-4091-4967-a49a-f463a5f21794


Training the LSTM-CNN hybrid model.
Epoch 1/5, Val Acc: 0.6675
Epoch 2/5, Val Acc: 0.6507
Epoch 3/5, Val Acc: 0.6558
Epoch 4/5, Val Acc: 0.6734


[I 2023-09-21 12:19:02,532] Trial 0 finished with value: 0.44846463203430176 and parameters: {'num_nucleotides': 6, 'num_unit': 71, 'dropout_rate': 0.19606921742184083, 'activation_function': 'tanh', 'learning_rate': 0.12283255202157496, 'momentum_values': 0.0873671823944049, 'batch_size': 69}. Best is trial 0 with value: 0.44846463203430176.


Epoch 5/5, Val Acc: 0.4485
Epoch 1/5, Val Acc: 0.6348
Epoch 2/5, Val Acc: 0.6350
Epoch 3/5, Val Acc: 0.6354
Epoch 4/5, Val Acc: 0.6352


[I 2023-09-21 12:20:10,166] Trial 1 finished with value: 0.6345345973968506 and parameters: {'num_nucleotides': 2, 'num_unit': 42, 'dropout_rate': 0.12961859456463837, 'activation_function': 'relu', 'learning_rate': 0.059628786122183974, 'momentum_values': 0.1259470191293182, 'batch_size': 81}. Best is trial 1 with value: 0.6345345973968506.


Epoch 5/5, Val Acc: 0.6345
Epoch 1/5, Val Acc: 0.5520
Epoch 2/5, Val Acc: 0.6528
Epoch 3/5, Val Acc: 0.4485
Epoch 4/5, Val Acc: 0.5726


[I 2023-09-21 12:23:35,688] Trial 2 finished with value: 0.44848862290382385 and parameters: {'num_nucleotides': 6, 'num_unit': 79, 'dropout_rate': 0.08082566261212867, 'activation_function': 'tanh', 'learning_rate': 0.14803323126868423, 'momentum_values': 0.5696618897571414, 'batch_size': 68}. Best is trial 1 with value: 0.6345345973968506.


Epoch 5/5, Val Acc: 0.4485
Epoch 1/5, Val Acc: 0.6840
Epoch 2/5, Val Acc: 0.6814
Epoch 3/5, Val Acc: 0.6549
Epoch 4/5, Val Acc: 0.6798


[I 2023-09-21 12:28:58,416] Trial 3 finished with value: 0.6504525542259216 and parameters: {'num_nucleotides': 7, 'num_unit': 102, 'dropout_rate': 0.05139723858068071, 'activation_function': 'tanh', 'learning_rate': 0.08619968940653848, 'momentum_values': 0.46550197083747064, 'batch_size': 67}. Best is trial 3 with value: 0.6504525542259216.


Epoch 5/5, Val Acc: 0.6505
Epoch 1/5, Val Acc: 0.6243
Epoch 2/5, Val Acc: 0.6240
Epoch 3/5, Val Acc: 0.6239
Epoch 4/5, Val Acc: 0.6232


[I 2023-09-21 12:29:53,250] Trial 4 finished with value: 0.6230102777481079 and parameters: {'num_nucleotides': 1, 'num_unit': 50, 'dropout_rate': 0.048200551435453036, 'activation_function': 'tanh', 'learning_rate': 0.08835721346607256, 'momentum_values': 0.5449804479171821, 'batch_size': 128}. Best is trial 3 with value: 0.6504525542259216.


Epoch 5/5, Val Acc: 0.6230
Epoch 1/5, Val Acc: 0.6450
Epoch 2/5, Val Acc: 0.4485
Epoch 3/5, Val Acc: 0.5879
Epoch 4/5, Val Acc: 0.5821


[I 2023-09-21 12:34:55,064] Trial 5 finished with value: 0.45127367973327637 and parameters: {'num_nucleotides': 6, 'num_unit': 116, 'dropout_rate': 0.19314120274170113, 'activation_function': 'relu', 'learning_rate': 0.08787995946260105, 'momentum_values': 0.29943719320901824, 'batch_size': 56}. Best is trial 3 with value: 0.6504525542259216.


Epoch 5/5, Val Acc: 0.4513
Epoch 1/5, Val Acc: 0.6839
Epoch 2/5, Val Acc: 0.6850
Epoch 3/5, Val Acc: 0.6593
Epoch 4/5, Val Acc: 0.6729


[I 2023-09-21 12:38:01,496] Trial 6 finished with value: 0.6795035004615784 and parameters: {'num_nucleotides': 7, 'num_unit': 43, 'dropout_rate': 0.061032036369686175, 'activation_function': 'tanh', 'learning_rate': 0.12786706519673818, 'momentum_values': 0.6093418450790119, 'batch_size': 52}. Best is trial 6 with value: 0.6795035004615784.


Epoch 5/5, Val Acc: 0.6795
Epoch 1/5, Val Acc: 0.4485
Epoch 2/5, Val Acc: 0.5515
Epoch 3/5, Val Acc: 0.4485
Epoch 4/5, Val Acc: 0.5515


[I 2023-09-21 12:40:27,751] Trial 7 finished with value: 0.44848862290382385 and parameters: {'num_nucleotides': 3, 'num_unit': 115, 'dropout_rate': 0.031950165722858026, 'activation_function': 'tanh', 'learning_rate': 0.13522953379898509, 'momentum_values': 0.7864917224204658, 'batch_size': 109}. Best is trial 6 with value: 0.6795035004615784.


Epoch 5/5, Val Acc: 0.4485
Epoch 1/5, Val Acc: 0.6460
Epoch 2/5, Val Acc: 0.6470
Epoch 3/5, Val Acc: 0.6435
Epoch 4/5, Val Acc: 0.5515


[I 2023-09-21 12:43:12,940] Trial 8 finished with value: 0.6114619374275208 and parameters: {'num_nucleotides': 3, 'num_unit': 122, 'dropout_rate': 0.07603006096807197, 'activation_function': 'relu', 'learning_rate': 0.13386449526643612, 'momentum_values': 0.7327007311789797, 'batch_size': 89}. Best is trial 6 with value: 0.6795035004615784.


Epoch 5/5, Val Acc: 0.6115
Epoch 1/5, Val Acc: 0.6845
Epoch 2/5, Val Acc: 0.6815
Epoch 3/5, Val Acc: 0.6842
Epoch 4/5, Val Acc: 0.6848


[I 2023-09-21 12:45:31,434] Trial 9 finished with value: 0.6842572689056396 and parameters: {'num_nucleotides': 7, 'num_unit': 39, 'dropout_rate': 0.17692511579391315, 'activation_function': 'relu', 'learning_rate': 0.132139087064944, 'momentum_values': 0.42337716426124655, 'batch_size': 87}. Best is trial 9 with value: 0.6842572689056396.


Epoch 5/5, Val Acc: 0.6843


In [12]:
fig = optuna.visualization.plot_param_importances(study)
fig.show()

In [31]:
print('Best trial')
print(f"accuracy: {study.best_trial.value}")
study.best_trial.params

Best trial
accuracy: 0.6842572689056396


{'num_nucleotides': 7,
 'num_unit': 39,
 'dropout_rate': 0.17692511579391315,
 'activation_function': 'relu',
 'learning_rate': 0.132139087064944,
 'momentum_values': 0.42337716426124655,
 'batch_size': 87}