In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import mean_squared_error
import numpy as np

def make_LSTM(X, y, n_LSTM_layers: int = 1, n_units: int = 64, dropout: float = 0, optimizer: tf.keras.optimizers = Adam, learning_rate: float = 0.001, loss: str = 'mean_squared_error'):
    input_shape = ((X[0]).shape[1], X[0].shape[2])
    output_shape = y[0].shape[1]
    to_return_sequences = lambda n_LSTM_layers_left: True if n_LSTM_layers_left > 1 else False

    model = Sequential()
    for n_LSTM_layers_left in range(n_LSTM_layers, 0, -1):
        model.add(LSTM(units=n_units, 
                       return_sequences=to_return_sequences(n_LSTM_layers_left), 
                       input_shape=input_shape, 
                       dropout = dropout, 
                       recurrent_dropout=dropout))
        n_units = n_units // 2 if n_units > 8 else n_units
    model.add(Dense(units=output_shape))

    model.compile(loss=loss, optimizer=optimizer(learning_rate=learning_rate), weighted_metrics = [])

    return model

def fit_LSTM(model, X_train, y_train, X_test, y_test, sw_train, sw_test, epochs: int = 100, batch_size: int = 32, return_history: bool = False):
    patience = 30
    callbacks = [EarlyStopping(monitor='val_loss', patience=patience, min_delta=0.1, restore_best_weights=True)]
    history = model.fit(
        X_train, y_train,
        validation_data = (X_test, y_test),
        #validation_data=(X_test, y_test, sw_test),  # failed experiments with sample weighting
        #sample_weight = sw_train,
        batch_size=batch_size,
        epochs=epochs,
        verbose=0,
        callbacks = callbacks,
        shuffle = False,
        )
    #test_loss = model.evaluate(X_test, y_test, sample_weight = sw_test, batch_size = batch_size)
    y_pred = model.predict(X_test, batch_size = batch_size, verbose = 0,
                           #workers = 16, use_multiprocessing = True,
                           )
    test_rmse = mean_squared_error(y_test, y_pred, squared = False, multioutput = 'uniform_average')
    #test_rmse = mean_squared_error(y_test, y_pred, sample_weight= sw_test, squared = False, multioutput = 'uniform_average')
    ### taking history into account and trying to minimize the gap between train and validation loss
    train_loss = history.history['loss']
    val_loss = history.history['val_loss']
    mse_epochs = [(x-y)**2 for x, y in zip(train_loss, val_loss)]
    history_slice = slice(-patience,len(mse_epochs),1)# slice(-3*patience//2,-patience//2,1)  # apparently it doesn't bug out if len(mse_epochs) < 2* patience
    historical_loss_rmse = np.sqrt(np.nanmean(mse_epochs[history_slice]))  # this slice was chosen as it contains the most optimized hyperparameters
    # here a new metric - historical_loss_rmse was introduced, that shows how far validation and training loss at the last patience epochs are

    hist_test_weights = [10,0]  # completely arbitrary parameter that sets up a weighted sum of traditional RMSE and HLRMSE with weights 2 and 1 respectively
    test_loss = (np.array([test_rmse, historical_loss_rmse]) * np.array(hist_test_weights)).sum() / sum(hist_test_weights)
    if return_history:
        return test_loss, history.history
    else:
        return test_loss, None

def objective_LSTM(trial):
    # Load_data
    data_load_params = dict(
        dedrifting_method = trial.suggest_categorical('dedrifting_method', ['SavGol', 'exp', "none"]), # 'SavGol', 
        )
    if data_load_params['dedrifting_method'] == 'SavGol':
        data_load_params['window_length'] = trial.suggest_int('window_length', 150, 500, step = 10)
        data_load_params['envelope_choice'] = trial.suggest_categorical('envelope_choice', ['multienv', 'topenv'])
        data_load_params['alpha'] = 1
    elif data_load_params['dedrifting_method'] == 'exp':
        data_load_params['alpha'] = trial.suggest_float('alpha', 0.001, 0.1, log = True)
        data_load_params['window_length'] = 1
        data_load_params['envelope_choice'] = trial.suggest_categorical('envelope_choice', ['multienv', 'topenv'])
    elif data_load_params['dedrifting_method'] == 'none':
        data_load_params['envelope_choice'] = 'none'
        data_load_params['window_length'] = 1
        data_load_params['alpha'] = 1
    
    params = dict(
        look_back = trial.suggest_int('look_back', 20, 57, log=False),
        n_components = trial.suggest_int('n_components', 15, 150),
        do_PCA = True,  # trial.suggest_categorical('do_PCA', [True, False]),
        n_LSTM_layers = 1,  # trial.suggest_int('n_LSTM_layers', 1, 2),
        n_units = trial.suggest_int('n_units', 16, 128, log=False),  # trial.suggest_categorical('n_units', [16, 32, 64, 96, 128]),  
        dropout = trial.suggest_float('dropout', 0.005, 0.5, log=False),  # 0.162
        learning_rate= trial.suggest_float('learning_rate', 0.01, 0.5, log=True),
        epochs = 150,  # trial.suggest_int('epochs', 50, 300),
        batch_size = trial.suggest_categorical('batch_size', [64, 128, 256]),
        )
    

    DF = load_full_dedrifted_dataset(**data_load_params)  # custom function for loading and preprocessing the data
    # Split data and perform PCA
    train_X, test_X, train_y, test_y, train_SW, test_SW = train_test_RNN(  # my custom version of train_test_split for sequential data
        DF, 
        look_back= params['look_back'],
        n_components= params['n_components'],
        do_PCA= params['do_PCA'],
        start=8)
    model = make_LSTM(train_X, 
                      train_y, 
                      optimizer = Adam, 
                      loss = 'mean_squared_error',
                      n_LSTM_layers = params['n_LSTM_layers'],
                      n_units = params['n_units'],
                      dropout = params['dropout'],
                      learning_rate = params['learning_rate'],)
  
    rmse_, history_ = fit_LSTM(
        model, train_X[0], train_y[0], test_X[0], test_y[0], 
        train_SW, test_SW,
        epochs = params['epochs'],
        batch_size= params['batch_size'], 
        return_history=True)
    plot_history(history_, params)  # custom function to plot the NN training results
    
    return rmse_

import optuna
import warnings
warnings.filterwarnings("ignore")
study = optuna.create_study(direction= 'minimize',  # minimize  for regression minimize  for classification 'maximize'
                            pruner= optuna.pruners.MedianPruner(n_startup_trials=10, n_warmup_steps=0, interval_steps=1, n_min_trials=1),
                            #optuna.pruners.SuccessiveHalvingPruner(min_resource='auto', reduction_factor=3, min_early_stopping_rate=4, bootstrap_count=0) 
                            # optuna.pruners.MedianPruner(n_warmup_steps=10),
                            )
study.optimize(objective_LSTM, n_trials=10000, timeout=8*60*60, show_progress_bar = False, n_jobs=-1)