# **This notebook groups the full execution of RNN models**

# **Install and import libraries**

In [None]:
!pip3 install mlflow

**Preparing Databricks environment** 

In [None]:
!databricks configure --host https://community.cloud.databricks.com/

**Initializing the experiment on Databricks using mlflow**

In [None]:
import mlflow
mlflow.set_tracking_uri("databricks")
mlflow.set_experiment("/Users/username@entity.ex/Experiment")

# **Data Exploring**

**Importing the libraries we will be working with.**

In [None]:
import numpy as np
import pandas as pd

from tensorflow.keras import Sequential, layers, callbacks
from tensorflow.keras.layers import Dense, Dropout, GRU, LSTM, Bidirectional

import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_error
import absl.logging
absl.logging.set_verbosity(absl.logging.ERROR)

## **Load data**

**We will be working with Apple's dataset from the period 17 March 2015, to 30 March 2021**

**Note:** The dataset has been shifted already.

In [None]:
df = pd.read_csv('new_apple_f.csv', header=0, parse_dates=[0], index_col=0, squeeze=True)
df

Unnamed: 0_level_0,average_true_range40,ema_indicator40,ema_indicator25,sma_indicator50,sma_indicator45,open,ema_indicator50,ema_indicator10,acc_dist_index,adj_close,sma_indicator10,average_true_range45,sma_indicator40,high,low,ema_indicator15,average_true_range25,average_true_range35,ema_indicator30,daily_return,ema_indicator35,sma_indicator15,ema_indicator20,sma_indicator20,sma_indicator30,average_true_range50,average_true_range30,ema_indicator45,close,sma_indicator25,sma_indicator5,ema_indicator5,cumulative_return,sma_indicator35
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1
2015-03-17,0.699112,30.724633,31.251490,30.00315,30.313722,31.475000,30.396444,31.432506,4.093498e+07,28.953447,31.386750,0.698566,30.714937,31.830000,31.412500,31.465652,0.691004,0.698556,31.081118,1.672672,30.901729,31.693166,31.391584,31.917875,31.487500,0.699026,0.696796,30.555396,32.117500,31.817000,31.113499,31.355336,16.198665,31.126000
2015-03-18,0.699072,30.792578,31.318106,30.11425,30.420500,31.750000,30.463936,31.557051,1.728797e+08,29.279362,31.385000,0.698542,30.838375,32.290001,31.592501,31.547133,0.691264,0.698526,31.147981,1.125630,30.969272,31.687833,31.460719,31.914750,31.569333,0.698996,0.696819,30.623313,31.875000,31.881500,31.424999,31.609391,17.506631,31.264071
2015-03-19,0.693158,30.845379,31.360944,30.22045,30.516500,32.187500,30.519272,31.614860,9.451222e+06,29.058285,31.412250,0.693297,30.950562,32.312500,31.850000,31.588116,0.682114,0.691782,31.194886,-0.755040,31.019590,31.639166,31.500175,31.902875,31.635500,0.694266,0.689009,30.677735,31.475000,31.907700,31.577500,31.697927,16.619409,31.351143
2015-03-20,0.696079,30.876092,31.369718,30.31120,30.605944,32.062500,30.556752,31.589431,-1.398124e+08,28.693634,31.394750,0.695890,31.034937,32.099998,31.290001,31.573977,0.687229,0.695160,31.212958,-1.254902,31.044891,31.596500,31.497777,31.857875,31.685166,0.696580,0.693042,30.712398,31.802500,31.902100,31.693000,31.623618,15.155950,31.401143
2015-03-23,0.690864,30.921283,31.403009,30.38780,30.719222,31.780001,30.605605,31.628170,-1.341414e+08,28.992193,31.396500,0.691259,31.123875,31.962500,31.629999,31.602542,0.679240,0.689227,31.250993,1.040508,31.086980,31.565166,31.526798,31.785500,31.754166,0.692399,0.686190,30.759794,31.672501,31.903400,31.806000,31.683245,16.354157,31.472928
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-03-24,3.694247,125.380130,123.938653,128.91260,128.934666,122.820000,125.770880,122.013795,8.195381e+09,119.886360,122.384999,3.687888,127.700250,122.900002,120.070000,122.547871,3.690351,3.697411,124.541857,-1.999351,125.022664,121.521999,123.243753,122.002499,124.892000,3.678568,3.696772,125.624960,120.589996,123.073600,121.307999,121.617514,339.367039,126.507714
2021-03-25,3.668391,125.146465,123.681064,128.74840,128.680444,119.540001,125.567708,121.754923,8.214703e+09,120.385513,122.247999,3.665046,127.163500,121.660004,119.000000,122.303136,3.649137,3.667771,124.286899,0.416354,124.776404,121.552666,122.991014,121.982499,124.398666,3.658196,3.662213,125.406049,121.209999,122.708799,121.319998,121.275008,341.196363,126.126285
2021-03-26,3.640681,124.954442,123.490982,128.55480,128.332444,120.349998,125.396817,121.655846,8.288931e+09,121.004463,122.265999,3.640490,126.766500,121.480003,118.919998,122.166494,3.605572,3.636121,124.088389,0.514141,124.578271,121.538666,122.821394,121.979999,123.934666,3.636233,3.625473,125.223612,121.389999,122.362399,121.563998,121.253338,343.464736,125.663999
2021-03-29,3.595914,124.780567,123.329368,128.40440,127.939555,121.650002,125.239687,121.607510,8.265777e+09,121.184158,122.005999,3.600701,126.502249,122.580002,120.730003,122.069432,3.535349,3.585089,123.914299,0.148503,124.401145,121.873999,122.685070,121.659999,123.468666,3.600508,3.566290,125.056933,119.900002,122.177999,121.163998,121.298892,344.123293,125.224857


In [None]:
df.shape

(1521, 34)

# **Get and prepare the Data**

In [None]:
# split a multivariate dataset into samples to get Xtrain and ytrain | Xtest and ytest
def split_data(X_dataset, y_dataset, n_steps_in):
    """
    The function does the following:
    (example with n_steps_in = 3)
    Data = [
    [ 10  15  25]
    [ 20  25  45]
    [ 30  35  65]
    [ 40  45  85]
    [ 50  55 105]
    [ 60  65 125]]
    
    Treatment: 
    X = [
    [10, 15]
    [20, 25]
    [30, 35]]
    
    y = [65]
    """
    X, y = [], []
    for i in range(len(X_dataset)):
        # find the end of this pattern
        end_ix = i + n_steps_in
    
        # check if we are beyond the dataset
        if end_ix > len(X_dataset):
            break

        # gather input and output parts of the pattern
        X.append(X_dataset[i:end_ix, :])
        y.append(y_dataset[end_ix-1:])

    return np.array(X), np.array(y)

In [None]:
def get_data(full_dataset, cols_list, input_scaler, output_scaler, n_steps, percentage):
    """
    This function prepares the data by splitting it into X_train and X_test for the given list of features.
    And it splits the target into y_train and y_test, all with the giving percentage of split.
    Moreover, it scales the values and make them suited to the RNN models (LSTM, Bi-LSTM, GRU, and Bi-GRU).

    Arguments:
    - full_dataset: Full dataset with all features and target variable.
    - cols_list: list of features.
    - input_scaler, output_scaler: scaler for the features set and for the target, respectively.
    - n_steps: window size for the prediction models.
    - percentage: splitting percentage of the dataset.

    Returns:
    - Train and test datasets with selected list of features.
    - The target variable for training and testing.
    - The scaler used to normalize the values of ytrain. This will be the tool to inverse 
    the transformation for the predictions given by the model.
    """

    # Target variable
    y = full_dataset[['close']]

    ## Chosing columns
    X = full_dataset[cols_list]

    ## Choose the spliting percentage
    split = int(len(y) * percentage)

    ## Training Data
    X_train = X.iloc[:split].values
    y_train = y.iloc[:split].values

    ## Test Data
    X_test = X.iloc[split:].values
    y_test = y.iloc[split:].values

    ## Scale X_train and X_test
    input_scaler.fit(X_train)
    X_train_scaled = input_scaler.transform(X_train)
    X_test_scaled = input_scaler.transform(X_test)
    
    # Scale y_train and y_test
    output_scaler.fit(y_train.reshape(-1, 1))
    y_train_scaled = output_scaler.transform(y_train.reshape(-1, 1))
    y_test_scaled = output_scaler.transform(y_test.reshape(-1, 1))
    
    ## Setting variables shape for RNN models
    Xtrain, ytrain = split_data(X_train_scaled, y_train_scaled, n_steps)
    Xtest, ytest = split_data(X_test_scaled, y_test_scaled, n_steps)

    return Xtrain, ytrain, Xtest, ytest, output_scaler

# **Modeling**

In [None]:
def Training(model_name, Xtrain, ytrain, Xtest, ytest, n_steps, Epochs, Batch_size, use_dropout = False):
    """
    This function creates the model's architecture, and trains the model.

    Arguments:
    - model_name: name from the list ["LSTM", "BiLSTM", "GRU", "BiGRU"]
    - Xtrain, ytrain: The training data and its corresponding target values, respectively.
    - Xtest, ytest: The test data and its corresponding target values, respectively.
    - n_steps: window size for the prediction models.
    - Epochs: Number of epochs for the model.
    - Batch_size: The choosen batch_size for the model.
    - use_dropout: Boolean variable indicating the use of Dropout in the architecture.

    Returns:
    The trained model, and its history after training.
    """

    # Make sure the model name is one of the available models
    assert model_name in ["LSTM", "BiLSTM", "GRU", "BiGRU"]

    # Get the number of features
    n_features = Xtrain.shape[2]

    model = Sequential()

    #### --- LSTM Model ---
    if model_name == "LSTM":
        # 1st Hidden layer
        model.add(LSTM(50, return_sequences = True, input_shape = (n_steps, n_features)))
            
        if use_dropout == True:
            model.add(Dropout(0.2))

        # 2nd Hidden layer
        model.add(LSTM(100, return_sequences=False))

        if use_dropout == True:
            model.add(Dropout(0.2))

    #### --- GRU Model ---
    if model_name == "GRU":
        # 1st Hidden layer
        model.add(GRU(50, return_sequences = True, input_shape = (n_steps, n_features)))
            
        if use_dropout == True:
            model.add(Dropout(0.2))

        # 2nd Hidden layer
        model.add(GRU(100, return_sequences=False))

        if use_dropout == True:
            model.add(Dropout(0.2))

    #### --- BiLSTM Model ---
    if model_name == "BiLSTM":
        # 1st Hidden layer
        model.add(Bidirectional(
                  LSTM(50, return_sequences = True, 
                  input_shape = (n_steps, n_features))))
                    
        if use_dropout == True:
            model.add(Dropout(0.2))

        # 2nd Hidden layer
        model.add(Bidirectional(LSTM(100, return_sequences=False)))

        if use_dropout == True:
            model.add(Dropout(0.2))
    
    #### --- BiGRU Model ---
    if model_name == "BiGRU":
        # 1st Hidden layer
        model.add(Bidirectional(
                  GRU(50, return_sequences = True, 
                  input_shape = (n_steps, n_features))))
                    
        if use_dropout == True:
            model.add(Dropout(0.2))

        # 2nd Hidden layer
        model.add(Bidirectional(GRU(100, return_sequences=False)))

        if use_dropout == True:
            model.add(Dropout(0.2))
    
    model.add(Dense(1))
    model.compile(loss = 'mse', optimizer = 'adam', metrics = ['mse', 'mae', 'mape'])

    ### Fit the model
    # Create early stop after 10 epochs if 'val_loss' (mse) is not changing
    early_stop = callbacks.EarlyStopping(monitor = 'val_loss', patience = 10)

    # Fit the model
    history = model.fit(Xtrain, ytrain, epochs = Epochs, batch_size = Batch_size, 
                        validation_data = (Xtest, ytest), verbose = 0, callbacks = [early_stop])

    return model, history

In [None]:
# Make predictions
def prediction(model, Xtest, outscaler):
    """
    This function makes predictions based on the model and the test set given.

    Arguments:
    - model: The trained model returned by the Training() function. 
    - Xtest: The test dataset returned by get_data() function.
    - outscaler: the scaler of the target variable, returned by get_data() function.

    Returns: The predictions of the model.
    """
    predictions = model.predict(Xtest)
    predictions = outscaler.inverse_transform(predictions)

    return predictions

In [None]:
def model_DoItAll(model_name, full_dataset, cols_list, n_steps, Epochs, Batch_size, use_dropout, percentage):
    """
    As the name indicates, this function does all the work.

    Arguments:
    - model_name: name from the list ["LSTM", "BiLSTM", "GRU", "BiGRU"]
    - full_dataset: Full dataset with all features and target variable.
    - cols_list: list of features.
    - n_steps: window size for the prediction models.
    - Epochs: Number of epochs for the model.
    - Batch_size: The choosen batch_size for the model.
    - use_dropout: Boolean variable indicating the use of Dropout in the architecture.
    - percentage: splitting percentage of the dataset.

    Returns:
    - Test set and the predictions of the target variable.
    - The trained model and its history on the evaluation.
    (The function returns the model and its history to be saved on mlflow, at the end)
    """

    # Make sure the model name is one of the available models
    assert model_name in ["LSTM", "BiLSTM", "GRU", "BiGRU"]

    # Define the scaler
    from sklearn.preprocessing import MinMaxScaler
    in_scaler = MinMaxScaler()
    out_scaler = MinMaxScaler()

    # Create and get the data scaled
    Xtrain, ytrain, Xtest, ytest, output_scaler = get_data(full_dataset, cols_list, in_scaler, out_scaler, n_steps, percentage)

    # Get the values as a 1 dimensional array
    ytrain = np.array([train[0] for train in ytrain])
    ytest = np.array([test[0] for test in ytest])

    # Train the model
    model, model_history = Training(model_name, Xtrain, ytrain, Xtest, ytest, n_steps, Epochs, Batch_size, use_dropout)

    ## Making predictions
    ypred = prediction(model, Xtest, output_scaler)
    ytest = output_scaler.inverse_transform(ytest)

    return ytest, ypred, model, model_history

In [None]:
def evaluate(y_test, y_pred):
    """
    This function evaluates the performance of the model.
    
    Arguments:
    - Test data of the target variable.
    - Predictions made by the model.

    Returns: Model performance based on: 
    - RMSE: Root Mean Squared Error.
    - MAPE: Mean Absolute Percentage Error.
    - MAE: Mean Absolute Error.
    """

    RMSE = mean_squared_error(y_test, y_pred, squared = False)
    MAPE = mean_absolute_percentage_error(y_test, y_pred) * 100
    MAE = mean_absolute_error(y_test, y_pred)

    return RMSE, MAPE, MAE

In [None]:
def turn_to_dataframe(model_name, dataset, ytest, ypred, percentage, step):
    """
    This function creates dataframes out of the test data of the target and its predictions, based on the splitting percentage.
    (This function is used when creating plots)
    """

    # Make sure the model name is one of the available models
    assert model_name in ["LSTM", "BiLSTM", "GRU", "BiGRU"]

    y = dataset[['close']]
    test_index = y[step-1:].iloc[int(len(y)*percentage):].index

    ytest_df = pd.DataFrame(data=ytest, columns=['test_close'], index=test_index)
    ypred_df = pd.DataFrame(data=ypred, columns=[model_name + 'predictions'], index=test_index)

    return ytest_df, ypred_df

In [None]:
def make_plot(dataset, ytest, ypred, percentage, model_name, step):
    """
    This function makes plots for comparing the actual price and the predictions the chosen model that has been made.

    Arguments:
    - Test data of the target variable.
    - Predictions made by the model.
    - The model's name, for the model used in making prediction. Only "LSTM", "BiLSTM", "GRU", and "BiGRU" 
    """

    # Make sure the model name is one of the available models
    assert model_name in ["LSTM", "BiLSTM", "GRU", "BiGRU"]
    ytest_df, ypred_df = turn_to_dataframe(model_name, dataset, ytest, ypred, percentage, step)
    
    plt.figure(figsize=(20, 15), dpi=500)
    plt.grid(True)
    plt.title("Prediction VS Actual for " + model_name + " model")

    plt.plot(ytest_df, color='red', label='Actual Price')
    plt.plot(ypred_df, color='green', marker='.',label='Predicted Price')
    plt.xlabel('Date')
    plt.ylabel('Close Price ($)')

    plt.legend()
    plt.savefig('fig.png', format='png', dpi=500)
    
    plt.show()

# **Iterate and get the performance for all the models**

In [None]:
# F1: Most correlated to the target variable
F1 = ['open','high','low','adj_close','sma_indicator5','ema_indicator5','sma_indicator10',
      'ema_indicator10','sma_indicator15','ema_indicator15','sma_indicator20','ema_indicator20',
      'sma_indicator25','ema_indicator25','average_true_range25','sma_indicator30','ema_indicator30',
      'average_true_range30','sma_indicator35','ema_indicator35','average_true_range35',
      'sma_indicator40','ema_indicator40','average_true_range40','sma_indicator45','ema_indicator45',
      'average_true_range45','sma_indicator50','ema_indicator50','average_true_range50','cumulative_return']

# F2: F-regression test
F2 = ['open', 'high', 'low', 'adj_close', 'sma_indicator5', 'ema_indicator5', 'sma_indicator10',
      'ema_indicator10', 'sma_indicator15', 'ema_indicator15', 'sma_indicator20', 'ema_indicator20',
      'ema_indicator25', 'ema_indicator30', 'cumulative_return']

# F3: RandomForestRegressor
F3 = ['open', 'high', 'low', 'adj_close', 'ema_indicator5', 'acc_dist_index', 'cumulative_return']

# F4: Lasso's Regularization
F4 = ['ema_indicator10', 'ema_indicator15', 'ema_indicator20', 'ema_indicator50', 'acc_dist_index']

# F5: Bi-directional elimination(Step-wise Selection)
F5  = ['cumulative_return', 'low', 'open', 'high']

In [None]:
# Initializing parameters
models_name = ["LSTM", "BiLSTM", "GRU", "BiGRU"]
batch_size = [128, 256, 512, 1024]
features_lists = [F1, F2, F3, F4, F5]
use_dropout = [True, False]
steps = [5, 10, 15, 20, 25, 30]

In [None]:
## Test to get the best batch size and best set of features
for name in models_name:
  for drop in use_dropout:
    for batch in batch_size:
      for idx, feature in enumerate(features_lists):
        model_name = name + '_batch' + str(batch) + '_F' + str(idx+1) + '_DropOut' + str(drop)
        with mlflow.start_run(run_name = model_name):
          ytest, ypred, model, model_history = model_DoItAll(name, df, feature, 5, 1000, batch, drop, 0.8)
          print("\nModel " + model_name + " Completed.\n")
          mlflow.log_param("percentage", 0.8)
          mlflow.log_param("n_steps", 5)
          mlflow.log_param("epochs", 1000)
          mlflow.log_param("batch_size", batch)
          mlflow.log_param("use_dropout", drop)
          print('percentage: ', 0.8 ,'; n_steps: ', 5 ,'; epochs: ', 1000 ,'; batch_size: ', batch ,'; use_dropout: ', drop)

          #mlflow.keras.log_model(model, model_name)
          RMSE, MAPE, MAE = evaluate(ytest, ypred)
          
          mlflow.log_metric("RMSE", RMSE)
          mlflow.log_metric("MAPE", MAPE)
          mlflow.log_metric("MAE", MAE)
          print('RMSE: ', RMSE ,'; MAPE: ', MAPE ,'; MAE: ', MAE)

          print("\n Comparison Plot:\n")
          make_plot(df, ytest, ypred, 0.8, name, 5)
          np.save('ypred.npy', ypred)
          mlflow.log_artifact('ypred.npy')
          mlflow.log_artifact("fig.png")

          mlflow.end_run()

In [None]:
## Test with all parameters
for name in models_name:
    for drop in use_dropout:
      for batch in batch_size:
        for step in steps:
          for l in range(1, 11):
            for idx, feature in enumerate(features_lists):
              model_name = name + '_batch' + str(batch) + '_F5' + '_DropOut' + str(drop) + '_step' + str(step) + '_#' + str(l)
              with mlflow.start_run(run_name = model_name):
                ytest, ypred, model, model_history = model_DoItAll(name, df, feature, step, 1000, batch, drop, 0.8)

                print("\nModel " + model_name + " Completed.\n")
                mlflow.log_param("percentage", 0.8)
                mlflow.log_param("n_steps", step)
                mlflow.log_param("epochs", 1000)
                mlflow.log_param("batch_size", batch)
                mlflow.log_param("use_dropout", drop)
                print('percentage: ', 0.8 ,'; n_steps: ', step ,'; epochs: ', 1000 ,'; batch_size: ', batch ,'; use_dropout: ', drop)

                mlflow.keras.log_model(model, model_name)
                RMSE, MAPE, MAE = evaluate(ytest, ypred)
                
                mlflow.log_metric("RMSE", RMSE)
                mlflow.log_metric("MAPE", MAPE)
                mlflow.log_metric("MAE", MAE)
                print('RMSE: ', RMSE ,'; MAPE: ', MAPE ,'; MAE: ', MAE)

                print("\n Comparison Plot:\n")
                make_plot(df, ytest, ypred, 0.8, name, step)
                np.save('ypred.npy', ypred)
                mlflow.log_artifact('ypred.npy')
                mlflow.log_artifact("fig.png")

                mlflow.end_run()