### This is the code to evaluate the best RNN model against different subset datasets and outliers removal thresholds.
#### References:
- Peixeiro, M. (2022). Time series forecasting in Python. Manning. Includes the codes fromi its GitHub repo (https://github.com/marcopeix/AppliedTimeSeriesForecastingInPython).   
Contribution: The technique for converting the series into sequenced samples and the idea for scale and reeschale data after predictions.
- Discolll, N. (2024, January 12). Harnessing RNNs for Financial Time Series Analysis: A Python Approach. Medium. https://medium.com/@redeaddiscolll/harnessing-rnns-for-financial-time-series-analysis-a-python-approach-0669b3a25c7a.   
Contribution: EarlyStopping function for the RNN model.

#### Libraries
- Package Pandas (2.2). (2024). [Python]. https://pandas.pydata.org/
- Package NumPy (1.23). (2023). [Pyhton]. https://numpy.org/ - Harris, C. R., Millman, K. J., Van Der Walt, S. J., Gommers, R., Virtanen, P., Cournapeau, D., Wieser, E., Taylor, J., Berg, S., Smith, N. J., Kern, R., Picus, M., Hoyer, S., Van Kerkwijk, M. H., Brett, M., Haldane, A., Del Río, J. F., Wiebe, M., Peterson, P., … Oliphant, T. E. (2020). Array programming with NumPy. Nature, 585(7825), 357–362. https://doi.org/10.1038/s41586-020-2649-2
- Droettboom, J. D. H., Michael. (2024). Package matplotlib (3.8.4) [Python]. https://matplotlib.org
- Package scikit-learn (1.4). (2024). [Pyhton]. https://scikit-learn.org/stable/index.html
- Package Tensorflow (2.16). (2024). [Python]. https://github.com/tensorflow


In [2]:
# import libraries
import numpy as np
import pandas as pd
from sklearn.metrics import (mean_absolute_error,
                             mean_absolute_percentage_error,
                             mean_squared_error)
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Dense, Dropout, Input, SimpleRNN
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras.regularizers import l1_l2

import useful_functions as uf

# Possible datasets to test - uncomment the desired dataset according to the model that will be loaded
file_paths = [
    #'../data/data_orig_parameters.csv'
    #'../data/data_cleaned_RF.csv',
    '../data/data_cleaned_LASSO.csv'
    #'../data/data_cleaned_RFE.csv'
]

# List of outlier thresholds to test
outlier_thresholds = [np.nan, 0.05, 0.10, 0.15, 0.20]

# Dictionary to store the errors
errors_dict = {}

# Load the model from the file
best_model = load_model('models_parameters/best_rnn_model_grid_Lasso_annual_10.keras')

# Define the manual hyperparameters
epochs = 20
batch_size = 128
patience = 10

# Loop through the files and outlier thresholds
for file_path in file_paths:
    print(f"REading File: {file_path}")
    for remove_outliers_threshold in outlier_thresholds:
        print(f"Outlier Threshold: {remove_outliers_threshold}")
        
        # Load  data
        df_raw = pd.read_csv(file_path, parse_dates=['Date'], index_col='Date')
        target_variable = df_raw.columns[0] # Set the target variable as the first column of the dataframe
        
        # Convert all columns to float
        df_raw = df_raw.astype('float64')   
        df = df_raw.copy()

        # Remove outliers using the threshold
        if not pd.isna(remove_outliers_threshold): # if the threshold is not NaN
            df_cleaned = uf.remove_outliers(df.copy(), threshold=remove_outliers_threshold)
        else: # if the threshold is NaN, we will not remove the outliers
            df_cleaned = df.copy()

        # After removing the outliers, we need to fill the missing values again
        df_adjusted = uf.fill_missing_values(df_cleaned) 
        
        # Define test, train and validation set sizes
        val_size = 48 # 48 months or 4 years
        test_size = 48 # 48 months or 4 years

        # Split the data into train and test sets
        train_raw_total = df_adjusted[:-test_size] # This total trainning set will be used to train the final model
        df_train = train_raw_total[:-val_size]
        df_val = train_raw_total[-val_size:]
        df_test = df_adjusted[-test_size:]

        # Let´s scale the dfs
        # Create the scaler
        scaler = MinMaxScaler(feature_range=(0,1))
        scaled_train = scaler.fit_transform(df_train) # Fit the scaler to the train set and transform it
        scaled_val = scaler.transform(df_val) # Transform the validation set
        scaled_test = scaler.transform(df_test) # Transform the test set
        # include df columns names in the train and test sets
        train = pd.DataFrame(scaled_train, columns=df_train.columns)
        val = pd.DataFrame(scaled_val, columns=df_val.columns)
        test = pd.DataFrame(scaled_test, columns=df_test.columns)
        # Include the index in the train and test sets
        train.index = df_train.index
        val.index = df_val.index
        test.index = df_test.index

        # Converting the series to samples
        # We will use the past 12 months to predict the next 12 months
        def createXY(dataset, n_past, n_future):
            dataX, dataY = [], []
            # Loop for the entire dataset
            for i in range(n_past, len(dataset) - n_future + 1):
                dataX.append(dataset.iloc[i - n_past:i].values)  # Past n months
                dataY.append(dataset.iloc[i + n_future - 1, 0])  #
            return np.array(dataX), np.array(dataY)

        n_past = 12  # Number of past months to use
        n_future = 12  # Number of future months to predict

        # Create the samples
        X_train, Y_train = createXY(train, n_past, n_future)
        X_val, Y_val = createXY(val, n_past, n_future)
        X_test, Y_test = createXY(test, n_past, n_future)

        # Define EarlyStopping callback
        early_stopping = EarlyStopping(monitor='val_loss', 
                                       patience=patience, 
                                       restore_best_weights=True) # Stop training when the validation loss is no longer decreasing after X epochs

        # train the model
        history = best_model.fit(X_train, Y_train,
                                 validation_data=(X_val, Y_val),
                                 epochs=epochs, batch_size=batch_size,
                                 verbose=0,callbacks=[early_stopping])
        
        # Let's predict the test set using the best model
        predictions_test_scaled = best_model.predict(X_test)

        # Let's reshape the predictions and Y_val to revert the scaling
        # Reshape predictions to 2D
        predictions_test_scaled_2d = predictions_test_scaled.reshape(-1, 1)
        
        # Get the last timestep of X_test
        X_test_last_timestep = X_test[:, -1, :]
        
        # Replace the first column of X_test_last_timestep with the scaled predictions.
        X_test_last_timestep[:, 0] = predictions_test_scaled_2d[:, 0]
        
        # unscale the predictions
        predictions_test_rescaled = scaler.inverse_transform(X_test_last_timestep)[:, 0]

        # Let's convert the predictions and Y_test to a dataframe usind the index from test
        predictions_test_df = pd.DataFrame(predictions_test_rescaled, index=test.index[-len(predictions_test_rescaled):], columns=[target_variable])
        predictions = predictions_test_df.copy()
        # Get the real values of Y_test to compare with the predictions
        Y_test = df_adjusted[-len(predictions):][target_variable]

        # Calculate the error
        mape_best_RNN = mean_absolute_percentage_error(Y_test, predictions).round(2)
        rmse_best_RNN = np.sqrt(mean_squared_error(Y_test, predictions)).round(2)
        mae_best_RNN = mean_absolute_error(Y_test, predictions).round(2)

        # Store the errors in the dictionary
        errors_dict[(file_path, remove_outliers_threshold)] = {'MAPE': mape_best_RNN, 'RMSE': rmse_best_RNN, 'MAE': mae_best_RNN}

# Print the errors
for key, value in errors_dict.items():
    print(f"File: {key[0]}, Outlier Threshold: {key[1]} -> Errors: {value}")

  trackable.load_own_variables(weights_store.get(inner_path))


REading File: ../data/data_cleaned_LASSO.csv
Outlier Threshold: nan
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 173ms/step
Outlier Threshold: 0.05
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
Outlier Threshold: 0.1
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
Outlier Threshold: 0.15
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
Outlier Threshold: 0.2
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
File: ../data/data_cleaned_LASSO.csv, Outlier Threshold: nan -> Errors: {'MAPE': 17.81, 'RMSE': 161587.75, 'MAE': 127844.89}
File: ../data/data_cleaned_LASSO.csv, Outlier Threshold: 0.05 -> Errors: {'MAPE': 8.68, 'RMSE': 127069.23, 'MAE': 112575.6}
File: ../data/data_cleaned_LASSO.csv, Outlier Threshold: 0.1 -> Errors: {'MAPE': 7.16, 'RMSE': 115088.43, 'MAE': 100662.52}
File: ../data/data_cleaned_LASSO.csv, Outlier Threshold: 0.15 -> Errors: {'MAPE': 7.97, 'RMSE': 121827.9