In [16]:
import pandas as pd
import numpy as np
import useful_functions as uf
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import SimpleRNN, Dense, Dropout, Input
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras.regularizers import l1_l2

# Possible datasets to test
file_paths = [
    #'../data/data_orig_parameters.csv'
    # '../data/BR_param_EDA.csv',
    #'../data/data_cleaned_RF.csv',
    '../data/data_cleaned_LASSO.csv',
    #'../data/data_cleaned_RFE.csv'
]

# List of outlier thresholds to test
outlier_thresholds = [np.nan, 0.05, 0.10, 0.15, 0.20]

# Dictionary to store the errors
errors_dict = {}

# Load the model from the file
best_model = load_model('best_rnn_model_grid_lasso_6.keras')
# from manual hyperparameter tuning
epochs = 75
batch_size = 128
patience = 6

# Loop through the files and outlier thresholds
for file_path in file_paths:
    print(f"REading File: {file_path}")
    for remove_outliers_threshold in outlier_thresholds:
        print(f"Outlier Threshold: {remove_outliers_threshold}")
        # Load  data
        df_raw = pd.read_csv(file_path, parse_dates=['Date'], index_col='Date')
        target_variable = df_raw.columns[0]
        # Convert all columns to float
        df_raw = df_raw.astype('float64')   
        df = df_raw.copy()

        # Remove outliers using the threshold
        if not pd.isna(remove_outliers_threshold):
            df_cleaned = uf.remove_outliers(df.copy(), threshold=remove_outliers_threshold)
        else:
            df_cleaned = df.copy()

        # Fill missing values
        df_adjusted = uf.fill_missing_values(df_cleaned)

        # Define test, train and validation set sizes
        val_size = 48 # 48 months or 4 years
        test_size = 48 # 48 months or 4 years

        # Split the data into train and test sets
        train_raw_total = df_adjusted[:-test_size] # This total trainning set will be used to train the final model
        df_train = train_raw_total[:-val_size]
        df_val = train_raw_total[-val_size:]
        df_test = df_adjusted[-test_size:]

        # Let´s scale the dfs

        scaler = MinMaxScaler(feature_range=(0,1))
        scaled_train = scaler.fit_transform(df_train)
        scaled_val = scaler.transform(df_val)
        scaled_test = scaler.transform(df_test)
        # include df columns names in the train and test sets
        train = pd.DataFrame(scaled_train, columns=df_train.columns)
        val = pd.DataFrame(scaled_val, columns=df_val.columns)
        test = pd.DataFrame(scaled_test, columns=df_test.columns)
        # Include the index in the train and test sets
        train.index = df_train.index
        val.index = df_val.index
        test.index = df_test.index

        # Converting the series to samples
        # We will use the past 12 months to predict the next 12 months
        def createXY(dataset, n_past, n_future):
            dataX, dataY = [], []
            # Loop for the entire dataset
            for i in range(n_past, len(dataset) - n_future + 1):
                dataX.append(dataset.iloc[i - n_past:i].values)  # Past n months
                dataY.append(dataset.iloc[i + n_future - 1, 0])  #
            return np.array(dataX), np.array(dataY)

        n_past = 12  # Number of past months to use
        n_future = 12  # Number of future months to predict

        # Create the samples
        X_train, Y_train = createXY(train, n_past, n_future)
        X_val, Y_val = createXY(val, n_past, n_future)
        X_test, Y_test = createXY(test, n_past, n_future)

        # Define EarlyStopping callback
        early_stopping = EarlyStopping(monitor='val_loss', patience=patience, restore_best_weights=True) # Stop training when the validation loss is no longer decreasing after X epochs


        # train the model
        history = best_model.fit(X_train, Y_train, 
                                 validation_data=(X_val, Y_val), 
                                 epochs=epochs, batch_size=batch_size, 
                                 verbose=2,
                                 callbacks=[early_stopping])
        # Let's predict the test set using the best model
        predictions_test_scaled = best_model.predict(X_test)

        # Let's reshape the predictions and Y_val to revert the scaling
        # Reshape predictions to 2D
        predictions_test_scaled_2d = predictions_test_scaled.reshape(-1, 1)
        # Get the last timestep of X_test
        X_test_last_timestep = X_test[:, -1, :]
        # Replace the first column of X_test_last_timestep with the scaled predictions.
        X_test_last_timestep[:, 0] = predictions_test_scaled_2d[:, 0]
        # unscale the predictions
        predictions_test_rescaled = scaler.inverse_transform(X_test_last_timestep)[:, 0]

        # Let's convert the predictions and Y_test to a dataframe usind the index from test
        predictions_test_df = pd.DataFrame(predictions_test_rescaled, index=test.index[-len(predictions_test_rescaled):], columns=[target_variable])

        # Reverse the decomposition of the time series
        #predictions = recompose_time_series(predictions_test_df, decomp_dict)
        predictions = predictions_test_df.copy()
        Y_test = df_adjusted[-len(predictions):][target_variable]

        # Calculate the error
        mape_best_RNN = mean_absolute_percentage_error(Y_test, predictions)
        rmse_best_RNN = np.sqrt(mean_squared_error(Y_test, predictions))
        mae_best_RNN = mean_absolute_error(Y_test, predictions)

        #print(f'MAPE best RNN: {mape_best_RNN}')
        #print(f'RMSE best RNN: {rmse_best_RNN}')
        #print(f'MAE best RNN: {mae_best_RNN}')

        # Armazenamento dos erros no dicionário
        errors_dict[(file_path, remove_outliers_threshold)] = {'MAPE': mape_best_RNN, 'RMSE': rmse_best_RNN, 'MAE': mae_best_RNN}

# Exibição dos resultados
for key, value in errors_dict.items():
    print(f"File: {key[0]}, Outlier Threshold: {key[1]} -> Errors: {value}")


  trackable.load_own_variables(weights_store.get(inner_path))


REading File: ../data/data_cleaned_LASSO.csv
Outlier Threshold: nan
Epoch 1/75
2/2 - 1s - 408ms/step - loss: 32.3761 - val_loss: 31.5074
Epoch 2/75
2/2 - 0s - 26ms/step - loss: 31.5465 - val_loss: 31.4737
Epoch 3/75
2/2 - 0s - 26ms/step - loss: 31.3161 - val_loss: 30.8903
Epoch 4/75
2/2 - 0s - 26ms/step - loss: 30.6776 - val_loss: 30.2680
Epoch 5/75
2/2 - 0s - 31ms/step - loss: 30.2077 - val_loss: 29.7851
Epoch 6/75
2/2 - 0s - 26ms/step - loss: 29.7369 - val_loss: 29.3188
Epoch 7/75
2/2 - 0s - 31ms/step - loss: 29.1936 - val_loss: 28.8842
Epoch 8/75
2/2 - 0s - 25ms/step - loss: 28.6705 - val_loss: 28.3743
Epoch 9/75
2/2 - 0s - 26ms/step - loss: 28.1609 - val_loss: 27.7843
Epoch 10/75
2/2 - 0s - 24ms/step - loss: 27.5934 - val_loss: 27.2062
Epoch 11/75
2/2 - 0s - 32ms/step - loss: 27.0807 - val_loss: 26.6614
Epoch 12/75
2/2 - 0s - 35ms/step - loss: 26.5586 - val_loss: 26.1279
Epoch 13/75
2/2 - 0s - 33ms/step - loss: 25.9826 - val_loss: 25.6135
Epoch 14/75
2/2 - 0s - 34ms/step - loss: 25

In [18]:
print(errors_dict)

{('../data/data_cleaned_LASSO.csv', nan): {'MAPE': 1.2320473, 'RMSE': 50846.7, 'MAE': 38909.207}, ('../data/data_cleaned_LASSO.csv', 0.05): {'MAPE': 1.1181465, 'RMSE': 43593.84, 'MAE': 33909.953}, ('../data/data_cleaned_LASSO.csv', 0.1): {'MAPE': 3.8761685, 'RMSE': 26224.848, 'MAE': 22626.865}, ('../data/data_cleaned_LASSO.csv', 0.15): {'MAPE': 3.3037739, 'RMSE': 24961.42, 'MAE': 21776.828}, ('../data/data_cleaned_LASSO.csv', 0.2): {'MAPE': 1.3807606, 'RMSE': 21849.766, 'MAE': 19136.52}}
