In [18]:
import pandas as pd
import numpy as np
import useful_functions as uf
import random as python_random
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from tensorflow.random import set_seed
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import GRU, Dense, Dropout, Input
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras.regularizers import l1_l2

# Possible datasets to test
file_paths = [
    #'../data/data_orig_parameters.csv'
    # '../data/BR_param_EDA.csv',
    #'../data/data_cleaned_RF.csv',
    '../data/data_cleaned_LASSO.csv',
    #'../data/data_cleaned_RFE.csv'
]

# List of outlier thresholds to test
outlier_thresholds = [np.nan, 0.05, 0.10, 0.15, 0.20]
#outlier_thresholds = [0.20]

# Dictionary to store the errors
errors_dict = {}

# Load the model from the file
best_model = load_model('models_parameters/best_gru_model_grid_lasso_10.keras')
# from manual hyperparameter tuning
epochs = 75
batch_size = 32
patience = 10

# Let´s define the seed for reproducibility

def func_set_seed(seed=42):
    np.random.seed(seed)
    python_random.seed(seed)
    set_seed(seed) #tensorflow.random.set_seed(seed)

# Call the function to set the seed
func_set_seed(42)

# Loop through the files and outlier thresholds
for file_path in file_paths:
    print(f"REading File: {file_path}")
    for remove_outliers_threshold in outlier_thresholds:
        print(f"Outlier Threshold: {remove_outliers_threshold}")
        # Load  data
        df_raw = pd.read_csv(file_path, parse_dates=['Date'], index_col='Date')
        target_variable = df_raw.columns[0]
        # Convert all columns to float
        df_raw = df_raw.astype('float64')   
        df = df_raw.copy()

        # Remove outliers using the threshold
        if not pd.isna(remove_outliers_threshold):
            df_cleaned = uf.remove_outliers(df.copy(), threshold=remove_outliers_threshold)
        else:
            df_cleaned = df.copy()

        # Fill missing values
        df_adjusted = uf.fill_missing_values(df_cleaned)

        # Define test, train and validation set sizes
        val_size = 48 # 48 months or 4 years
        test_size = 48 # 48 months or 4 years

        # Split the data into train and test sets
        train_raw_total = df_adjusted[:-test_size] # This total trainning set will be used to train the final model
        df_train = train_raw_total[:-val_size]
        df_val = train_raw_total[-val_size:]
        df_test = df_adjusted[-test_size:]

        # Let´s scale the dfs

        scaler = MinMaxScaler(feature_range=(0,1))
        scaled_train = scaler.fit_transform(df_train)
        scaled_val = scaler.transform(df_val)
        scaled_test = scaler.transform(df_test)
        # include df columns names in the train and test sets
        train = pd.DataFrame(scaled_train, columns=df_train.columns)
        val = pd.DataFrame(scaled_val, columns=df_val.columns)
        test = pd.DataFrame(scaled_test, columns=df_test.columns)
        # Include the index in the train and test sets
        train.index = df_train.index
        val.index = df_val.index
        test.index = df_test.index

        # Converting the series to samples
        # We will use the past 12 months to predict the next 12 months
        def createXY(dataset, n_past, n_future):
            dataX, dataY = [], []
            # Loop for the entire dataset
            for i in range(n_past, len(dataset) - n_future + 1):
                dataX.append(dataset.iloc[i - n_past:i].values)  # Past n months
                dataY.append(dataset.iloc[i + n_future - 1, 0])  #
            return np.array(dataX), np.array(dataY)

        n_past = 12  # Number of past months to use
        n_future = 12  # Number of future months to predict

        # Create the samples
        X_train, Y_train = createXY(train, n_past, n_future)
        X_val, Y_val = createXY(val, n_past, n_future)
        X_test, Y_test = createXY(test, n_past, n_future)

        # Define EarlyStopping callback
        early_stopping = EarlyStopping(monitor='val_loss', patience=patience, restore_best_weights=True) # Stoppatience=patience, restore_best_weights training when the validation loss is no longer decreasing after X epochs


        # train the model
        history = best_model.fit(X_train, Y_train, 
                                 validation_data=(X_val, Y_val), 
                                 epochs=epochs, batch_size=batch_size, 
                                 verbose=2,
                                 callbacks=[early_stopping])
        # Let's predict the test set using the best model
        predictions_test_scaled = best_model.predict(X_test)

        # Let's reshape the predictions and Y_val to revert the scaling
        # Reshape predictions to 2D
        predictions_test_scaled_2d = predictions_test_scaled.reshape(-1, 1)
        # Get the last timestep of X_test
        X_test_last_timestep = X_test[:, -1, :]
        # Replace the first column of X_test_last_timestep with the scaled predictions.
        X_test_last_timestep[:, 0] = predictions_test_scaled_2d[:, 0]
        # unscale the predictions
        predictions_test_rescaled = scaler.inverse_transform(X_test_last_timestep)[:, 0]

        # Let's convert the predictions and Y_test to a dataframe usind the index from test
        predictions_test_df = pd.DataFrame(predictions_test_rescaled, index=test.index[-len(predictions_test_rescaled):], columns=[target_variable])

        # Reverse the decomposition of the time series
        #predictions = recompose_time_series(predictions_test_df, decomp_dict)
        predictions = predictions_test_df.copy()
        Y_test = df_adjusted[-len(predictions):][target_variable]

        # Calculate the error
        mape_best_GRU = mean_absolute_percentage_error(Y_test, predictions)
        rmse_best_GRU = np.sqrt(mean_squared_error(Y_test, predictions))
        mae_best_GRU = mean_absolute_error(Y_test, predictions)

        #print(f'MAPE best GRU: {mape_best_GRU}')
        #print(f'RMSE best GRU: {rmse_best_GRU}')
        #print(f'MAE best GRU: {mae_best_GRU}')

        # Armazenamento dos erros no dicionário
        errors_dict[(file_path, remove_outliers_threshold)] = {'MAPE': mape_best_GRU, 'RMSE': rmse_best_GRU, 'MAE': mae_best_GRU}

# Exibição dos resultados
for key, value in errors_dict.items():
    print(f"File: {key[0]}, Outlier Threshold: {key[1]} -> Errors: {value}")


REading File: ../data/data_cleaned_LASSO.csv
Outlier Threshold: nan
Epoch 1/75
5/5 - 2s - 327ms/step - loss: 0.3660 - val_loss: 0.0932
Epoch 2/75
5/5 - 0s - 25ms/step - loss: 0.0234 - val_loss: 0.0517
Epoch 3/75
5/5 - 0s - 19ms/step - loss: 0.0240 - val_loss: 0.0549
Epoch 4/75
5/5 - 0s - 16ms/step - loss: 0.0256 - val_loss: 0.0735
Epoch 5/75
5/5 - 0s - 21ms/step - loss: 0.0782 - val_loss: 0.0877
Epoch 6/75
5/5 - 0s - 19ms/step - loss: 0.0398 - val_loss: 0.0699
Epoch 7/75
5/5 - 0s - 20ms/step - loss: 0.0470 - val_loss: 0.0515
Epoch 8/75
5/5 - 0s - 35ms/step - loss: 0.0171 - val_loss: 0.0436
Epoch 9/75
5/5 - 0s - 20ms/step - loss: 0.0194 - val_loss: 0.1514
Epoch 10/75
5/5 - 0s - 22ms/step - loss: 0.0854 - val_loss: 0.0413
Epoch 11/75
5/5 - 0s - 19ms/step - loss: 0.0160 - val_loss: 0.0480
Epoch 12/75
5/5 - 0s - 17ms/step - loss: 0.0343 - val_loss: 0.0577
Epoch 13/75
5/5 - 0s - 21ms/step - loss: 0.0311 - val_loss: 0.1351
Epoch 14/75
5/5 - 0s - 18ms/step - loss: 0.0441 - val_loss: 0.0462
Ep

In [19]:
print(errors_dict)

{('../data/data_cleaned_LASSO.csv', nan): {'MAPE': 1.8524871, 'RMSE': 49376.723, 'MAE': 37868.414}, ('../data/data_cleaned_LASSO.csv', 0.05): {'MAPE': 1.5311929, 'RMSE': 45971.51, 'MAE': 35612.805}, ('../data/data_cleaned_LASSO.csv', 0.1): {'MAPE': 1.3755577, 'RMSE': 25924.264, 'MAE': 20804.95}, ('../data/data_cleaned_LASSO.csv', 0.15): {'MAPE': 2.2431517, 'RMSE': 23950.594, 'MAE': 19591.242}, ('../data/data_cleaned_LASSO.csv', 0.2): {'MAPE': 1.224133, 'RMSE': 18526.266, 'MAE': 15323.695}}
