In [1]:
from pmdarima import auto_arima
import pandas as pd
import numpy as np
import useful_functions as uf
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error
from sklearn.preprocessing import MinMaxScaler
# Lista dos caminhos dos arquivos
file_paths = [
    '../data/data_orig_parameters.csv',
    '../data/BR_param_EDA.csv',
    '../data/data_cleaned_RF.csv',
    '../data/data_cleaned_LASSO.csv',
    '../data/data_cleaned_RFE.csv'
]

# Lista dos limiares de remoção de outliers
outlier_thresholds = [np.nan, 0.05, 0.10, 0.15, 0.20]

# Dicionário para armazenar os resultados dos erros
errors_dict = {}

# Loop pelos caminhos dos arquivos e pelos limiares de outliers
for file_path in file_paths:
    print(f"REading File: {file_path}")
    for remove_outliers_threshold in outlier_thresholds:
        print(f"Outlier Threshold: {remove_outliers_threshold}")
        # Load  data
        df_raw = pd.read_csv(file_path, parse_dates=['Date'], index_col='Date')
        target_variable = df_raw.columns[0]
        #df = df_raw[[target_variable]]

        # Remove outliers using the threshold
        if not pd.isna(remove_outliers_threshold):
            df_cleaned = uf.remove_outliers(df_raw.copy(), threshold=remove_outliers_threshold)
        else:
            df_cleaned = df_raw.copy()

        # Preenchimento de valores faltantes
        df_adjusted = uf.fill_missing_values(df_cleaned)

        # Definição dos conjuntos de treinamento e teste
        test_size = 48  # meses
        df_train = df_adjusted[:-test_size]
        df_test = df_adjusted[-test_size:]

        # Let´s scale the dfs

        #scaled_train, scaled_test, params = uf.scale_data(train_raw, test_raw, 'normalize')

        scaler = MinMaxScaler(feature_range=(0,1))
        scaled_train = scaler.fit_transform(df_train)
        scaled_test = scaler.transform(df_test)
        # include df columns names in the train and test sets
        train = pd.DataFrame(scaled_train, columns=df_adjusted.columns)
        test = pd.DataFrame(scaled_test, columns=df_adjusted.columns)
        # Include the index in the train and test sets
        train.index = df_adjusted.index[:-test_size]
        test.index = df_adjusted.index[-test_size:]
        # define the exogenous variables as all except the target variable
        # Subset df to get only the first column
        exog_var_train = train.iloc[:, 1:].ffill() # fill NAs with the last valid observation
        exog_var_test = test.iloc[:, 1:].ffill()# fill NAs with the last valid observation
        # Modelagem e previsão
        auto_model = auto_arima(train[target_variable], 
                              start_p=0, start_q=0, 
                              max_p=4, max_q=4, 
                              m=12, 
                              start_P=0, start_Q=0, 
                              max_P=4, max_Q=4, 
                              seasonal=True, 
                              d=None, D=1,  
                              trace=False,
                              error_action='ignore',  
                              suppress_warnings=True, 
                              stepwise=True,
                              exog = exog_var_train, 
                              max_order=5)
        
        model = SARIMAX(train[target_variable], order=auto_model.order, 
                        seasonal_order=auto_model.seasonal_order, exog = exog_var_train)
        
        model_fit = model.fit(disp=False, maxiter=50)
        predictions = model_fit.forecast(steps=len(test[target_variable]), exog = exog_var_test)

        # Let's reverse the scaling to get the real values
        original_data_test = df_adjusted[-test_size:][target_variable]
        # Convert Pandas Series to NumPy arrays and reshape
        forecasts_on_test_scaled_np = predictions.to_numpy().reshape(-1, 1)
        forecasts_on_test_scaled_np = np.repeat(forecasts_on_test_scaled_np,test.shape[1], axis=-1)

        # Inverse transform to get the real values
        forecasts_on_test_all = scaler.inverse_transform(forecasts_on_test_scaled_np)

        # Subset the forecast to get only the first column
        forecasts_on_test = forecasts_on_test_all[:,0]

        # Convert to pandas dataframe and include the index
        forecasts_on_test = pd.DataFrame(forecasts_on_test, index=test.index, columns=[target_variable])


        # Cálculo dos erros
        mape = mean_absolute_percentage_error(original_data_test, forecasts_on_test)
        rmse = np.sqrt(mean_squared_error(original_data_test, forecasts_on_test))
        mae = mean_absolute_error(original_data_test, forecasts_on_test)

        # Armazenamento dos erros no dicionário
        errors_dict[(file_path, remove_outliers_threshold)] = {'MAPE': mape, 'RMSE': rmse, 'MAE': mae}



REading File: ../data/data_orig_parameters.csv
Outlier Threshold: nan


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


Outlier Threshold: 0.05


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


Outlier Threshold: 0.1


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


Outlier Threshold: 0.15


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


Outlier Threshold: 0.2


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


REading File: ../data/BR_param_EDA.csv
Outlier Threshold: nan


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


Outlier Threshold: 0.05


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


Outlier Threshold: 0.1


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


Outlier Threshold: 0.15


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


Outlier Threshold: 0.2


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


REading File: ../data/data_cleaned_RF.csv
Outlier Threshold: nan


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


Outlier Threshold: 0.05


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


Outlier Threshold: 0.1


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


Outlier Threshold: 0.15


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


Outlier Threshold: 0.2


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


REading File: ../data/data_cleaned_LASSO.csv
Outlier Threshold: nan


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


Outlier Threshold: 0.05


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


Outlier Threshold: 0.1


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


Outlier Threshold: 0.15


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


Outlier Threshold: 0.2


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


REading File: ../data/data_cleaned_RFE.csv
Outlier Threshold: nan


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


Outlier Threshold: 0.05


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


Outlier Threshold: 0.1


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


Outlier Threshold: 0.15


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


Outlier Threshold: 0.2


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


In [2]:
print(errors_dict)

{('../data/data_orig_parameters.csv', nan): {'MAPE': 2.0816466635781286, 'RMSE': 57651.846987597826, 'MAE': 41942.11840976812}, ('../data/data_orig_parameters.csv', 0.05): {'MAPE': 2.2490469668650412, 'RMSE': 52023.501200104154, 'MAE': 37074.09551355226}, ('../data/data_orig_parameters.csv', 0.1): {'MAPE': 5.755532971004488, 'RMSE': 40987.53463561887, 'MAE': 30267.969263924784}, ('../data/data_orig_parameters.csv', 0.15): {'MAPE': 11.91920310506717, 'RMSE': 35080.464665931744, 'MAE': 27426.326005784194}, ('../data/data_orig_parameters.csv', 0.2): {'MAPE': 3.2282908298674484, 'RMSE': 29390.64515967309, 'MAE': 22687.149457123265}, ('../data/BR_param_EDA.csv', nan): {'MAPE': 56.854825274084504, 'RMSE': 588786.592166604, 'MAE': 454692.3387583306}, ('../data/BR_param_EDA.csv', 0.05): {'MAPE': 50.22361796653365, 'RMSE': 531507.4749419176, 'MAE': 404129.691475207}, ('../data/BR_param_EDA.csv', 0.1): {'MAPE': 20.174267238081153, 'RMSE': 171109.2927900669, 'MAE': 150044.37566768492}, ('../data/

In [3]:
for key, value in errors_dict.items():
    # Supondo que `value` seja um dicionário com as chaves 'MAPE', 'RMSE', 'MAE'
    mape = value['MAPE']
    rmse = value['RMSE']
    mae = value['MAE']
    print(f"Model: SARIMAX., File: {key[0]}, Outlier Threshold: {key[1]} ->, MAPE: {mape:.2f}, RMSE: {rmse:.2f}, MAE: {mae:.2f}")

Model: SARIMAX., File: ../data/data_orig_parameters.csv, Outlier Threshold: nan ->, MAPE: 2.08, RMSE: 57651.85, MAE: 41942.12
Model: SARIMAX., File: ../data/data_orig_parameters.csv, Outlier Threshold: 0.05 ->, MAPE: 2.25, RMSE: 52023.50, MAE: 37074.10
Model: SARIMAX., File: ../data/data_orig_parameters.csv, Outlier Threshold: 0.1 ->, MAPE: 5.76, RMSE: 40987.53, MAE: 30267.97
Model: SARIMAX., File: ../data/data_orig_parameters.csv, Outlier Threshold: 0.15 ->, MAPE: 11.92, RMSE: 35080.46, MAE: 27426.33
Model: SARIMAX., File: ../data/data_orig_parameters.csv, Outlier Threshold: 0.2 ->, MAPE: 3.23, RMSE: 29390.65, MAE: 22687.15
Model: SARIMAX., File: ../data/BR_param_EDA.csv, Outlier Threshold: nan ->, MAPE: 56.85, RMSE: 588786.59, MAE: 454692.34
Model: SARIMAX., File: ../data/BR_param_EDA.csv, Outlier Threshold: 0.05 ->, MAPE: 50.22, RMSE: 531507.47, MAE: 404129.69
Model: SARIMAX., File: ../data/BR_param_EDA.csv, Outlier Threshold: 0.1 ->, MAPE: 20.17, RMSE: 171109.29, MAE: 150044.38
Mod