In [4]:
from pmdarima import auto_arima
import pandas as pd
import numpy as np
import useful_functions as uf
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error

# Lista dos caminhos dos arquivos
file_paths = [
    '../data/data_orig_parameters.csv',
    '../data/BR_param_EDA.csv',
    '../data/data_cleaned_RF.csv',
    '../data/data_cleaned_LASSO.csv',
    '../data/data_cleaned_RFE.csv'
]

# Lista dos limiares de remoção de outliers
outlier_thresholds = [np.nan, 0.05, 0.10, 0.15, 0.20]

# Dicionário para armazenar os resultados dos erros
errors_dict = {}

# Loop pelos caminhos dos arquivos e pelos limiares de outliers
for file_path in file_paths:
    print(f"REading File: {file_path}")
    for remove_outliers_threshold in outlier_thresholds:
        print(f"Outlier Threshold: {remove_outliers_threshold}")
        # Load  data
        df_raw = pd.read_csv(file_path, parse_dates=['Date'], index_col='Date')
        target_variable = df_raw.columns[0]
        df = df_raw[[target_variable]]

        # Remove outliers using the threshold
        if not pd.isna(remove_outliers_threshold):
            df_cleaned = uf.remove_outliers(df.copy(), threshold=remove_outliers_threshold)
        else:
            df_cleaned = df.copy()

        # Preenchimento de valores faltantes
        df_adjusted = uf.fill_missing_values(df_cleaned)

        # Definição dos conjuntos de treinamento e teste
        test_size = 48  # meses
        df_train = df_adjusted[:-test_size]
        df_test = df_adjusted[-test_size:]

        # Modelagem e previsão
        auto_model = auto_arima(df_train[target_variable], 
                              start_p=0, start_q=0, 
                              max_p=4, max_q=4, 
                              m=12, 
                              start_P=0, start_Q=0, 
                              max_P=4, max_Q=4, 
                              seasonal=True, 
                              d=None, D=1,  
                              trace=False,
                              error_action='ignore',  
                              suppress_warnings=True, 
                              stepwise=False, max_order=10)
        #auto_model = auto_arima(df_train, seasonal=True, m=12, trace=False, error_action='ignore', suppress_warnings=True)
        model = SARIMAX(df_train, order=auto_model.order, seasonal_order=auto_model.seasonal_order)
        model_fit = model.fit(disp=False, maxiter=200)
        predictions = model_fit.forecast(steps=len(df_test))

        # Cálculo dos erros
        mape = mean_absolute_percentage_error(df_test, predictions)
        rmse = np.sqrt(mean_squared_error(df_test, predictions))
        mae = mean_absolute_error(df_test, predictions)

        # Armazenamento dos erros no dicionário
        errors_dict[(file_path, remove_outliers_threshold)] = {'MAPE': mape, 'RMSE': rmse, 'MAE': mae}

# Exibição dos resultados
for key, value in errors_dict.items():
    print(f"File: {key[0]}, Outlier Threshold: {key[1]} -> Errors: {value:.2f}")


REading File: ../data/data_orig_parameters.csv
Outlier Threshold: nan


KeyboardInterrupt: 

In [3]:
errors_dict

{('../data/data_orig_parameters.csv', nan): {'MAPE': 1.5790905414448069,
  'RMSE': 47686.606094597155,
  'MAE': 31993.247371648304},
 ('../data/data_orig_parameters.csv', 0.05): {'MAPE': 1.574335230482456,
  'RMSE': 39835.52105717258,
  'MAE': 28753.94634625768}}