In [None]:
# Best Model Selection Function

def best_arma_models_order(series, max_order=5, significance=0.05):
    """
    Determines the optimal ARMA order for a given time series.

    This function iterates through different combinations of AR(p) and MA(q)
    orders, fits ARIMA models, and evaluates them using statistical tests
    and information criteria. The best order is chosen based on model
    significance and the Akaike Information Criterion (AIC).

    Args:
        series (pandas.Series): The time series data.
        max_order (int, optional): The maximum order to consider for both AR and MA components. 
                                   Defaults to 5.
        significance (float, optional): The significance level for statistical tests. 
                                        Defaults to 0.05.

    Returns:
        tuple: The optimal ARMA order as a tuple (p, d, q), where 'd' is assumed to be 0.
    """
    import statsmodels.stats.diagnostic as smsd
    import statsmodels.tsa.arima.model as smta
    import warnings
    import pandas as pd
    import numpy as np
    warnings.simplefilter('ignore', category=UserWarning)

    candidate_orders = []
    white_noise_pvalues = []
    coefficient_pvalues = []
    aic_values = []

    for p in range(1, max_order + 1):
        for q in range(max_order + 1):
            order = (p, 0, q)  # Assuming d=0 for ARIMA
            candidate_orders.append(order)

            model = smta.ARIMA(series, order=order).fit(cov_type="robust")

            white_noise_pvalues.append(smsd.acorr_ljungbox(
                model.resid, lags=p+q+1, model_df=p + q, boxpierce=False).lb_pvalue[p+q+1])
            coefficient_pvalues.append(model.pvalues[:-1].to_numpy())
            aic_values.append(model.aic)
    for q in range(1, max_order+1):
        order = (0, 0, q)  # Assuming d=0 for ARIMA
        candidate_orders.append(order)

        model = smta.ARIMA(series, order=order).fit(cov_type="robust")

        white_noise_pvalues.append(smsd.acorr_ljungbox(
            model.resid, lags=q+1, model_df=q, boxpierce=False).lb_pvalue[q+1])
        coefficient_pvalues.append(model.pvalues[:-1].to_numpy())
        aic_values.append(model.aic)
    # Create DataFrame for model comparison
    results_df = pd.DataFrame((candidate_orders, white_noise_pvalues, coefficient_pvalues, aic_values
                               )).T
    results_df.columns = ['order', 'white_noise_pvalue',
                          'coefficient_pvalues', 'aic']

    # Filter for significant models
    print(f"{len(results_df['order'])} distinct models considered...")

    results_df = results_df[results_df['white_noise_pvalue'] >= significance]
    results_df['all_coeffs_significant'] = results_df['coefficient_pvalues'].apply(
        lambda x: np.all(x <= significance))
    significant_models = results_df[results_df['all_coeffs_significant']]
    # Select the model with the lowest AIC among significant models
    optimal_orders = significant_models['order']

    """
    if not significant_models.empty:
        optimal_order = significant_models.loc[significant_models['aic'].idxmin(), 'order']
    else:
        optimal_order = "None"  # Handle the case where no significant models are found
    """
    if optimal_orders.empty:
        optimal_orders = "None is good enough according to the criteria of this function."
    else:
        optimal_orders = optimal_orders
    return optimal_orders


# Example usage (Make sure you have a 'series' variable defined)
if __name__ == "__main__":

    m2v = inflation_direct_data["CPI_ALL_ITEMS %"].dropna()
    best_orders = best_arma_model_order(m2v, max_order=5)
    print(best_orders)

In [None]:
# Best ARIMA model selection (supposing max d is 1). Probablmenete esta mal
def Best_Arima_Model(timeseries, IC="bic", max_order=5, significance=0.05):
    def ADF_test(timeseries, IC="bic", significance=0.05):
        """
        Performs Augmented Dickey-Fuller (ADF) test with additional outputs.

        Args:
            timeseries (array-like): The time series data to be tested.
            IC (str): Information criterion to use for selecting the lag order. ["bic","aic"]
            significance (float): Significance level for hypothesis testing.

        Returns:
            Analysis of unit root type

        """

        from arch.unitroot import ADF
        import statsmodels.tsa.arima.model as smt
        adf = ADF(timeseries, trend="ct", method=IC)
        if adf.pvalue < significance:
            print("Output d selection (trend=c is just cte):")
            print("     d=0,trend='c'")
            return 0, "c"
        else:
            model_2 = smt.ARIMA(endog=timeseries.diff(),
                                order=(adf.lags, 0, 0), trend="ct").fit(cov_type="robust")
            p_value_trend = model_2.pvalues["x1"]
            if p_value_trend < significance:
                print("Output d selection:")
                print("     d=1,trend='ct'")
                return 1, "ct"
            else:
                adf = ADF(timeseries, method=IC, trend="c")
                if adf.pvalue < significance:
                    print("Output d selection (trend=c is just cte):")
                    print("     d=0,trend='c'")
                    return 0, "c"
                else:
                    model_3 = smt.ARIMA(endog=timeseries.diff(),
                                        order=(adf.lags, 0, 0), trend="c").fit(cov_type="robust")
                    p_value_const = model_3.pvalues["const"]

                    if p_value_const < significance:
                        print("Output d selection:")
                        print("     d=1,trend='c'")
                        return 1, "c"
                    else:
                        adf = ADF(timeseries, method=IC, trend="n")
                        if adf.pvalue < significance:
                            print("Output d selection (trend=c is just cte):")
                            print("     d=0,trend='c'")
                            return 0, "c"
                        else:
                            print("Output d selection:")
                            print("     d=1,trend='n'")
                            return 1, "n"

    def best_arma_model_order(timeseries, max_order=5, significance=0.05):
        """
        Determines the optimal ARMA order for a given time series.

        This function iterates through different combinations of AR(p) and MA(q)
        orders, fits ARIMA models, and evaluates them using statistical tests
        and information criteria. The best order is chosen based on model
        significance and the Akaike Information Criterion (AIC).

        Args:
            series (pandas.Series): The time series data.
            max_order (int, optional): The maximum order to consider for both AR and MA components. 
                                    Defaults to 5.
            significance (float, optional): The significance level for statistical tests. 
                                            Defaults to 0.05.

        Returns:
            tuple: The optimal ARMA order as a tuple (p, d, q), where 'd' is assumed to be 0.
        """
        import statsmodels.stats.diagnostic as smsd
        import statsmodels.tsa.arima.model as smta
        import warnings
        import pandas as pd
        import numpy as np
        warnings.simplefilter('ignore', category=UserWarning)
        d, trend = ADF_test(timeseries, significance=significance, IC=IC)
        candidate_orders = []
        white_noise_pvalues = []
        coefficient_pvalues = []
        aic_values = []

        for p in range(1, max_order + 1):
            for q in range(max_order + 1):
                order = (p, d, q)  # Assuming d=0 for ARIMA
                candidate_orders.append(order)

                model = smta.ARIMA(timeseries, order=order,
                                   trend=trend).fit(cov_type="robust")

                white_noise_pvalues.append(smsd.acorr_ljungbox(
                    model.resid, lags=p+q+1, model_df=p + q, boxpierce=False).lb_pvalue[p+q+1])
                coefficient_pvalues.append(model.pvalues[:-1].to_numpy())
                aic_values.append(model.aic)
        for q in range(1, max_order+1):
            order = (0, d, q)  # Assuming d=0 for ARIMA
            candidate_orders.append(order)

            model = smta.ARIMA(timeseries, order=order,
                               trend=trend).fit(cov_type="robust")

            white_noise_pvalues.append(smsd.acorr_ljungbox(
                model.resid, lags=q+1, model_df=q, boxpierce=False).lb_pvalue[q+1])
            coefficient_pvalues.append(model.pvalues[:-1].to_numpy())
            aic_values.append(model.aic)
        # Create DataFrame for model comparison
        results_df = pd.DataFrame((candidate_orders, white_noise_pvalues, coefficient_pvalues, aic_values
                                   )).T
        results_df.columns = ['order', 'white_noise_pvalue',
                              'coefficient_pvalues', 'aic']

        # Filter for significant models
        print(f"{len(results_df['order'])} distinct models considered...")

        results_df = results_df[results_df['white_noise_pvalue']
                                >= significance]
        results_df['all_coeffs_significant'] = results_df['coefficient_pvalues'].apply(
            lambda x: np.all(x <= significance))
        significant_models = results_df[results_df['all_coeffs_significant']]
        # Select the model with the lowest AIC among significant models
        """optimal_orders = significant_models['order']"""

        if not significant_models.empty:
            optimal_order = significant_models.loc[significant_models['aic'].idxmin(
            ), 'order']
            print("Order of best model selected:")
            print(f"     {optimal_order} + trned={trend}")
            return smta.ARIMA(timeseries, trend=trend, order=optimal_order).fit(cov_type="robust")
        else:
            # Handle the case where no significant models are found
            return print(f"No good model with this caracteristics")

    return best_arma_model_order(timeseries=timeseries, max_order=max_order, significance=significance)


Best_Arima_Model(inflation, max_order=5)

In [None]:
import pmdarima.arima as pa

inflation = inflation_direct_data[["CPI_ALL_ITEMS%"]].dropna()
X = inflation_direct_data.loc[:,
                              inflation_direct_data.columns != "CPI_ALL_ITEMS%"].dropna()
import pmdarima as pm
from pmdarima import model_selection
import numpy as np
from matplotlib import pyplot as plt

y_train, y_test = model_selection.train_test_split(inflation, train_size=0.8)
x_train, x_test = model_selection.train_test_split(X, train_size=0.8)
# Fit a simple auto_arima model
arima = pm.auto_arima(y=y_train,
                      X=x_train,
                      trace=True,
                      seasonal=False,
                      # stationary=True #This is basically asking if the series is already stationary and set d=0 without evaluation
                      # m=4 # Is the seasonal order. Seasonal order would be 4 if say we got quarerly NON SEASONALLY AJUSTED DATA...
                      alpha=0.05,  # I guess significancy of ADF
                      # test = "" # What sort of ADF test do use
                      # Paricular algorithm to be faster and avoid overfitting. Might be worth it.
                      stepwise=True,
                      information_criterion="bic",
                      max_p=6,
                      max_q=6,
                      method="lbfgs",  # Optimization algorithm
                      maxiter=50,
                      suppress_warnings=True,  # Annolying ARIMA warnings
                      error_action="warn",  # Warn when there is a problem
                      )

preds, conf_int = arima.predict(
    n_periods=y_test.shape[0], X=x_test, return_conf_int=True)

x_axis = np.arange(y_train.shape[0] + preds.shape[0])

x_date = inflation.index

plt.plot(x_date[x_axis[:y_train.shape[0]]], y_train, alpha=0.75)

plt.plot(x_date[x_axis[y_train.shape[0]:]], preds, alpha=0.75)  # Forecasts

plt.scatter(x_date[x_axis[y_train.shape[0]:]], y_test,
            alpha=0.4, marker='x')  # Real data

plt.fill_between(x_date[x_axis[-preds.shape[0]:]],
                 conf_int[:, 0], conf_int[:, 1], alpha=0.44, color='b')
plt.title("Forecasts")
