In [1]:
import pandas as pd
import numpy as np
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.stats.diagnostic import acorr_ljungbox
from sklearn.metrics import mean_absolute_error, mean_squared_error
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_parquet("data/dataset_procesada.parquet", engine="pyarrow")
df

Unnamed: 0_level_0,exportaciones,importaciones,precio_petroleo,igae,pib,consumo_final,inversion_total,tasa_inflacion
fecha,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2008-10-01,-0.454482,-0.299724,-4.956194,0.037897,0.038064,0.581546,0.414413,-0.060587
2009-01-01,-1.812458,-0.796024,-1.256884,-0.886494,-0.886777,-0.798865,-0.811319,-1.995298
2009-04-01,0.356706,0.603206,1.470491,1.032076,1.032046,0.188213,0.485071,4.119722
2009-07-01,0.512052,0.556132,0.794157,-0.273556,-0.273605,-0.096622,-0.070220,-1.670576
2009-10-01,0.117188,0.069532,0.725736,0.123238,0.123502,0.652331,0.510964,-0.177565
...,...,...,...,...,...,...,...,...
2023-10-01,-0.949204,-0.144119,-0.283497,1.441632,1.441919,1.999727,0.736056,-0.302286
2024-01-01,-1.801562,-1.529920,-0.049535,-1.893434,-1.893470,-1.957061,-1.796246,0.082974
2024-04-01,1.673592,0.960185,0.375641,0.491718,0.491684,-0.176902,1.100735,0.528649
2024-07-01,-0.321870,0.777987,-0.406048,-0.286716,-0.286890,-0.132628,-0.039382,-0.271221


In [3]:
def buscar_mejor_arima(y, exog=None, max_p=5, max_d=2, max_q=5):
    """
    Búsqueda automática de los mejores parámetros (p,d,q) para ARIMAX
    """
    mejor_aic = np.inf
    mejores_params = None
    
    for p in range(max_p + 1):
        for d in range(max_d + 1):
            for q in range(max_q + 1):
                try:
                    modelo = ARIMA(y, exog=exog, order=(p, d, q))
                    modelo_fit = modelo.fit()
                    
                    if modelo_fit.aic < mejor_aic:
                        mejor_aic = modelo_fit.aic
                        mejores_params = (p, d, q)
                        
                except:
                    continue
    
    return mejores_params, mejor_aic

In [4]:
def entrenar_arimax(y_train, exog_train, order):
    """
    Entrena el modelo ARIMAX con los parámetros especificados
    """
    modelo = ARIMA(y_train, exog=exog_train, order=order)
    modelo_fit = modelo.fit()
    return modelo_fit

In [5]:
def predecir_arimax(modelo_fit, steps, exog_forecast=None):
    """
    Realiza predicciones con el modelo ARIMAX entrenado
    """
    forecast = modelo_fit.forecast(steps=steps, exog=exog_forecast)
    return forecast

In [6]:
def evaluar_modelo(y_real, y_pred):
    """
    Calcula métricas de evaluación del modelo
    """
    mae = mean_absolute_error(y_real, y_pred)
    mse = mean_squared_error(y_real, y_pred)
    rmse = np.sqrt(mse)
    mape = np.mean(np.abs((y_real - y_pred) / y_real)) * 100
    
    return {
        'MAE': mae,
        'MSE': mse,
        'RMSE': rmse,
        'MAPE': mape
    }

In [7]:
def diagnosticos_modelo(modelo_fit):
    """
    Realiza diagnósticos estadísticos del modelo
    """
    # Test de Ljung-Box para autocorrelación en residuos
    residuos = modelo_fit.resid
    ljung_box = acorr_ljungbox(residuos, lags=10, return_df=True)
    
    # Estadísticas del modelo
    aic = modelo_fit.aic
    bic = modelo_fit.bic
    
    diagnosticos = {
        'AIC': aic,
        'BIC': bic,
        'Ljung_Box_pvalue': ljung_box['lb_pvalue'].min(),
        'Log_Likelihood': modelo_fit.llf
    }
    
    return diagnosticos

In [8]:
def generarModelo(df):

    # 2. Separar variable objetivo y variables exógenas
    y = df['pib']  # Variable objetivo
    exog = df.drop('pib', axis=1)  # Variables exógenas
    
    # 3. División train/test (80-20)
    n_train = int(len(df) * 0.8)
    
    y_train = y[:n_train]
    y_test = y[n_train:]
    exog_train = exog[:n_train]
    exog_test = exog[n_train:]
    
    # 4. Búsqueda de mejores parámetros
    print("Buscando mejores parámetros ARIMA...")
    mejores_params, mejor_aic = buscar_mejor_arima(y_train, exog_train)
    print(f"Mejores parámetros: {mejores_params}")
    print(f"AIC: {mejor_aic}")
    
    # 5. Entrenar modelo final
    print("\nEntrenando modelo ARIMAX...")
    modelo_final = entrenar_arimax(y_train, exog_train, mejores_params)
    print("Modelo entrenado exitosamente")
    
    # 6. Realizar predicciones
    print("\nRealizando predicciones...")
    predicciones = predecir_arimax(modelo_final, len(y_test), exog_test)
    
    # 7. Evaluar modelo
    print("\nEvaluando modelo...")
    metricas = evaluar_modelo(y_test, predicciones)
    
    print("\n=== MÉTRICAS DE EVALUACIÓN ===")
    for metrica, valor in metricas.items():
        print(f"{metrica}: {valor:.4f}")
    
    # 8. Diagnósticos del modelo
    print("\n=== DIAGNÓSTICOS DEL MODELO ===")
    diagnosticos = diagnosticos_modelo(modelo_final)
    
    for diagnostico, valor in diagnosticos.items():
        print(f"{diagnostico}: {valor:.4f}")
    
    # 9. Resumen del modelo
    print("\n=== RESUMEN DEL MODELO ===")
    print(modelo_final.summary())
    
    return modelo_final, predicciones, metricas

In [10]:
df1 = df.drop('igae', axis=1)
generarModelo(df1)

Buscando mejores parámetros ARIMA...
Mejores parámetros: (4, 0, 3)
AIC: -56.62380139919824

Entrenando modelo ARIMAX...
Modelo entrenado exitosamente

Realizando predicciones...

Evaluando modelo...

=== MÉTRICAS DE EVALUACIÓN ===
MAE: 0.1616
MSE: 0.0353
RMSE: 0.1879
MAPE: 46.4361

=== DIAGNÓSTICOS DEL MODELO ===
AIC: -56.6238
BIC: -27.3551
Ljung_Box_pvalue: 0.0108
Log_Likelihood: 43.3119

=== RESUMEN DEL MODELO ===
                               SARIMAX Results                                
Dep. Variable:                    pib   No. Observations:                   52
Model:                 ARIMA(4, 0, 3)   Log Likelihood                  43.312
Date:                Wed, 10 Sep 2025   AIC                            -56.624
Time:                        20:00:49   BIC                            -27.355
Sample:                    10-01-2008   HQIC                           -45.403
                         - 07-01-2021                                         
Covariance Type:           

(<statsmodels.tsa.arima.model.ARIMAResultsWrapper at 0x2cb095d3520>,
 2021-10-01    1.564040
 2022-01-01   -1.668711
 2022-04-01    0.334499
 2022-07-01   -0.081352
 2022-10-01    1.201688
 2023-01-01   -1.627021
 2023-04-01    0.249238
 2023-07-01   -0.018690
 2023-10-01    1.697531
 2024-01-01   -2.261294
 2024-04-01    0.534983
 2024-07-01   -0.421973
 2024-10-01    1.157561
 Freq: QS-OCT, Name: predicted_mean, dtype: float64,
 {'MAE': 0.1616263064859051,
  'MSE': 0.03529322929198132,
  'RMSE': 0.18786492299517046,
  'MAPE': 46.436082827812065})

In [11]:
df3 = df.drop(['igae', 'exportaciones', 'inversion_total'], axis=1)
generarModelo(df3)

Buscando mejores parámetros ARIMA...
Mejores parámetros: (4, 0, 0)
AIC: -61.198824676124175

Entrenando modelo ARIMAX...
Modelo entrenado exitosamente

Realizando predicciones...

Evaluando modelo...

=== MÉTRICAS DE EVALUACIÓN ===
MAE: 0.1788
MSE: 0.0438
RMSE: 0.2093
MAPE: 40.6736

=== DIAGNÓSTICOS DEL MODELO ===
AIC: -61.1988
BIC: -41.6864
Ljung_Box_pvalue: 0.0016
Log_Likelihood: 40.5994

=== RESUMEN DEL MODELO ===
                               SARIMAX Results                                
Dep. Variable:                    pib   No. Observations:                   52
Model:                 ARIMA(4, 0, 0)   Log Likelihood                  40.599
Date:                Wed, 10 Sep 2025   AIC                            -61.199
Time:                        20:01:47   BIC                            -41.686
Sample:                    10-01-2008   HQIC                           -53.718
                         - 07-01-2021                                         
Covariance Type:          

(<statsmodels.tsa.arima.model.ARIMAResultsWrapper at 0x2cb0962a590>,
 2021-10-01    1.578426
 2022-01-01   -1.715335
 2022-04-01    0.345158
 2022-07-01   -0.072203
 2022-10-01    1.250868
 2023-01-01   -1.654390
 2023-04-01    0.265237
 2023-07-01    0.015987
 2023-10-01    1.743011
 2024-01-01   -2.275842
 2024-04-01    0.540600
 2024-07-01   -0.429624
 2024-10-01    1.186608
 Freq: QS-OCT, Name: predicted_mean, dtype: float64,
 {'MAE': 0.17875977590961722,
  'MSE': 0.04381461585059246,
  'RMSE': 0.2093194110697631,
  'MAPE': 40.673603374628684})