In [None]:
from pathlib import Path

import matplotlib.pyplot as plt
import pandas as pd

import sys

In [None]:
"""Data Loader.

The functions in this script perform data load and a time series split.

Usage:
    Either run the whole pipeline (see src/main.py) or
    import the functions.
"""


from pathlib import Path
from typing import List, Tuple

import numpy as np
import pandas as pd


def load_data(path_to_file: Path) -> pd.DataFrame:
    """Loads the main data file from csv to a pandas dataframe.

    Parameters
    -------
    path_to_file : Path
                Path to main csv.

    Returns
    -------
    df : pd.DataFrame
            Data as a dataframe.
    """
    df = pd.read_csv(path_to_file)

    # convert time from string to datetime and set it as index
    df.index = pd.to_datetime(df["time"])
    df = df.drop(columns="time")

    return df


def time_split(
    df: pd.DataFrame, n_folds: int = 6, test_size: int = 9
) -> List[Tuple[np.ndarray[int], np.ndarray[int]]]:
    """Creates an extending time series split for data.

    Parameters
    -------
    df : pd.DataFrame
        Data as a dataframe.
    n_folds : int, optional
        Number of time series folds, default is 6.
    test_size : int, optional
        Number of rows in one test test, default is 9.

    Returns
    -------
    all_splits : List[Tuple[np.ndarray[int], np.ndarray[int]]]
                Splits of train and test indices per fold.
    """
    all_splits = []
    split_index = len(df) - n_folds * test_size
    train_ids = np.arange(0, split_index)

    for _ in range(1, n_folds + 1):
        test_ids = np.arange(split_index, split_index + test_size)

        all_splits.append((train_ids, test_ids))
        train_ids = np.append(train_ids, test_ids)

        split_index += test_size

    return all_splits

In [None]:
DATA_DIR = (
    Path("..")
    / ".."
    / "hfactory_magic_folders"
    / "plastic_cost_prediction"
    / "data"
)
MAIN_FILE = "PA6_cleaned_dataset.csv"

In [None]:
df = load_data(DATA_DIR / MAIN_FILE)

In [None]:
first_try_df = df.loc[:, test_df.columns != "iNATGAS"]
first_try_df.dropna(inplace=True)
first_try_df

In [None]:
import numpy as np
import pandas as pd
from statsmodels.tsa.statespace.varmax import VARMAX
from sklearn.model_selection import ParameterGrid


def varmax_grid_search(train_df, order_range, seasonal_order_range):
    """
    Perform a grid search to find the best parameters for VARMAX model based on AIC.

    Parameters:
    - train_df: pd.DataFrame, the training data
    - order_range: tuple, range of values for the non-seasonal order parameter (p, q)
    - seasonal_order_range: tuple, range of values for the seasonal order parameter (P, D, Q, s)

    Returns:
    - best_params: dict, the best parameters found during the grid search
    """

    # Create a grid of parameter combinations
    param_grid = {
        "order": [
            (p, q)
            for p in range(order_range[0], order_range[1] + 1)
            for q in range(order_range[0], order_range[1] + 1)
        ],
        "seasonal_order": [
            (P, D, Q, s)
            for P in range(
                seasonal_order_range[0], seasonal_order_range[1] + 1
            )
            for D in range(
                seasonal_order_range[0], seasonal_order_range[1] + 1
            )
            for Q in range(
                seasonal_order_range[0], seasonal_order_range[1] + 1
            )
            for s in range(1, 13)
        ],
    }

    best_aic = np.inf
    best_params = None

    for params in ParameterGrid(param_grid):
        # Create VARMAX model with current parameters
        model = VARMAX(
            train_df,
            order=params["order"],
            seasonal_order=params["seasonal_order"],
            trend="c",
        )
        try:
            # Fit the model
            model_fitted = model.fit(disp=False)

            # Calculate AIC
            current_aic = model_fitted.aic

            # Update best parameters if the current AIC is lower
            if current_aic < best_aic:
                best_aic = current_aic
                best_params = params

        except Exception as e:
            print(f"Error fitting model with parameters {params}: {e}")

    return best_params

In [None]:
order_range = (1, 3)
seasonal_order_range = (1, 3)

best_params = varmax_grid_search(
    first_try_df, order_range, seasonal_order_range
)
print("Best Parameters:", best_params)