In [None]:
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import sys

sys.path.append("../..")

from src.data_preprocessing.data_pipeline import data_pipeline
from src.modeling.evaluation import smape, mae
from src.modeling.multivariate_modeling import adfuller_test, get_var_model

In [None]:
DATA_DIR = (
    Path("..")
    / ".."
    / ".."
    / "hfactory_magic_folders"
    / "plastic_cost_prediction"
    / "data"
)
MAIN_FILE = "PA6_cleaned_dataset.csv"

In [None]:
df, _ = data_pipeline(DATA_DIR / MAIN_FILE)

In [None]:
df.dropna(inplace=True)

### Check for stationarity

In [None]:
D = {
    "best_price_compound": 1,
    "PA6 GLOBAL_ EMEAS _ EUR per TON": 3,
    "CRUDE_PETRO": 1,
    "CRUDE_BRENT": 1,
    "CRUDE_DUBAI": 1,
    "CRUDE_WTI": 1,
    "Benzene_price": 1,
    "Caprolactam_price": 1,
    "Cyclohexane_price": 1,
    "Electricty_Price_Netherlands": 0,
    "Electricty_Price_France": 0,
    "Electricty_Price_Italy": 0,
    "Electricty_Price_Poland": 2,
    "Electricty_Price_Germany": 2,
    "NGAS_EUR": 0,
    "NGAS_US": 1,
    "NGAS_JP": 2,
    "iNATGAS": 0,
    "Inflation_rate_france": 2,
    "Automotive Value": 1,
}

In [None]:
df_diff = df.copy()
for column, value in D.items():
    for i in range(value):
        df_diff[column] = df_diff[column].diff()

df_diff = df_diff.iloc[2:].dropna()
df_diff


In [None]:
for name, column in df_diff.items():
    adfuller_test(column, name=name)

### VAR Modelling

In [None]:
from src.data_preprocessing.data_loader import time_split

df_diff.dropna(inplace=True)
spl = time_split(df_diff)

for train_idx, test_idx in spl:
    train = df_diff.iloc[train_idx]
    test = df_diff.iloc[test_idx]

In [None]:
from src.modeling.multivariate_modeling import adfuller_test, grid_search_var

best_model, best_order = grid_search_var(train)
print(best_model.summary())

In [None]:
lag_order = best_model.k_ar
forecast_input = df_diff.values[-lag_order:]

nobs = len(train)
fc = best_model.forecast(y=forecast_input, steps=nobs)

In [None]:
fc

In [None]:
df_forecast = pd.DataFrame(
            fc, index=df_diff.index[-nobs:], columns=df_diff.columns 
        )

In [None]:
# df_predic = df_forecast.copy()

# for column, value in D.items():
#     for i in range(value):
#         df_predic[column] = train[column].iloc[-1] + df_forecast[column].cumsum()


In [None]:
    df_forecast = df_forecast.add_suffix('_forecast')
    
    fig, axes = plt.subplots(
        nrows=int(len(df_predic.columns) / 2), ncols=2, dpi=150, figsize=(20, 20)
    )
    for i, (col, ax) in enumerate(zip(df_predic.columns, axes.flatten())):
        # df_forecast[col + "_forecast"][-nobs:].plot(legend=True, ax=ax)
        # .autoscale(
        #     axis="x", tight=True
        # )
        df_diff[col].plot(legend=True, ax=ax)
        ax.set_title(col + ": Forecast vs Actuals")
        ax.xaxis.set_ticks_position("none")
        ax.yaxis.set_ticks_position("none")
        ax.spines["top"].set_alpha(0)
        ax.tick_params(labelsize=6)

    plt.tight_layout()


In [None]:
var_model = get_var_model(df_var, df_diff)

In [None]:
var_model = get_var_model(df, df_3diff)

### VARMA

In [None]:
df_2diff

In [None]:
from src.data_preprocessing.data_loader import time_split

df_2diff.dropna(inplace=True)
spl = time_split(df_2diff)

for train_idx, test_idx in spl:
    train = df_2diff.iloc[train_idx]
    test = df_2diff.iloc[test_idx]

In [None]:
from statsmodels.tsa.statespace.varmax import VARMAX

model = VARMAX(train, order=(1, 1))
model_fit = model.fit(disp=False)

In [None]:
pred = model_fit.forecast(steps=len(test))
pred

In [None]:
train

In [None]:
# remove first point

res=pd.DataFrame({"Pred": pred['best_price_compound'], "Act": test["best_price_compound"].values})
res

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.plot(res.index, res['Pred'], label='Predicted')
plt.plot(res.index, res['Act'], label='Actual')

plt.title('Predicted vs Actual Values Over Time')
plt.xlabel('Date')
plt.ylabel('Values')
plt.legend()
plt.show()
