## Stationarity Notebook

In [None]:
from pathlib import Path
from typing import List, Tuple
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.stattools import adfuller
from scipy.stats import boxcox
import warnings

In [None]:
warnings.filterwarnings("ignore")

## Initializing the data

In [None]:
DATA_DIR = (
    Path("..")
    / ".."
    / ".."
    / "hfactory_magic_folders"
    / "plastic_cost_prediction"
    / "data"
)
MAIN_FILE = "PA6_cleaned_dataset.csv"

In [None]:
df = pd.read_csv(DATA_DIR / MAIN_FILE)
# convert time from string to datetime
df["time"] = pd.to_datetime(df["time"])

## Stationarity analysis

In a first stage, we want to find which variables are already stationary and which ones need to be transformed in order to obtain stationarity. For this we will use the Augmented Dickey-Fuller test.

In [None]:
def adf_test_all_columns(
    df: pd.DataFrame, significance_level: float = 0.05
) -> Tuple[pd.DataFrame, List[str]]:
    """Perform Augmented Dickey-Fuller (ADF) test on all columns in the DataFrame.

    Parameters:
    -----------
    df: pd.DataFrame
        Dataframe for which we perform ADF.
    significance_level: float
        significance level for the test (default is 0.05).

    Returns
    --------
    df_results: pd.DataFrame
        DataFrame containing column names and corresponding p-values.
    stationary_columns: List[str]
        List of column names that are stationary.
    """
    # initializing a df to store the p-values per column and the list of stationary variables
    df_results = pd.DataFrame(columns=["Column", "ADF p-value"])
    stationary_columns = []

    # obaining the p-values for each column
    for column in df.columns:
        result_adf = adfuller(df[column].dropna())
        p_value = round(result_adf[1], 3)
        df_results = pd.concat(
            [
                df_results,
                pd.DataFrame({"Column": [column], "ADF p-value": [p_value]}),
            ],
            ignore_index=True,
        )

        # checking if columns are stationary according to the significance level, saving them if so
        if p_value <= significance_level:
            stationary_columns.append(column)

    print("Columns that are stationary based on the ADF test:")
    print(stationary_columns)

    return df_results, stationary_columns


results, stationary_cols = adf_test_all_columns(df)
display(results)

## Stationary Variable analysis

For the variables that are already stationary, we want to check their time series, as well as their ACF and PACF plots, in order to better understand how these variables behave.

In [None]:
def plot_acf_pacf_before_transformation(
    series: pd.Series, title: str, col: str
):
    """Plot ACF and PACF before applying transformation.

    Parameters:
    -----------
    series: pd.Series
        Original series.
    title: str
        Title for the plot.
    col: str
        Column name.
    """
    plt.figure(figsize=(18, 8))

    # Plot original series
    plt.subplot(4, 1, 1)
    plt.plot(series)
    plt.title(f"Original Series ({col})")

    # Plot ACF of transformed series
    plt.subplot(4, 2, 3)
    plot_acf(series, lags=20, ax=plt.gca())
    plt.title(f"ACF ({title})")

    # Plot PACF of transformed series
    plt.subplot(4, 2, 4)
    plot_pacf(series, lags=20, method="ywm", ax=plt.gca())
    plt.title(f"PACF ({title})")

    plt.tight_layout()
    plt.show()

In [None]:
## Stationary variable Analysis
for col in stationary_cols:
    plot_acf_pacf_before_transformation(df[col], "No transformation", col)

# Non stationary variable analysis 

For the non stationary variables, we want to find a transformation that allows us to modify the variable into a stationary one. Since most of our variables can be both positive and negative, some of the typical transformations cannot be used like the log or square root transformation. Four different transformations will be tested, and if multiple work, we will select them in the following order(based on possibility of overfitting, ease of fitting to data engineering pipeline, etc...): 

1. First difference transformation
2. Second difference transformation
3. Cubic Root Transformation
4. Boxcox transformation

After finding the appropriate transformation, we will do an analysis on these variables similar to the one done for stationary variables above. We will plot the time series of the original variable as well as the one for the transformed variable, and also the ACT and PACT plots for the transformed variable. 

In [None]:
def transform_and_test(
    series: pd.Series, transformation: str, significance_level: float = 0.05
) -> Tuple[pd.Series, float]:
    """Apply transformations to make the series stationary and perform ADF test.

    Parameters
    ----------
    series: pd.Series
        Series which will be transformed.
    transformation: str
        Tranformation that will be performed on the series.
    significance_level: float
        significance level for the ADF test (default is 0.05)

    Returns:
    transformed_series: pd.Series
        Transformed series.
    adf_p_value: float
        ADF test p-value.
    """
    possible_trans = [
        "First difference",
        "Second difference",
        "Cubic Root",
        "Boxcox",
    ]
    # checking if the transformation given as argument is allowed
    if transformation not in possible_trans:
        raise ValueError(
            "The transformation given was not expected. Expected one of: %s"
            % possible_trans
        )
    # doing the transformation
    if transformation == "First difference":
        transformed_series = series.diff().dropna()
    elif transformation == "Second difference":
        transformed_series = series.diff().diff().dropna()
    elif transformation == "Cubic Root":
        transformed_series = np.cbrt(series)
    elif transformation == "Boxcox":
        transformed_series, _ = boxcox(series)

    # running the ADF test on the transformed variable
    result_adf = adfuller(transformed_series)
    adf_p_value = result_adf[1]

    return transformed_series, adf_p_value

In [None]:
def plot_acf_pacf_after_transformation(series, transformed_series, title, col):
    """Plot ACF and PACF after applying transformation.

    Parameters:
    -----------
    series: pd.Series
        Original series.
    transformed_series: pd.Series
        Series after transformation.
    title: str
        Title for the plot.
    col: str
        Column name.
    """
    plt.figure(figsize=(18, 8))

    # Plot original series
    plt.subplot(4, 2, 1)
    plt.plot(series)
    plt.title(f"Original Series ({col})")

    # Plot transformed series
    plt.subplot(4, 2, 2)
    plt.plot(transformed_series)
    plt.title(f"Transformed Series ({col})")

    # Plot ACF of transformed series
    plt.subplot(4, 2, 3)
    plot_acf(transformed_series, lags=20, ax=plt.gca())
    plt.title(f"ACF ({title})")

    # Plot PACF of transformed series
    plt.subplot(4, 2, 4)
    plot_pacf(transformed_series, lags=20, method="ywm", ax=plt.gca())
    plt.title(f"PACF ({title})")

    plt.tight_layout()
    plt.show()

## Non stationary variable results

In [None]:
# initializing a dictionary to save the results
transformation_results = {}
# list of all the transformations we want to test
transformations_to_test = [
    "First difference",
    "Second difference",
    "Cubic Root",
    "Boxcox",
]

for column_name in df.columns:
    # checking if the column is non stationary
    if (column_name != "time") & (column_name not in stationary_cols):
        original_series = df[column_name].dropna()

        print(f"Column: {column_name}")

        # Dictionary to store transformation results
        transformation_results[column_name] = []
        found = False
        # trying the different transformations
        for transformation in transformations_to_test:
            # checking if one of the previous transformation already reached the goal of stationarity
            if not found:
                trans_series, adf_p_value = transform_and_test(
                    original_series, transformation
                )
                # checking if the transformation worked. If so, make the plots described above and storing the result
                if adf_p_value < 0.05:
                    transformation_results[column_name] = (
                        transformation,
                        adf_p_value,
                    )
                    found = True
                    plot_acf_pacf_after_transformation(
                        original_series,
                        trans_series,
                        transformation + " Transformation",
                        column_name,
                    )
# displaying the results
print("Transformation Results:")
display(transformation_results)