# Exploratory Data Analysis

In [None]:
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.stattools import adfuller
from scipy.stats import boxcox

In [None]:
DATA_DIR = (
    Path("..")
    / ".."
    / "hfactory_magic_folders"
    / "plastic_cost_prediction"
    / "data"
)
MAIN_FILE = "PA6_cleaned_dataset.csv"

In [None]:
df = pd.read_csv(DATA_DIR / MAIN_FILE)

In [None]:
df.columns

In [None]:
df.dtypes

In [None]:
# convert time from string to datetime
df["time"] = pd.to_datetime(df["time"])

## Time series plots of features

First, we have a look at the single columns that don't belong to the same topic.

In [None]:
def plot_time_series(col: str) -> None:
    # Create the plot
    plt.plot(df["time"], df[col])

    # Add title and axis labels
    plt.title(f"Time Series Plot - {col}")
    plt.xlabel("time")
    plt.ylabel(col)
    plt.xticks(rotation=45)

    # Display the plot
    plt.show()

In [None]:
single_cols = [
    "PA6 GLOBAL_ EMEAS _ EUR per TON",
    "best_price_compound",
    "Inflation_rate_france",
    "Automotive Value",
]

In [None]:
for col in single_cols:
    plot_time_series(col)

Next, the columns that belong to the same group are displayed in the same plot.

In [None]:
gas_cols = ["NGAS_US", "NGAS_EUR", "NGAS_JP", "iNATGAS"]

plt.figure()

for column in gas_cols:
    plt.plot(df["time"], df[column], label=column)

plt.title(f"Time Series Plot - Natural Gas Prices")
plt.xlabel("time")
plt.ylabel("natural gas prices")
plt.legend();

In [None]:
crude_cols = ["CRUDE_PETRO", "CRUDE_BRENT", "CRUDE_DUBAI", "CRUDE_WTI"]

plt.figure()

for column in crude_cols:
    plt.plot(df["time"], df[column], label=column)

plt.title(f"Time Series Plot - Crude Oil Prices")
plt.xlabel("time")
plt.ylabel("crude oil prices")
plt.legend();

In [None]:
electricity_cols = [
    "Electricty_Price_France",
    "Electricty_Price_Italy",
    "Electricty_Price_Poland",
    "Electricty_Price_Netherlands",
    "Electricty_Price_Germany",
]

plt.figure()

for column in electricity_cols:
    plt.plot(df["time"], df[column], label=column)

plt.title(f"Time Series Plot - Electricty Prices")
plt.xlabel("time")
plt.ylabel("electricity prices")
plt.legend();

In [None]:
hydrocarbons_cols = ["Benzene_price", "Caprolactam_price", "Cyclohexane_price"]

plt.figure()

for column in hydrocarbons_cols:
    plt.plot(df["time"], df[column], label=column)

plt.title(f"Time Series Plot - Hydrocarbons Prices")
plt.xlabel("time")
plt.ylabel("hydrocarbons prices")
plt.legend();

## (Partial) Autocorrelation

In [None]:
for col in df.columns[1:]:
    data = df[col]
    if col == "best_price_compound":
        data = data.dropna()

    f, ax = plt.subplots(nrows=2, ncols=1)
    f.subplots_adjust(hspace=0.4, top=0.85)
    f.suptitle(col)
    plot_acf(data, lags=10, ax=ax[0])
    plot_pacf(data, lags=10, ax=ax[1])
    plt.tight_layout()
    plt.show()

## Augmented Dicky-Fuller (ADF) Test

In [None]:
df.drop(columns=["Unnamed: 0"], inplace=True)

In [None]:
df

In [None]:
def adf_test_all_columns(dataframe, significance_level=0.05):
    """
    Perform Augmented Dickey-Fuller (ADF) test on all columns in the DataFrame.

    Parameters:
    - dataframe: pandas DataFrame
    - significance_level: significance level for the test (default is 0.05)

    Returns:
    - df_results: DataFrame containing column names and corresponding p-values
    - stationary_columns: List of column names that are stationary
    """

    df_results = pd.DataFrame(columns=["Column", "ADF p-value"])
    stationary_columns = []

    for column in dataframe.columns:
        result_adf = adfuller(dataframe[column].dropna())
        p_value = round(result_adf[1], 3)

        # df_results = df_results.append({'Column': column, 'ADF p-value': p_value}, ignore_index=True)
        df_results = pd.concat(
            [
                df_results,
                pd.DataFrame({"Column": [column], "ADF p-value": [p_value]}),
            ],
            ignore_index=True,
        )

        if p_value <= significance_level:
            stationary_columns.append(column)

    print("Columns that are stationary based on the ADF test:")
    print(stationary_columns)

    return df_results, stationary_columns


# Example Usage:
# Replace 'your_dataframe' with the actual DataFrame containing your time series data
results, stationary_cols = adf_test_all_columns(df)
display(results)

In [None]:
def transform_and_test(series, transformation="log", significance_level=0.05):
    """
    Apply transformations to make the series stationary and perform ADF test.

    Parameters:
    - series: pandas Series
    - transformation: 'log', 'sqrt', 'boxcox', 'diff', 'logdiff', or None (default is 'log')
    - significance_level: significance level for the ADF test (default is 0.05)

    Returns:
    - transformed_series: pandas Series after transformation
    - adf_p_value: ADF test p-value
    """

    if transformation == "log":
        transformed_series = np.log(series)
    elif transformation == "sqrt":
        transformed_series = np.sqrt(series)
    elif transformation == "boxcox":
        transformed_series, _ = boxcox(series)
    elif transformation == "diff":
        transformed_series = series.diff().dropna()
    elif transformation == "logdiff":
        transformed_series = np.log(series).diff().dropna()
    else:
        transformed_series = series

    result_adf = adfuller(transformed_series)
    adf_p_value = result_adf[1]

    return transformed_series, adf_p_value


def plot_acf_pacf_after_transformation(series, transformed_series, title):
    """
    Plot ACF and PACF after applying transformation.

    Parameters:
    - series: pandas Series (original series)
    - transformed_series: pandas Series after transformation
    - title: Title for the plot
    """

    plt.figure(figsize=(18, 8))

    # Plot original series
    plt.subplot(3, 1, 1)
    plt.plot(series)
    plt.title("Original Series")

    # Plot ACF of transformed series
    plt.subplot(3, 2, 3)
    plot_acf(transformed_series, lags=20, ax=plt.gca())
    plt.title(f"ACF of Transformed Series ({title})")

    # Plot PACF of transformed series
    plt.subplot(3, 2, 4)
    plot_pacf(transformed_series, lags=20, method="ywm", ax=plt.gca())
    plt.title(f"PACF of Transformed Series ({title})")

    plt.tight_layout()
    plt.show()


# Example Usage:
transformation_results = {}

for column_name in df.columns:
    if column_name != "time":
        original_series = df[column_name].dropna()

        print(f"Column: {column_name}")

        # Dictionary to store transformation results
        transformation_results[column_name] = []

        # Log transformation
        log_transformed_series, adf_p_value_log = transform_and_test(
            original_series, transformation="log"
        )
        if adf_p_value_log < 0.05:
            transformation_results[column_name].append("log")
        print(f"ADF P-Value (Log Transformation): {adf_p_value_log}")
        plot_acf_pacf_after_transformation(
            original_series, log_transformed_series, "Log Transformation"
        )

        # Square root transformation
        sqrt_transformed_series, adf_p_value_sqrt = transform_and_test(
            original_series, transformation="sqrt"
        )
        if adf_p_value_sqrt < 0.05:
            transformation_results[column_name].append("sqrt")
        print(f"ADF P-Value (Square Root Transformation): {adf_p_value_sqrt}")
        plot_acf_pacf_after_transformation(
            original_series,
            sqrt_transformed_series,
            "Square Root Transformation",
        )

        # Box-Cox transformation
        boxcox_transformed_series, adf_p_value_boxcox = transform_and_test(
            original_series, transformation="boxcox"
        )
        if adf_p_value_boxcox < 0.05:
            transformation_results[column_name].append("boxcox")
        print(f"ADF P-Value (Box-Cox Transformation): {adf_p_value_boxcox}")
        plot_acf_pacf_after_transformation(
            original_series,
            boxcox_transformed_series,
            "Box-Cox Transformation",
        )

        # First differencing
        differenced_series, adf_p_value_diff = transform_and_test(
            original_series, transformation="diff"
        )
        if adf_p_value_diff < 0.05:
            transformation_results[column_name].append("diff")
        print(f"ADF P-Value (First Differencing): {adf_p_value_diff}")
        plot_acf_pacf_after_transformation(
            original_series, differenced_series, "First Differencing"
        )

        # Second difference of log
        logdiff_transformed_series, adf_p_value_logdiff = transform_and_test(
            original_series, transformation="logdiff"
        )
        if adf_p_value_logdiff < 0.05:
            transformation_results[column_name].append("logdiff")
        print(f"ADF P-Value (Difference of Log): {adf_p_value_logdiff}")
        plot_acf_pacf_after_transformation(
            original_series,
            logdiff_transformed_series,
            "Second Difference of Log",
        )

        print("\n")

# Display the dictionary of transformation results
print("Transformation Results:")
display(transformation_results)

In [None]:
# Select the 'NGAS_JP' column
original_series = df["NGAS_JP"].dropna()

# Apply the second difference of the log transformation
transformed_series = np.log(original_series).diff().diff().dropna()

# Perform the ADF test
result_adf = adfuller(transformed_series)
adf_p_value = result_adf[1]

# Plot ACF and PACF after applying transformation
plot_acf_pacf_after_transformation(
    original_series, transformed_series, "Second Difference of Log"
)

print(f"ADF P-Value (Second Difference of Log): {adf_p_value}")

In [None]:
data = df["best_price_compound"].diff()
data = data.dropna()

f, ax = plt.subplots(nrows=2, ncols=1)
f.subplots_adjust(hspace=0.4, top=0.85)
f.suptitle("Stationary series for best_price_compound, Y(t) - Y(t-1)")
plot_acf(data, lags=10, ax=ax[0])
plot_pacf(data, lags=10, ax=ax[1])
plt.tight_layout()
plt.show()