# Exploratory Data Analysis

In [None]:
from pathlib import Path

import matplotlib.pyplot as plt
import pandas as pd
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

In [None]:
DATA_DIR = (
    Path("..")
    / ".."
    / "hfactory_magic_folders"
    / "plastic_cost_prediction"
    / "data"
)
MAIN_FILE = "PA6_cleaned_dataset.csv"

In [None]:
df = pd.read_csv(DATA_DIR / MAIN_FILE)

In [None]:
df.columns

In [None]:
df.dtypes

In [None]:
# convert time from string to datetime
df["time"] = pd.to_datetime(df["time"])

## Time series plots of features

First, we have a look at the single columns that don't belong to the same topic.

In [None]:
def plot_time_series(col: str) -> None:
    # Create the plot
    plt.plot(df["time"], df[col])

    # Add title and axis labels
    plt.title(f"Time Series Plot - {col}")
    plt.xlabel("time")
    plt.ylabel(col)
    plt.xticks(rotation=45)

    # Display the plot
    plt.show()

In [None]:
single_cols = [
    "PA6 GLOBAL_ EMEAS _ EUR per TON",
    "best_price_compound",
    "Inflation_rate_france",
    "Automotive Value",
]

In [None]:
for col in single_cols:
    plot_time_series(col)

Next, the columns that belong to the same group are displayed in the same plot.

In [None]:
gas_cols = ["NGAS_US", "NGAS_EUR", "NGAS_JP", "iNATGAS"]

plt.figure()

for column in gas_cols:
    plt.plot(df["time"], df[column], label=column)

plt.title(f"Time Series Plot - Natural Gas Prices")
plt.xlabel("time")
plt.ylabel("natural gas prices")
plt.legend();

In [None]:
crude_cols = ["CRUDE_PETRO", "CRUDE_BRENT", "CRUDE_DUBAI", "CRUDE_WTI"]

plt.figure()

for column in crude_cols:
    plt.plot(df["time"], df[column], label=column)

plt.title(f"Time Series Plot - Crude Oil Prices")
plt.xlabel("time")
plt.ylabel("crude oil prices")
plt.legend();

In [None]:
electricity_cols = [
    "Electricty_Price_France",
    "Electricty_Price_Italy",
    "Electricty_Price_Poland",
    "Electricty_Price_Netherlands",
    "Electricty_Price_Germany",
]

plt.figure()

for column in electricity_cols:
    plt.plot(df["time"], df[column], label=column)

plt.title(f"Time Series Plot - Electricty Prices")
plt.xlabel("time")
plt.ylabel("electricity prices")
plt.legend();

In [None]:
hydrocarbons_cols = ["Benzene_price", "Caprolactam_price", "Cyclohexane_price"]

plt.figure()

for column in hydrocarbons_cols:
    plt.plot(df["time"], df[column], label=column)

plt.title(f"Time Series Plot - Hydrocarbons Prices")
plt.xlabel("time")
plt.ylabel("hydrocarbons prices")
plt.legend();

## (Partial) Autocorrelation

In [None]:
for col in df.columns[1:]:
    data = df[col]
    if col == "best_price_compound":
        data = data.dropna()

    f, ax = plt.subplots(nrows=2, ncols=1)
    f.subplots_adjust(hspace=0.4, top=0.85)
    f.suptitle(col)
    plot_acf(data, lags=10, ax=ax[0])
    plot_pacf(data, lags=10, ax=ax[1])
    plt.tight_layout()
    plt.show()