# Exploratory Data Analysis

In [None]:
from pathlib import Path
from typing import List

import altair as alt
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pandas.plotting import autocorrelation_plot
from pmdarima.arima import CHTest
import seaborn as sns
from statsmodels.tsa.seasonal import seasonal_decompose

In [None]:
DATA_DIR = (
    Path("..")
    / ".."
    / ".."
    / "hfactory_magic_folders"
    / "plastic_cost_prediction"
    / "data"
)
MAIN_FILE = "PA6_cleaned_dataset.csv"

In [None]:
df = pd.read_csv(DATA_DIR / MAIN_FILE)

In [None]:
df.columns

In [None]:
df.dtypes

In [None]:
# convert time from string to datetime
df["time"] = pd.to_datetime(df["time"])

## Time series plots of features

First, we have a look at the single columns that don't belong to the same topic.

In [None]:
def plot_time_series(df: pd.DataFrame, col: str) -> None:
    """
    Plots a time series from a DataFrame.

    Parameters
    ----------
    df : pd.DataFrame
            Data as a dataframe.
    col : str
        The name of the column to be plotted from the DataFrame.

    Returns
    -------
    None
    """
    # Create the plot
    plt.plot(df["time"], df[col])

    # Add title and axis labels
    plt.title(f"Time Series Plot - {col}")
    plt.xlabel("time")
    plt.ylabel(col)
    plt.xticks(rotation=45)

    # Display the plot
    plt.show()

In [None]:
single_cols = [
    "PA6 GLOBAL_ EMEAS _ EUR per TON",
    "best_price_compound",
    "Inflation_rate_france",
    "Automotive Value",
]

In [None]:
for col in single_cols:
    plot_time_series(df, col)

Next, the columns that belong to the same group are displayed in the same plot.

In [None]:
gas_cols = [col for col in df.columns.to_list() if "GAS" in col]

plt.figure()

for column in gas_cols:
    plt.plot(df["time"], df[column], label=column)

plt.title(f"Time Series Plot - Natural Gas Prices")
plt.xlabel("time")
plt.ylabel("natural gas prices")
plt.legend();

In [None]:
crude_cols = [col for col in df.columns.to_list() if "CRUDE" in col]

plt.figure()

for column in crude_cols:
    plt.plot(df["time"], df[column], label=column)

plt.title(f"Time Series Plot - Crude Oil Prices")
plt.xlabel("time")
plt.ylabel("crude oil prices")
plt.legend();

In [None]:
electricity_cols = [col for col in df.columns.to_list() if "Electricty" in col]

plt.figure()

for column in electricity_cols:
    plt.plot(df["time"], df[column], label=column)

plt.title(f"Time Series Plot - Electricty Prices")
plt.xlabel("time")
plt.ylabel("electricity prices")
plt.legend();

In [None]:
hydrocarbons_cols = ["Benzene_price", "Caprolactam_price", "Cyclohexane_price"]

plt.figure()

for column in hydrocarbons_cols:
    plt.plot(df["time"], df[column], label=column)

plt.title(f"Time Series Plot - Hydrocarbons Prices")
plt.xlabel("time")
plt.ylabel("hydrocarbons prices")
plt.legend();

## Correlation analysis

In [None]:
corr = df.corr()
corr.style.background_gradient(cmap="coolwarm")

## Sesonality and outliers

### Outliers Analysis

Following the Covid-19 pandemic and its related restrictions, in beginning of 2020, there was a drop in demand for energy in general. As a result the natural gas, electricity and oil prices experienced lower prices.

The recovery of economic activities was translated into increased energy demand, and natural gas prices regained their pre-pandemic levels by Q3/Q4 2020. The upward trend continued in 2021.

In 2022, Russia’s war on Ukraine and decision to suspend deliveries of gas to some EU member states have pushed up the price of gas, which has also caused record high prices for electricity in the EU.

Heatwaves during summer 2022 have put additional pressure on energy markets, on the one hand causing increased demand of energy for cooling, and on the other decreased energy supply due to drought and the consequent reduction in the supply of hydropower. 

In [None]:
def create_box_plot(
    data: pd.DataFrame,
    x_column: str,
    y_column: str,
    color_column: str,
    y_scale_domain: List[int] = None,
) -> alt.Chart:
    """Creates a box plot using Altair visualization library.

    Parameters
    ----------
    data : pd.DataFrame
        The input DataFrame containing the data to be visualized.
    x_column : str
        The column name for the x-axis.
    y_column : str
        The column name for the y-axis (numeric).
    color_column : str
        The column name used for coloring the boxes.
    y_scale_domain : List[int], optional
        The domain for the y-axis scale, if provided.

    Returns
    -------
    chart : alt.Chart
        Altair Chart object representing the box plot.
    """
    if y_scale_domain is not None:
        chart = (
            alt.Chart(data)
            .mark_boxplot(
                opacity=0.3,
                size=50,
            )
            .encode(
                x=alt.X(f"year({x_column}):O", title="Year"),
                y=alt.Y(
                    f"{y_column}:Q",
                    title="Price",
                    scale=alt.Scale(domain=y_scale_domain),
                ),
                color=f"{color_column}:N",
            )
        )
    else:
        chart = (
            alt.Chart(data)
            .mark_boxplot(
                opacity=0.3,
                size=50,
            )
            .encode(
                x=alt.X(f"year({x_column}):O", title="Year"),
                y=alt.Y(f"{y_column}:Q", title="Price"),
                color=f"{color_column}:N",
            )
        )
    return chart


def create_swarm_plot(
    data: pd.DataFrame,
    x_column: str,
    y_column: str,
    color_column: str,
    y_scale_domain: List[int] = None,
) -> alt.Chart:
    """Creates a swarm plot using Altair visualization library.

    Parameters
    ----------
    data : pd.DataFrame
        The input DataFrame containing the data to be visualized.
    x_column : str
        The column name for the x-axis.
    y_column : str
        The column name for the y-axis (numeric).
    color_column : str
        The column name used for coloring the points.
    y_scale_domain : List[int], optional
        The domain for the y-axis scale, if provided.

    Returns
    -------
    chart : alt.Chart
        Altair Chart object representing the swarm plot.
    """
    if y_scale_domain is not None:
        chart = (
            alt.Chart(data)
            .mark_circle(size=30, opacity=0.7, color="black")
            .encode(
                x=alt.X(f"year({x_column}):O", title="Year"),
                y=alt.Y(
                    f"{y_column}:Q", scale=alt.Scale(domain=y_scale_domain)
                ),
                color=f"{color_column}:N",
            )
        )
    else:
        chart = (
            alt.Chart(data)
            .mark_circle(size=30, opacity=0.7, color="black")
            .encode(
                x=alt.X(f"year({x_column}):O", title="Year"),
                y=alt.Y(f"{y_column}:Q"),
                color=f"{color_column}:N",
            )
        )
    return chart


def create_outliers_distribution_chart(
    dataframe: pd.DataFrame,
    date_column: str,
    features_list: [List[int]],
    title: str,
    y_scale_domain: List[int] = None,
) -> alt.Chart:
    """Creates a combined chart with box plots and swarm plots for multiple features.

    Parameters
    ----------
    dataframe : pd.DataFrame
        The input DataFrame containing the data to be visualized.
    date_column : str
        The column name representing the date or time.
    features_list : List[str]
        List of column names for the features to be visualized.
    title : str
        The title for the combined chart.
    y_scale_domain : List[int], optional
        The domain for the y-axis scale, if provided.

    Returns
    -------
    final_chart : alt.Chart
        Altair Chart object representing the combined box plots and swarm plots.
    """
    melted_df = pd.melt(
        dataframe.reset_index(),
        id_vars=[date_column],
        value_vars=features_list,
    )

    box_plot = create_box_plot(
        melted_df, date_column, "value", "variable", y_scale_domain
    )
    swarm_plot = create_swarm_plot(
        melted_df, date_column, "value", "variable", y_scale_domain
    )

    # Combine box plot and swarm plot
    chart = (box_plot + swarm_plot).properties(
        width=600, height=400, title=title
    )

    # Facet by the specified column
    final_chart = chart.facet(
        column=alt.Column(f"variable:N"),
    )

    return final_chart

In [None]:
chart = create_outliers_distribution_chart(
    df,
    "time",
    ["PA6 GLOBAL_ EMEAS _ EUR per TON", "best_price_compound"],
    "Price Distribution Over Years",
    [1000, 4000],
)
chart

In [None]:
chart = create_outliers_distribution_chart(
    df,
    "time",
    crude_cols,
    "Crude Prices Distribution Over Years",
)
chart

In [None]:
chart = create_outliers_distribution_chart(
    df,
    "time",
    gas_cols,
    "Natural Gas Prices Distribution Over Years",
)
chart

Following the Covid-19 pandemic and its related restrictions, in beginning of 2020, there was a drop in demand for natural gas and energy in general. As a result the natural gas prices experienced record-low prices.

The recovery of economic activities was translated into increased energy demand, and natural gas prices regained their pre-pandemic levels by Q3/Q4 2020. The upward trend continued in 2021. 

Price on Natural Gas in Europe were affected by war much more than Natural Gas prices in Japan and United States

In [None]:
chart = create_outliers_distribution_chart(
    df,
    "time",
    electricity_cols,
    "Electricity Prices Distribution Over Years",
)
chart

Poland's success in containing prices was due, among other things, to Poland's electricity mix, which is still based on coal sourced from domestic mines. Therefore, Poland was less affected by the severe increases in global commodity prices observed in 2022.

Source: https://pkee.pl/en/aktualnosci/wojna-o-ceny-energii-podsumowanie-dzialan-oslonowych-na-rynkach-w-polscei-europie/#:~:text=As%20he%20points%20out%2C%20Poland's,commodity%20prices%20observed%20in%202022.

In [None]:
chart = create_outliers_distribution_chart(
    df,
    "time",
    ["Inflation_rate_france"],
    "France Inflation Rate Distribution Over Years",
)
chart

In [None]:
chart = create_outliers_distribution_chart(
    df, "time", ["Automotive Value"], "Compounds Price Distribution Over Years"
)
chart

In [None]:
chart = create_outliers_distribution_chart(
    df,
    "time",
    hydrocarbons_cols,
    "Compounds Price Distribution Over Years",
)
chart

### Seasonality Analysis

First, we'll have a look at the target column.

In [None]:
# Multiplicative Decomposition
multiplicative_decomposition = seasonal_decompose(
    df["best_price_compound"].dropna(), model="multiplicative", period=30
)

# Additive Decomposition
additive_decomposition = seasonal_decompose(
    df["best_price_compound"].dropna(), model="additive", period=30
)

# Plot
plt.rcParams.update({"figure.figsize": (7, 5)})
multiplicative_decomposition.plot().suptitle(
    "Multiplicative Decomposition", fontsize=16
)
plt.tight_layout(rect=[0, 0.03, 1, 0.95])

additive_decomposition.plot().suptitle("Additive Decomposition", fontsize=16)
plt.tight_layout(rect=[0, 0.03, 1, 0.95])

plt.show()

In [None]:
# Test for seasonality

plt.rcParams.update({"figure.figsize": (8, 3), "figure.dpi": 120})
autocorrelation_plot(df["best_price_compound"].dropna().tolist())

In [None]:
# CH test for seasonality

time_series = pd.Series(
    df["best_price_compound"].values, index=df["time"]
).dropna()
CHTest(m=2).estimate_seasonal_differencing_term(time_series)

Second, let's have a look at the decomposition of seasonal columns.

In [None]:
for col in df.columns:
    if col in ["time", "best_price_compound"]:
        continue

    # Multiplicative Decomposition
    multiplicative_decomposition = seasonal_decompose(
        df[col], model="multiplicative", period=30
    )

    # Plot
    plt.rcParams.update({"figure.figsize": (7, 5)})
    multiplicative_decomposition.plot()
    plt.tight_layout(rect=[0, 0.03, 1, 0.95])

    plt.show()