In [None]:
import matplotlib.pyplot as plt
import pandas as pd

from constants import intermediate_names, processed_names
import constants.constants as cst
from constants.paths import INTERMEDIATE_PC_PRICE_DIR, INTERMEDIATE_PHENOL_ACETONE_DIR
from src.data_pipelines.multi_intermediate_to_processed import (
    build_multivariate_dataset,
)
from src.data_pipelines.uni_intermediate_to_processed import build_univariate_dataset
import src.utils.feature_engineering as fe_utils

# Analysing intermediate data files

In [None]:
pc_price_eu = pd.read_csv(INTERMEDIATE_PC_PRICE_DIR / "intermediate_pc_price_eu.csv")
pc_price_eu_grouped = pd.read_csv(
    INTERMEDIATE_PC_PRICE_DIR / "intermediate_pc_price_eu_grouped.csv"
)
pc_price_asia = pd.read_csv(
    INTERMEDIATE_PC_PRICE_DIR / "intermediate_pc_price_asia.csv"
)
bpa_capacity_loss = pd.read_csv(
    INTERMEDIATE_PHENOL_ACETONE_DIR / "intermediate_bpa_capacity_loss.csv"
)

In [None]:
pc_price_asia["pc_gf_best_price"]

# Feature Engineering approach

There are 2 approaches to feature engineering depending on the modeling technique we want to use:
1. PC type specific models: Use a wide format dataset (1 column per PC type).
2. Global multivariate model: Use a long format dataset (1 row per PC type per country per month) with all features included.

## 1 - PC type specific models 

In this approach, we will create a wide format dataset where each PC type has its own column. Features will be engineered specifically for each PC type. We first create a large dataset with all possible features for each PC type, and select a subset of that dataset at modeling time.

The base dataset is created using the `build_univariate_dataset` function in `src/data_pipelines/uni_intermediate_to_processed.py`. This function simply concatenates all datasets (PC prices and exogenous variables) into a wide format dataframe.

In [None]:
wide_df = fe_utils.create_wide_format()
wide_df.head()

The `asia_pc_gf_best_price` column is weird. Possible that values within the column don't have the same unit. Comes from data, nothing we can really do about it...

### Feature engineering

This is the base for the pipeline that creates the final processed dataset used for modeling in `src/data_pipelines/uni_intermediate_to_processed.py`. The final dataset is built by adding features to the base dataset using functions from `src/utils/feature_engineering.py`.
The feature engineering steps are as follows:
- Calendar features (month, quarter, year) with cyclical encoding for month and quarter.
- Lag features for PC prices.
- Rolling window statistics for PC prices.
- Rate of change features for PC prices.
- Lag features for exogenous variables.
- Rolling window statistics for exogenous variables.

This is the base for the pipeline that creates the final processed dataset used for modeling in `src/data_pipelines/uni_intermediate_to_processed.py`. The final dataset is built by adding features to the base dataset using functions from `src/utils/feature_engineering.py`.
The feature engineering steps are as follows:
- Calendar features (month, quarter, year) with cyclical encoding for month and quarter.
- Lag features for PC prices.
- Rolling window statistics for PC prices.
- Rate of change features for PC prices.
- Lag features for exogenous variables.
- Rolling window statistics for exogenous variables.

In [None]:
full_wide_df = build_univariate_dataset(horizon=3)
full_wide_df.head()

Once this dataset is created, we can select the relevant columns for each PC type and use it for modeling.

In [None]:
def select_features(wide_df: pd.DataFrame, pc_type: cst.PCType) -> pd.DataFrame:
    """Select relevant features for the univariate model for the selected PC type.

    These features include:

    Args:
        wide_df (pd.DataFrame): The full wide format dataframe
        pc_type (PCType): The name of the PC type to select features for

    Returns:
        pd.DataFrame: A dataframe containing only the relevant features for the
        specified PC type.
    """
    if not isinstance(pc_type, cst.PCType):
        raise ValueError(
            f"pc_type must be an instance of {cst.PCType} Enum, got {type(pc_type)}"
        )
    pc_type_columns = [col for col in wide_df.columns if pc_type.value in col]

    exogenous_features = []
    for exogenous_col in intermediate_names.EXOGENOUS_COLUMNS:
        exogenous_features.extend(
            [
                col
                for col in wide_df.columns
                if (pc_type.name not in col and exogenous_col in col)
            ]
        )
    relevant_features = (
        pc_type_columns + exogenous_features + [processed_names.WIDE_DATE]
    )
    return wide_df[relevant_features]

In [None]:
select_features(wide_df=full_wide_df, pc_type=cst.PCType.CRYSTAL)

## 2 - Global multivariate model

In this approach , we will create a long format dataset where each row corresponds to a specific PC type in a specific region at a specific month.

```
| date       | region | pc_type | price | ... features ... |
|------------|--------|---------|-------|------------------|
| 2020-01-01 | europe | crystal | 2.5   | ...              |
| 2020-01-01 | europe | gf10    | 2.8   | ...              |
| 2020-01-01 | asia   | gp      | 2.3   | ...              |
```

In [None]:
long_df = fe_utils.create_long_format()
long_df.head()

In [None]:
long_df[long_df["region"] == "europe"]["pc_type"].unique()

In [None]:
long_df[long_df["pc_type"] == "gf"]["pc_price"].value_counts()

In [None]:
# Price distributions by PC type
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Europe
eu_data = long_df[long_df[processed_names.LONG_REGION] == cst.EUROPE]
eu_data.boxplot(
    column=processed_names.LONG_PC_PRICE, by=processed_names.LONG_PC_TYPE, ax=axes[0]
)
axes[0].set_title("Europe PC Price Distributions")
axes[0].set_xlabel("PC Type")
axes[0].set_ylabel("Price (USD/kg)")
plt.sca(axes[0])
plt.xticks(rotation=45, ha="right")

# Asia
asia_data = long_df[long_df[processed_names.LONG_REGION] == cst.ASIA]
asia_data.boxplot(
    column=processed_names.LONG_PC_PRICE, by=processed_names.LONG_PC_TYPE, ax=axes[1]
)
axes[1].set_title("Asia PC Price Distributions")
axes[1].set_xlabel("PC Type")
axes[1].set_ylabel("Price (USD/kg)")
plt.sca(axes[1])
plt.xticks(rotation=45, ha="right")

plt.tight_layout()
plt.show()

In [None]:
# Imbalanced data across PC types

fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Europe
eu_data = long_df[long_df[processed_names.LONG_REGION] == cst.EUROPE]
eu_counts = (
    eu_data[processed_names.LONG_PC_TYPE].value_counts().sort_values(ascending=False)
)
axes[0].bar(range(len(eu_counts)), eu_counts.values)
axes[0].set_xticks(range(len(eu_counts)))
axes[0].set_xticklabels(eu_counts.index, rotation=45, ha="right")
axes[0].set_title("Europe: Number of Samples per PC Type")
axes[0].set_xlabel("PC Type")
axes[0].set_ylabel("Number of Samples")

# Asia
asia_data = long_df[long_df[processed_names.LONG_REGION] == cst.ASIA]
asia_counts = (
    asia_data[processed_names.LONG_PC_TYPE].value_counts().sort_values(ascending=False)
)
axes[1].bar(range(len(asia_counts)), asia_counts.values)
axes[1].set_xticks(range(len(asia_counts)))
axes[1].set_xticklabels(asia_counts.index, rotation=45, ha="right")
axes[1].set_title("Asia: Number of Samples per PC Type")
axes[1].set_xlabel("PC Type")
axes[1].set_ylabel("Number of Samples")

plt.tight_layout()
plt.show()

In [None]:
full_long_df = build_multivariate_dataset(horizon=3)
full_long_df

In [None]:
# Lag feature correlation with target (by PC type)
fig, ax = plt.subplots(figsize=(12, 6))

lag_cols = [c for c in full_long_df.columns if "pc_price_lag" in c]
corr_by_type = (
    full_long_df.groupby(processed_names.LONG_PC_TYPE)[
        [processed_names.LONG_PC_PRICE] + lag_cols
    ]
    .corr()[processed_names.LONG_PC_PRICE]
    .drop(processed_names.LONG_PC_PRICE, level=1)
)

corr_by_type.unstack().T.plot(kind="bar", ax=ax)
ax.set_title("Lag Feature Correlations with Target Price (by PC Type)")
ax.set_xlabel("Lag Feature")
ax.set_ylabel("Correlation with Price")
ax.legend(title="PC Type", bbox_to_anchor=(1.05, 1), loc="upper left")
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.show()

## Feature selection rationale

----