In [None]:
import matplotlib.pyplot as plt
import pandas as pd

from constants import processed_names
from constants.paths import INTERMEDIATE_PC_PRICE_DIR, INTERMEDIATE_PHENOL_ACETONE_DIR
from src.data_pipelines.multi_intermediate_to_processed import (
    build_multivariate_dataset,
)
from src.data_pipelines.uni_intermediate_to_processed import build_univariate_dataset
import src.utils.feature_engineering as fe_utils

# Analysing intermediate data files

In [None]:
pc_price_eu = pd.read_csv(INTERMEDIATE_PC_PRICE_DIR / "intermediate_pc_price_eu.csv")
pc_price_asia = pd.read_csv(
    INTERMEDIATE_PC_PRICE_DIR / "intermediate_pc_price_asia.csv"
)
bpa_capacity_loss = pd.read_csv(
    INTERMEDIATE_PHENOL_ACETONE_DIR / "intermediate_bpa_capacity_loss.csv"
)

# Feature Engineering approach

There are 2 approaches to feature engineering depending on the modeling technique we want to use:
1. PC type specific models: Use a wide format dataset (1 column per PC type).
2. Global multivariate model: Use a long format dataset (1 row per PC type per country per month) with all features included.

## 1 - PC type specific models 

In this approach, we will create a wide format dataset where each PC type has its own column. Features will be engineered specifically for each PC type. We first create a large dataset with all possible features for each PC type, and select a subset of that dataset at modeling time.

The base dataset is created using the `build_univariate_dataset` function in `src/data_pipelines/uni_intermediate_to_processed.py`. This function simply concatenates all datasets (PC prices and exogenous variables) into a wide format dataframe.

In [None]:
fe_utils.create_wide_format()

The `asia_pc_gf_best_price` column is weird. Possible that values within the column don't have the same unit. Comes from data, nothing we can really do about it...

### Feature engineering

This is the base for the pipeline that creates the final processed dataset used for modeling in `src/data_pipelines/uni_intermediate_to_processed.py`. The final dataset is built by adding features to the base dataset using functions from `src/utils/feature_engineering.py`.
The feature engineering steps are as follows:
- Calendar features (month, quarter, year) with cyclical encoding for month and quarter.
- Lag features for PC prices.
- Rolling window statistics for PC prices.
- Rate of change features for PC prices.
- Lag features for exogenous variables.
- Rolling window statistics for exogenous variables.

In [None]:
build_univariate_dataset(horizon=3)

## 2 - Global multivariate model

In this approach , we will create a long format dataset where each row corresponds to a specific PC type in a specific region at a specific month.

```
| date       | region | pc_type | price | ... features ... |
|------------|--------|---------|-------|------------------|
| 2020-01-01 | europe | crystal | 2.5   | ...              |
| 2020-01-01 | europe | gf10    | 2.8   | ...              |
| 2020-01-01 | asia   | gp      | 2.3   | ...              |
```

In [None]:
long_df = fe_utils.create_long_format()

In [None]:
# Price distributions by PC type
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Europe
eu_data = long_df[long_df[processed_names.LONG_REGION] == processed_names.EUROPE]
eu_data.boxplot(
    column=processed_names.LONG_PC_PRICE, by=processed_names.LONG_PC_TYPE, ax=axes[0]
)
axes[0].set_title("Europe PC Price Distributions")
axes[0].set_xlabel("PC Type")
axes[0].set_ylabel("Price (USD/kg)")
plt.sca(axes[0])
plt.xticks(rotation=45, ha="right")

# Asia
asia_data = long_df[long_df[processed_names.LONG_REGION] == processed_names.ASIA]
asia_data.boxplot(
    column=processed_names.LONG_PC_PRICE, by=processed_names.LONG_PC_TYPE, ax=axes[1]
)
axes[1].set_title("Asia PC Price Distributions")
axes[1].set_xlabel("PC Type")
axes[1].set_ylabel("Price (USD/kg)")
plt.sca(axes[1])
plt.xticks(rotation=45, ha="right")

plt.tight_layout()
plt.show()

In [None]:
full_long_df = build_multivariate_dataset(horizon=3)

In [None]:
# Lag feature correlation with target (by PC type)
fig, ax = plt.subplots(figsize=(12, 6))

lag_cols = [c for c in full_long_df.columns if "pc_price_lag" in c]
corr_by_type = (
    full_long_df.groupby(processed_names.LONG_PC_TYPE)[
        [processed_names.LONG_PC_PRICE] + lag_cols
    ]
    .corr()[processed_names.LONG_PC_PRICE]
    .drop(processed_names.LONG_PC_PRICE, level=1)
)

corr_by_type.unstack().T.plot(kind="bar", ax=ax)
ax.set_title("Lag Feature Correlations with Target Price (by PC Type)")
ax.set_xlabel("Lag Feature")
ax.set_ylabel("Correlation with Price")
ax.legend(title="PC Type", bbox_to_anchor=(1.05, 1), loc="upper left")
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.show()

# TODO - Select features for modelling

- Correlation analysis
- ADF testing (stationarity of features)
- Granger causality tests (past values of features helping predict target)
- Time-lagged mutual information
- Independence testing

----