## 1. Setup and Data Loading

In [None]:
# Import required libraries
from pathlib import Path
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

warnings.filterwarnings("ignore")

# Set visualization style
plt.style.use("seaborn-v0_8-darkgrid")
sns.set_palette("husl")
%matplotlib inline

# Configure display options
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 100)
pd.set_option("display.float_format", lambda x: f"{x:.2f}")

In [None]:
# Load data
data_path = Path("../data/raw/phenol_acetone_capacity_loss/")

acetone_df = pd.read_parquet(data_path / "acetone_consumption_capacity_loss_kt.pq")
phenol_df = pd.read_parquet(data_path / "phenol_consumption_capacity_loss_kt.pq")

print("Acetone data shape:", acetone_df.shape)
print("Phenol data shape:", phenol_df.shape)

## 2. Data Overview and Structure

In [None]:
# Display first few rows
print("=" * 80)
print("ACETONE DATA - First 5 rows")
print("=" * 80)
display(acetone_df.head())

print("\n" + "=" * 80)
print("PHENOL DATA - First 5 rows")
print("=" * 80)
display(phenol_df.head())

In [None]:
# Data info and types
print("=" * 80)
print("ACETONE DATA INFO")
print("=" * 80)
acetone_df.info()

print("\n" + "=" * 80)
print("PHENOL DATA INFO")
print("=" * 80)
phenol_df.info()

In [None]:
# Statistical summary
print("=" * 80)
print("ACETONE - Statistical Summary")
print("=" * 80)
display(acetone_df.describe())

print("\n" + "=" * 80)
print("PHENOL - Statistical Summary")
print("=" * 80)
display(phenol_df.describe())

In [None]:
# Check for missing values
print("=" * 80)
print("Missing Values Analysis")
print("=" * 80)

print("\nAcetone missing values:")
acetone_missing = acetone_df.isnull().sum()
acetone_missing_pct = (acetone_df.isnull().sum() / len(acetone_df)) * 100
missing_acetone = pd.DataFrame(
    {"Missing Count": acetone_missing, "Percentage": acetone_missing_pct}
)
display(missing_acetone[missing_acetone["Missing Count"] > 0])

print("\nPhenol missing values:")
phenol_missing = phenol_df.isnull().sum()
phenol_missing_pct = (phenol_df.isnull().sum() / len(phenol_df)) * 100
missing_phenol = pd.DataFrame(
    {"Missing Count": phenol_missing, "Percentage": phenol_missing_pct}
)
display(missing_phenol[missing_phenol["Missing Count"] > 0])

In [None]:
# Check missing values specifically for Bisphenol A
print("=" * 80)
print("BISPHENOL A - MISSING VALUES ANALYSIS")
print("=" * 80)

# Get Bisphenol A rows
acetone_bpa_row = acetone_df[acetone_df["Derivative"] == "BISPHENOL A"]
phenol_bpa_row = phenol_df[phenol_df["Derivative"] == "BISPHENOL A"]

# Get month columns (exclude metadata)
acetone_month_cols = [
    col
    for col in acetone_df.columns
    if col not in ["Derivative", "Avg. Conversion Factor (+)"]
]
phenol_month_cols = [
    col
    for col in phenol_df.columns
    if col not in ["Derivative", "Avg. Conversion Factor (+)"]
]

print("\n ACETONE - Bisphenol A:")
print(f"Total months: {len(acetone_month_cols)}")
acetone_bpa_nans = acetone_bpa_row[acetone_month_cols].isnull().sum(axis=1).values[0]
print(f"Missing values (NaN): {acetone_bpa_nans}")
available_months = len(acetone_month_cols) - acetone_bpa_nans
availability_pct = (available_months / len(acetone_month_cols)) * 100
print(f"Data availability: {available_months} months ({availability_pct:.1f}%)")

if acetone_bpa_nans > 0:
    nan_months_acetone = (
        acetone_bpa_row[acetone_month_cols]
        .columns[acetone_bpa_row[acetone_month_cols].isnull().values[0]]
        .tolist()
    )
    print(f"\nMissing months for Acetone BPA ({len(nan_months_acetone)}):")
    for month in nan_months_acetone:
        print(f"  - {month}")

print("\n" + "-" * 80)

print("\n PHENOL - Bisphenol A:")
print(f"Total months: {len(phenol_month_cols)}")
phenol_bpa_nans = phenol_bpa_row[phenol_month_cols].isnull().sum(axis=1).values[0]
print(f"Missing values (NaN): {phenol_bpa_nans}")
available_months = len(phenol_month_cols) - phenol_bpa_nans
availability_pct = (available_months / len(phenol_month_cols)) * 100
print(f"Data availability: {available_months} months ({availability_pct:.1f}%)")

if phenol_bpa_nans > 0:
    nan_months_phenol = (
        phenol_bpa_row[phenol_month_cols]
        .columns[phenol_bpa_row[phenol_month_cols].isnull().values[0]]
        .tolist()
    )
    print(f"\nMissing months for Phenol BPA ({len(nan_months_phenol)}):")
    for month in nan_months_phenol:
        print(f"  - {month}")

print("\n" + "=" * 80)

## 3. Data Preparation and Reshaping

The data is in **wide format** where:
- Each row represents a derivative type (Grand Total, Bisphenol A, etc.)
- Each column (except first 2) represents a month with capacity loss values
- We need to reshape it to **long format** for time series analysis

In [None]:
# Reshape data from wide to long format for time series analysis
def reshape_to_long(df, chemical_name):
    """Convert wide format to long format for time series analysis."""
    # Separate metadata columns from time series columns
    id_cols = ["Derivative", "Avg. Conversion Factor (+)"]

    # Get all month columns (everything except id_cols and 'chemical' if present)
    month_cols = [col for col in df.columns if col not in id_cols and col != "chemical"]

    # Melt the dataframe
    df_long = df.melt(
        id_vars=id_cols,
        value_vars=month_cols,
        var_name="Month",
        value_name="Capacity_Loss_kt",
    )

    # Convert Month to datetime
    df_long["Date"] = pd.to_datetime(df_long["Month"], format="%b %Y", errors="coerce")

    # Add chemical identifier
    df_long["Chemical"] = chemical_name

    # Sort by derivative and date
    df_long = df_long.sort_values(["Derivative", "Date"]).reset_index(drop=True)

    return df_long


# Reshape both datasets
acetone_long = reshape_to_long(acetone_df, "Acetone")
phenol_long = reshape_to_long(phenol_df, "Phenol")

print("Reshaped Data Summary:")
print(f"Acetone (long format): {acetone_long.shape}")
print(f"Phenol (long format): {phenol_long.shape}")

print("\nAcetone long format - first few rows:")
display(acetone_long.head(10))

print("\nPhenol long format - first few rows:")
display(phenol_long.head(10))

## 4. Temporal Trend Analysis

Analyze capacity loss trends over time for each derivative and chemical.

In [None]:
# Time series analysis for each chemical
def plot_time_series_by_derivative(df_long, chemical_name):
    """Plot time series of capacity loss by derivative."""
    derivatives = df_long["Derivative"].unique()

    # Reorder to put Grand Total at the bottom
    derivatives_list = derivatives.tolist()
    if "Grand Total" in derivatives_list:
        derivatives_list.remove("Grand Total")
        derivatives_list.append("Grand Total")

    n_derivatives = len(derivatives_list)

    fig, axes = plt.subplots(n_derivatives, 1, figsize=(15, 4 * n_derivatives))
    if n_derivatives == 1:
        axes = [axes]

    for idx, derivative in enumerate(derivatives_list):
        df_subset = df_long[df_long["Derivative"] == derivative].copy()
        df_subset = df_subset.dropna(subset=["Date", "Capacity_Loss_kt"])

        axes[idx].plot(
            df_subset["Date"],
            df_subset["Capacity_Loss_kt"],
            marker="o",
            linestyle="-",
            linewidth=2,
            markersize=3,
        )

        # Add trend line
        if len(df_subset) > 1:
            z = np.polyfit(range(len(df_subset)), df_subset["Capacity_Loss_kt"], 1)
            p = np.poly1d(z)
            axes[idx].plot(
                df_subset["Date"],
                p(range(len(df_subset))),
                "r--",
                alpha=0.8,
                linewidth=2,
                label="Trend",
            )

        # Highlight Grand Total with different styling
        if derivative == "Grand Total":
            axes[idx].set_facecolor("#f0f0f0")
            axes[idx].set_title(
                f"{chemical_name} - {derivative} (Overall)",
                fontsize=14,
                fontweight="bold",
                color="darkblue",
            )
        else:
            axes[idx].set_title(
                f"{chemical_name} - {derivative}",
                fontsize=13,
                fontweight="bold",
            )

        axes[idx].set_xlabel("Date")
        axes[idx].set_ylabel("Capacity Loss (kt)")
        axes[idx].grid(True, alpha=0.3)
        axes[idx].legend()
        axes[idx].tick_params(axis="x", rotation=45)

    plt.tight_layout()
    plt.show()


print("Acetone Capacity Loss Trends:")
plot_time_series_by_derivative(acetone_long, "Acetone")

print("\nPhenol Capacity Loss Trends:")
plot_time_series_by_derivative(phenol_long, "Phenol")

## 5. Comparative Analysis: Acetone vs Phenol

Compare capacity loss patterns between the two chemicals across all derivatives.

In [None]:
# Combine data for comparison
combined_long = pd.concat([acetone_long, phenol_long], ignore_index=True)

# Focus on Grand Total for overall comparison
grand_total = combined_long[combined_long["Derivative"] == "Grand Total"].copy()

# Plot comparison
fig, axes = plt.subplots(2, 1, figsize=(15, 10))

# Time series comparison
for chemical in ["Acetone", "Phenol"]:
    data = grand_total[grand_total["Chemical"] == chemical].dropna(
        subset=["Date", "Capacity_Loss_kt"]
    )
    axes[0].plot(
        data["Date"],
        data["Capacity_Loss_kt"],
        marker="o",
        linestyle="-",
        linewidth=2,
        markersize=4,
        label=chemical,
    )

axes[0].set_title(
    "Grand Total Capacity Loss Comparison: Acetone vs Phenol",
    fontsize=14,
    fontweight="bold",
)
axes[0].set_xlabel("Date")
axes[0].set_ylabel("Capacity Loss (kt)")
axes[0].legend()
axes[0].grid(True, alpha=0.3)
axes[0].tick_params(axis="x", rotation=45)

# Box plot comparison
acetone_total = grand_total[grand_total["Chemical"] == "Acetone"][
    "Capacity_Loss_kt"
].dropna()
phenol_total = grand_total[grand_total["Chemical"] == "Phenol"][
    "Capacity_Loss_kt"
].dropna()

bp = axes[1].boxplot(
    [acetone_total, phenol_total], labels=["Acetone", "Phenol"], patch_artist=True
)

for patch, color in zip(bp["boxes"], ["lightblue", "lightcoral"], strict=True):
    patch.set_facecolor(color)

axes[1].set_title(
    "Distribution of Grand Total Capacity Loss",
    fontsize=14,
    fontweight="bold",
)
axes[1].set_ylabel("Capacity Loss (kt)")
axes[1].grid(True, alpha=0.3, axis="y")

plt.tight_layout()
plt.show()

# Statistical comparison
print("=" * 80)
print("STATISTICAL COMPARISON - Grand Total Capacity Loss")
print("=" * 80)
print("\nAcetone:")
print(f"  Mean: {acetone_total.mean():.2f} kt")
print(f"  Std Dev: {acetone_total.std():.2f} kt")
print(f"  Min: {acetone_total.min():.2f} kt")
print(f"  Max: {acetone_total.max():.2f} kt")

print("\nPhenol:")
print(f"  Mean: {phenol_total.mean():.2f} kt")
print(f"  Std Dev: {phenol_total.std():.2f} kt")
print(f"  Min: {phenol_total.min():.2f} kt")
print(f"  Max: {phenol_total.max():.2f} kt")

## 6. Derivative-Specific Analysis

Analyze capacity loss patterns for each specific derivative type.

In [None]:
# Analyze Bisphenol A (uses both chemicals)
print("=" * 80)
print("BISPHENOL A ANALYSIS - Uses Both Acetone AND Phenol")
print("=" * 80)

bpa_data = combined_long[combined_long["Derivative"] == "BISPHENOL A"].copy()

# Plot Bisphenol A consumption for both chemicals
plt.figure(figsize=(15, 6))

for chemical in ["Acetone", "Phenol"]:
    data = bpa_data[bpa_data["Chemical"] == chemical].dropna(
        subset=["Date", "Capacity_Loss_kt"]
    )
    cf_value = data["Avg. Conversion Factor (+)"].iloc[0]
    plt.plot(
        data["Date"],
        data["Capacity_Loss_kt"],
        marker="o",
        linestyle="-",
        linewidth=2,
        markersize=4,
        label=f"{chemical} (CF={cf_value:.2f})",
    )

plt.title(
    "Bisphenol A Production - Capacity Loss for Both Chemicals",
    fontsize=14,
    fontweight="bold",
)
plt.xlabel("Date")
plt.ylabel("Capacity Loss (kt)")
plt.legend()
plt.grid(True, alpha=0.3)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Calculate correlation
acetone_bpa = (
    bpa_data[bpa_data["Chemical"] == "Acetone"]
    .sort_values("Date")["Capacity_Loss_kt"]
    .values
)
phenol_bpa = (
    bpa_data[bpa_data["Chemical"] == "Phenol"]
    .sort_values("Date")["Capacity_Loss_kt"]
    .values
)

# Ensure same length
min_len = min(len(acetone_bpa), len(phenol_bpa))
if min_len > 1:
    correlation = np.corrcoef(acetone_bpa[:min_len], phenol_bpa[:min_len])[0, 1]
    print(
        "\nCorrelation between Acetone and Phenol capacity loss "
        f"for Bisphenol A: {correlation:.3f}"
    )
    print("(High correlation suggests synchronized production disruptions)")

# Summary statistics by derivative
print("\n" + "=" * 80)
print("DERIVATIVE-SPECIFIC SUMMARY STATISTICS")
print("=" * 80)

for chemical in ["Acetone", "Phenol"]:
    print(f"\n{chemical} Capacity Loss by Derivative:")
    chem_data = combined_long[combined_long["Chemical"] == chemical]

    summary = (
        chem_data.groupby("Derivative")["Capacity_Loss_kt"]
        .agg(
            [
                ("Mean", "mean"),
                ("Std Dev", "std"),
                ("Min", "min"),
                ("Max", "max"),
                ("Count", "count"),
            ]
        )
        .round(2)
    )

    display(summary)