# Comparison: Shutdown Dataset vs Capacity Loss Dataset

This notebook compares the shutdown/outage dataset with the phenol_acetone_capacity_loss dataset to determine if they contain the same information.

## Hypothesis
The shutdown dataset contains individual outage events with start/end dates and total capacity loss. If we aggregate these by month, they should match the monthly capacity loss values in the capacity_loss dataset.

## Approach
1. Load both datasets
2. Aggregate shutdown data by month (distribute capacity loss across outage period)
3. Compare aggregated shutdown data with capacity_loss monthly data
4. Visualize differences and similarities
5. Identify any discrepancies


## 1. Setup and Data Loading


In [None]:
# Import required libraries
from pathlib import Path
import warnings

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

warnings.filterwarnings("ignore")

# Set visualization style
plt.style.use("seaborn-v0_8-darkgrid")
sns.set_palette("husl")
%matplotlib inline

# Configure display options
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 100)
pd.set_option("display.float_format", lambda x: f"{x:.2f}")

In [None]:
# Load capacity loss data (monthly aggregated)
capacity_loss_path = Path("../../data/raw/phenol_acetone_capacity_loss/")

acetone_capacity_loss = pd.read_parquet(
    capacity_loss_path / "acetone_consumption_capacity_loss_kt.pq"
)
phenol_capacity_loss = pd.read_parquet(
    capacity_loss_path / "phenol_consumption_capacity_loss_kt.pq"
)

print("Capacity Loss Data:")
print(f"Acetone shape: {acetone_capacity_loss.shape}")
print(f"Phenol shape: {phenol_capacity_loss.shape}")
print("\nAcetone columns (first 10):", list(acetone_capacity_loss.columns[:10]))
print("Phenol columns (first 10):", list(phenol_capacity_loss.columns[:10]))

In [None]:
# Load shutdown data (individual events)
shutdown_path = Path("../../data/raw/shutdown/")

acetone_shutdown = pd.read_csv(shutdown_path / "Acetone.csv")
phenol_shutdown = pd.read_csv(shutdown_path / "Phenol.csv")

# Convert date columns to datetime
acetone_shutdown["Outage Start Date"] = pd.to_datetime(
    acetone_shutdown["Outage Start Date"], errors="coerce"
)
acetone_shutdown["Outage End Date"] = pd.to_datetime(
    acetone_shutdown["Outage End Date"], errors="coerce"
)

phenol_shutdown["Outage Start Date"] = pd.to_datetime(
    phenol_shutdown["Outage Start Date"], errors="coerce"
)
phenol_shutdown["Outage End Date"] = pd.to_datetime(
    phenol_shutdown["Outage End Date"], errors="coerce"
)

print("Shutdown Data:")
print(f"Acetone shutdowns: {len(acetone_shutdown)} events")
print(f"Phenol shutdowns: {len(phenol_shutdown)} events")
print("\nAcetone date range:")
print(f"  Start: {acetone_shutdown['Outage Start Date'].min()}")
print(f"  End: {acetone_shutdown['Outage End Date'].max()}")
print("\nPhenol date range:")
print(f"  Start: {phenol_shutdown['Outage Start Date'].min()}")
print(f"  End: {phenol_shutdown['Outage End Date'].max()}")

## 2. Understanding the Data Structures


In [None]:
# Examine capacity loss data structure
print("=" * 80)
print("CAPACITY LOSS DATA STRUCTURE")
print("=" * 80)
print("\nAcetone Capacity Loss - First few rows:")
display(acetone_capacity_loss.head())

print("\nDerivatives in Acetone data:")
print(acetone_capacity_loss["Derivative"].unique())

print("\nPhenol Capacity Loss - First few rows:")
display(phenol_capacity_loss.head())

print("\nDerivatives in Phenol data:")
print(phenol_capacity_loss["Derivative"].unique())

In [None]:
# Examine shutdown data structure
print("=" * 80)
print("SHUTDOWN DATA STRUCTURE")
print("=" * 80)
print("\nAcetone Shutdown - Sample records:")
display(
    acetone_shutdown[
        [
            "Company",
            "Site",
            "Outage Start Date",
            "Outage End Date",
            "Total Outage Days",
            "Total Capacity Loss (kt)",
            "Cause",
        ]
    ].head(10)
)

print("\nPhenol Shutdown - Sample records:")
display(
    phenol_shutdown[
        [
            "Company",
            "Site",
            "Outage Start Date",
            "Outage End Date",
            "Total Outage Days",
            "Total Capacity Loss (kt)",
            "Cause",
        ]
    ].head(10)
)

## 3. Aggregate Shutdown Data by Month

For each shutdown event, we need to distribute the total capacity loss across the months it spans. We'll use a simple approach: divide the total capacity loss by the number of days, then allocate to each month proportionally.


In [None]:
def aggregate_shutdowns_by_month(df_shutdown, chemical_name):
    """Aggregate shutdown events by month.

    For each shutdown event, distribute the capacity loss across the months
    it spans, proportional to the number of days in each month.
    """
    monthly_losses = []

    for _, row in df_shutdown.iterrows():
        start_date = row["Outage Start Date"]
        end_date = row["Outage End Date"]
        total_loss = row["Total Capacity Loss (kt)"]

        # Skip if dates are invalid
        if pd.isna(start_date) or pd.isna(end_date) or pd.isna(total_loss):
            continue

        # Calculate total days
        total_days = (
            end_date - start_date
        ).days + 1  # +1 to include both start and end

        if total_days <= 0:
            continue

        # Generate all dates in the outage period
        date_range = pd.date_range(start=start_date, end=end_date, freq="D")

        # Group by year-month and count days in each month
        for date in date_range:
            year_month = date.strftime("%b %Y")
            monthly_losses.append(
                {
                    "Month": year_month,
                    "Date": date,
                    "Capacity_Loss_kt": total_loss / total_days,  # Daily loss
                }
            )

    # Convert to DataFrame and aggregate by month
    if not monthly_losses:
        return pd.DataFrame(columns=["Month", "Capacity_Loss_kt"])

    df_monthly = pd.DataFrame(monthly_losses)

    # Aggregate by month (sum daily losses)
    monthly_agg = df_monthly.groupby("Month")["Capacity_Loss_kt"].sum().reset_index()

    # Convert Month to datetime for sorting
    monthly_agg["Date"] = pd.to_datetime(
        monthly_agg["Month"], format="%b %Y", errors="coerce"
    )
    monthly_agg = monthly_agg.sort_values("Date").reset_index(drop=True)

    monthly_agg["Chemical"] = chemical_name

    return monthly_agg


# Aggregate shutdown data
acetone_shutdown_monthly = aggregate_shutdowns_by_month(acetone_shutdown, "Acetone")
phenol_shutdown_monthly = aggregate_shutdowns_by_month(phenol_shutdown, "Phenol")

print("Aggregated Shutdown Data by Month:")
print(f"\nAcetone: {len(acetone_shutdown_monthly)} months with data")
print(
    f"Date range: {acetone_shutdown_monthly['Date'].min()} to {
        acetone_shutdown_monthly['Date'].max()
    }"
)
print(f"\nPhenol: {len(phenol_shutdown_monthly)} months with data")
print(
    f"Date range: {phenol_shutdown_monthly['Date'].min()} to {
        phenol_shutdown_monthly['Date'].max()
    }"
)

print("\nAcetone shutdown monthly (first 10):")
display(acetone_shutdown_monthly.head(10))

print("\nPhenol shutdown monthly (first 10):")
display(phenol_shutdown_monthly.head(10))

## 4. Extract Monthly Data from Capacity Loss Dataset

Extract the "Grand Total" row from the capacity loss dataset, which should represent the total capacity loss across all derivatives.


In [None]:
def extract_monthly_capacity_loss(df_capacity_loss, chemical_name):
    """Extract monthly capacity loss from the capacity loss dataset.

    Focus on 'Grand Total' which should match aggregated shutdowns.
    """
    # Get Grand Total row
    grand_total = df_capacity_loss[
        df_capacity_loss["Derivative"] == "Grand Total"
    ].copy()

    if len(grand_total) == 0:
        print(f"Warning: No 'Grand Total' found for {chemical_name}")
        return pd.DataFrame()

    # Get month columns (exclude metadata columns)
    month_cols = [
        col
        for col in df_capacity_loss.columns
        if col not in ["Derivative", "Avg. Conversion Factor (+)"]
    ]

    # Melt to long format
    monthly_data = []
    for col in month_cols:
        value = grand_total[col].iloc[0]
        if pd.notna(value):
            monthly_data.append(
                {
                    "Month": col,
                    "Capacity_Loss_kt": value,
                }
            )

    df_monthly = pd.DataFrame(monthly_data)

    # Convert Month to datetime for sorting
    df_monthly["Date"] = pd.to_datetime(
        df_monthly["Month"], format="%b %Y", errors="coerce"
    )
    df_monthly = (
        df_monthly.dropna(subset=["Date"]).sort_values("Date").reset_index(drop=True)
    )

    df_monthly["Chemical"] = chemical_name

    return df_monthly


# Extract monthly capacity loss
acetone_capacity_loss_monthly = extract_monthly_capacity_loss(
    acetone_capacity_loss, "Acetone"
)
phenol_capacity_loss_monthly = extract_monthly_capacity_loss(
    phenol_capacity_loss, "Phenol"
)

print("Capacity Loss Monthly Data:")
print(f"\nAcetone: {len(acetone_capacity_loss_monthly)} months with data")
print(
    f"Date range: {acetone_capacity_loss_monthly['Date'].min()} to {
        acetone_capacity_loss_monthly['Date'].max()
    }"
)
print(f"\nPhenol: {len(phenol_capacity_loss_monthly)} months with data")
print(
    f"Date range: {phenol_capacity_loss_monthly['Date'].min()} to {
        phenol_capacity_loss_monthly['Date'].max()
    }"
)

print("\nAcetone capacity loss monthly (first 10):")
display(acetone_capacity_loss_monthly.head(10))

print("\nPhenol capacity loss monthly (first 10):")
display(phenol_capacity_loss_monthly.head(10))

## 5. Compare the Two Datasets


In [None]:
def compare_datasets(df_shutdown_monthly, df_capacity_loss_monthly, chemical_name):
    """Compare shutdown monthly data with capacity loss monthly data.

    Note: Capacity loss values are negated to match the sign convention
    of shutdown data.
    """
    # Normalize dates to first of month to ensure proper matching
    df_shutdown_monthly = df_shutdown_monthly.copy()
    df_shutdown_monthly["Date"] = (
        pd.to_datetime(df_shutdown_monthly["Date"]).dt.to_period("M").dt.to_timestamp()
    )

    df_capacity_loss_monthly = df_capacity_loss_monthly.copy()
    df_capacity_loss_monthly["Date"] = (
        pd.to_datetime(df_capacity_loss_monthly["Date"])
        .dt.to_period("M")
        .dt.to_timestamp()
    )

    # Negate capacity loss values to match sign convention (if it's -1, it becomes 1)
    df_capacity_loss_monthly["Capacity_Loss_kt"] = -df_capacity_loss_monthly[
        "Capacity_Loss_kt"
    ]

    # Merge on Date
    comparison = pd.merge(
        df_shutdown_monthly[["Date", "Month", "Capacity_Loss_kt"]],
        df_capacity_loss_monthly[["Date", "Month", "Capacity_Loss_kt"]],
        on="Date",
        how="outer",
        suffixes=("_shutdown", "_capacity_loss"),
    ).sort_values("Date")

    # Track which values were originally present (not NaN) before filling
    comparison["has_shutdown_data"] = comparison["Capacity_Loss_kt_shutdown"].notna()
    comparison["has_capacity_loss_data"] = comparison[
        "Capacity_Loss_kt_capacity_loss"
    ].notna()

    # Fill missing values with 0 for comparison calculations
    comparison["Capacity_Loss_kt_shutdown"] = comparison[
        "Capacity_Loss_kt_shutdown"
    ].fillna(0)
    comparison["Capacity_Loss_kt_capacity_loss"] = comparison[
        "Capacity_Loss_kt_capacity_loss"
    ].fillna(0)

    # Calculate difference
    comparison["Difference"] = (
        comparison["Capacity_Loss_kt_capacity_loss"]
        - comparison["Capacity_Loss_kt_shutdown"]
    )
    comparison["Abs_Difference"] = comparison["Difference"].abs()
    comparison["Pct_Difference"] = (
        comparison["Difference"]
        / (comparison["Capacity_Loss_kt_capacity_loss"].abs() + 1e-10)
        * 100
    )

    # Get month name from either column
    comparison["Month"] = comparison["Month_shutdown"].fillna(
        comparison["Month_capacity_loss"]
    )
    comparison = comparison[
        [
            "Date",
            "Month",
            "Capacity_Loss_kt_shutdown",
            "Capacity_Loss_kt_capacity_loss",
            "has_shutdown_data",
            "has_capacity_loss_data",
            "Difference",
            "Abs_Difference",
            "Pct_Difference",
        ]
    ].copy()

    return comparison


# Compare datasets
acetone_comparison = compare_datasets(
    acetone_shutdown_monthly, acetone_capacity_loss_monthly, "Acetone"
)
phenol_comparison = compare_datasets(
    phenol_shutdown_monthly, phenol_capacity_loss_monthly, "Phenol"
)

print("=" * 80)
print("ACETONE COMPARISON")
print("=" * 80)
print(f"Total months compared: {len(acetone_comparison)}")
print(f"Months with shutdown data: {acetone_comparison['has_shutdown_data'].sum()}")
print(
    f"Months with capacity loss data: {
        acetone_comparison['has_capacity_loss_data'].sum()
    }"
)
print(
    f"Months with data in both: {
        (
            acetone_comparison['has_shutdown_data']
            & acetone_comparison['has_capacity_loss_data']
        ).sum()
    }"
)

acetone_both = (
    acetone_comparison["has_shutdown_data"]
    & acetone_comparison["has_capacity_loss_data"]
)
phenol_both = (
    phenol_comparison["has_shutdown_data"] & phenol_comparison["has_capacity_loss_data"]
)

if acetone_both.sum() > 0:
    print(
        f"\nMean absolute difference (where both exist): {
            acetone_comparison[acetone_both]['Abs_Difference'].mean():.2f} kt"
    )
    print(
        f"Max absolute difference: {acetone_comparison['Abs_Difference'].max():.2f} kt"
    )
    if acetone_both.sum() > 1:
        print(
            f"Correlation (where both exist): {
                acetone_comparison[acetone_both]['Capacity_Loss_kt_shutdown'].corr(
                    acetone_comparison[acetone_both]['Capacity_Loss_kt_capacity_loss']
                ):.3f}"
        )

print("\n" + "=" * 80)
print("PHENOL COMPARISON")
print("=" * 80)
print(f"Total months compared: {len(phenol_comparison)}")
print(f"Months with shutdown data: {phenol_comparison['has_shutdown_data'].sum()}")
print(
    f"Months with capacity loss data: {
        phenol_comparison['has_capacity_loss_data'].sum()
    }"
)
print(f"Months with data in both: {phenol_both.sum()}")

if phenol_both.sum() > 0:
    print(
        f"\nMean absolute difference (where both exist): {
            phenol_comparison[phenol_both]['Abs_Difference'].mean():.2f} kt"
    )
    print(
        f"Max absolute difference: {phenol_comparison['Abs_Difference'].max():.2f} kt"
    )
    if phenol_both.sum() > 1:
        print(
            f"Correlation (where both exist): {
                phenol_comparison[phenol_both]['Capacity_Loss_kt_shutdown'].corr(
                    phenol_comparison[phenol_both]['Capacity_Loss_kt_capacity_loss']
                ):.3f}"
        )

In [None]:
# Show detailed comparison for months with largest differences
print("=" * 80)
print("ACETONE - Top 20 Months with Largest Differences")
print("=" * 80)
display(
    acetone_comparison.nlargest(20, "Abs_Difference")[
        [
            "Month",
            "Capacity_Loss_kt_shutdown",
            "Capacity_Loss_kt_capacity_loss",
            "Difference",
            "Pct_Difference",
        ]
    ]
)

print("\n" + "=" * 80)
print("PHENOL - Top 20 Months with Largest Differences")
print("=" * 80)
display(
    phenol_comparison.nlargest(20, "Abs_Difference")[
        [
            "Month",
            "Capacity_Loss_kt_shutdown",
            "Capacity_Loss_kt_capacity_loss",
            "Difference",
            "Pct_Difference",
        ]
    ]
)

## 6. Visualizations


In [None]:
# Plot time series comparison
# Note: Capacity Loss values are negated to match sign convention
fig, axes = plt.subplots(2, 1, figsize=(15, 10))

# Acetone comparison
axes[0].plot(
    acetone_comparison["Date"],
    acetone_comparison["Capacity_Loss_kt_shutdown"],
    marker="o",
    linestyle="-",
    linewidth=2,
    markersize=3,
    label="Shutdown Dataset (Aggregated)",
    alpha=0.7,
)
axes[0].plot(
    acetone_comparison["Date"],
    acetone_comparison["Capacity_Loss_kt_capacity_loss"],
    marker="s",
    linestyle="--",
    linewidth=2,
    markersize=3,
    label="Capacity Loss Dataset (negated)",
    alpha=0.7,
)
axes[0].set_title(
    "Acetone: Shutdown vs Capacity Loss Comparison", fontsize=14, fontweight="bold"
)
axes[0].set_xlabel("Date")
axes[0].set_ylabel("Capacity Loss (kt)")
axes[0].legend()
axes[0].grid(True, alpha=0.3)
axes[0].tick_params(axis="x", rotation=45)

# Phenol comparison
axes[1].plot(
    phenol_comparison["Date"],
    phenol_comparison["Capacity_Loss_kt_shutdown"],
    marker="o",
    linestyle="-",
    linewidth=2,
    markersize=3,
    label="Shutdown Dataset (Aggregated)",
    alpha=0.7,
)
axes[1].plot(
    phenol_comparison["Date"],
    phenol_comparison["Capacity_Loss_kt_capacity_loss"],
    marker="s",
    linestyle="--",
    linewidth=2,
    markersize=3,
    label="Capacity Loss Dataset (negated)",
    alpha=0.7,
)
axes[1].set_title(
    "Phenol: Shutdown vs Capacity Loss Comparison", fontsize=14, fontweight="bold"
)
axes[1].set_xlabel("Date")
axes[1].set_ylabel("Capacity Loss (kt)")
axes[1].legend()
axes[1].grid(True, alpha=0.3)
axes[1].tick_params(axis="x", rotation=45)

plt.tight_layout()
plt.show()

In [None]:
# Scatter plots to show correlation
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Filter to only show months where both datasets have data
acetone_both = (
    acetone_comparison["has_shutdown_data"]
    & acetone_comparison["has_capacity_loss_data"]
)
phenol_both = (
    phenol_comparison["has_shutdown_data"] & phenol_comparison["has_capacity_loss_data"]
)

# Acetone scatter
axes[0].scatter(
    acetone_comparison[acetone_both]["Capacity_Loss_kt_shutdown"],
    acetone_comparison[acetone_both]["Capacity_Loss_kt_capacity_loss"],
    alpha=0.6,
    s=50,
)
# Add diagonal line
if acetone_both.sum() > 0:
    max_val = max(
        acetone_comparison[acetone_both]["Capacity_Loss_kt_shutdown"].max(),
        acetone_comparison[acetone_both]["Capacity_Loss_kt_capacity_loss"].max(),
    )
    min_val = min(
        acetone_comparison[acetone_both]["Capacity_Loss_kt_shutdown"].min(),
        acetone_comparison[acetone_both]["Capacity_Loss_kt_capacity_loss"].min(),
    )
    axes[0].plot(
        [min_val, max_val],
        [min_val, max_val],
        "r--",
        linewidth=2,
        label="Perfect Match",
    )
axes[0].set_xlabel("Shutdown Dataset (kt)")
axes[0].set_ylabel("Capacity Loss Dataset (negated) (kt)")
axes[0].set_title("Acetone: Correlation Plot", fontsize=14, fontweight="bold")
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Phenol scatter
axes[1].scatter(
    phenol_comparison[phenol_both]["Capacity_Loss_kt_shutdown"],
    phenol_comparison[phenol_both]["Capacity_Loss_kt_capacity_loss"],
    alpha=0.6,
    s=50,
    color="orange",
)
# Add diagonal line
if phenol_both.sum() > 0:
    max_val = max(
        phenol_comparison[phenol_both]["Capacity_Loss_kt_shutdown"].max(),
        phenol_comparison[phenol_both]["Capacity_Loss_kt_capacity_loss"].max(),
    )
    min_val = min(
        phenol_comparison[phenol_both]["Capacity_Loss_kt_shutdown"].min(),
        phenol_comparison[phenol_both]["Capacity_Loss_kt_capacity_loss"].min(),
    )
    axes[1].plot(
        [min_val, max_val],
        [min_val, max_val],
        "r--",
        linewidth=2,
        label="Perfect Match",
    )
axes[1].set_xlabel("Shutdown Dataset (kt)")
axes[1].set_ylabel("Capacity Loss Dataset (negated) (kt)")
axes[1].set_title("Phenol: Correlation Plot", fontsize=14, fontweight="bold")
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Plot difference over time
fig, axes = plt.subplots(2, 1, figsize=(15, 10))

# Acetone difference
axes[0].plot(
    acetone_comparison["Date"],
    acetone_comparison["Difference"],
    marker="o",
    linestyle="-",
    linewidth=1.5,
    markersize=3,
    alpha=0.7,
)
axes[0].axhline(y=0, color="r", linestyle="--", linewidth=2, label="Zero Difference")
axes[0].set_title(
    "Acetone: Difference (Capacity Loss - Shutdown) Over Time",
    fontsize=14,
    fontweight="bold",
)
axes[0].set_xlabel("Date")
axes[0].set_ylabel("Difference (kt)")
axes[0].legend()
axes[0].grid(True, alpha=0.3)
axes[0].tick_params(axis="x", rotation=45)

# Phenol difference
axes[1].plot(
    phenol_comparison["Date"],
    phenol_comparison["Difference"],
    marker="o",
    linestyle="-",
    linewidth=1.5,
    markersize=3,
    alpha=0.7,
    color="orange",
)
axes[1].axhline(y=0, color="r", linestyle="--", linewidth=2, label="Zero Difference")
axes[1].set_title(
    "Phenol: Difference (Capacity Loss - Shutdown) Over Time",
    fontsize=14,
    fontweight="bold",
)
axes[1].set_xlabel("Date")
axes[1].set_ylabel("Difference (kt)")
axes[1].legend()
axes[1].grid(True, alpha=0.3)
axes[1].tick_params(axis="x", rotation=45)

plt.tight_layout()
plt.show()