# Compare EIA 930 imputation methods

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

from pudl.analysis.timeseries_evaluation import plot_imputation, _filter_df
from pudl.analysis.timeseries_cleaning import melt_imputed_timeseries_matrix
from pudl.etl import defs

Helper function to load a dataframe with simulated and actual data.

In [None]:
def _load_comparison_data(
    base_asset: str,
    id_cols: list[str],
    value_col: str = "demand_imputed_pudl_mwh",
) -> pd.DataFrame:
    simulated_df = defs.load_asset_value(f"_{base_asset}_simulated").rename(
        columns={value_col: "simulated_demand", f"{value_col}_imputation_code": "simulated_demand_imputation_code"}
    )
    imputed_df = defs.load_asset_value(base_asset).rename(
        columns={value_col: "imputed_demand", f"{value_col}_imputation_code": "imputed_demand_imputation_code"}
    )
    print(f"Mean percent error for {base_asset}: ", defs.load_asset_value(f"_{base_asset}_score"))

    # Get months with simulated data
    simulated_month_id_cols = ["month"] + id_cols
    simulated_df["month"] = simulated_df["datetime"].dt.to_period("M")
    simulated_months = simulated_df[simulated_df["simulated_demand_imputation_code"] == "simulated"].drop_duplicates(
        subset=simulated_month_id_cols
    )[simulated_month_id_cols]
    simulated_df = simulated_df.merge(simulated_months, on=simulated_month_id_cols)

    return simulated_df.merge(
        imputed_df, on=["datetime_utc"] + id_cols
    )


def _get_highest_error_months(
    comparison_df: pd.DataFrame,
    id_col: str,
):
    comparison_df["error"] = (comparison_df["simulated_demand"] - comparison_df["imputed_demand"]).abs()
    return comparison_df[
        [id_col, "datetime_utc", "error"]
    ].groupby(
        [id_col, pd.Grouper(key='datetime_utc', freq='MS')]
    ).mean()["error"].sort_values(ascending=False).head(30).reset_index()


def _get_last_day_of_month(date_str):
    return pd.to_datetime(date_str).to_period('M').to_timestamp('M').strftime("%Y-%m-%d")


def _plot_month(comparison_df, simulated_months_df, id_col: str, month_idx: int):
    month = simulated_months_df.iloc[month_idx]
    entity_id = month[id_col]
    start_date = month["datetime_utc"]
    end_date = _get_last_day_of_month(start_date)
    plot_imputation(
        comparison_df,
        idx_cols=[id_col],
        idx_vals=(entity_id),
        start_date=start_date,
        end_date=end_date,
        reported_col="imputed_demand",
        imputed_col="simulated_demand",
        time_col="datetime_utc",
    )

## Analyze Combined Imputation

In [None]:
comparison_df = _load_comparison_data("_out_eia930__combined_imputed_demand", id_cols=["generic_id"])

### Find cases with largest diff between simulated and actual

In [None]:
simulated_months = _get_highest_error_months(comparison_df, id_col="generic_id")
simulated_months

In [None]:
_plot_month(comparison_df, simulated_months, "generic_id", 0)

In [None]:
_plot_month(comparison_df, simulated_months, "generic_id", 1)

In [None]:
_plot_month(comparison_df, simulated_months, "generic_id", 2)

## Analyze BA Imputation

In [None]:
comparison_df = _load_comparison_data("out_eia930__hourly_operations", id_cols=["balancing_authority_code_eia"])

In [None]:
simulated_months = _get_highest_error_months(comparison_df, id_col="balancing_authority_code_eia")
simulated_months

In [None]:
_plot_month(comparison_df, simulated_months, "balancing_authority_code_eia", 0)

In [None]:
_plot_month(comparison_df, simulated_months, "balancing_authority_code_eia", 1)

In [None]:
_plot_month(comparison_df, simulated_months, "balancing_authority_code_eia", 2)

## Analyze Subregion Imputation

In [None]:
comparison_df = _load_comparison_data(
    "out_eia930__hourly_subregion_demand",
    id_cols=["balancing_authority_code_eia", "balancing_authority_subregion_code_eia"],
)

In [None]:
simulated_months = _get_highest_error_months(comparison_df, id_col="combined_subregion_ba_id")
simulated_months

In [None]:
_plot_month(comparison_df, simulated_months, "combined_subregion_ba_id", 0)

In [None]:
_plot_month(comparison_df, simulated_months, "combined_subregion_ba_id", 1)

In [None]:
_plot_month(comparison_df, simulated_months, "combined_subregion_ba_id", 2)