# Notebook Preamble

## IPython Magic

In [None]:
%load_ext autoreload
%autoreload 3


## Notebook Imports

In [None]:
# 3rd Party Imports:
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import matplotx
import numpy as np
import duckdb
import sqlalchemy as sa
from dagster import AssetKey
from pathlib import Path
import os

# Local Imports
import pudl
from pudl.etl import defs
from pudl.output.pudltabl import PudlTabl
from pudl.workspace.setup import PudlPaths

logger = pudl.logging_helpers.get_logger("pudl")

## Visualization Settings

In [None]:
%matplotlib inline

In [None]:
matplotlib.rcParams["figure.figsize"] = (10, 6)
matplotlib.rcParams["figure.dpi"] = 150
matplotlib.style.use(matplotx.styles.onedark)
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 300)
pd.set_option("display.max_colwidth", 1000)

# Data access shortcuts

In [None]:
def get_table(table: str):
    return defs.load_asset_value(AssetKey(table))

def get_parquet(table: str) -> pd.DataFrame:
    return pd.read_parquet(Path(os.environ["PUDL_OUTPUT"]) / f"parquet/{table}.parquet")

In [None]:
from pudl.analysis.timeseries_evaluation import plot_imputation

In [None]:
eia930_sub = get_table("out_eia930__hourly_subregion_demand")

In [None]:
get_table("core_pudl__codes_imputation_reasons")

# Find some bad data

Calculate the proportion of imputed values by subregion to identify areas with a lot of imputation happening so we can see what the results look like.

In [None]:
bad_data = (
    eia930_sub.groupby(
        [
            "balancing_authority_code_eia",
            "balancing_authority_subregion_code_eia",
            eia930_sub["datetime_utc"].dt.year  # Extract the year from datetime_utc
        ], observed=True)
    ["demand_imputed_pudl_mwh_imputation_code"]
    .apply(lambda x: x.notnull().mean()).sort_values(ascending=False)
)
bad_data.head(50).tail(25)

In [None]:
idx_cols = ["balancing_authority_code_eia", "balancing_authority_subregion_code_eia"]
reported_col = "demand_reported_mwh"
imputed_col = "demand_imputed_pudl_mwh"

plot_imputation(
    eia930_sub,
    idx_cols=idx_cols,
    idx_vals=("CISO", "PGAE"),
    start_date="2019-02-01",
    end_date="2019-02-20",
    reported_col=reported_col,
    imputed_col=imputed_col,
)

plot_imputation(
    eia930_sub,
    idx_cols=idx_cols,
    idx_vals=("CISO", "VEA"),
    start_date="2019-02-01",
    end_date="2019-02-20",
    reported_col=reported_col,
    imputed_col=imputed_col,
)

plot_imputation(
    eia930_sub,
    idx_cols=idx_cols,
    idx_vals=("SWPP", "INDN"),
    start_date="2019-12-01",
    end_date="2019-12-31",
    reported_col=reported_col,
    imputed_col=imputed_col,
)

plot_imputation(
    eia930_sub,
    idx_cols=idx_cols,
    idx_vals=("SWPP", "INDN"),
    start_date="2024-12-01",
    end_date="2024-12-31",
    reported_col=reported_col,
    imputed_col=imputed_col,
)

plot_imputation(
    eia930_sub,
    idx_cols=idx_cols,
    idx_vals=("PNM", "KCEC"),
    start_date="2022-06-15",
    end_date="2022-07-15",
    reported_col=reported_col,
    imputed_col=imputed_col,
)

plot_imputation(
    eia930_sub,
    idx_cols=idx_cols,
    idx_vals=("CISO", "VEA"),
    start_date="2019-12-01",
    end_date="2019-12-31",
    reported_col=reported_col,
    imputed_col=imputed_col,
)


In [None]:
assert False

# Time series correlation scatterplot

In [None]:
new_imputed = pd.read_parquet("s3://pudl.catalyst.coop/nightly/out_ferc714__hourly_planning_area_demand.parquet")
old_imputed = pd.read_parquet("s3://pudl.catalyst.coop/nightly/_out_ferc714__hourly_imputed_demand.parquet")

In [None]:
both_imputed = pd.merge(
    new_imputed.set_index(["respondent_id_ferc714", "datetime_utc"]),
    old_imputed.set_index(["respondent_id_ferc714", "datetime_utc"]),
    left_index=True,
    right_index=True,
    suffixes=("_new", "_old"),
    how="outer",
)
# Reset the index to make `respondent_id_ferc714` a data column
both_imputed = both_imputed.reset_index()
both_imputed.info()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Assign a discrete color to each `respondent_id_ferc714`
unique_ids = both_imputed["respondent_id_ferc714"].unique()
palette = sns.color_palette("tab20", len(unique_ids))
color_map = {rid: palette[i] for i, rid in enumerate(unique_ids)}
colors = both_imputed["respondent_id_ferc714"].map(color_map)

# Create the scatter plot
plt.figure(figsize=(12, 12))
plt.scatter(
    both_imputed["demand_mwh"],
    both_imputed["demand_imputed_pudl_mwh"],
    c=colors,
    s=0.1,
)

# Set both axes to logarithmic scale
plt.xscale("log")
plt.yscale("log")
plt.xlim(1e-1, 1e6)
plt.ylim(1e-1, 1e6)

# Add gridlines
plt.grid(True, which="both", linestyle="--", linewidth=0.5)

# Optionally add labels and a title
plt.xlabel("Old Imputed FERC-714 Planning Area Demand [MWh]")
plt.ylabel("New Imputed FERC-714 Planning Area Demand [MWh]")
plt.title("Log-Log Scatter Plot of Old vs New Imputed Demand")

plt.show()

In [None]:
assert False

In [None]:
out_sub_eia930 = get_table("out_eia930__hourly_subregion_demand")
core_sub_eia930 = get_table("core_eia930__hourly_subregion_demand")

In [None]:
core_sub_eia930

## subregion output EIA-930 notes
- There's no EIA imputation for the subregions, but for naming clarity purposes, do we want to rename `demand_imputed_mwh` so that it will be consistent with the names of the PUDL imputed columns that will exist in other tables?
- It would be nice if the table were sorted by BA Code, subregion code, and time to ensure contiguous time series.
- Looking at `CISO` I see that there are **more** NA values in the imputed column than the reported column. Is that expected? I would have thought we'd fill in the missing values.


In [None]:
out_sub_eia930