## Compare imputed assets

In [None]:
from pudl.etl import defs
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Validate FERC 714 downstream assets


In [None]:
def _get_nightly_df(table_name):
    return pd.read_parquet(f"https://s3.us-west-2.amazonaws.com/pudl.catalyst.coop/nightly/{table_name}.parquet")

### `out_ferc714__hourly_estimated_state_demand`

In [None]:
nightly_df = _get_nightly_df("out_ferc714__hourly_estimated_state_demand")
local_df = defs.load_asset_value("out_ferc714__hourly_estimated_state_demand")

In [None]:
print("Nightly table shape: ", nightly_df.shape)
print("New table shape: ", local_df.shape)

print("Nightly demand_mwh nulls: ", nightly_df.demand_mwh.isna().sum())
print("New demand_mwh nulls: ", local_df.demand_mwh.isna().sum())

#### Compare total `demand_mwh` distributions

In [None]:
fig, axs = plt.subplots(1, 2, sharey=True, tight_layout=True)
axs[0].hist(nightly_df["demand_mwh"], bins=100)
axs[1].hist(local_df["demand_mwh"], bins=100)
axs[0].set_title("Nightly")
axs[1].set_title("New")

plt.show()

Cut off tail of distribution to get a closer look at the bulk of the distribution.

In [None]:
fig, axs = plt.subplots(1, 2, sharey=True, tight_layout=True)
axs[0].hist(nightly_df[nightly_df.demand_mwh < 5000]["demand_mwh"], bins=100)
axs[1].hist(local_df[local_df.demand_mwh < 5000]["demand_mwh"], bins=100)
axs[0].set_title("Nightly")
axs[1].set_title("New")

plt.show()

Overall the distributions look quite similar, but there are some obvious weird spikes in the new version.

#### Compare total `scaled_demand_mwh` distributions

In [None]:
fig, axs = plt.subplots(1, 2, sharey=True, tight_layout=True)
axs[0].hist(nightly_df["scaled_demand_mwh"], bins=100)
axs[1].hist(local_df["scaled_demand_mwh"], bins=100)
axs[0].set_title("Nightly")
axs[1].set_title("New")

plt.show()

#### Compare demand curves by state

In [None]:
state_id_fips = "01"
year = 2006
month = 3

nightly_selection = nightly_df[
    (nightly_df.state_id_fips == state_id_fips) &
    (nightly_df.datetime_utc.dt.year == year) &
    (nightly_df.datetime_utc.dt.month == month)
]
local_selection = local_df[
    (local_df.state_id_fips == state_id_fips) &
    (local_df.datetime_utc.dt.year == year) &
    (local_df.datetime_utc.dt.month == month)
]

fig, ax = plt.subplots()
fig.set_size_inches(15,5)
ax.plot(nightly_selection.datetime_utc, nightly_selection.demand_mwh, label="nightly", lw=0.8)
ax.plot(local_selection.datetime_utc, local_selection.demand_mwh, label="new", lw=0.8)
legend = ax.legend()

In the vast majority of cases the demand curves line up very closely, however I have found a few cases where there is some weirdness, which I'll demonstrate below.

In [None]:
state_id_fips = "01"
year = 2006
month = 4

nightly_selection = nightly_df[
    (nightly_df.state_id_fips == state_id_fips) &
    (nightly_df.datetime_utc.dt.year == year) &
    (nightly_df.datetime_utc.dt.month == month)
]
local_selection = local_df[
    (local_df.state_id_fips == state_id_fips) &
    (local_df.datetime_utc.dt.year == year) &
    (local_df.datetime_utc.dt.month == month)
]

fig, ax = plt.subplots()
fig.set_size_inches(15,5)
ax.plot(nightly_selection.datetime_utc, nightly_selection.demand_mwh, label="nightly", lw=0.8)
ax.plot(local_selection.datetime_utc, local_selection.demand_mwh, label="new", lw=0.8)
legend = ax.legend()

Here we see a really weird spike in the new demand curve before it seemingly gets back on track and follows the old curve again.

### `out_ferc714__summarized_demand`

In [None]:
nightly_df = _get_nightly_df("out_ferc714__summarized_demand")
local_df = defs.load_asset_value("out_ferc714__summarized_demand")

#### Compare total `demand_annual_mwh` distributions

In [None]:
fig, axs = plt.subplots(1, 2, sharey=True, tight_layout=True)
axs[0].hist(nightly_df["demand_annual_mwh"], bins=100)
axs[1].hist(local_df["demand_annual_mwh"], bins=100)
axs[0].set_title("Nightly")
axs[1].set_title("New")

plt.show()

Cut off tail again.

In [None]:
fig, axs = plt.subplots(1, 2, sharey=True, tight_layout=True)
axs[0].hist(nightly_df[nightly_df.demand_annual_mwh < 4e8]["demand_annual_mwh"], bins=100)
axs[1].hist(local_df[local_df.demand_annual_mwh < 4e8]["demand_annual_mwh"], bins=100)
axs[0].set_title("Nightly")
axs[1].set_title("New")

plt.show()

In [None]:
new_imputed = defs.load_asset_value("out_ferc714__hourly_planning_area_demand")
old_imputed = _get_nightly_df("_out_ferc714__hourly_imputed_demand")

In [None]:
new_imputed.info()

In [None]:
old_imputed.info()

In [None]:
both_imputed = pd.merge(
    new_imputed.set_index(["respondent_id_ferc714", "datetime_utc"]),
    old_imputed.set_index(["respondent_id_ferc714", "datetime_utc"]),
    left_index=True,
    right_index=True,
    suffixes=("_new", "_old"),
    how="outer",
)
both_imputed.info()

In [None]:
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns

import matplotx
matplotlib.style.use(matplotx.styles.onedark)

# Reset the index to make `respondent_id_ferc714` a data column
both_imputed = both_imputed.reset_index()

# Assign a discrete color to each `respondent_id_ferc714`
unique_ids = both_imputed["respondent_id_ferc714"].unique()
palette = sns.color_palette("tab20", len(unique_ids))
color_map = {rid: palette[i] for i, rid in enumerate(unique_ids)}
colors = both_imputed["respondent_id_ferc714"].map(color_map)

# Create the scatter plot
plt.figure(figsize=(12, 12))
plt.scatter(
    both_imputed["demand_mwh"],
    both_imputed["demand_imputed_pudl_mwh"],
    c=colors,
    s=0.1,
    alpha=0.1,
)

# Set both axes to logarithmic scale
plt.xscale("log")
plt.yscale("log")

# Add gridlines
plt.grid(True, which="both", linestyle="--", linewidth=0.5)

# Optionally add labels and a title
plt.xlabel("Old Imputed FERC-714 Planning Area Demand [MWh]")
plt.ylabel("New Imputed FERC-714 Planning Area Demand [MWh]")
plt.title("Log-Log Scatter Plot of Old vs New Imputed Demand")

plt.show()