# Setup

In [4]:
from pathlib import Path

import pandas as pd
import pyarrow.parquet as pq


# Load data



In [5]:
# Put in the name of the file that you want to load
ferc714_yearly = "core_ferc714__yearly_planning_area_demand_forecast"

In [6]:
def path_finder(target_asset_name):
    """Returns the path to the target_asset_name and incorporates your local PUDL_OUTPUT value."""
    # Find the PUDL_OUTPUT dir path
    bashrc_path = Path.home() / ".bashrc"
    pudl_output_filepath = None

    with Path.open(bashrc_path, "r") as file:
        for line in file:
            if line.startswith("export PUDL_OUTPUT="):
                pudl_output_filepath = line.split("=")[1].strip().strip('"')
                break

    if not pudl_output_filepath:
        print("PUDL_OUTPUT not found in .bashrc")
        return ""

    # Using the PUDL_OUTPUT dir path, find the target file
    target_asset_filepath = ""

    pudl_output_path = Path(pudl_output_filepath)

    for path in pudl_output_path.rglob("*"):
        if path.is_file() and path.stem == target_asset_name:
            target_asset_filepath = str(path)
            break

    print(f"Target asset filepath: {target_asset_filepath}")
    return target_asset_filepath


In [7]:
ferc714_yearly_path = path_finder(ferc714_yearly)
ferc714_yearly_table = pq.read_table(ferc714_yearly_path)
ferc714_yearly_df = ferc714_yearly_table.to_pandas()
ferc714_yearly_df.head()

Target asset filepath: /Users/sam/Documents/pudl-data/pudl_output/parquet/core_ferc714__yearly_planning_area_demand_forecast.parquet


Unnamed: 0,respondent_id_ferc714,report_year,forecast_year,summer_peak_demand_mw,winter_peak_demand_mw,net_demand_mwh
0,2,2006,2007,1108.0,0.0,0.0
1,2,2006,2008,1141.0,0.0,0.0
2,2,2006,2009,1173.0,0.0,0.0
3,2,2006,2010,1261.0,0.0,0.0
4,2,2006,2011,1292.0,0.0,0.0


# Initial checks

## Is the data complete?

In [8]:
counts = ferc714_yearly_df.groupby(["respondent_id_ferc714", "report_year"]).size().reset_index(name="forecast_year_count").sort_values(by="report_year")
print("For this FERC 714 form, respondents were expected to provide 10 years' worth of forecasted demand.")
print("Here we can see that not all respondents provided 10 years' worth each report year:")
counts[counts["forecast_year_count"] != 10]

For this FERC 714 form, respondents were expected to provide 10 years' worth of forecasted demand.
Here we can see that not all respondents provided 10 years' worth each report year:


Unnamed: 0,respondent_id_ferc714,report_year,forecast_year_count
174,125,2006,9
1088,235,2009,9
1770,321,2010,9
849,211,2013,9
939,219,2013,3
787,201,2018,1


## Do we see any obvious anomalies?

In [9]:
# Define threshold for anomalies (e.g., more than 100% change)
threshold = 100.0

# Apply the logic without using a function
grouped = ferc714_yearly_df.groupby(["respondent_id_ferc714", "report_year"])

# Initialize an empty DataFrame to collect anomalies
anomalies = pd.DataFrame(columns=[
    "respondent_id_ferc714", "report_year", "forecast_year",
    "metric_type", "percentage_change", "value_in_prior_year", "value_in_this_forecast_year"
])

for key, group in grouped:
    # Sort by forecast_year
    group = group.sort_values(by="forecast_year")

    # Calculate percentage change
    group["summer_peak_demand_pct_change"] = group["summer_peak_demand_mw"].pct_change() * 100
    group["winter_peak_demand_pct_change"] = group["winter_peak_demand_mw"].pct_change() * 100
    group["net_demand_pct_change"] = group["net_demand_mwh"].pct_change() * 100

    # Check for anomalies and append to the result DataFrame
    for index, row in group.iterrows():
        if abs(row["summer_peak_demand_pct_change"]) > threshold:
            anomalies = pd.concat([anomalies, pd.DataFrame({
                "respondent_id_ferc714": [row["respondent_id_ferc714"]],
                "report_year": [row["report_year"]],
                "forecast_year": [row["forecast_year"]],
                "metric_type": ["summer_peak_demand_mw"],
                "percentage_change": [row["summer_peak_demand_pct_change"]],
                "value_in_prior_year": [group.loc[index-1, "summer_peak_demand_mw"]] if index > 0 else [None],
                "value_in_this_forecast_year": [row["summer_peak_demand_mw"]]
            })], ignore_index=True)
        if abs(row["winter_peak_demand_pct_change"]) > threshold:
            anomalies = pd.concat([anomalies, pd.DataFrame({
                "respondent_id_ferc714": [row["respondent_id_ferc714"]],
                "report_year": [row["report_year"]],
                "forecast_year": [row["forecast_year"]],
                "metric_type": ["winter_peak_demand_mw"],
                "percentage_change": [row["winter_peak_demand_pct_change"]],
                "value_in_prior_year": [group.loc[index-1, "winter_peak_demand_mw"]] if index > 0 else [None],
                "value_in_this_forecast_year": [row["winter_peak_demand_mw"]]
            })], ignore_index=True)
        if abs(row["net_demand_pct_change"]) > threshold:
            anomalies = pd.concat([anomalies, pd.DataFrame({
                "respondent_id_ferc714": [row["respondent_id_ferc714"]],
                "report_year": [row["report_year"]],
                "forecast_year": [row["forecast_year"]],
                "metric_type": ["net_demand_mwh"],
                "percentage_change": [row["net_demand_pct_change"]],
                "value_in_prior_year": [group.loc[index-1, "net_demand_mwh"]] if index > 0 else [None],
                "value_in_this_forecast_year": [row["net_demand_mwh"]]
            })], ignore_index=True)

# Reset index for the final anomalies DataFrame
anomalies = anomalies.reset_index(drop=True)
anomalies["respondent_id_ferc714"] = anomalies["respondent_id_ferc714"].astype(int)
anomalies["report_year"] = anomalies["report_year"].astype(int)
anomalies["forecast_year"] = anomalies["forecast_year"].astype(int)


  anomalies = pd.concat([anomalies, pd.DataFrame({


In [None]:
anomalies.head()

Unnamed: 0,respondent_id_ferc714,report_year,forecast_year,metric_type,percentage_change,value_in_prior_year,value_in_this_forecast_year
0,120,2006,2013,net_demand_mwh,514.395325,1928828.0,11850629.0
1,125,2017,2024,winter_peak_demand_mw,944.296631,3384.0,35339.0
2,134,2007,2009,winter_peak_demand_mw,9786.792969,106.0,10480.0
3,159,2006,2008,winter_peak_demand_mw,inf,0.0,2848.0
4,159,2006,2008,net_demand_mwh,inf,0.0,13399136.0
