In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import calendar
import logging
import pathlib
import zipfile

from datetime import timedelta
import datetime

import geopandas
from geopandas import gpd
import numpy as np
import pandas as pd
import requests
from geopandas import GeoDataFrame
from shapely.geometry import GeometryCollection, MultiPolygon, Polygon
from shapely.ops import unary_union
from tqdm import tqdm
import seaborn as sns
import matplotlib as mpl
import sys

logger = logging.getLogger(__name__)

from shapely.geometry import Polygon, MultiPolygon

from pudl.analysis.demand_mapping import (corr_fig, error_heatmap, error_na_fig, compare_datasets,
                                          regional_demand_profiles, vec_error, uncovered_area_mismatch,
                                          layer_intersection, allocate_and_aggregate)

from pudl.output.ferc714 import Respondents
import matplotlib.pyplot as plt
import matplotlib as mpl

%matplotlib inline

import scipy
from collections import defaultdict
from IPython.display import clear_output
import pudl

from pudl.helpers import download_zip_url
from pudl.transform.ferc714 import OFFSET_CODES, TZ_CODES
import addfips

import sqlalchemy as sa

In [None]:
## Not set for map visualization
sns.set()
%matplotlib inline
mpl.rcParams['figure.figsize'] = (10,4)
mpl.rcParams['figure.dpi'] = 150
pd.options.display.max_columns = 100
pd.options.display.max_rows = 100

In [None]:
logger = logging.getLogger()
logger.setLevel(logging.INFO)
handler = logging.StreamHandler(stream=sys.stdout)
log_format = '%(asctime)s [%(levelname)8s] %(name)s:%(lineno)s %(message)s'
formatter = logging.Formatter(log_format)
handler.setFormatter(formatter)
logger.handlers = [handler]

# Define notebook parameters

In [None]:
pudl_settings = pudl.workspace.setup.get_defaults()
pudl_engine = sa.create_engine(pudl_settings['pudl_db'])

local_data = pathlib.Path(pudl_settings["data_dir"]) / "local"

# Importing and Loading Relevant Dataframes

Major dataframes and tables being accessed:
- FERC-714 databases used to allocate demand annually at the county level (part of the PUDL database)
- Census databases for county shapefiles [Public]
- 2010 and 2012 ReEDS regions hourly demand data [Private]
- More coming...

## County Shapefiles and Population Data

In [None]:
county_gdf = (pudl.analysis.service_territory.get_census2010_gdf(pudl_settings, "county")
              .rename(columns={"GEOID10":"county_id_fips", "DP0010001": "POPULATION"}))

county_gdf["STATE_FIPS_int"] = pd.to_numeric(county_gdf["county_id_fips"].str[:2])

county_gdf =  (
    county_gdf
    # Remove all islands and non-mainland states and territories
    .query("STATE_FIPS_int<=56 & STATE_FIPS_int not in (2, 15, 44)")
    # Project to US Albers conic equal-area projection
    .to_crs("ESRI:102003")
    .drop("STATE_FIPS_int", axis=1).sort_values("county_id_fips").reset_index(drop=True)[
        ["county_id_fips", "POPULATION", "geometry"]
    ]
)

## County-Planning Area Mapping Dataset & Planning Area Hourly Demand (FERC-714 Data)
- `ba_county_map` contains pairwise rows signifying which county have been supplied by which utility in each reporting year
- `pa_demand_ferc714_df` contains the hourly demand data for each planning area 
- One year of data is analyzed at a time

In [None]:
%%time
pudl_out = pudl.output.pudltabl.PudlTabl(pudl_engine=pudl_engine)
pa_demand_ferc714_df = pudl_out.demand_hourly_pa_ferc714()
ferc714_out = pudl.output.ferc714.Respondents(pudl_out)
ba_county_map = ferc714_out.georef_counties()

In [None]:
def annual_ferc_data(pa_demand_ferc714_df, rep_year):

    
    pa_demand_ferc714_df_annual = (pa_demand_ferc714_df[pa_demand_ferc714_df["report_date"].dt.year==rep_year]
                                   # Focusing on one year
                                   .pivot_table(values="demand_mwh",
                                                index="respondent_id_ferc714",
                                                columns="utc_datetime",
                                                aggfunc=np.mean)).reset_index()
                                                # Pivoting the data to the right format
    
    return pa_demand_ferc714_df_annual

## REEDS Data [OPTIONAL]
- Some of the REEDS public data is not readily accessible
- If that is the case, extract the timeseries data and the shapefile

### REEDS Shapefiles

In [None]:
%%time
reeds_gdf_path = local_data / "nrel/reeds_shp"
if not reeds_gdf_path.is_dir():
    raise FileNotFoundError(
        f"ReEDS Balancing Area geometries not found."
        f"Expected them at {reeds_path}"
    )
reeds_gdf = gpd.read_file(reeds_gdf_path)

logger.info("Normalizing inconsistent geometries")
reeds_gdf["geometry"] = reeds_gdf["geometry"].buffer(0)

logger.info("Keeping Mainland US REEDS regions and dissolving to the PCA level")
reeds_gdf = (
    reeds_gdf.assign(pca_num=lambda x: pd.to_numeric(x.pca.replace("^p", value="", regex=True)))
    .query("pca_num<=134")
    .to_crs("ESRI:102003")
    .drop(["Shape_Leng", "Shape_Area", "OBJECTID", "gid", "demreg2", "pca_num"], axis=1)
    .dissolve(by=["pca"], as_index=False)
)

### 2012 REEDS Hourly Data

In [None]:
%%time
logger.info("Loading REEDS 2012 hourly demand data")
reeds_hourly_2012 = (pd.read_csv(local_data / "nrel" / "reeds_load_2012_est_time.csv",
                                 infer_datetime_format=True,
                                 parse_dates=["est_time"]))

logger.info("Converting local hourly demand data to UTC datetime for consistency")
reeds_hourly_2012["utc_datetime"] = reeds_hourly_2012["est_time"] - OFFSET_CODES["EST"]

logger.info("Merging with the shapefile")
reeds_hourly_2012 = (reeds_hourly_2012
                     .drop("est_time", axis=1)
                     .set_index("utc_datetime").T
                     .reset_index()
                     .rename(columns={"index":"pca"})
                     .merge(reeds_gdf))

# 2012 Demand Data Analysis

- Allocating 2012 US mainland hourly demand data disaggregated to the county level and re-aggregated to any other disjoint geometry series based on available FERC714 data
- The data is compared to REEDS 2012 hourly demand data

In [None]:
def annual_demand_county_mapping(ba_county_map, county_gdf, rep_year=2010):
    """
    Function to extract a single year of demand from the mapping of demand and counties
    """
    ba_county_map_yr = ba_county_map[ba_county_map.report_date.dt.year == rep_year].to_crs("ESRI:102003")

    ba_county_id_set = (ba_county_map_yr
                        .groupby(["county_id_fips", "state", "state_id_fips"])["respondent_id_ferc714"]
                        .agg(frozenset).reset_index()
                        .rename(columns={"respondent_id_ferc714":"respondent_id_ferc714_set"}))
    
    ba_county_map_yr = (ba_county_map_yr
                        .merge(ba_county_id_set)
                        .merge(county_gdf.drop("geometry", axis=1))
                       )

    
    ba_county_map_yr = ba_county_map_yr.merge(ba_county_id_set)

    ba_county_map_yr.drop(["balancing_authority_id_eia",
                             "balancing_authority_name_eia", "utility_id_eia",
                             "utility_name_eia"], axis=1, inplace=True)
    
    return ba_county_map_yr



In [None]:
%%time

ba_county_map_2012 = annual_demand_county_mapping(ba_county_map, county_gdf, rep_year=2012)

attributes_2012 = dict()
logger.info("Creating attributes dictionary")
for col in list(reeds_gdf.columns) + list(ba_county_map_2012.columns):
    
    if col == "respondent_id_ferc714":
        attributes_2012[col] = "ID"
        
    elif col == "POPULATION":
        attributes_2012[col] = "uniform"
        
    elif col != "geometry":
        attributes_2012[col] = "constant"

logger.info("Disaggregating the county-planning area-REEDS shapefiles")
final_disagg_layer_2012 = layer_intersection(ba_county_map_2012, reeds_gdf, attributes_2012)

logger.info("Extracting 2012 demand data")
pa_demand_ferc714_df_2012 = annual_ferc_data(pa_demand_ferc714_df.copy(), rep_year=2012)

demand_columns_2012 = [col for col in pa_demand_ferc714_df_2012.columns if type(col) != str]

## Allocate FERC714 Data for 2012
- The `allocate_and_aggregate` function takes care of the allocation and aggregation (if required) based on the parameters in the function definition

In [None]:
%%time


final_disagg_layer_2012_sel = final_disagg_layer_2012[final_disagg_layer_2012["respondent_id_ferc714"].isin(
    pa_demand_ferc714_df_2012["respondent_id_ferc714"]
)]

reeds_hourly_pop_2012, geo_layer_2012 = allocate_and_aggregate(final_disagg_layer_2012_sel.copy(),
                                                               attributes=attributes_2012,
                                                               timeseries=pa_demand_ferc714_df_2012.copy(),
                                                               aggregators="pca",
                                                               allocatees=demand_columns_2012)

### REEDS Regions
- Some suggestive REEDS PCA regions mentioned
- We are using Texas for our analysis

In [None]:
TX_PCAs = ['p60', 'p61', 'p62', 'p63', 'p64', 'p65', 'p67']
CO_PCAs = ["p33", "p34"]
NM_PCAs = ["p31", "p47"]
LA_PCAs = ["p58", "p86"]

## 2012 REEDS Analysis
- Allocating demand solely based on population and its exponents
- i.e. demand may be allocated based on population, or population^2... and accordingly the minimum error is compared to the REEDS data for best fit

In [None]:
%%time
reeds_errs_2012_TX = []
reeds_exps_2012_TX = [-1, -0.75, -0.5, -0.25, 0, 0.25, 0.5, 0.75, 1, 1.5, 2, 3]


for i, exp in enumerate(reeds_exps_2012_TX):
    
    print("Iteration " + str(i) + ": Calculation for exponent " + str(exp))
    reeds_hourly_pop_2012 = allocate_and_aggregate(final_disagg_layer_2012_sel.copy(),
                                                   attributes=attributes_2012,
                                                   timeseries=pa_demand_ferc714_df_2012.copy(),
                                                   aggregators="pca", allocatees=demand_columns_2012,
                                                   geo_layer=geo_layer_2012.copy(), alloc_exps=exp)

    reeds_compare_2012_TX = compare_datasets(reeds_hourly_pop_2012.copy(), reeds_hourly_2012.copy(),
                                          demand_columns_2012, TX_PCAs, time_col="utc_datetime", region="pca")
    
    mse_err = np.nanmean(reeds_compare_2012_TX.eval("(measured - predicted) ** 2"))
    reeds_errs_2012_TX.append(mse_err)
    clear_output()


reeds_err_df_2012_TX = pd.DataFrame({
    "exponent": reeds_exps_2012_TX,
    "errors": reeds_errs_2012_TX
})
plt.plot("exponent", "errors", data=reeds_err_df_2012_TX.sort_values("exponent"))

- It is notable that the error margin is lowest when demand is directly allocated based on population raised to the power 1

In [None]:
reeds_hourly_pop_2012 = allocate_and_aggregate(final_disagg_layer_2012_sel.copy(),
                                 attributes=attributes_2012,timeseries=pa_demand_ferc714_df_2012.copy(),
                                 aggregators="pca", allocatees=demand_columns_2012,
                                 geo_layer=geo_layer_2012.copy(), alloc_exps=1)

reeds_compare_2012 = compare_datasets(alloc_demand=reeds_hourly_pop_2012.copy(),
                                      actual_demand=reeds_hourly_2012.copy(),
                                      demand_columns=demand_columns_2012,
                                      select_regions=list(reeds_hourly_2012["pca"].unique()))


# create_figs(reeds_compare_2012.copy())

### Parity Plot

In [None]:
corr_fig(reeds_compare_2012.copy(), TX_PCAs,
         suptitle="Allocated demand for TX in 2012", top=0.94)

### Seasonal Errors and NA values

In [None]:
error_na_fig(reeds_compare_2012[reeds_compare_2012["region"].isin(TX_PCAs)].copy())

### Regional Demand Profiles

In [None]:
%%time
regional_demand_profiles(reeds_compare_2012.copy(), TX_PCAs, agg=False)

### Uncovered Areas

In [None]:
%%time
uncovered_area_mismatch(final_disagg_layer_2012_sel, reeds_gdf)

### Yearly-Hourly Heatmap
- Used to compare seasonal errors over a period 

In [None]:
%%time
error_heatmap(reeds_hourly_pop_2012.copy(), reeds_hourly_2012.copy(), demand_columns_2012, error_metric="mse",
              leap_exception=True)