In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import calendar
import logging
import pathlib
import zipfile

from datetime import timedelta
import datetime

import geopandas
from geopandas import gpd
import numpy as np
import pandas as pd
import requests
from geopandas import GeoDataFrame
from shapely.geometry import GeometryCollection, MultiPolygon, Polygon
from shapely.ops import unary_union
from tqdm import tqdm
import seaborn as sns
import matplotlib as mpl
import sys

logger = logging.getLogger(__name__)

from shapely.geometry import Polygon, MultiPolygon

from pudl.analysis.demand_mapping import (corr_fig, error_heatmap, error_na_fig, compare_datasets,
                                          regional_demand_profiles, vec_error, uncovered_area_mismatch,
                                          layer_intersection, allocate_and_aggregate)

from pudl.output.ferc714 import Respondents
import matplotlib.pyplot as plt
import matplotlib as mpl

%matplotlib inline

import scipy
from collections import defaultdict
from IPython.display import clear_output
import pudl

from pudl.helpers import download_zip_url
from pudl.transform.ferc714 import OFFSET_CODES, TZ_CODES
import addfips

import sqlalchemy as sa

In [None]:
## Not set for map visualization
sns.set()
%matplotlib inline
mpl.rcParams['figure.figsize'] = (10,4)
mpl.rcParams['figure.dpi'] = 150
pd.options.display.max_columns = 100
pd.options.display.max_rows = 100

In [None]:
logger = logging.getLogger()
logger.setLevel(logging.INFO)
handler = logging.StreamHandler(stream=sys.stdout)
log_format = '%(asctime)s [%(levelname)8s] %(name)s:%(lineno)s %(message)s'
formatter = logging.Formatter(log_format)
handler.setFormatter(formatter)
logger.handlers = [handler]

# Define notebook parameters

In [None]:
pudl_settings = pudl.workspace.setup.get_defaults()
pudl_engine = sa.create_engine(pudl_settings['pudl_db'])

local_data = pathlib.Path(pudl_settings["data_dir"]) / "local"

# Importing and Loading Relevant Dataframes

Major dataframes and tables being accessed:
- FERC-714 databases used to allocate demand annually at the county level (part of the PUDL database)
- Census databases for county shapefiles [Public]
- 2010 and 2012 ReEDS regions hourly demand data [Private]
- More coming...

## County Shapefiles and Population Data

In [None]:
%%time
uscb_census2010_url = "http://www2.census.gov/geo/tiger/TIGER2010DP1/Profile-County_Tract.zip"
uscb_census2010_dir = local_data / "uscb" / "census2010"
uscb_census2010_dir.mkdir(parents=True, exist_ok=True)

uscb_census2010_zipfile = uscb_census2010_dir / "census2010.zip"
uscb_census2010_gdb_dir = uscb_census2010_dir / "census2010.gdb"

if not uscb_census2010_gdb_dir.is_dir():
    logger.info("No Census GeoDB found. Downloading from US Census Bureau.")
    # Download to appropriate location
    download_zip_url(uscb_census2010_url, uscb_census2010_zipfile)
    # Unzip because we can't use zipfile paths with geopandas
    with zipfile.ZipFile(uscb_census2010_zipfile, 'r') as zip_ref:
        zip_ref.extractall(uscb_dir)
        # Grab the UUID based directory name so we can change it:
        extract_root = uscb_dir / pathlib.Path(zip_ref.filelist[0].filename).parent
    extract_root.rename(uscb_census2010_gdb_dir)
else:
    logger.info("We've already got the 2010 Census GeoDB.")

logger.info("Extracting the GeoDB into a GeoDataFrame")
census_tract_gdf = gpd.read_file(uscb_census2010_gdb_dir, driver='FileGDB', layer='Tract_2010Census_DP1')

## Creating columns for county and state level aggregation
census_tract_gdf["STATE_FIPS"] = census_tract_gdf["GEOID10"].str[:2]
census_tract_gdf["STATE_FIPS_int"] = pd.to_numeric(census_tract_gdf["GEOID10"].str[:2])
census_tract_gdf["STCOFIPS"] = census_tract_gdf["GEOID10"].str[:5]

census_tract_gdf = (
    census_tract_gdf
    # Remove all islands and non-mainland states and territories
    .query("STATE_FIPS_int<=56 & STATE_FIPS_int not in (2, 15, 44)")
    # Project to US Albers conic equal-area projection
    .to_crs("ESRI:102003")
)
census_tract_gdf["SQMI"] = census_tract_gdf.area / 10 ** 6 / 1.60934 ** 2
census_tract_gdf = census_tract_gdf.rename(columns={
    "DP0010001": "POPULATION",
    "GEOID10": "FIPS"
})
census_tract_gdf.drop("STATE_FIPS_int", axis=1)
census_tract_gdf.sample(5)

logger.info("Dissolving census tracts to form counties")
county_gdf =  census_tract_gdf[["STCOFIPS", "POPULATION", "geometry"]].dissolve(by="STCOFIPS",
                                                                              aggfunc=np.sum,
                                                                              as_index=False)

county_gdf = (county_gdf[["STCOFIPS", "POPULATION", "geometry"]]
              .rename(columns={"STCOFIPS":"county_id_fips"}))

# county_gdf.to_file(str(local_data / "counties.shp"))

## County-Planning Area Mapping Dataset
- `ba_county_map` contains pairwise rows signifying which county have been supplied by which utility in each reporting year 

In [None]:
%%time
pudl_out = pudl.output.pudltabl.PudlTabl(pudl_engine=pudl_engine)
ferc714_out = pudl.output.ferc714.Respondents(pudl_out)
ba_county_map = ferc714_out.georef_counties()

## REEDS Data
- Some of the REEDS public data is not readily accessible
- If that is the case, extract the timeseries data and the shapefile

### REEDS Shapefiles

In [None]:
%%time
reeds_gdf_path = local_data / "nrel/reeds_shp"
if not reeds_gdf_path.is_dir():
    raise FileNotFoundError(
        f"ReEDS Balancing Area geometries not found."
        f"Expected them at {reeds_path}"
    )
reeds_gdf = gpd.read_file(reeds_gdf_path)
reeds_gdf["geometry"] = reeds_gdf["geometry"].apply(lambda x: x.buffer(0))

reeds_gdf = (
    reeds_gdf.assign(pca_num=lambda x: pd.to_numeric(x.pca.replace("^p", value="", regex=True)))
    .query("pca_num<=134")
    .to_crs("ESRI:102003")
)

reeds_gdf.drop(["Shape_Leng", "Shape_Area", "OBJECTID", "gid", "demreg2", "pca_num"], axis=1, inplace=True)
reeds_gdf = reeds_gdf.dissolve(by=["pca"], as_index=False)

### 2010 REEDS Hourly Data

In [None]:
logger.info("Loading REEDS 2010 hourly demand data")
reeds_hourly_2010 = (pd.read_csv(str(local_data / "nrel" / "reeds_load_2010_est_time.csv"),
                                 infer_datetime_format=True,
                                 parse_dates=["est_time"]))

logger.info("Converting local hourly demand data to UTC datetime for consistency")
reeds_hourly_2010["utc_datetime"] = reeds_hourly_2010["est_time"] - OFFSET_CODES["EST"]


reeds_hourly_2010 = (reeds_hourly_2010
                     .drop("est_time", axis=1)
                     .set_index("utc_datetime").T
                     .reset_index()
                     .rename(columns={"index":"pca"})
                     .merge(reeds_gdf))

### 2012 REEDS Hourly Data

In [None]:
reeds_hourly_2012 = (pd.read_csv(str(local_data / "nrel" / "reeds_load_2012_est_time.csv"),
                                 infer_datetime_format=True,
                                 parse_dates=["est_time"]))

reeds_hourly_2012["utc_datetime"] = reeds_hourly_2012["est_time"] - OFFSET_CODES["EST"]

reeds_hourly_2012 = (reeds_hourly_2012
                     .drop("est_time", axis=1)
                     .set_index("utc_datetime").T
                     .reset_index()
                     .rename(columns={"index":"pca"})
                     .merge(reeds_gdf))

## FERC 714 Time Series Data
- The `pa_demand_ferc714_df` variable stores the hourly demand data for each planning area 
- One year of data is analyzed at a time

In [None]:
%%time

ferc714_url = "https://www.ferc.gov/sites/default/files/2020-06/form714-database-June-2020.zip"
ferc714_dir = local_data / "ferc714"
ferc714_dir.mkdir(parents=True, exist_ok=True)

ferc714_save_path = ferc714_dir / "ferc714.zip"
if ferc714_save_path.exists():
    logger.info("Already have FERC 714 data, not downloading.")
else:
    logger.info("Downloading fresh FERC 714 data.")
    download_zip_url(ferc714_url, ferc714_save_path)

raw_ferc714_dfs = pudl.extract.ferc714.extract(pudl_settings=pudl_settings)
tfr_ferc714_dfs = pudl.transform.ferc714.transform(raw_ferc714_dfs)
pa_demand_ferc714_df = pd.merge(
    tfr_ferc714_dfs["demand_hourly_pa_ferc714"],
    tfr_ferc714_dfs["respondent_id_ferc714"]
)

In [None]:
def annual_ferc_data(pa_demand_ferc714_df, final_disagg_layer, rep_year):

    # Focusing on one year
    pa_demand_ferc714_df_annual = pa_demand_ferc714_df[pa_demand_ferc714_df["report_date"].dt.year==rep_year]
    
#     .query("report_year == "+str(rep_year))

    # Pivoting to the right format for the merge
    pa_demand_ferc714_df_annual = pa_demand_ferc714_df_annual.pivot_table(values="demand_mwh",
                                                                          index="respondent_id_ferc714",
                                                                          columns="utc_datetime",
                                                                          aggfunc=np.mean)
    
    pa_demand_ferc714_df_annual.reset_index(inplace=True)

    # Removing respondents with no mapping
    pa_demand_ferc714_df_annual = pa_demand_ferc714_df_annual[
        pa_demand_ferc714_df_annual["respondent_id_ferc714"]
        .isin(final_disagg_layer["respondent_id_ferc714"].unique())]
    
    pa_demand_ferc714_df_annual.reset_index(drop=True, inplace=True)
    
    return pa_demand_ferc714_df_annual

# 2012 Demand Data Analysis

- Allocating 2012 US mainland hourly demand data disaggregated to the county level and re-aggregated to any other disjoint geometry series based on available FERC714 data
- The data is compared to REEDS 2012 hourly demand data

In [None]:
def annual_demand_county_mapping(ba_county_map, county_gdf, rep_year=2010):
    """
    Function to extract a single year of demand
    """
    ba_county_map_yr = (ba_county_map.query("report_date >= " + str(rep_year) + "0101 \
                                             & report_date < " + str(rep_year + 1) + "0101")
                        .to_crs("ESRI:102003"))
#                                              & has_demand == True")

    ba_county_id_set = (ba_county_map_yr
                        .groupby(["county_id_fips", "state", "state_id_fips"])["respondent_id_ferc714"]
                        .agg(frozenset).reset_index()
                        .rename(columns={"respondent_id_ferc714":"respondent_id_ferc714_set"}))
    
    ba_county_map_yr = (ba_county_map_yr
                        .merge(ba_county_id_set)
                        .merge(county_gdf.drop("geometry", axis=1))
                       )

    
    ba_county_map_yr = ba_county_map_yr.merge(ba_county_id_set)

    ba_county_map_yr.drop(["balancing_authority_id_eia",
                             "balancing_authority_name_eia", "utility_id_eia",
                             "utility_name_eia"], axis=1, inplace=True)
    
    return ba_county_map_yr



In [None]:
%%time

ba_county_map_2012 = annual_demand_county_mapping(ba_county_map, county_gdf, rep_year=2012)

attributes_2012 = dict()
for col in list(reeds_gdf.columns) + list(ba_county_map_2012.columns):
    
    if col == "respondent_id_ferc714":
        attributes_2012[col] = "ID"
        
    elif col == "POPULATION":
        attributes_2012[col] = "uniform"
        
    elif col != "geometry":
        attributes_2012[col] = "constant"

final_disagg_layer_2012 = layer_intersection(ba_county_map_2012, reeds_gdf, attributes_2012)
pa_demand_ferc714_df_2012 = annual_ferc_data(pa_demand_ferc714_df.copy(),
                                             final_disagg_layer=final_disagg_layer_2012.copy(),
                                             rep_year=2012)

demand_columns_2012 = [col for col in pa_demand_ferc714_df_2012.columns if type(col) != str]

## Allocate FERC714 Data for 2012
- The `allocate_and_aggregate` function takes care of the allocation and aggregation (if required) based on the parameters in the function definition

In [None]:
%%time


final_disagg_layer_2012_sel = final_disagg_layer_2012[final_disagg_layer_2012["respondent_id_ferc714"].isin(
    pa_demand_ferc714_df_2012["respondent_id_ferc714"]
)]

reeds_hourly_pop_2012, geo_layer_2012 = allocate_and_aggregate(final_disagg_layer_2012_sel.copy(),
                                                               attributes=attributes_2012,
                                                               timeseries=pa_demand_ferc714_df_2012.copy(),
                                                               aggregators="pca",
                                                               allocatees=demand_columns_2012)

### REEDS Regions
- Some suggestive REEDS PCA regions mentioned
- We are using Texas for our analysis

In [None]:
TX_PCAs = ['p60', 'p61', 'p62', 'p63', 'p64', 'p65', 'p67']
CO_PCAs = ["p33", "p34"]
NM_PCAs = ["p31", "p47"]
LA_PCAs = ["p58", "p86"]

## 2012 REEDS Analysis
- Allocating demand solely based on population and its exponents
- i.e. demand may be allocated based on population, or population^2... and accordingly the minimum error is compared to the REEDS data for best fit

In [None]:
%%time
reeds_errs_2012_TX = []
reeds_exps_2012_TX = [-1, -0.75, -0.5, -0.25, 0, 0.25, 0.5, 0.75, 1, 1.5, 2, 3]


for i, exp in enumerate(reeds_exps_2012_TX):
    
    print("Iteration " + str(i) + ": Calculation for exponent " + str(exp))
    reeds_hourly_pop_2012 = allocate_and_aggregate(final_disagg_layer_2012_sel.copy(),
                                                   attributes=attributes_2012,
                                                   timeseries=pa_demand_ferc714_df_2012.copy(),
                                                   aggregators="pca", allocatees=demand_columns_2012,
                                                   geo_layer=geo_layer_2012.copy(), alloc_exps=exp)

    reeds_compare_2012_TX = compare_datasets(reeds_hourly_pop_2012.copy(), reeds_hourly_2012.copy(),
                                          demand_columns_2012, TX_PCAs, time_col="utc_datetime", region="pca")
    
    mse_err = np.nanmean(reeds_compare_2012_TX.eval("(actual - alloc) ** 2"))
    reeds_errs_2012_TX.append(mse_err)
    clear_output()


reeds_err_df_2012_TX = pd.DataFrame({
    "exponent": reeds_exps_2012_TX,
    "errors": reeds_errs_2012_TX
})
plt.plot("exponent", "errors", data=reeds_err_df_2012_TX.sort_values("exponent"))

- It is notable that the error margin is lowest when demand is directly allocated based on population raised to the power 1

In [None]:
reeds_hourly_pop_2012 = allocate_and_aggregate(final_disagg_layer_2012_sel.copy(),
                                 attributes=attributes_2012,timeseries=pa_demand_ferc714_df_2012.copy(),
                                 aggregators="pca", allocatees=demand_columns_2012,
                                 geo_layer=geo_layer_2012.copy(), alloc_exps=1)

reeds_compare_2012 = compare_datasets(alloc_demand=reeds_hourly_pop_2012.copy(),
                                      actual_demand=reeds_hourly_2012.copy(),
                                      demand_columns=demand_columns_2012,
                                      select_regions=list(reeds_hourly_2012["pca"].unique()))


# create_figs(reeds_compare_2012.copy())

### Parity Plot

In [None]:
corr_fig(reeds_compare_2012[reeds_compare_2012["region"].isin(TX_PCAs)].copy(),
         suptitle="Allocated demand for TX in 2012")

### Seasonal Errors and NA values

In [None]:
error_na_fig(reeds_compare_2012[reeds_compare_2012["region"].isin(TX_PCAs)].copy())

### Regional Demand Profiles

In [None]:
regional_demand_profiles(reeds_compare_2012.copy(), select_regions=TX_PCAs)

### Uncovered Areas

In [None]:
uncovered_area_mismatch(GeoDataFrame(reeds_hourly_pop_2012.copy()), GeoDataFrame(reeds_hourly_2012.copy()))

### Yearly-Hourly Heatmap
- Used to compare seasonal errors over a period 

In [None]:
error_heatmap(reeds_hourly_pop_2012.copy(), reeds_hourly_2012.copy(), demand_columns_2012, error_metric="mse",
              leap_exception=True)