In [1]:
%load_ext autoreload
%autoreload 2

In [33]:
import os, sys
import numpy as np
import pandas as pd
import sqlalchemy as sa
import pudl
import pudl.constants as pc
from pudl.analysis.state_demand import (
    load_ferc714_hourly_demand_matrix,
    clean_ferc714_hourly_demand_matrix,
    filter_ferc714_hourly_demand_matrix,
    impute_ferc714_hourly_demand_matrix,
    melt_ferc714_hourly_demand_matrix,
    load_counties,
    load_ferc714_county_assignments,
    load_eia861_state_total_sales,
    predict_state_hourly_demand,
    plot_demand_timeseries,
    plot_demand_scatter,
    load_ventyx_hourly_state_demand,
    lookup_state,
    compare_state_demand,
)

In [3]:
import logging
logger=logging.getLogger()
logger.setLevel(logging.INFO)
handler = logging.StreamHandler(stream=sys.stdout)
formatter = logging.Formatter('%(message)s')
handler.setFormatter(formatter)
logger.handlers = [handler]

In [4]:
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
sns.set()
%matplotlib inline

In [5]:
mpl.rcParams['figure.figsize'] = (10,4)
mpl.rcParams['figure.dpi'] = 150
pd.options.display.max_columns = 100
pd.options.display.max_rows = 100

In [6]:
API_KEY_EIA = os.environ["API_KEY_EIA"]
API_KEY_BLS = os.environ["API_KEY_BLS"]
API_KEY_FRED = os.environ["API_KEY_FRED"]
#HARVEST_TOKEN = os.environ["HARVEST_TOKEN"]
#HARVEST_ACCOUNT_ID = os.environ["HARVEST_ACCOUNT_ID"]

pudl_settings = pudl.workspace.setup.get_defaults()
ferc1_engine = sa.create_engine(pudl_settings['ferc1_db'])
pudl_engine = sa.create_engine(pudl_settings['pudl_db'])
pudl_out = pudl.output.pudltabl.PudlTabl(pudl_engine=pudl_engine)
pudl_settings

{'pudl_in': '/home/zane/code/catalyst/pudl-work',
 'data_dir': '/home/zane/code/catalyst/pudl-work/data',
 'settings_dir': '/home/zane/code/catalyst/pudl-work/settings',
 'pudl_out': '/home/zane/code/catalyst/pudl-work',
 'sqlite_dir': '/home/zane/code/catalyst/pudl-work/sqlite',
 'parquet_dir': '/home/zane/code/catalyst/pudl-work/parquet',
 'datapkg_dir': '/home/zane/code/catalyst/pudl-work/datapkg',
 'notebook_dir': '/home/zane/code/catalyst/pudl-work/notebook',
 'ferc1_db': 'sqlite:////home/zane/code/catalyst/pudl-work/sqlite/ferc1.sqlite',
 'pudl_db': 'sqlite:////home/zane/code/catalyst/pudl-work/sqlite/pudl.sqlite'}

## Load the FERC 714 data
* Should take ~1 minute.

In [7]:
%%time
df1, tz = load_ferc714_hourly_demand_matrix(pudl_out)

Running the interim FERC 714 ETL process!
Extracting respondent_id_ferc714 from CSV into pandas DataFrame.
Extracting id_certification_ferc714 from CSV into pandas DataFrame.


The data has not yet been validated, and the structure may change.


Extracting gen_plants_ba_ferc714 from CSV into pandas DataFrame.
Extracting demand_monthly_ba_ferc714 from CSV into pandas DataFrame.
Extracting net_energy_load_ba_ferc714 from CSV into pandas DataFrame.
Extracting adjacency_ba_ferc714 from CSV into pandas DataFrame.
Extracting interchange_ba_ferc714 from CSV into pandas DataFrame.
Extracting lambda_hourly_ba_ferc714 from CSV into pandas DataFrame.
Extracting lambda_description_ferc714 from CSV into pandas DataFrame.
Extracting description_pa_ferc714 from CSV into pandas DataFrame.
Extracting demand_forecast_pa_ferc714 from CSV into pandas DataFrame.
Extracting demand_hourly_pa_ferc714 from CSV into pandas DataFrame.
Transforming respondent_id_ferc714.
Transforming id_certification_ferc714.
Transforming gen_plants_ba_ferc714.
Transforming demand_monthly_ba_ferc714.
Transforming net_energy_load_ba_ferc714.
Transforming adjacency_ba_ferc714.
Transforming interchange_ba_ferc714.
Transforming lambda_hourly_ba_ferc714.
Transforming lambda_d

## Clean the FERC 714 hourly demand matrix
* This uses ~32GB of memory.
* Takes ~10 minutes.
* Why the `RuntimeWarning` about an All-NaN slice?

In [8]:
%%time
df2 = clean_ferc714_hourly_demand_matrix(df1)

  result = np.apply_along_axis(_nanmedian1d, axis, a, overwrite_input)
  result = np.apply_along_axis(_nanquantile_1d, axis, a, q,


CPU times: user 3min 50s, sys: 2min 26s, total: 6min 17s
Wall time: 7min 4s


## Remove respondents lacking data

In [9]:
%%time
df3 = filter_ferc714_hourly_demand_matrix(df2, min_data=100, min_data_fraction=0.9)

Nulled short respondent-years (below min_data):
id
201    [2006, 2019]
Name: year, dtype: object
Nulled bad respondent-years (below min_data_fraction):
id
115    [2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019]                                    
150    [2007]                                                                              
161    [2008, 2009]                                                                        
201    [2006]                                                                              
260    [2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019]
Name: year, dtype: object
Dropped blank respondents: [137, 148, 153, 154, 161, 201, 208, 260, 288, 293, 294]
CPU times: user 1 s, sys: 103 ms, total: 1.11 s
Wall time: 1.24 s


## Impute missing demand values.
* Very CPU intensive, takes ~1 hour and maxes out all 4 of my cores.
* Not very memory intensive.

In [10]:
%%time
df4 = impute_ferc714_hourly_demand_matrix(df3)

Imputing year 2006
Iteration: 210
Imputing year 2007
Iteration: 201
Imputing year 2008
Iteration: 216
Imputing year 2009
Iteration: 206
Imputing year 2010
Iteration: 202
Imputing year 2011
Iteration: 213
Imputing year 2012
Iteration: 196
Imputing year 2013
Iteration: 210
Imputing year 2014
Iteration: 206
Imputing year 2015
Iteration: 193
Imputing year 2016
Iteration: 198
Imputing year 2017
Iteration: 201
Imputing year 2018
Iteration: 226
Imputing year 2019
Iteration: 201
CPU times: user 34min 55s, sys: 15min 11s, total: 50min 7s
Wall time: 16min 58s


## Melt FERC 714 hourly demand

In [11]:
%%time
demand = melt_ferc714_hourly_demand_matrix(df4, tz)

CPU times: user 4.67 s, sys: 956 ms, total: 5.62 s
Wall time: 5.69 s


In [12]:
%%time
counties = load_counties(pudl_out, pudl_settings)

We've already got the 2010 Census GeoDB.
Extracting the GeoDB into a GeoDataFrame
CPU times: user 2.77 s, sys: 229 ms, total: 3 s
Wall time: 3.2 s


In [13]:
%%time
assignments = load_ferc714_county_assignments(pudl_out)

Running the interim EIA 861 ETL process!
Extracting eia861 spreadsheet data.


The data has not yet been validated, and the structure may change.


Transforming raw EIA 861 DataFrames for service_territory_eia861 concatenated across all years.
Assigned state FIPS codes for 100.00% of records.
Assigned county FIPS codes for 99.65% of records.
Transforming raw EIA 861 DataFrames for balancing_authority_eia861 concatenated across all years.
Started with 37622 missing BA Codes out of 39086 records (96.25%)
Ended with 12674 missing BA Codes out of 39086 records (32.43%)
Transforming raw EIA 861 DataFrames for sales_eia861 concatenated across all years.
Tidying the EIA 861 Sales table.
Dropped 0 duplicate records from EIA 861 Sales table, out of a total of 336550 records (0.0000% of all records). 
Performing value transformations on EIA 861 Sales table.
Transforming raw EIA 861 DataFrames for advanced_metering_infrastructure_eia861 concatenated across all years.
Tidying the EIA 861 Advanced Metering Infrastructure table.
Transforming raw EIA 861 DataFrames for demand_response_eia861 concatenated across all years.
Dropped 0 duplicate rec

In [14]:
%%time
state_totals = load_eia861_state_total_sales(pudl_out)

CPU times: user 49.6 ms, sys: 688 µs, total: 50.2 ms
Wall time: 49.2 ms


In [15]:
%%time
prediction = predict_state_hourly_demand(
    demand,
    counties=counties,
    assignments=assignments,
    state_totals=state_totals,
    mean_overlaps=False
)

CPU times: user 8.79 s, sys: 2.13 s, total: 10.9 s
Wall time: 10.6 s


In [16]:
%%time
import pathlib
local_dir = pathlib.Path(pudl_settings['data_dir']) / 'local'
ventyx_path = local_dir / 'ventyx/state_level_load_2007_2018.csv'
base_dir = local_dir / 'state-demand'
base_dir.mkdir(parents=True, exist_ok=True)
demand_path = base_dir / 'demand.csv'
stats_path = base_dir / 'demand-stats.csv'
timeseries_dir = base_dir / 'timeseries'
timeseries_dir.mkdir(parents=True, exist_ok=True)
scatter_dir = base_dir / 'scatter'
scatter_dir.mkdir(parents=True, exist_ok=True)

CPU times: user 489 µs, sys: 212 µs, total: 701 µs
Wall time: 773 µs


In [17]:
%%time
# Write predicted hourly state demand
prediction.to_csv(
    demand_path, index=False, date_format='%Y%m%dT%H', float_format='%.1f'
)

CPU times: user 58.2 s, sys: 267 ms, total: 58.5 s
Wall time: 58.8 s


In [22]:
%%time
# Load Ventyx as reference if available
reference = None
if ventyx_path.exists():
    reference = load_ventyx_hourly_state_demand(ventyx_path)

CPU times: user 44.3 s, sys: 4.49 s, total: 48.8 s
Wall time: 52.6 s


In [34]:
%%time
# Plots and statistics
stats = []
for fips in prediction['state_id_fips'].unique():
    state = lookup_state(fips)
    # Filter demand by state
    a = prediction.query(f'state_id_fips == {fips}')
    b = None
    title = f'{state["fips"]}: {state["name"]} ({state["code"]})'
    plot_name = f'{state["fips"]}-{state["name"]}.png'
    if reference is not None:
        b = reference.query(f'state_id_fips == {fips}')
    # Save timeseries plot
    plot_demand_timeseries(
        a, b=b, window=168, title=title, path=timeseries_dir / plot_name
    )
    if b is None or b.empty:
        continue
    # Align predicted and reference demand
    a = a.set_index('utc_datetime')
    b = b.set_index('utc_datetime')
    index = a.index.intersection(b.index)
    a = a.loc[index].reset_index()
    b = b.loc[index].reset_index()
    # Compute statistics
    stat = compare_state_demand(a, b, scaled=True)
    stat['state_id_fips'] = fips
    stats.append(stat)
    # Save scatter plot
    plot_demand_scatter(a, b=b, title=title, path=scatter_dir / plot_name)

# Write statistics
if reference is not None:
    pd.concat(stats, ignore_index=True).to_csv(
        stats_path, index=False, float_format='%.1f'
    )


CPU times: user 3min 16s, sys: 2.54 s, total: 3min 18s
Wall time: 3min 29s
