In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os, sys
import numpy as np
import pandas as pd
import sqlalchemy as sa
import pudl
import pudl.constants as pc
from pudl.analysis.state_demand import (
    load_ferc714_hourly_demand_matrix,
    clean_ferc714_hourly_demand_matrix,
    filter_ferc714_hourly_demand_matrix,
    impute_ferc714_hourly_demand_matrix,
    melt_ferc714_hourly_demand_matrix,
    load_counties,
    load_ferc714_county_assignments,
    load_eia861_state_total_sales,
    predict_state_hourly_demand,
    plot_demand_timeseries,
    plot_demand_scatter,
    load_ventyx_hourly_state_demand,
    lookup_state,
    compare_state_demand,
)

In [None]:
import logging
logger=logging.getLogger()
logger.setLevel(logging.INFO)
handler = logging.StreamHandler(stream=sys.stdout)
formatter = logging.Formatter('%(message)s')
handler.setFormatter(formatter)
logger.handlers = [handler]

In [None]:
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
sns.set()
%matplotlib inline

In [None]:
mpl.rcParams['figure.figsize'] = (10,4)
mpl.rcParams['figure.dpi'] = 150
pd.options.display.max_columns = 100
pd.options.display.max_rows = 100

In [None]:
API_KEY_EIA = os.environ["API_KEY_EIA"]
API_KEY_BLS = os.environ["API_KEY_BLS"]
API_KEY_FRED = os.environ["API_KEY_FRED"]
#HARVEST_TOKEN = os.environ["HARVEST_TOKEN"]
#HARVEST_ACCOUNT_ID = os.environ["HARVEST_ACCOUNT_ID"]

from pudl.workspace.setup import PudlPaths
ferc1_engine = sa.create_engine(PudlPaths().sqlite_db_uri("ferc1"))
pudl_engine = sa.create_engine(PudlPaths().pudl_db)
pudl_out = pudl.output.pudltabl.PudlTabl(pudl_engine=pudl_engine)

## Load the FERC 714 data
* Should take ~1 minute.

In [None]:
%%time
df1, tz = load_ferc714_hourly_demand_matrix(pudl_out)

## Clean the FERC 714 hourly demand matrix
* This uses ~32GB of memory.
* Takes ~10 minutes.
* Why the `RuntimeWarning` about an All-NaN slice?

In [None]:
%%time
df2 = clean_ferc714_hourly_demand_matrix(df1)

## Remove respondents lacking data

In [None]:
%%time
df3 = filter_ferc714_hourly_demand_matrix(df2, min_data=100, min_data_fraction=0.9)

## Impute missing demand values.
* Very CPU intensive, takes ~1 hour and maxes out all 4 of my cores.
* Not very memory intensive.

In [None]:
%%time
df4 = impute_ferc714_hourly_demand_matrix(df3)

## Melt FERC 714 hourly demand

In [None]:
%%time
demand = melt_ferc714_hourly_demand_matrix(df4, tz)

In [None]:
%%time
counties = load_counties(pudl_out, pudl_settings)

In [None]:
%%time
assignments = load_ferc714_county_assignments(pudl_out)

In [None]:
%%time
state_totals = load_eia861_state_total_sales(pudl_out)

In [None]:
%%time
prediction = predict_state_hourly_demand(
    demand,
    counties=counties,
    assignments=assignments,
    state_totals=state_totals,
    mean_overlaps=False
)

In [None]:
%%time
import pathlib
local_dir = PudlPaths().data_dir / 'local'
ventyx_path = local_dir / 'ventyx/state_level_load_2007_2018.csv'
base_dir = local_dir / 'state-demand'
base_dir.mkdir(parents=True, exist_ok=True)
demand_path = base_dir / 'demand.csv'
stats_path = base_dir / 'demand-stats.csv'
timeseries_dir = base_dir / 'timeseries'
timeseries_dir.mkdir(parents=True, exist_ok=True)
scatter_dir = base_dir / 'scatter'
scatter_dir.mkdir(parents=True, exist_ok=True)

In [None]:
%%time
# Write predicted hourly state demand
prediction.to_csv(
    demand_path, index=False, date_format='%Y%m%dT%H', float_format='%.1f'
)

In [None]:
%%time
# Load Ventyx as reference if available
reference = None
if ventyx_path.exists():
    reference = load_ventyx_hourly_state_demand(ventyx_path)

In [None]:
%%time
# Plots and statistics
stats = []
for fips in prediction['state_id_fips'].unique():
    state = lookup_state(fips)
    # Filter demand by state
    a = prediction.query(f'state_id_fips == {fips}')
    b = None
    title = f'{state["fips"]}: {state["name"]} ({state["code"]})'
    plot_name = f'{state["fips"]}-{state["name"]}.png'
    if reference is not None:
        b = reference.query(f'state_id_fips == {fips}')
    # Save timeseries plot
    plot_demand_timeseries(
        a, b=b, window=168, title=title, path=timeseries_dir / plot_name
    )
    if b is None or b.empty:
        continue
    # Align predicted and reference demand
    a = a.set_index('utc_datetime')
    b = b.set_index('utc_datetime')
    index = a.index.intersection(b.index)
    a = a.loc[index].reset_index()
    b = b.loc[index].reset_index()
    # Compute statistics
    stat = compare_state_demand(a, b, scaled=True)
    stat['state_id_fips'] = fips
    stats.append(stat)
    # Save scatter plot
    plot_demand_scatter(a, b=b, title=title, path=scatter_dir / plot_name)

# Write statistics
if reference is not None:
    pd.concat(stats, ignore_index=True).to_csv(
        stats_path, index=False, float_format='%.1f'
    )
