In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# Standard libraries
import logging
import os
import pathlib
import sys

# 3rd party libraries
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import pandas as pd
import seaborn as sns
import sqlalchemy as sa

# Local libraries
import pudl
import pudl.output.ferc714

# Configure Display Parameters

In [3]:
sns.set()
%matplotlib inline
mpl.rcParams['figure.figsize'] = (10,4)
mpl.rcParams['figure.dpi'] = 150
pd.options.display.max_columns = 100
pd.options.display.max_rows = 100

# Use Python Logging facilities
* Using a logger from the beginning will make the transition into the PUDL package easier.
* Creating a logging handler here will also allow you to see the logging output coming from PUDL and other underlying packages.

In [4]:
logger=logging.getLogger()
logger.setLevel(logging.INFO)
handler = logging.StreamHandler(stream=sys.stdout)
formatter = logging.Formatter('%(message)s')
handler.setFormatter(formatter)
logger.handlers = [handler]

# Define Functions

# Define Notebook Parameters

In [5]:
from pudl.workspace.setup import PudlPaths

ferc1_engine = sa.create_engine(PudlPaths().sqlite_db_uri("ferc1"))
display(ferc1_engine)

pudl_engine = sa.create_engine(PudlPaths().pudl_db)
display(pudl_engine)

{'pudl_in': '/home/zane/code/catalyst/pudl-work',
 'data_dir': '/home/zane/code/catalyst/pudl-work/data',
 'settings_dir': '/home/zane/code/catalyst/pudl-work/settings',
 'pudl_out': '/home/zane/code/catalyst/pudl-work',
 'sqlite_dir': '/home/zane/code/catalyst/pudl-work/sqlite',
 'parquet_dir': '/home/zane/code/catalyst/pudl-work/parquet',
 'datapkg_dir': '/home/zane/code/catalyst/pudl-work/datapkg',
 'notebook_dir': '/home/zane/code/catalyst/pudl-work/notebook',
 'ferc1_db': 'sqlite:////home/zane/code/catalyst/pudl-work/sqlite/ferc1.sqlite',
 'pudl_db': 'sqlite:////home/zane/code/catalyst/pudl-work/sqlite/pudl.sqlite'}

Engine(sqlite:////home/zane/code/catalyst/pudl-work/sqlite/ferc1.sqlite)

Engine(sqlite:////home/zane/code/catalyst/pudl-work/sqlite/pudl.sqlite)

# Load Data

In [6]:
pudl_out = pudl.output.pudltabl.PudlTabl(pudl_engine=pudl_engine)

In [7]:
%%time
ferc714_out = pudl.output.ferc714.Respondents(pudl_out)
annualized = ferc714_out.annualize()
categorized = ferc714_out.categorize()
summarized = ferc714_out.summarize_demand()
fipsified = ferc714_out.fipsify()
counties_gdf = ferc714_out.georef_counties()

Running the interim EIA 861 ETL process! (~2 minutes)
Extracting eia861 spreadsheet data.


The data has not yet been validated, and the structure may change.


Transforming raw EIA 861 DataFrames for service_territory_eia861 concatenated across all years.
Assigned state FIPS codes for 100.00% of records.
Assigned county FIPS codes for 99.64% of records.
Transforming raw EIA 861 DataFrames for balancing_authority_eia861 concatenated across all years.
Started with 37622 missing BA Codes out of 38882 records (96.76%)
Ended with 12674 missing BA Codes out of 38882 records (32.60%)
Transforming raw EIA 861 DataFrames for sales_eia861 concatenated across all years.
Tidying the EIA 861 Sales table.
Dropped 0 duplicate records from EIA 861 Demand Response table, out of a total of 301045 records (0.0000% of all records). 
Performing value transformations on EIA 861 Sales table.
Transforming raw EIA 861 DataFrames for advanced_metering_infrastructure_eia861 concatenated across all years.
Tidying the EIA 861 Advanced Metering Infrastructure table.
Transforming raw EIA 861 DataFrames for demand_response_eia861 concatenated across all years.
Tidying the E

  mask = arr == x


Running the interim FERC 714 ETL process! (~11 minutes)
Extracting respondent_id_ferc714 from CSV into pandas DataFrame.
Extracting id_certification_ferc714 from CSV into pandas DataFrame.
Extracting gen_plants_ba_ferc714 from CSV into pandas DataFrame.


The data has not yet been validated, and the structure may change.


Extracting demand_monthly_ba_ferc714 from CSV into pandas DataFrame.
Extracting net_energy_load_ba_ferc714 from CSV into pandas DataFrame.
Extracting adjacency_ba_ferc714 from CSV into pandas DataFrame.
Extracting interchange_ba_ferc714 from CSV into pandas DataFrame.
Extracting lambda_hourly_ba_ferc714 from CSV into pandas DataFrame.
Extracting lambda_description_ferc714 from CSV into pandas DataFrame.
Extracting description_pa_ferc714 from CSV into pandas DataFrame.
Extracting demand_forecast_pa_ferc714 from CSV into pandas DataFrame.
Extracting demand_hourly_pa_ferc714 from CSV into pandas DataFrame.
Transforming respondent_id_ferc714.
Transforming id_certification_ferc714.
Transforming gen_plants_ba_ferc714.
Transforming demand_monthly_ba_ferc714.
Transforming net_energy_load_ba_ferc714.
Transforming adjacency_ba_ferc714.
Transforming interchange_ba_ferc714.
Transforming lambda_hourly_ba_ferc714.
Transforming lambda_description_ferc714.
Transforming description_pa_ferc714.
Transfor

  mask = arr == x


We've already got the 2010 Census GeoDB.
Extracting the GeoDB into a GeoDataFrame


  mask = arr == x


CPU times: user 10min 52s, sys: 60 s, total: 11min 52s
Wall time: 12min 39s


In [8]:
categorized.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2968 entries, 0 to 2785
Data columns (total 10 columns):
 #   Column                        Non-Null Count  Dtype         
---  ------                        --------------  -----         
 0   eia_code                      2954 non-null   Int64         
 1   respondent_type               2870 non-null   category      
 2   respondent_id_ferc714         2968 non-null   Int64         
 3   respondent_name_ferc714       2968 non-null   string        
 4   report_date                   2968 non-null   datetime64[ns]
 5   balancing_authority_id_eia    1806 non-null   Int64         
 6   balancing_authority_code_eia  1176 non-null   category      
 7   balancing_authority_name_eia  1806 non-null   string        
 8   utility_id_eia                1064 non-null   Int64         
 9   utility_name_eia              994 non-null    string        
dtypes: Int64(4), category(2), datetime64[ns](1), string(3)
memory usage: 229.3 KB


In [9]:
summarized.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2968 entries, 0 to 2967
Data columns (total 11 columns):
 #   Column                        Non-Null Count  Dtype         
---  ------                        --------------  -----         
 0   report_date                   2968 non-null   datetime64[ns]
 1   respondent_id_ferc714         2968 non-null   Int64         
 2   demand_annual_mwh             2968 non-null   float64       
 3   eia_code                      2954 non-null   Int64         
 4   respondent_type               2870 non-null   category      
 5   respondent_name_ferc714       2968 non-null   string        
 6   balancing_authority_id_eia    1806 non-null   Int64         
 7   balancing_authority_code_eia  1176 non-null   category      
 8   balancing_authority_name_eia  1806 non-null   string        
 9   utility_id_eia                1064 non-null   Int64         
 10  utility_name_eia              994 non-null    string        
dtypes: Int64(4), category(2), date

In [10]:
fipsified.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 99747 entries, 0 to 2785
Data columns (total 14 columns):
 #   Column                        Non-Null Count  Dtype         
---  ------                        --------------  -----         
 0   eia_code                      99733 non-null  Int64         
 1   respondent_type               99649 non-null  category      
 2   respondent_id_ferc714         99747 non-null  Int64         
 3   respondent_name_ferc714       99747 non-null  string        
 4   report_date                   99747 non-null  datetime64[ns]
 5   balancing_authority_id_eia    91893 non-null  Int64         
 6   balancing_authority_code_eia  82262 non-null  category      
 7   balancing_authority_name_eia  91893 non-null  string        
 8   utility_id_eia                7756 non-null   Int64         
 9   utility_name_eia              7339 non-null   string        
 10  state                         98255 non-null  string        
 11  county                       

In [11]:
counties_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 99747 entries, 0 to 99746
Data columns (total 16 columns):
 #   Column                        Non-Null Count  Dtype         
---  ------                        --------------  -----         
 0   county_id_fips                98238 non-null  string        
 1   county_name_census            98221 non-null  object        
 2   geometry                      98221 non-null  geometry      
 3   eia_code                      99733 non-null  Int64         
 4   respondent_type               99649 non-null  category      
 5   respondent_id_ferc714         99747 non-null  Int64         
 6   respondent_name_ferc714       99747 non-null  string        
 7   report_date                   99747 non-null  datetime64[ns]
 8   balancing_authority_id_eia    91893 non-null  Int64         
 9   balancing_authority_code_eia  82262 non-null  category      
 10  balancing_authority_name_eia  91893 non-null  string        
 11  utility_id_eia      

In [12]:
# This takes 45 minutes so...
#respondents_gdf = ferc714_out.georef_respondents()
#display(respondents_gdf.info())
#respondents_gdf.sample(10)