In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import pandas as pd
import numpy as np
import sys
import os
import io

import pudl
import pudl.glue.ferc1_eia
from pudl import init
from pudl import constants as pc
import tableschema
import hashlib
import datapackage
import json
import logging
from tableschema import Table
from tableschema import Schema

In [2]:
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
handler = logging.StreamHandler(stream=sys.stdout)
formatter = logging.Formatter('%(message)s')
handler.setFormatter(formatter)
logger.handlers = [handler]

In [3]:
from sqlalchemy.engine import reflection
pudl_engine = init.connect_db(testing=False)
insp = reflection.Inspector.from_engine(pudl_engine)
# extract the table names
tbls = insp.get_table_names()
# extract only the ferc tables
ferc_tbls = [s for s in tbls if "ferc" in s]

If you want to generate the package for just ferc as an example, you can use the commented out ferc options below.

In [4]:
name = "pudl-test"
title = "All tables integrated into PUDL."
tbls = tbls

#name = "pudl-ferc1"
#title =  "FERC Form 1 tables integrated into PUDL."
#tbls = ferc_tbls

In [5]:
# we need this as the main info regarding this iteration of packaging
pkg_skeleton = {
    "name": name,
    "title": title,
    "description": "Tables collected from FERC Form 1, as they appear in our database, with all dependent tables. Allows the user to instantiate their own local database.",
    "keywords": [
        "ferc",
        "form 1",
        "energy",
        "electricity",
        "utility",
        "fuel",
        "expenses",
        "coal",
        "natural gas",
        "generation",
        "regulation"
    ]
}

this will generate metadata and csv's for every table in pudl. they will live in `results/data_pkgs`

In [6]:
pkg = pudl.output.export.data_package(tbls,pkg_skeleton)

Finding dependent tables for plants_entity_eia
Finding dependent tables for plants_entity_eia
Finding dependent tables for utilities
Finding dependent tables for plants
Finding dependent tables for utilities
Finding dependent tables for plants
Finding dependent tables for utilities
Finding dependent tables for fuel_type_aer_eia923
Finding dependent tables for fuel_type_eia923
Finding dependent tables for prime_movers_eia923
Finding dependent tables for plants_entity_eia
Finding dependent tables for fuel_type_eia923
Finding dependent tables for generators_entity_eia
Finding dependent tables for plants_entity_eia
Finding dependent tables for generators_entity_eia
Finding dependent tables for plants_entity_eia
Finding dependent tables for plants_entity_eia
Finding dependent tables for coalmine_eia923
Finding dependent tables for energy_source_eia923
Finding dependent tables for transport_modes_eia923
Finding dependent tables for plants_entity_eia
Finding dependent tables for utilities_ent

Finding dependent tables for utilities_ferc
Finding dependent tables for utilities
Finding dependent tables for plants
Finding dependent tables for generators_entity_eia
Finding dependent tables for plants_entity_eia
Finding dependent tables for utilities_entity_eia
Finding dependent tables for plants_ferc
Finding dependent tables for utilities_ferc
Finding dependent tables for utilities
Finding dependent tables for plants
Finding dependent tables for plants
Finding dependent tables for plants_ferc
Finding dependent tables for utilities_ferc
Finding dependent tables for utilities
Finding dependent tables for plants
Finding dependent tables for utilities
Finding dependent tables for utilities_ferc
Finding dependent tables for utilities
Finding dependent tables for plants
Finding dependent tables for plants_entity_eia
Finding dependent tables for utilities_ferc
Finding dependent tables for utilities
Finding dependent tables for ferc_depreciation_lines
Finding dependent tables for generat

here we are pulling the list of dependent tables based on the generated metadata

In [7]:
dependent_tbls = pudl.helpers.get_dependent_tables_from_list_pkg(ferc_tbls,pkg_name=name)

Finding dependent tables for utilities
Finding dependent tables for plants_ferc
Finding dependent tables for utilities_ferc
Finding dependent tables for utilities
Finding dependent tables for plants
Finding dependent tables for utilities_ferc
Finding dependent tables for utilities
Finding dependent tables for ferc_accounts
Finding dependent tables for utilities_ferc
Finding dependent tables for utilities
Finding dependent tables for plants
Finding dependent tables for plants_ferc
Finding dependent tables for utilities_ferc
Finding dependent tables for utilities
Finding dependent tables for plants
Finding dependent tables for utilities_ferc
Finding dependent tables for utilities
Finding dependent tables for ferc_depreciation_lines
Finding dependent tables for plants_ferc
Finding dependent tables for utilities_ferc
Finding dependent tables for utilities
Finding dependent tables for plants
Finding dependent tables for plants_ferc
Finding dependent tables for utilities_ferc
Finding depende

In [14]:
from pudl.settings import SETTINGS
settings_init = pudl.settings.settings_init()

In [15]:
# inputs for new init function
ferc1_tables=settings_init['ferc1_tables']
ferc1_years=[2017]
eia923_tables=pc.eia923_pudl_tables
eia923_years=[2017]
eia860_tables=pc.eia860_pudl_tables
eia860_years=[2017]
epacems_years=None
epacems_states=None
pudl_testing=None
ferc1_testing=None
debug=None
csvdir=SETTINGS['csvdir']
keep_csv=None

In [16]:
if epacems_states and epacems_years:
    epacems_tables = ('hourly_emissions_epacems') # should this live in constants?
else:
    epacems_tables = ()
all_tables = (eia860_tables + eia923_tables + epacems_tables)

In [18]:
# note: this will be usefull for pulling the dependent tables from the metadata
# I also still need to incorporate this dependency check into the etl functions..
# one remaining question/issue is how the different types of table names interact
# with the ETL functions...
all_tables =  pudl.helpers.get_dependent_tables_from_list_pkg(all_tables,pkg_name=name)

Finding dependent tables for generators_entity_eia
Finding dependent tables for plants_entity_eia
Finding dependent tables for coalmine_eia923
Finding dependent tables for energy_source_eia923
Finding dependent tables for transport_modes_eia923
Finding dependent tables for plants_entity_eia
Finding dependent tables for utilities_entity_eia
Finding dependent tables for generators_entity_eia
Finding dependent tables for plants_entity_eia
Finding dependent tables for utilities_entity_eia
Finding dependent tables for plants_entity_eia
Finding dependent tables for fuel_type_aer_eia923
Finding dependent tables for fuel_type_eia923
Finding dependent tables for prime_movers_eia923
Finding dependent tables for plants_entity_eia
Finding dependent tables for fuel_type_eia923
Finding dependent tables for generators_entity_eia
Finding dependent tables for plants_entity_eia
Finding dependent tables for plants_entity_eia
Finding dependent tables for generators_entity_eia
Finding dependent tables for 

In [19]:
def _ingest_datasets_table_pkg(ferc1_years,
                               eia860_years,
                               eia923_years,
                               epacems_years,
                               csvdir):
    """
    Create and populate datasets table.

    This table will be used to determine which sources have been ingested into
    the database in later output or anaylsis.
    """
    datasets = pd.DataFrame.from_records([('ferc1', bool(ferc1_years)),
                                          ('eia860', bool(eia860_years)),
                                          ('eia923', bool(eia923_years)),
                                          ('epacems', bool(epacems_years)), ],
                                         columns=['datasource', 'active'])

    outfile = os.path.join(csvdir, "datasets" + '.csv')
    datasets.to_csv(path_or_buf=outfile, index=False)

In [20]:
def _ETL_eia_pkg(eia923_tables, eia923_years, eia860_tables,
                 eia860_years, csvdir):
    if (not eia923_tables or not eia923_years) and (not eia860_tables or not eia860_years):
        logger.info('Not ingesting EIA.')
        return None

    # Extract EIA forms 923, 860
    eia923_raw_dfs = pudl.extract.eia923.extract(eia923_years=eia923_years)
    eia860_raw_dfs = pudl.extract.eia860.extract(eia860_years=eia860_years)
    # Transform EIA forms 923, 860
    eia923_transformed_dfs = \
        pudl.transform.eia923.transform(eia923_raw_dfs,
                                        eia923_tables=eia923_tables)
    eia860_transformed_dfs = \
        pudl.transform.eia860.transform(eia860_raw_dfs,
                                        eia860_tables=eia860_tables)
    # create an eia transformed dfs dictionary
    eia_transformed_dfs = eia860_transformed_dfs.copy()
    eia_transformed_dfs.update(eia923_transformed_dfs.copy())

    entities_dfs, eia_transformed_dfs = \
        pudl.transform.eia.transform(eia_transformed_dfs,
                                     eia923_years=eia923_years,
                                     eia860_years=eia860_years)
    # Compile transformed dfs for loading...
    transformed_dfs = {"Entities": entities_dfs, "EIA": eia_transformed_dfs}
    # Load step

    for data_source, transformed_df in transformed_dfs.items():
        pudl.load.dict_dump(transformed_df,
                            data_source,
                            need_fix_inting=pc.need_fix_inting,
                            csvdir=csvdir)

In [21]:
def _ETL_ferc1_pkg(ferc1_tables, ferc1_years, ferc1_testing,
                   csvdir):
    if not ferc1_years or not ferc1_tables:
        logger.info('Not ingesting FERC1')
        return None

    # Extract FERC form 1
    ferc1_raw_dfs = pudl.extract.ferc1.extract(ferc1_tables=ferc1_tables,
                                               ferc1_years=ferc1_years,
                                               testing=ferc1_testing)
    # Transform FERC form 1
    ferc1_transformed_dfs = pudl.transform.ferc1.transform(
        ferc1_raw_dfs, ferc1_tables=ferc1_tables)
    # Load FERC form 1
    pudl.load.dict_dump(ferc1_transformed_dfs,
                        "FERC 1",
                        need_fix_inting=pc.need_fix_inting,
                        csvdir=csvdir)

In [22]:
def _ETL_glue(eia923_years,eia860_years,ferc1_years,csvdir):
    """
    Grabs the glue tables and generates CSVs.
    
    """
    # grab the glue tables for ferc1 & eia
    glue_dfs = pudl.glue.ferc1_eia.glue(eia923_years,
                                        eia860_years,
                                        ferc1_years)

    pudl.load.dict_dump(glue_dfs,
                        "Glue",
                        need_fix_inting=pc.need_fix_inting,
                        csvdir=csvdir)

In [25]:
def _ingest_static_tables_pkg(csvdir):
    """
    Populate static PUDL tables with constants for use as foreign keys.

    There are many values specified within the data that are essentially
    constant, but which we need to store for data validation purposes, for use
    as foreign keys.  E.g. the list of valid EIA fuel type codes, or the
    possible state and country codes indicating a coal delivery's location of
    origin. For now these values are primarily stored in a large collection of
    lists, dictionaries, and dataframes which are specified in the
    pudl.constants module.  This function uses those data structures to
    populate a bunch of small infrastructural tables within the PUDL DB.
    """
    # create dfs for tables with static data from constants.
    fuel_type_eia923 = pd.DataFrame({'abbr': list(pc.fuel_type_eia923.keys()),
                                     'fuel_type': list(pc.fuel_type_eia923.values())})


    prime_movers_eia923 = pd.DataFrame({'abbr': list(pc.prime_movers_eia923.keys()),
                                        'prime_mover': list(pc.prime_movers_eia923.values())})

    fuel_type_aer_eia923 = pd.DataFrame({'abbr': list(pc.fuel_type_aer_eia923.keys()),
                                         'fuel_type': list(pc.fuel_type_aer_eia923.values())})

    energy_source_eia923 = pd.DataFrame({'abbr': list(pc.energy_source_eia923.keys()),
                                         'source': list(pc.energy_source_eia923.values())})


    transport_modes_eia923 = pd.DataFrame({'abbr': list(pc.transport_modes_eia923.keys()),
                                           'mode': list(pc.transport_modes_eia923.values())})

    ferc_accounts = pc.ferc_electric_plant_accounts.drop('row_number', axis=1).\
        replace({'ferc_account_description': r'\s+'}, ' ', regex=True).\
        rename(columns={'ferc_account_id': 'id',
                        'ferc_account_description': 'description'})

    ferc_depreciation_lines = pc.ferc_accumulated_depreciation.drop('row_number', axis=1).\
        rename(columns={'line_id': 'id',
                        'ferc_account_description': 'description'})

    # compile the dfs in a dictionary, prep for dict_dump
    static_dfs = {'fuel_type_eia923': fuel_type_eia923,
                  'prime_movers_eia923' : prime_movers_eia923,
                  'fuel_type_aer_eia923': fuel_type_aer_eia923,
                  'energy_source_eia923': energy_source_eia923,
                  'transport_modes_eia923': transport_modes_eia923,
                  'ferc_accounts': ferc_accounts,
                  'ferc_depreciation_lines': ferc_depreciation_lines
                 }
    
    # run the dictionary of prepped static tables through dict_dump to make CSVs
    pudl.load.dict_dump(static_dfs,
                        "Static Tables",
                        need_fix_inting=pc.need_fix_inting,
                        csvdir=csvdir)

In [26]:
# Make sure that the tables we're being asked to ingest can actually be
# pulled into both the FERC Form 1 DB, and the PUDL DB...
logger.info("Beginning PUDL DB ETL process.")

if (not debug) and (ferc1_tables):
    for table in ferc1_tables:
        if table not in pc.ferc1_pudl_tables:
            raise AssertionError(
                f"Unrecognized FERC table: {table}."
            )

if (not debug) and (eia860_tables):
    for table in eia860_tables:
        if table not in pc.eia860_pudl_tables:
            raise AssertionError(
                f"Unrecognized EIA 860 table: {table}"
            )

if (not debug) and (eia923_tables):
    for table in eia923_tables:
        if table not in pc.eia923_pudl_tables:
            raise AssertionError(
                f"Unrecogized EIA 923 table: {table}"
            )


_ingest_datasets_table_pkg(ferc1_years,
                           eia860_years,
                           eia923_years,
                           epacems_years,
                           csvdir)

# Populate all the static tables:
logger.info("Ingesting static PUDL tables...")
_ingest_static_tables_pkg(csvdir)

# Populate tables that relate FERC1 & EIA923 data to each other.
logger.info("Sniffing EIA923/FERC1 glue tables...")
_ETL_glue(eia923_years,
          eia860_years,
          ferc1_years,
          csvdir)

_ETL_ferc1_pkg(ferc1_tables,
               ferc1_years,
               ferc1_testing,
               csvdir)

_ETL_eia_pkg(eia923_tables,
             eia923_years,
             eia860_tables,
             eia860_years,
             csvdir)

Beginning PUDL DB ETL process.
Ingesting static PUDL tables...
Loading Static Tables fuel_type_eia923 dataframe into CSV
Loading Static Tables prime_movers_eia923 dataframe into CSV
Loading Static Tables fuel_type_aer_eia923 dataframe into CSV
Loading Static Tables energy_source_eia923 dataframe into CSV
Loading Static Tables transport_modes_eia923 dataframe into CSV
Loading Static Tables ferc_accounts dataframe into CSV
Loading Static Tables ferc_depreciation_lines dataframe into CSV
Sniffing EIA923/FERC1 glue tables...
Loading Glue plants dataframe into CSV
Loading Glue utilities dataframe into CSV
Loading Glue utilities_ferc dataframe into CSV
Loading Glue plants_ferc dataframe into CSV
Loading Glue utility_plant_assn dataframe into CSV
Loading Glue utilities_eia dataframe into CSV
Loading Glue plants_eia dataframe into CSV
Converting extracted FERC Form 1 table fuel_ferc1 into a pandas DataFrame.
Converting extracted FERC Form 1 table plants_steam_ferc1 into a pandas DataFrame.
Con

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


       Ratio: 1.0  Wrongos: 0.0  Total: 10099   longitude
       Ratio: 1.0  Wrongos: 0.0  Total: 10088   nerc_region
       Ratio: 1.0  Wrongos: 0.0  Total: 10127   plant_name
       Ratio: 1.0  Wrongos: 0.0  Total: 10127   primary_purpose_naics_id
       Ratio: 1.0  Wrongos: 0.0  Total: 10127   sector_id
       Ratio: 1.0  Wrongos: 0.0  Total: 10127   sector_name
       Ratio: 1.0  Wrongos: 0.0  Total: 10127   state
       Ratio: 1.0  Wrongos: 0.0  Total: 9943   street_address
       Ratio: 1.0  Wrongos: 0.0  Total: 10105   zip_code
       Ratio: 1.0  Wrongos: 0.0  Total: 10127   ash_impoundment
       Ratio: 1.0  Wrongos: 0.0  Total: 10127   ash_impoundment_lined
       Ratio: 1.0  Wrongos: 0.0  Total: 362   ash_impoundment_status
       Ratio: 1.0  Wrongos: 0.0  Total: 10127   energy_storage
       Ratio: 1.0  Wrongos: 0.0  Total: 491   ferc_cogen_docket_no
       Ratio: 1.0  Wrongos: 0.0  Total: 4475   water_source
       Ratio: 1.0  Wrongos: 0.0  Total: 1139   ferc_exempt_wholesa

        boiler_fuel_eia923...
    Casting harvested IDs to correct data types
       Ratio: 0.976  Wrongos: 113.0  Total: 4674   prime_mover_code
Average consistency of static boilers values is 97.58%
Inferring complete EIA boiler-generator associations.
Multiple EIA unit codes:plant_id_eia=10725, unit_id_pudl=1, unit_id_eia=['F801' 'F802']
Multiple EIA unit codes:plant_id_eia=55309, unit_id_pudl=1, unit_id_eia=['SMR1' 'SMR2']
Multiple EIA unit codes:plant_id_eia=60786, unit_id_pudl=1, unit_id_eia=[4343 4141]
Loading Entities plants_entity_eia dataframe into CSV
Loading Entities generators_entity_eia dataframe into CSV
Loading Entities utilities_entity_eia dataframe into CSV
Loading Entities boilers_entity_eia dataframe into CSV
Loading EIA ownership_eia860 dataframe into CSV
Loading EIA generators_eia860 dataframe into CSV
Loading EIA plants_eia860 dataframe into CSV
Loading EIA boiler_generator_assn_eia860 dataframe into CSV
Loading EIA utilities_eia860 dataframe into CSV
Loading EIA