In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import pandas as pd
import numpy as np
import sys
import os
import io
import shutil

import pudl
import pudl.glue.ferc1_eia
from pudl import init
from pudl import constants as pc
import tableschema
import hashlib
import datapackage
import json
import logging
from tableschema import Table
from tableschema import Schema
import datetime
import goodtables

In [2]:
from pudl import ETL_pkg
from pudl.ETL_pkg import ETL_pkg, _input_validate
from pudl.ETL_pkg import _ETL_eia_pkg, _ETL_ferc1_pkg, _ETL_glue, _ETL_cems_pkg

In [3]:
from pudl.settings import SETTINGS

In [4]:
logger = logging.getLogger()
logger.setLevel(logging.INFO)
handler = logging.StreamHandler(stream=sys.stdout)
formatter = logging.Formatter('%(message)s')
handler.setFormatter(formatter)
logger.handlers = [handler]

In [5]:
from sqlalchemy.engine import reflection
pudl_engine = init.connect_db(testing=False)
insp = reflection.Inspector.from_engine(pudl_engine)
# extract the table names
tbls = insp.get_table_names()
# extract only the ferc tables
ferc_tbls = [s for s in tbls if "ferc" in s]

If you want to generate the package for just ferc as an example, you can use the commented out ferc options below.

In [6]:
name = "pudl-test"
title = "All tables integrated into PUDL."
tbls = tbls

#name = "pudl-ferc1"
#title =  "FERC Form 1 tables integrated into PUDL."
#tbls = ferc_tbls

In [7]:
#tbls.remove('hourly_emissions_epacems')

In [8]:
# we need this as the main info regarding this iteration of packaging
pkg_skeleton = {
    "name": name,
    "title": title,
    "description": "A full metadat description of all PUDL tables.",
    "keywords": [
        "ferc",
        "form 1",
        "energy",
        "electricity",
        "utility",
        "fuel",
        "expenses",
        "coal",
        "natural gas",
        "generation",
        "regulation"
    ]
}

this will generate metadata and csv's for every table in pudl. they will live in `results/data_pkgs`

In [9]:
pkg = pudl.output.export.data_package(tbls,pkg_skeleton,out_dir=SETTINGS['meta_dir'], dry_run=True)

Finding dependent tables for plants_entity_eia
Finding dependent tables for utilities
Finding dependent tables for plants_entity_eia
Finding dependent tables for utilities
Finding dependent tables for plants
Finding dependent tables for utilities
Finding dependent tables for plants
Finding dependent tables for utilities_entity_eia
Finding dependent tables for plants_entity_eia
Finding dependent tables for fuel_type_eia923
Finding dependent tables for prime_movers_eia923
Finding dependent tables for fuel_type_aer_eia923
Finding dependent tables for fuel_type_eia923
Finding dependent tables for plants_entity_eia
Finding dependent tables for plants_entity_eia
Finding dependent tables for coalmine_eia923
Finding dependent tables for energy_source_eia923
Finding dependent tables for transport_modes_eia923
Finding dependent tables for regions_entity_ipm
Finding dependent tables for regions_entity_ipm
Finding dependent tables for regions_entity_ipm
Finding dependent tables for regions_entity_

Finding dependent tables for fuel_type_eia923
Finding dependent tables for plants_entity_eia
Finding dependent tables for plants_ferc
Finding dependent tables for utilities_ferc
Finding dependent tables for utilities
Finding dependent tables for plants
Finding dependent tables for plants_entity_eia
Finding dependent tables for utilities
Finding dependent tables for utilities
Finding dependent tables for plants
Finding dependent tables for plants_ferc
Finding dependent tables for utilities_ferc
Finding dependent tables for utilities
Finding dependent tables for plants
Finding dependent tables for plants_entity_eia
Finding dependent tables for coalmine_eia923
Finding dependent tables for energy_source_eia923
Finding dependent tables for transport_modes_eia923
Finding dependent tables for plants_entity_eia
Finding dependent tables for fuel_type_eia923
Finding dependent tables for prime_movers_eia923
Finding dependent tables for fuel_type_aer_eia923
Finding dependent tables for regions_ent

In [7]:
out_dir = SETTINGS['out_dir']
settings_init = pudl.settings.settings_init(settings_file='settings_init_pudl_package.yml')

In [8]:
metas = pudl.output.export.generate_data_packages(settings_init, debug=True)

no CEMS for now...
glue-test
Loading Glue plants dataframe into CSV
Loading Glue utilities dataframe into CSV
Loading Glue utilities_ferc dataframe into CSV
Loading Glue plants_ferc dataframe into CSV
Loading Glue utility_plant_assn dataframe into CSV
Loading Glue utilities_eia dataframe into CSV
Loading Glue plants_eia dataframe into CSV
Tables are consistent for glue-test package
Data package data validation failed.
ferc1-test
Loading Static FERC Tables ferc_accounts dataframe into CSV
Loading Static FERC Tables ferc_depreciation_lines dataframe into CSV
Converting extracted FERC Form 1 table fuel_ferc1 into a pandas DataFrame.
Converting extracted FERC Form 1 table plants_steam_ferc1 into a pandas DataFrame.
Transforming raw FERC Form 1 dataframe for loading into fuel_ferc1
Transforming raw FERC Form 1 dataframe for loading into plants_steam_ferc1
Identifying distinct large FERC plants for ID assignment.
Successfully associated 880 of 882 (99.77%) FERC Form 1 plant records with mult

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


Average consistency of static plants values is 100.00%
Harvesting IDs & consistently static attributes for EIA generators
Average consistency of static generators values is 100.00%
Harvesting IDs & consistently static attributes for EIA utilities
Average consistency of static utilities values is 100.00%
Harvesting IDs & consistently static attributes for EIA boilers
Average consistency of static boilers values is 97.58%
Inferring complete EIA boiler-generator associations.
Multiple EIA unit codes:plant_id_eia=10725, unit_id_pudl=1, unit_id_eia=['F801' 'F802']
Multiple EIA unit codes:plant_id_eia=55309, unit_id_pudl=1, unit_id_eia=['SMR1' 'SMR2']
Multiple EIA unit codes:plant_id_eia=60786, unit_id_pudl=1, unit_id_eia=[4343 4141]
Loading Entities plants_entity_eia dataframe into CSV
Loading Entities generators_entity_eia dataframe into CSV
Loading Entities utilities_entity_eia dataframe into CSV
Loading Entities boilers_entity_eia dataframe into CSV
Loading EIA ownership_eia860 dataframe

In [None]:
# change the many types of id's
# add UUID to the packages

In [55]:
ferc_vali = metas['ferc1-test'][1]
ferc_meta = metas['ferc1-test'][0]
eia_vali = metas['eia-test'][1]

In [None]:
def _ETL_cems(pudl_engine, epacems_years, csvdir, keep_csv, states):
    """"""
    # If we're not doing CEMS, just stop here to avoid printing messages like
    # "Reading EPA CEMS data...", which could be confusing.
    if not states or not epacems_years:
        logger.info('Not ingesting EPA CEMS.')
        return None
    if states[0].lower() == 'all':
        states = list(pc.cems_states.keys())

    # NOTE: This a generator for raw dataframes
    epacems_raw_dfs = pudl.extract.epacems.extract(
        epacems_years=epacems_years, states=states)
    # NOTE: This is a generator for transformed dataframes
    epacems_transformed_dfs = pudl.transform.epacems.transform(
        pudl_engine=pudl_engine, epacems_raw_dfs=epacems_raw_dfs)
    logger.info("Loading tables from EPA CEMS into PUDL:")
    if logger.isEnabledFor(logging.INFO):
        start_time = time.monotonic()
    with pudl.load.BulkCopy(
            table_name="hourly_emissions_epacems",
            engine=pudl_engine,
            csvdir=csvdir,
            keep_csv=keep_csv) as loader:

        for transformed_df_dict in epacems_transformed_dfs:
            # There's currently only one dataframe in this dict at a time,
            # but that could be changed if useful.
            # The keys to the dict are a tuple (year, month, state)
            for transformed_df in transformed_df_dict.values():
                loader.add(transformed_df)
    if logger.isEnabledFor(logging.INFO):
        time_message = "    Loading    EPA CEMS took {}".format(
            time.strftime("%H:%M:%S",
                          time.gmtime(time.monotonic() - start_time)))
        logger.info(time_message)
        start_time = time.monotonic()
    pudl.models.epacems.finalize(pudl_engine)
    if logger.isEnabledFor(logging.INFO):
        time_message = "    Finalizing EPA CEMS took {}".format(
            time.strftime("%H:%M:%S", time.gmtime(
                time.monotonic() - start_time))
        )
        logger.info(time_message)