In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import pandas as pd
import numpy as np
import sys
import os
import io
import shutil

import pudl
import pudl.glue.ferc1_eia
from pudl import init
from pudl import constants as pc
import tableschema
import hashlib
import datapackage
import json
import logging
from tableschema import Table
from tableschema import Schema

In [2]:
from pudl import ETL_pkg
from pudl.ETL_pkg import _ETL_eia_pkg, _ETL_ferc1_pkg, _ETL_glue, _ETL_EPACEMS_pkg
from pudl.ETL_pkg import _input_validation, _input_validate_eia,_input_validate_ferc1, _input_validate_epacems

In [3]:
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
handler = logging.StreamHandler(stream=sys.stdout)
formatter = logging.Formatter('%(message)s')
handler.setFormatter(formatter)
logger.handlers = [handler]

In [4]:
from sqlalchemy.engine import reflection
pudl_engine = init.connect_db(testing=False)
insp = reflection.Inspector.from_engine(pudl_engine)
# extract the table names
tbls = insp.get_table_names()
# extract only the ferc tables
ferc_tbls = [s for s in tbls if "ferc" in s]

If you want to generate the package for just ferc as an example, you can use the commented out ferc options below.

In [5]:
name = "pudl-test"
title = "All tables integrated into PUDL."
tbls = tbls

#name = "pudl-ferc1"
#title =  "FERC Form 1 tables integrated into PUDL."
#tbls = ferc_tbls

In [6]:
tbls.remove('hourly_emissions_epacems')

In [7]:
# we need this as the main info regarding this iteration of packaging
pkg_skeleton = {
    "name": name,
    "title": title,
    "description": "Tables collected from FERC Form 1, as they appear in our database, with all dependent tables. Allows the user to instantiate their own local database.",
    "keywords": [
        "ferc",
        "form 1",
        "energy",
        "electricity",
        "utility",
        "fuel",
        "expenses",
        "coal",
        "natural gas",
        "generation",
        "regulation"
    ]
}

this will generate metadata and csv's for every table in pudl. they will live in `results/data_pkgs`

In [8]:
pkg = pudl.output.export.data_package(tbls,pkg_skeleton,dry_run=True)

Finding dependent tables for plants_entity_eia
Finding dependent tables for plants_entity_eia
Finding dependent tables for utilities
Finding dependent tables for plants
Finding dependent tables for utilities
Finding dependent tables for utilities
Finding dependent tables for plants
Finding dependent tables for prime_movers_eia923
Finding dependent tables for fuel_type_eia923
Finding dependent tables for plants_entity_eia
Finding dependent tables for fuel_type_aer_eia923
Finding dependent tables for fuel_type_eia923
Finding dependent tables for generators_entity_eia
Finding dependent tables for plants_entity_eia
Finding dependent tables for generators_entity_eia
Finding dependent tables for plants_entity_eia
Finding dependent tables for plants_entity_eia
Finding dependent tables for transport_modes_eia923
Finding dependent tables for plants_entity_eia
Finding dependent tables for energy_source_eia923
Finding dependent tables for coalmine_eia923
Finding dependent tables for utilities_ent

Finding dependent tables for plants
Finding dependent tables for prime_movers_eia923
Finding dependent tables for fuel_type_eia923
Finding dependent tables for plants_entity_eia
Finding dependent tables for fuel_type_aer_eia923
Finding dependent tables for plants_entity_eia
Finding dependent tables for plants_ferc
Finding dependent tables for plants
Finding dependent tables for utilities_ferc
Finding dependent tables for utilities
Finding dependent tables for plants_ferc
Finding dependent tables for plants
Finding dependent tables for utilities_ferc
Finding dependent tables for utilities
Finding dependent tables for plants_ferc
Finding dependent tables for plants
Finding dependent tables for utilities_ferc
Finding dependent tables for utilities
Finding dependent tables for fuel_type_eia923
Finding dependent tables for utilities_entity_eia
Finding dependent tables for plants_ferc
Finding dependent tables for plants
Finding dependent tables for utilities_ferc
Finding dependent tables for

here we are pulling the list of dependent tables based on the generated metadata

In [9]:
dependent_tbls = pudl.helpers.get_dependent_tables_from_list_pkg(ferc_tbls,pkg_name=name)

Finding dependent tables for utilities
Finding dependent tables for plants_ferc
Finding dependent tables for plants
Finding dependent tables for utilities_ferc
Finding dependent tables for utilities
Finding dependent tables for ferc_accounts
Finding dependent tables for utilities_ferc
Finding dependent tables for utilities
Finding dependent tables for plants
Finding dependent tables for utilities_ferc
Finding dependent tables for utilities
Finding dependent tables for plants_ferc
Finding dependent tables for plants
Finding dependent tables for utilities_ferc
Finding dependent tables for utilities
Finding dependent tables for ferc_depreciation_lines
Finding dependent tables for utilities_ferc
Finding dependent tables for utilities
Finding dependent tables for plants_ferc
Finding dependent tables for plants
Finding dependent tables for utilities_ferc
Finding dependent tables for utilities
Finding dependent tables for plants_ferc
Finding dependent tables for plants
Finding dependent table

In [10]:
def _prep_directories(pkg_dir):
    # delete package directories if they exist
    if os.path.exists(pkg_dir):
        shutil.rmtree(pkg_dir)

    # create the main package directory
    os.mkdir(pkg_dir)
    # also create the data directory for the CSVs to live in
    os.mkdir(os.path.join(pkg_dir,'data'))

In [11]:
# One function that generates the CSVs
# One function that generates the metadata
# One function that coordinates the two and validates

In [12]:
def generate_resources(all_tables, mega_pkg_dir):

    # here is the 
    pkg_json = os.path.join(mega_pkg_dir, "datapackage.json")
    
    with open(pkg_json) as md:
            metadata_mega = json.load(md)

    resources_list = []
    for tbl in metadata_mega['resources']:
        if tbl['name'] in all_tables:
            resources_list.append(tbl)
    resources = {'resources': resources_list}
    # We need to re-hash with hash_csv!!
    return(resources)

In [17]:
def ETL_pkg(out_dir, settings_init):
    ETL_functions = {'eia':_ETL_eia_pkg,
                     'ferc1':_ETL_ferc1_pkg,
                     'epacems':_ETL_EPACEMS_pkg,
                     'glue':_ETL_glue,
                    }

    _input_validation(settings_init)
    # a dictionary to compile the list of tables being loaded for each package
    tables_dct = {}
    for pkg in settings_init:
        print(pkg['name'])
        # define the package directory
        pkg_dir =  os.path.join(out_dir,pkg['name'])
        # prepping the directories where the pkges will live
        _prep_directories(pkg_dir)
        # compile a list of tables in each dataset
        tables = []
        for dataset_dct in pkg['datasets']:
            for dataset in dataset_dct:
                tables.extend(ETL_functions[dataset](pkg_dir, dataset_dct[dataset]))
        # Add an assertion that tables = os.listdir(os.path.join(pkg_dir,"data"))
        tables_dct[pkg['name']] = tables
    return tables_dct

In [18]:
from pudl.settings import SETTINGS
# needed input
out_dir = SETTINGS['out_dir']
settings_init = pudl.settings.settings_init(settings_file='settings_init_pudl_package.yml')

In [20]:
tables_dct = ETL_pkg(out_dir,settings_init)

Validating inputs.
no CEMS for now...
eia-test
Loading Static EIA Tables fuel_type_eia923 dataframe into CSV
Loading Static EIA Tables prime_movers_eia923 dataframe into CSV
Loading Static EIA Tables fuel_type_aer_eia923 dataframe into CSV
Loading Static EIA Tables energy_source_eia923 dataframe into CSV
Loading Static EIA Tables transport_modes_eia923 dataframe into CSV
Extracting EIA 923 spreadsheets for 2017.
Converting EIA 923 generation_fuel spreadsheet tab from 2017 into a pandas DataFrame
Converting EIA 923 stocks spreadsheet tab from 2017 into a pandas DataFrame
Converting EIA 923 boiler_fuel spreadsheet tab from 2017 into a pandas DataFrame
Converting EIA 923 generator spreadsheet tab from 2017 into a pandas DataFrame
Converting EIA 923 fuel_receipts_costs spreadsheet tab from 2017 into a pandas DataFrame
Beginning ETL for EIA 860.
Extracting data from EIA 860 enviro_assn spreadsheet for 2017.
Converting EIA 860 spreadsheet tab boiler_generator_assn to pandas DataFrame for 201

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


       Ratio: 1.0  Wrongos: 0.0  Total: 10099   longitude
       Ratio: 1.0  Wrongos: 0.0  Total: 10088   nerc_region
       Ratio: 1.0  Wrongos: 0.0  Total: 10127   plant_name
       Ratio: 1.0  Wrongos: 0.0  Total: 10127   primary_purpose_naics_id
       Ratio: 1.0  Wrongos: 0.0  Total: 10127   sector_id
       Ratio: 1.0  Wrongos: 0.0  Total: 10127   sector_name
       Ratio: 1.0  Wrongos: 0.0  Total: 10127   state
       Ratio: 1.0  Wrongos: 0.0  Total: 9943   street_address
       Ratio: 1.0  Wrongos: 0.0  Total: 10105   zip_code
       Ratio: 1.0  Wrongos: 0.0  Total: 10127   ash_impoundment
       Ratio: 1.0  Wrongos: 0.0  Total: 10127   ash_impoundment_lined
       Ratio: 1.0  Wrongos: 0.0  Total: 362   ash_impoundment_status
       Ratio: 1.0  Wrongos: 0.0  Total: 10127   energy_storage
       Ratio: 1.0  Wrongos: 0.0  Total: 491   ferc_cogen_docket_no
       Ratio: 1.0  Wrongos: 0.0  Total: 4475   water_source
       Ratio: 1.0  Wrongos: 0.0  Total: 1139   ferc_exempt_wholesa

        boiler_fuel_eia923...
    Casting harvested IDs to correct data types
       Ratio: 0.976  Wrongos: 113.0  Total: 4674   prime_mover_code
Average consistency of static boilers values is 97.58%
Inferring complete EIA boiler-generator associations.
Multiple EIA unit codes:plant_id_eia=10725, unit_id_pudl=1, unit_id_eia=['F801' 'F802']
Multiple EIA unit codes:plant_id_eia=55309, unit_id_pudl=1, unit_id_eia=['SMR1' 'SMR2']
Multiple EIA unit codes:plant_id_eia=60786, unit_id_pudl=1, unit_id_eia=[4343 4141]
Loading Entities plants_entity_eia dataframe into CSV
Loading Entities generators_entity_eia dataframe into CSV
Loading Entities utilities_entity_eia dataframe into CSV
Loading Entities boilers_entity_eia dataframe into CSV
Loading EIA ownership_eia860 dataframe into CSV
Loading EIA generators_eia860 dataframe into CSV
Loading EIA plants_eia860 dataframe into CSV
Loading EIA boiler_generator_assn_eia860 dataframe into CSV
Loading EIA utilities_eia860 dataframe into CSV
Loading EIA

In [22]:
eia_resources = generate_resources(tables_dct['eia-test'], os.path.join(out_dir,'pudl-test'))

In [None]:
# I've been canabalizing this but it's by no means functional yet...
# we need to reun the parts that can/should only be done while
# we're generating the package and squish
def get_tabular_data_resource_2(table_name, pkg_dir, testing=False):
    """
    Create a Tabular Data Resource descriptor for a PUDL DB table.

    Based on the information in the database, and some additional metadata,
    stored elsewhere (Where?!?!) this function will generate a valid Tabular
    Data Resource descriptor, according to the Frictionless Data specification,
    which can be found here:

    https://frictionlessdata.io/specs/tabular-data-resource/
    """
    # table = get_table(tablename, testing=testing)

    # Where the CSV file holding the data is, relative to datapackage.json
    # This is the value that has to be embedded in the data package.
    csv_relpath = os.path.join('data', f'{table_name}.csv')
    # We need to access the file to calculate hash and size too:
    csv_abspath = os.path.join(os.path.abspath(pkg_dir), csv_relpath)

    descriptor = {} #import existing descriptor...
    descriptor['path'] = csv_relpath
    descriptor['bytes'] = os.path.getsize(csv_abspath)
    descriptor['hash'] = hash_csv(csv_abspath)

    # If omitted, icenses are inherited from the containing data package.
    descriptor["licenses"] = [pudl.constants.licenses['cc-by-4.0'], ]
w
    data_sources = \
        pudl.helpers.data_sources_from_tables([table_name, ])
    # descriptor["sources"] = \
    #    [pudl.constants.data_sources[src] for src in data_sources]
    descriptor["sources"] = []
    for src in data_sources:
        if src in pudl.constants.data_sources:
            descriptor["sources"].append({"title": src,
                                          "path": pc.base_data_urls[src]})

    resource = datapackage.Resource(descriptor)
    if not resource.valid:
        raise AssertionError(
            f"""
            Invalid tabular data resource: {resource.name}

            Errors:
            {resource.errors}
            """
        )

    return descriptor