In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import pandas as pd
import numpy as np
import sys
import os
import io
import shutil
import pathlib

import pudl
import pudl.glue.ferc1_eia
from pudl import init
from pudl import constants as pc
import logging
import yaml

In [15]:
import uuid
import itertools
import datetime
import re

In [3]:
pudl_settings = pudl.settings.init()

In [4]:
logger = logging.getLogger()
logger.setLevel(logging.INFO)
handler = logging.StreamHandler(stream=sys.stdout)
formatter = logging.Formatter('%(message)s')
handler.setFormatter(formatter)
logger.handlers = [handler]

In [None]:
from sqlalchemy.engine import reflection
pudl_engine = init.connect_db(testing=False)
insp = reflection.Inspector.from_engine(pudl_engine)
# extract the table names
tbls = insp.get_table_names()
# extract only the ferc tables
ferc_tbls = [s for s in tbls if "ferc" in s]

If you want to generate the package for just ferc as an example, you can use the commented out ferc options below.

In [None]:
name = "pudl-test"
title = "All tables integrated into PUDL."
tbls = tbls

#name = "pudl-ferc1"
#title =  "FERC Form 1 tables integrated into PUDL."
#tbls = ferc_tbls

In [None]:
# we need this as the main info regarding this iteration of packaging
pkg_skeleton = {
    "name": name,
    "title": title,
    "description": "A full metadat description of all PUDL tables.",
    "keywords": [
        "ferc",
        "form 1",
        "energy",
        "electricity",
        "utility",
        "fuel",
        "expenses",
        "coal",
        "natural gas",
        "generation",
        "regulation"
    ]
}

this will generate metadata and csv's for every table in pudl. they will live in `results/data_pkgs`

In [None]:
pkg = pudl.output.export.data_package(tbls,
                                      pkg_skeleton,
                                      pudl_settings, 
                                      dry_run=True)

file structure in dp directory:
- Cems_pkg
    - data
        - cems2017.csv.gz
        - cems2016.csv.gz
        - cems2015.csv.gz
        (see "Compression of Resources": http://frictionlessdata.io/specs/patterns/)
    - datapackage.json
        which includes a list of paths in "paths" in the resource
        (see "Data in Multiple Files": https://frictionlessdata.io/specs/data-resource/)


I need to convert the ETL cems process to generate years of cems at a time and write it compressed.
- run either the whole ETL process over one year at of cems at a time or somehow run the load step over one year of cems data at a time
- special cems loading with compression...

In [93]:
pudl_settings = pudl.settings.init()

In [92]:
pkg_bundle_settings = pudl.settings.grab_package_settings(pudl_settings, 'settings_datapackage_default.yml')

In [9]:
pkg_settings = pkg_bundle_settings[3]

In [89]:
metas = pudl.output.export.generate_data_packages(pkg_bundle_settings, pudl_settings, debug=True)

validating settings

Loading Glue plants dataframe into CSV
Loading Glue utilities dataframe into CSV
Loading Glue utilities_ferc dataframe into CSV
Loading Glue plants_ferc dataframe into CSV
Loading Glue utility_plant_assn dataframe into CSV
Loading Glue utilities_eia dataframe into CSV
Loading Glue plants_eia dataframe into CSV
adding no partitions for glue
adding no partitions for glue
adding no partitions for glue
adding no partitions for glue
adding no partitions for glue
adding no partitions for glue
adding no partitions for glue
adding no partitions for glue
Tables are consistent for glue-test package
adding no partitions for glue

Not loading FERC1
adding no partitions for ferc1
Tables are consistent for ferc1-test package
Not generating metadata for ferc1-test

Not loading EIA.
adding no partitions for eia
Tables are consistent for eia-test package
Not generating metadata for eia-test

Loading Static EIA Tables fuel_type_eia923 dataframe into CSV
Loading Static EIA Tables pri

In [90]:
metas['epacems_eia860'][1]

{'time': 8.502,
 'valid': True,
 'error-count': 0,
 'table-count': 10,
 'tables': [{'datapackage': '/Users/christinagosnell/code/pudl/results/datapackage/epacems_eia860/datapackage.json',
   'time': 5.917,
   'valid': True,
   'error-count': 0,
   'row-count': 1000,
   'source': '/Users/christinagosnell/code/pudl/results/datapackage/epacems_eia860/data/boiler_fuel_eia923.csv',
   'headers': ['id',
    'plant_id_eia',
    'boiler_id',
    'fuel_type_code',
    'fuel_type_code_pudl',
    'report_date',
    'fuel_consumed_units',
    'fuel_mmbtu_per_unit',
    'sulfur_content_pct',
    'ash_content_pct'],
   'format': 'inline',
   'schema': 'table-schema',
   'errors': []},
  {'datapackage': '/Users/christinagosnell/code/pudl/results/datapackage/epacems_eia860/datapackage.json',
   'time': 4.14,
   'valid': True,
   'error-count': 0,
   'row-count': 1000,
   'source': '/Users/christinagosnell/code/pudl/results/datapackage/epacems_eia860/data/boiler_generator_assn_eia860.csv',
   'headers'

- the main coordinating function is pudl.output.export.generate_data_packages()
    - uses settings:
        -yml file (ex: ‘settings_init_pudl_package.yml’) and settings.py
    - validates settings using `pudl.ETL_pkg._input_validate`... this spits out a new, validated list of package settings
    - then for each of the of the packages definied in settings, run the ETL_pkg function which runs a data source specific function for each dataset. The ETL_pgk function needs the settings/inputs. In a stand-alone version of this function these can be validated settings or non-validated settings. The settings will get re-validated immediately inside of the data source specific ETL function. These ETL functions generate CSVs in the out_dir/(package name)/data folder.
    - an output of the ETL_pkg function is a list of tables being generated. this list is piped through to a `test_file_consistency` function, which ensures the ETL tables, the CSVs and dependencies from the metadata
    - then the `generate_metadata` function is run. at a high level, this generates and validates the `datapackage.json` file and runs `goodtables.validate`.