In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import pandas as pd
import numpy as np
import sys
import os
import io
import shutil
import pathlib
import importlib
import json

import pudl
import pudl.glue.ferc1_eia
from pudl import init
from pudl import constants as pc
import logging
import yaml

In [2]:
import uuid
import itertools
import datetime
import re

In [None]:
pudl_settings = pudl.settings.init()

In [3]:
logger = logging.getLogger()
logger.setLevel(logging.INFO)
handler = logging.StreamHandler(stream=sys.stdout)
formatter = logging.Formatter('%(message)s')
handler.setFormatter(formatter)
logger.handlers = [handler]

In [None]:
from sqlalchemy.engine import reflection
pudl_engine = init.connect_db(testing=False)
insp = reflection.Inspector.from_engine(pudl_engine)
# extract the table names
tbls = insp.get_table_names()
# extract only the ferc tables
ferc_tbls = [s for s in tbls if "ferc" in s]

If you want to generate the package for just ferc as an example, you can use the commented out ferc options below.

In [None]:
name = "pudl-test"
title = "All tables integrated into PUDL."
tbls = tbls

#name = "pudl-ferc1"
#title =  "FERC Form 1 tables integrated into PUDL."
#tbls = ferc_tbls

In [None]:
# we need this as the main info regarding this iteration of packaging
pkg_skeleton = {
    "name": name,
    "title": title,
    "description": "A full metadat description of all PUDL tables.",
    "keywords": [
        "ferc",
        "form 1",
        "energy",
        "electricity",
        "utility",
        "fuel",
        "expenses",
        "coal",
        "natural gas",
        "generation",
        "regulation"
    ]
}

this will generate metadata and csv's for every table in pudl. they will live in `results/data_pkgs`

In [None]:
pkg = pudl.output.export.data_package(tbls,
                                      pkg_skeleton,
                                      pudl_settings, 
                                      dry_run=True)

file structure in dp directory:
- Cems_pkg
    - data
        - cems2017.csv.gz
        - cems2016.csv.gz
        - cems2015.csv.gz
        (see "Compression of Resources": http://frictionlessdata.io/specs/patterns/)
    - datapackage.json
        which includes a list of paths in "paths" in the resource
        (see "Data in Multiple Files": https://frictionlessdata.io/specs/data-resource/)


I need to convert the ETL cems process to generate years of cems at a time and write it compressed.
- run either the whole ETL process over one year at of cems at a time or somehow run the load step over one year of cems data at a time
- special cems loading with compression...

In [7]:
pudl_settings = pudl.settings.init()

In [8]:
pkg_bundle_settings = pudl.settings.grab_package_settings(pudl_settings, 'settings_datapackage_default.yml')

In [9]:
pkg_settings = pkg_bundle_settings[1]

In [12]:
metas = pudl.output.export.generate_data_packages(pkg_bundle_settings, pudl_settings, debug=True)

Loading Static IPM Tables regions_entity_epaipm dataframe into CSV
Beginning ETL for EPA IPM.
Extracting data from EPA IPM transmission_single_epaipm spreadsheet.
Extracting data from EPA IPM transmission_joint_epaipm spreadsheet.
Extracting data from EPA IPM load_curves_epaipm spreadsheet.
Extracting data from EPA IPM plant_region_map_epaipm spreadsheet.
Extracting data from EPA IPM plant_region_map_epaipm spreadsheet.
Transforming raw EPA IPM DataFrames for transmission_single_epaipm
Transforming raw EPA IPM DataFrames for transmission_joint_epaipm
Transforming raw EPA IPM DataFrames for load_curves_epaipm
Transforming raw EPA IPM DataFrames for plant_region_map_epaipm
Loading EPA IPM transmission_single_epaipm dataframe into CSV
Loading EPA IPM transmission_joint_epaipm dataframe into CSV
Loading EPA IPM load_curves_epaipm dataframe into CSV
Loading EPA IPM plant_region_map_epaipm dataframe into CSV
Tables are consistent for epaipm-test package
Loading Static EIA Tables fuel_type_ei

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,


Harvesting IDs & consistently static attributes for EIA generators
Average consistency of static generators values is 100.00%
Harvesting IDs & consistently static attributes for EIA utilities
Average consistency of static utilities values is 100.00%
Harvesting IDs & consistently static attributes for EIA boilers
Average consistency of static boilers values is 97.58%
Inferring complete EIA boiler-generator associations.
Multiple EIA unit codes:plant_id_eia=10725, unit_id_pudl=1, unit_id_eia=['F801' 'F802']
Multiple EIA unit codes:plant_id_eia=55309, unit_id_pudl=1, unit_id_eia=['SMR1' 'SMR2']
Multiple EIA unit codes:plant_id_eia=60786, unit_id_pudl=1, unit_id_eia=[4343 4141]
Loading Entities plants_entity_eia dataframe into CSV
Loading Entities generators_entity_eia dataframe into CSV
Loading Entities utilities_entity_eia dataframe into CSV
Loading Entities boilers_entity_eia dataframe into CSV
Loading EIA ownership_eia860 dataframe into CSV
Loading EIA generators_eia860 dataframe into 

  + pd.to_timedelta(df["op_hour"], unit="h", box=False)


    Loading 70,080 records (10 MB) into PUDL.
    Loading    EPA CEMS took 00:00:02
Tables are consistent for epacems_eia860 package


In [13]:
metas['eia-test']

{'time': 4.217,
 'valid': True,
 'error-count': 0,
 'table-count': 10,
 'tables': [{'datapackage': '/Users/christinagosnell/code/pudl/results/datapackage/0.1.dev1625+gd85ca22.d20190726/eia-test/datapackage.json',
   'time': 3.218,
   'valid': True,
   'error-count': 0,
   'row-count': 1000,
   'source': '/Users/christinagosnell/code/pudl/results/datapackage/0.1.dev1625+gd85ca22.d20190726/eia-test/data/boiler_fuel_eia923.csv',
   'headers': ['id',
    'plant_id_eia',
    'boiler_id',
    'fuel_type_code',
    'fuel_type_code_pudl',
    'report_date',
    'fuel_consumed_units',
    'fuel_mmbtu_per_unit',
    'sulfur_content_pct',
    'ash_content_pct'],
   'format': 'inline',
   'schema': 'table-schema',
   'errors': []},
  {'datapackage': '/Users/christinagosnell/code/pudl/results/datapackage/0.1.dev1625+gd85ca22.d20190726/eia-test/datapackage.json',
   'time': 2.906,
   'valid': True,
   'error-count': 0,
   'row-count': 1000,
   'source': '/Users/christinagosnell/code/pudl/results/dat

In [None]:
metas['epaipm-test'][1]

- the main coordinating function is pudl.output.export.generate_data_packages()
    - uses settings:
        -yml file (ex: ‘settings_init_pudl_package.yml’) and settings.py
    - validates settings using `pudl.ETL_pkg._input_validate`... this spits out a new, validated list of package settings
    - then for each of the of the packages definied in settings, run the ETL_pkg function which runs a data source specific function for each dataset. The ETL_pgk function needs the settings/inputs. In a stand-alone version of this function these can be validated settings or non-validated settings. The settings will get re-validated immediately inside of the data source specific ETL function. These ETL functions generate CSVs in the out_dir/(package name)/data folder.
    - an output of the ETL_pkg function is a list of tables being generated. this list is piped through to a `test_file_consistency` function, which ensures the ETL tables, the CSVs and dependencies from the metadata
    - then the `generate_metadata` function is run. at a high level, this generates and validates the `datapackage.json` file and runs `goodtables.validate`.