In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import pandas as pd
import numpy as np
import sys
import os
import io
import shutil
import pathlib
import importlib
import json

from datapackage import Package
from tableschema import exceptions

import pudl
from pudl import constants as pc
import logging
import yaml

In [None]:
import uuid
import itertools
import datetime
import re
import sqlalchemy as sa
from sqlalchemy.engine import reflection

In [None]:
pudl_settings = pudl.workspace.setup.get_defaults()

In [None]:
logger = logging.getLogger()
logger.setLevel(logging.INFO)
handler = logging.StreamHandler(stream=sys.stdout)
formatter = logging.Formatter('%(message)s')
handler.setFormatter(formatter)
logger.handlers = [handler]

In [None]:
pudl_engine = pudl.init.connect_db(pudl_settings ,testing=False)
insp = reflection.Inspector.from_engine(pudl_engine)
# extract the table names
tbls = insp.get_table_names()
# extract only the ferc tables
ferc_tbls = [s for s in tbls if "ferc" in s]

If you want to generate the package for just ferc as an example, you can use the commented out ferc options below.

In [None]:
name = "pudl-test"
title = "All tables integrated into PUDL."
tbls = tbls

#name = "pudl-ferc1"
#title =  "FERC Form 1 tables integrated into PUDL."
#tbls = ferc_tbls

In [None]:
# we need this as the main info regarding this iteration of packaging
pkg_skeleton = {
    "name": name,
    "title": title,
    "description": "A full metadat description of all PUDL tables.",
    "keywords": [
        "ferc",
        "form 1",
        "energy",
        "electricity",
        "utility",
        "fuel",
        "expenses",
        "coal",
        "natural gas",
        "generation",
        "regulation"
    ]
}

this will generate metadata and csv's for every table in pudl. they will live in `results/data_pkgs`

In [None]:
#pkg = pudl.output.export.data_package(tbls,
#                                      pkg_skeleton,
#                                      pudl_settings, 
#                                      dry_run=True)

the following cells generate data data packages based on the metadata

In [None]:
with pathlib.Path(pudl_settings['settings_dir'], 'settings_datapackage_example.yml').open() as f:
    pkg_bundle_settings = yaml.safe_load(f)

In [None]:
pkg_name = 'pudl_pkg_test'

In [None]:
metas = pudl.etl_pkg.generate_data_packages(pkg_bundle_settings, pudl_settings, pkg_name, debug=True,clobber=True)

flatten the data packages and generate a sqlite db

In [None]:
report = pudl.convert.flatten_datapkgs.flatten_pudl_datapackages(pudl_settings,pkg_name)

In [None]:
pudl.convert.datapkg_to_sqlite.pkg_to_sqlite_db(pudl_settings,pkg_name)

check whether or not there are foreign keys..

In [None]:
pudl_engine = sa.create_engine(pudl_settings["pudl_sqlite_url"])
insp = reflection.Inspector.from_engine(pudl_engine)
insp.get_foreign_keys('boiler_fuel_eia923')

- the main coordinating function is pudl.output.export.generate_data_packages()
    - uses settings:
        -yml file (ex: ‘settings_init_pudl_package.yml’) and settings.py
    - validates settings using `pudl.ETL_pkg._input_validate`... this spits out a new, validated list of package settings
    - then for each of the of the packages definied in settings, run the ETL_pkg function which runs a data source specific function for each dataset. The ETL_pgk function needs the settings/inputs. In a stand-alone version of this function these can be validated settings or non-validated settings. The settings will get re-validated immediately inside of the data source specific ETL function. These ETL functions generate CSVs in the out_dir/(package name)/data folder.
    - an output of the ETL_pkg function is a list of tables being generated. this list is piped through to a `test_file_consistency` function, which ensures the ETL tables, the CSVs and dependencies from the metadata
    - then the `generate_metadata` function is run. at a high level, this generates and validates the `datapackage.json` file and runs `goodtables.validate`.