In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pudl
import pathlib
import yaml
from pudl.etl import *
import logging
import sys
import copy

In [None]:
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
handler = logging.StreamHandler(stream=sys.stdout)
formatter = logging.Formatter('%(message)s')
handler.setFormatter(formatter)
logger.handlers = [handler]

# Setup

In [None]:
pudl_settings = pudl.workspace.setup.get_defaults()
settings_file_name= 'etl_full.yml'
etl_settings = EtlSettings.from_yaml(
    pathlib.Path(pudl_settings['settings_dir'],
                 settings_file_name))
validated_etl_settings = etl_settings.datasets
datasets = validated_etl_settings.get_datasets()
eia_settings = datasets["eia"]

You can skip the settings step above and set these years/tables yourself here without using the settings files... just know they are not validated below so they could be wrong and fail after some time. It is HIGHLY RECOMMENDED that you use all the years/tables

In [None]:
eia860_tables = eia_settings.eia860.tables
eia860_years = eia_settings.eia860.years
eia860m = eia_settings.eia860.eia860m
eia923_tables = eia_settings.eia923.tables
eia923_years = eia_settings.eia923.years

ds = Datastore()

# Run extract step & phase 1 transform step
this is pulled from `pudl.etl._etl_eia()`

In [None]:

# Extract EIA forms 923, 860
eia923_raw_dfs = pudl.extract.eia923.Extractor(ds).extract(
    settings=eia_settings.eia923
)
eia860_raw_dfs = pudl.extract.eia860.Extractor(ds).extract(
    settings=eia_settings.eia860
)
# if we are trying to add the EIA 860M YTD data, then extract it and append
if eia860m:
    eia860m_raw_dfs = pudl.extract.eia860m.Extractor(ds).extract(
        settings=eia_settings.eia860
    )
    eia860_raw_dfs = pudl.extract.eia860m.append_eia860m(
        eia860_raw_dfs=eia860_raw_dfs, eia860m_raw_dfs=eia860m_raw_dfs
    )

# Transform EIA forms 923, 860
eia860_transformed_dfs = pudl.transform.eia860.transform(
    eia860_raw_dfs, eia860_settings=eia_settings.eia860
)

eia923_transformed_dfs = pudl.transform.eia923.transform(
    eia923_raw_dfs, eia923_settings=eia_settings.eia923
)

You have to re-run this cell every time you want to re-run the havesting cell below (bc `pudl.transform.eia.harvesting` removes columns from the dfs). This cell enables you to start with a fresh`eia_transformed_dfs` without needing to re-run the 860/923 transforms.

In [None]:
# create an eia transformed dfs dictionary
eia_transformed_dfs = eia860_transformed_dfs.copy()
eia_transformed_dfs.update(eia923_transformed_dfs.copy())

# Do some final cleanup and assign appropriate types:
eia_transformed_dfs = {
    name: convert_cols_dtypes(df, data_source="eia")
    for name, df in eia_transformed_dfs.items()
}

# Run harvest w/ debug=True

In [None]:
# we want to investigate the harvesting of the plants in this case...
entity = 'generators'
# create the empty entities df to fill up
entities_dfs = {}
entities_dfs, eia_transformed_dfs, col_dfs = (
    pudl.transform.eia.harvesting(
        entity, eia_transformed_dfs, entities_dfs, debug=True)
)

# Use `col_dfs` to explore harvested values

In [None]:
pmc = col_dfs['prime_mover_code']

In [None]:
pmc.prime_mover_code.unique()