In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os

assert os.environ.get("DAGSTER_HOME"), (
    "The DAGSTER_HOME env var is not set so dagster won't be able to find the assets."
    "Set the DAGSTER_HOME env var in this notebook or kill the jupyter server and set"
    " the DAGSTER_HOME env var in your shell and relaunch jupyter."
)

In [None]:
import copy
import logging
import pathlib
import sys

import yaml

import pudl
from pudl.etl import *

In [None]:
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
handler = logging.StreamHandler(stream=sys.stdout)
formatter = logging.Formatter("%(message)s")
handler.setFormatter(formatter)
logger.handlers = [handler]

## Load pre harvesting assets
To avoid rerunning the extract and transform steps, this notebook loads the most recent pre harvested asset values. **If you are debugging the code changes in the harvesting step, make sure you rematerialize the EIA extract and transform assets for the same years prior to running the following cells.** If you are you are debugging pre harvesting asset code changes' affect on the harvesting process, rerun the pre harvesting asset using dagit then rerun the following cells.

In [None]:
from dagster import AssetKey

from pudl.etl import defs
from pudl.helpers import convert_cols_dtypes, get_asset_group_keys

clean_eia923_asset_names = get_asset_group_keys("clean_eia923", default_assets)
clean_eia860_asset_names = get_asset_group_keys("clean_eia860", default_assets)
clean_eia_asset_names = clean_eia860_asset_names + clean_eia923_asset_names

eia_transformed_dfs = {}
with defs.get_asset_value_loader() as loader:
    eia_transformed_dfs = {
        asset_name: loader.load_asset_value(AssetKey(asset_name))
        for asset_name in clean_eia_asset_names
    }


# Do some final cleanup and assign appropriate types:
eia_transformed_dfs = {
    name: convert_cols_dtypes(df, data_source="eia")
    for name, df in eia_transformed_dfs.items()
}

# Remove the clean_ prefix from the table names.
eia_transformed_dfs = {
    table_name.replace("clean_", ""): df
    for table_name, df in eia_transformed_dfs.items()
}

# Run harvest w/ debug=True

In [None]:
%%time
# we want to investigate the harvesting of the plants in this case...
entity = "generators"
# create the empty entities df to fill up
entities_dfs = {}
entities_dfs, eia_transformed_dfs, col_dfs = pudl.transform.eia.harvesting(
    entity, eia_transformed_dfs, entities_dfs, debug=True
)

# Use `col_dfs` to explore harvested values

In [None]:
pmc = col_dfs["prime_mover_code"]

In [None]:
pmc.prime_mover_code.unique()