In [None]:
%load_ext autoreload
%autoreload 3

In [None]:
import os

assert os.environ.get("DAGSTER_HOME"), (
    "The DAGSTER_HOME env var is not set so dagster won't be able to find the assets."
    "Set the DAGSTER_HOME env var in this notebook or kill the jupyter server and set"
    " the DAGSTER_HOME env var in your shell and relaunch jupyter."
)

In [None]:
import copy
import logging
import pathlib
import sys

import pandas as pd
import yaml

import pudl
from pudl.etl import *

# Load pre-harvesting assets and run harvesting
* To avoid rerunning the extract and transform steps, this notebook loads the most recent pre harvested asset values.
* **If you are debugging the code changes in the harvesting step, make sure you rematerialize the EIA extract and transform assets for the same years prior to running the following cells.**
* If you are you are debugging the effects of pre-harvesting code on the harvesting process, rerun the pre harvesting asset using dagit then rerun the following cells.
* Note that because the harvesting process mutates the pre-harvesting assets, they need to be pulled fresh each time you re-run the harvesting step.

In [None]:
%%time
from dagster import AssetKey

from pudl.etl import default_assets, defs
from pudl.helpers import get_asset_group_keys
from pudl.settings import EiaSettings
from pudl.transform.eia import EiaEntity, debug_harvesting

eia_settings = EiaSettings()

clean_eia923_asset_names = get_asset_group_keys("clean_eia923", default_assets)
clean_eia860_asset_names = get_asset_group_keys("clean_eia860", default_assets)
clean_eia_asset_names = clean_eia860_asset_names + clean_eia923_asset_names

clean_dfs = {}
with defs.get_asset_value_loader() as loader:
    clean_dfs = {
        asset_name: loader.load_asset_value(AssetKey(asset_name))
        for asset_name in clean_eia_asset_names
    }

# this Enum defines the valid values of entity
# entity = EiaEntity.UTILITIES
# entity = EiaEntity.PLANTS
# entity = EiaEntity.BOILERS
entity = EiaEntity.GENERATORS
entity_df, annual_df, col_dfs = debug_harvesting(
    entity, clean_dfs, eia_settings=eia_settings
)

# Inspect the harvested results

## The entity (static) table

In [None]:
entity_df

## The annual table

In [None]:
annual_df

## Use `col_dfs` to explore harvested values

In [None]:
pmc = col_dfs["prime_mover_code"]

In [None]:
pmc.prime_mover_code.unique()