In [None]:
%load_ext autoreload
%autoreload 3

In [None]:
import os

assert os.environ.get("DAGSTER_HOME"), (
    "The DAGSTER_HOME env var is not set so dagster won't be able to find the assets."
    "Set the DAGSTER_HOME env var in this notebook or kill the jupyter server and set"
    " the DAGSTER_HOME env var in your shell and relaunch jupyter."
)

In [None]:
import copy
import logging
import pathlib
import sys

import pandas as pd
import yaml

import pudl
from pudl.etl import *

In [None]:
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
handler = logging.StreamHandler(stream=sys.stdout)
formatter = logging.Formatter("%(message)s")
handler.setFormatter(formatter)
logger.handlers = [handler]

In [None]:
from pudl.helpers import convert_cols_dtypes
from pudl.metadata.classes import Package
from pudl.metadata.fields import apply_pudl_dtypes
from pudl.settings import EiaSettings
from pudl.transform.eia import (
    _add_additional_epacems_plants,
    _add_timezone,
    fillna_balancing_authority_codes_via_names,
    fix_balancing_authority_codes_with_state,
    harvesting,
)


def run_harvesting(entity: str, eia_transformed_dfs: dict[str, pd.DataFrame]):
    """Replicate harvested_entity asset behavior.
    
    See pudl.transform.eia.harvested_entity_asset_factory
    """
    eia_transformed_dfs = {
        name: convert_cols_dtypes(df, data_source="eia")
        for name, df in eia_transformed_dfs.items()
    }

    if entity == "utilities":
        # Remove location columns that are associated with plants, not utilities:
        for table, df in eia_transformed_dfs.items():
            if "plant_id_eia" in df.columns:
                plant_location_cols = [
                    "street_address",
                    "city",
                    "state",
                    "zip_code",
                ]
                logger.info(f"Removing {plant_location_cols} from {table} table.")
                eia_transformed_dfs[table] = df.drop(
                    columns=plant_location_cols, errors="ignore"
                )

    entity_df, annual_df, col_dfs = harvesting(
        entity,
        eia_transformed_dfs,
        debug=True,
        eia860m=eia_settings.eia860.eia860m,
    )

    # Apply standard PUDL data types to the new entity tables:
    pkg = Package.from_resource_ids()
    entity_res = pkg.get_resource(f"{entity}_entity_eia")
    entity_df = apply_pudl_dtypes(entity_df, group="eia").pipe(entity_res.encode)
    annual_res = pkg.get_resource(f"{entity}_eia860")
    annual_df = apply_pudl_dtypes(annual_df, group="eia").pipe(annual_res.encode)

    if entity == "plants":
        # Post-processing specific to the plants entity tables
        entity_df = _add_additional_epacems_plants(entity_df).pipe(_add_timezone)
        annual_df = fillna_balancing_authority_codes_via_names(annual_df).pipe(
            fix_balancing_authority_codes_with_state, plants_entity=entity_df
        )

    entity_df = entity_res.enforce_schema(entity_df)
    annual_df = annual_res.enforce_schema(annual_df)

    return entity_df, annual_df, col_dfs

# Load pre-harvesting assets and run harvesting
* To avoid rerunning the extract and transform steps, this notebook loads the most recent pre harvested asset values.
* **If you are debugging the code changes in the harvesting step, make sure you rematerialize the EIA extract and transform assets for the same years prior to running the following cells.**
* If you are you are debugging the effects of pre-harvesting code on the harvesting process, rerun the pre harvesting asset using dagit then rerun the following cells.
* Note that because the harvesting process mutates the pre-harvesting assets, they need to be pulled fresh each time you re-run the harvesting step.

In [None]:
%%time
from dagster import AssetKey

from pudl.etl import defs
from pudl.helpers import convert_cols_dtypes, get_asset_group_keys

eia_settings = EiaSettings()

clean_eia923_asset_names = get_asset_group_keys("clean_eia923", default_assets)
clean_eia860_asset_names = get_asset_group_keys("clean_eia860", default_assets)
clean_eia_asset_names = clean_eia860_asset_names + clean_eia923_asset_names

eia_transformed_dfs = {}
with defs.get_asset_value_loader() as loader:
    eia_transformed_dfs = {
        asset_name: loader.load_asset_value(AssetKey(asset_name))
        for asset_name in clean_eia_asset_names
    }

# Entity must be one of: "plants",  "utilities", "boilers", or "generators"
entity = "generators"
entity_df, annual_df, col_dfs = run_harvesting(entity, eia_transformed_dfs)

# Inspect the harvested results

## The entity (static) table

In [None]:
entity_df

## The annual table

In [None]:
annual_df

## Use `col_dfs` to explore harvested values

In [None]:
pmc = col_dfs["prime_mover_code"]

In [None]:
pmc.prime_mover_code.unique()