In [1]:
%load_ext autoreload
%autoreload 3

In [2]:
import os

import pandas as pd
from dagster import AssetKey

from pudl.etl import default_assets, defs
from pudl.helpers import get_asset_group_keys
from pudl.settings import EiaSettings
from pudl.transform.eia import EiaEntity, harvest_entity_tables

assert os.environ.get("DAGSTER_HOME"), (
    "The DAGSTER_HOME env var is not set so dagster won't be able to find the assets."
    "Set the DAGSTER_HOME env var in this notebook or kill the jupyter server and set"
    " the DAGSTER_HOME env var in your shell and relaunch jupyter."
)

# Load pre-harvesting assets and run harvesting
* To avoid rerunning the extract and transform steps, this notebook loads the most recent pre harvested asset values.
* **If you are debugging the code changes in the harvesting step, make sure you rematerialize the EIA extract and transform assets for the same years prior to running the following cells.**
* If you are you are debugging the effects of pre-harvesting code on the harvesting process, rerun the pre harvesting asset using dagit then rerun the following cells.
* Note that because the harvesting process mutates the pre-harvesting assets, they need to be pulled fresh each time you re-run the harvesting step.

In [3]:
%%time

_core_assets = get_asset_group_keys("_core_eia923", default_assets)
_core_assets += get_asset_group_keys("_core_eia860", default_assets)

clean_dfs = {}
with defs.get_asset_value_loader() as loader:
    clean_dfs = {
        asset: loader.load_asset_value(AssetKey(asset)) for asset in _core_assets
    }

# this Enum defines the valid values of entity
# entity = EiaEntity.UTILITIES
# entity = EiaEntity.PLANTS
# entity = EiaEntity.BOILERS
entity = EiaEntity.GENERATORS
eia_settings = EiaSettings()
entity_df, annual_df, col_dfs = harvest_entity_tables(
    entity, clean_dfs, debug=True, eia_settings=eia_settings
)

2024-01-09 17:59:53 -0800 - dagster - DEBUG - system - Loading file from: /Users/bendnorman/catalyst/dagster-pudl-work/dagster_home/storage/_core_eia923__generation_fuel using PickledObjectFilesystemIOManager...
2024-01-09 17:59:53 -0800 - dagster - DEBUG - system - Loading file from: /Users/bendnorman/catalyst/dagster-pudl-work/dagster_home/storage/_core_eia923__generation_fuel_nuclear using PickledObjectFilesystemIOManager...
2024-01-09 17:59:53 -0800 - dagster - DEBUG - system - Loading file from: /Users/bendnorman/catalyst/dagster-pudl-work/dagster_home/storage/_core_eia923__coalmine using PickledObjectFilesystemIOManager...
2024-01-09 17:59:53 -0800 - dagster - DEBUG - system - Loading file from: /Users/bendnorman/catalyst/dagster-pudl-work/dagster_home/storage/_core_eia923__boiler_fuel using PickledObjectFilesystemIOManager...
2024-01-09 17:59:53 -0800 - dagster - DEBUG - system - Loading file from: /Users/bendnorman/catalyst/dagster-pudl-work/dagster_home/storage/_core_eia923__f

CPU times: user 34.1 s, sys: 919 ms, total: 35 s
Wall time: 36.3 s


# Inspect the harvested results

## The entity (static) table

In [4]:
entity_df

Unnamed: 0,plant_id_eia,generator_id,duct_burners,generator_operating_date,topping_bottoming_code,solid_fuel_gasification,pulverized_coal_tech,fluidized_bed_tech,subcritical_tech,supercritical_tech,ultrasupercritical_tech,stoker_tech,other_combustion_tech,bypass_heat_recovery,rto_iso_lmp_node_id,rto_iso_location_wholesale_reporting_id,associated_combined_heat_power,original_planned_generator_operating_date,operating_switch,previously_canceled
0,66735,783,False,NaT,,,,,,,,,,False,,,False,2023-11-01,,False
1,66730,P4318,False,2021-12-01,X,,,,,,,,,False,,,False,NaT,,
2,66729,P3101,False,2020-11-01,X,,,,,,,,,False,,,False,NaT,,
3,66728,SFBES,False,NaT,,,,,,,,,,False,,,False,2023-05-01,,False
4,66727,GBBES,False,NaT,,,,,,,,,,False,,,False,2023-05-01,,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34937,1128,7,False,2012-09-01,X,False,,,,,,,,False,,,False,NaT,,
34938,645,BBST1,False,NaT,,,,,,,,,,False,,,False,2023-01-01,,False
34939,613,ST7,False,NaT,,,,,,,,,,False,,,False,2022-06-01,,False
34940,613,7GT2,False,NaT,,,,,,,,,,False,,,False,2022-06-01,,False


## The annual table

In [5]:
annual_df

Unnamed: 0,plant_id_eia,generator_id,utility_id_eia,report_date,operational_status_code,operational_status,ownership_code,capacity_mw,summer_capacity_mw,summer_capacity_estimate,winter_capacity_mw,winter_capacity_estimate,net_capacity_mwdc,energy_storage_capacity_mwh,prime_mover_code,energy_source_code_1,energy_source_code_2,energy_source_code_3,energy_source_code_4,energy_source_code_5,energy_source_code_6,energy_source_1_transport_1,energy_source_1_transport_2,energy_source_1_transport_3,energy_source_2_transport_1,energy_source_2_transport_2,energy_source_2_transport_3,fuel_type_code_pudl,multiple_fuels,deliver_power_transgrid,distributed_generation,syncronized_transmission_grid,turbines_num,planned_modifications,planned_net_summer_capacity_uprate_mw,planned_net_winter_capacity_uprate_mw,planned_uprate_date,planned_net_summer_capacity_derate_mw,planned_net_winter_capacity_derate_mw,planned_derate_date,planned_new_prime_mover_code,planned_energy_source_code_1,planned_repower_date,other_planned_modifications,other_modifications_date,planned_generator_retirement_date,carbon_capture,startup_source_code_1,startup_source_code_2,startup_source_code_3,startup_source_code_4,technology_description,turbines_inverters_hydrokinetics,time_cold_shutdown_full_load_code,planned_new_capacity_mw,cofire_fuels,switch_oil_gas,nameplate_power_factor,minimum_load_mw,uprate_derate_during_year,uprate_derate_completed_date,current_planned_generator_operating_date,summer_estimated_capability_mw,winter_estimated_capability_mw,generator_retirement_date,owned_by_non_utility,reactive_power_output_mvar,ferc_qualifying_facility,data_maturity
0,66735,783,60025,2023-01-01,V,proposed,,1.5,1.5,,1.5,,,,PV,SUN,,,,,,,,,,,,solar,,,,,,,,,NaT,,,NaT,,,NaT,,NaT,NaT,,,,,,Solar Photovoltaic,,,,,,,,,NaT,NaT,,,NaT,,,,monthly_update
1,66730,P4318,64872,2023-01-01,OP,existing,,1.1,1.1,,1.1,,,,PV,SUN,,,,,,,,,,,,solar,,,,,,,,,NaT,,,NaT,,,NaT,,NaT,NaT,,,,,,Solar Photovoltaic,,,,,,,,,NaT,NaT,,,NaT,,,,monthly_update
2,66729,P3101,64872,2023-01-01,OP,existing,,1.4,1.4,,1.4,,,,PV,SUN,,,,,,,,,,,,solar,,,,,,,,,NaT,,,NaT,,,NaT,,NaT,NaT,,,,,,Solar Photovoltaic,,,,,,,,,NaT,NaT,,,NaT,,,,monthly_update
3,66728,SFBES,64778,2023-01-01,OP,existing,,5.0,5.0,,5.0,,,,BA,MWH,,,,,,,,,,,,other,,,,,,,,,NaT,,,NaT,,,NaT,,NaT,NaT,,,,,,Batteries,,,,,,,,,NaT,NaT,,,NaT,,,,monthly_update
4,66727,GBBES,64778,2023-01-01,OP,existing,,5.0,5.0,,5.0,,,,BA,MWH,,,,,,,,,,,,other,,,,,,,,,NaT,,,NaT,,,NaT,,NaT,NaT,,,,,,Batteries,,,,,,,,,NaT,NaT,,,NaT,,,,monthly_update
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95699,1,WT1,63560,2020-01-01,OA,existing,S,0.5,0.1,,0.1,,,,WT,WND,,,,,,,,,,,,wind,,,,False,,,,,NaT,,,NaT,,,NaT,,NaT,NaT,,,,,,Onshore Wind Turbine,1,,,,,0.89,0.1,False,NaT,NaT,,,NaT,,,,final
95700,1,5,63560,2020-01-01,OA,existing,S,0.7,0.4,,0.3,,,,IC,DFO,,,,,,,,,,,,oil,False,,,False,,,,,NaT,,,NaT,,,NaT,,NaT,NaT,,,,,,Petroleum Liquids,,10M,,,,0.80,0.3,False,NaT,NaT,,,NaT,,,,final
95701,1,3,63560,2020-01-01,OP,existing,S,0.5,0.3,,0.3,,,,IC,DFO,,,,,,,,,,,,oil,False,,,False,,,,,NaT,,,NaT,,,NaT,,NaT,NaT,,,,,,Petroleum Liquids,,10M,,,,0.80,0.3,False,NaT,NaT,,,NaT,,,,final
95702,1,2,63560,2020-01-01,OP,existing,S,0.9,0.3,,0.3,,,,IC,DFO,,,,,,,,,,,,oil,False,,,False,,,,,NaT,,,NaT,,,NaT,,NaT,NaT,,,,,,Petroleum Liquids,,10M,,,,0.80,0.3,False,NaT,NaT,,,NaT,,,,final


## Use `col_dfs` to explore harvested values

In [6]:
pmc = col_dfs["prime_mover_code"]

In [7]:
pmc.prime_mover_code.unique()

<StringArray>
['CT', 'GT', 'ST', 'IC', 'HY', 'WT', 'CA', 'BT', 'PV', 'CS', 'OT', 'BA', 'FC', 'CP', 'PS', 'CE', 'FW', 'WS', 'CC', 'HA']
Length: 20, dtype: string