From a091e243ebe7c9304f65dbeb00dae05941a43c2f Mon Sep 17 00:00:00 2001 From: bendnorman Date: Mon, 7 Aug 2023 13:01:21 -0800 Subject: [PATCH 1/3] Enable subsetting for multi_assets --- src/pudl/etl/glue_assets.py | 15 +++++---------- src/pudl/extract/eia860.py | 13 ++++++++----- src/pudl/extract/eia923.py | 18 +++++++++++------- 3 files changed, 24 insertions(+), 22 deletions(-) diff --git a/src/pudl/etl/glue_assets.py b/src/pudl/etl/glue_assets.py index c40c02915f..7c1ba114b4 100644 --- a/src/pudl/etl/glue_assets.py +++ b/src/pudl/etl/glue_assets.py @@ -9,19 +9,14 @@ logger = pudl.logging_helpers.get_logger(__name__) -# TODO (bendnorman): Currently loading all glue tables. Could potentially allow users -# to load subsets of the glue tables, see: https://docs.dagster.io/concepts/assets/multi-assets#subsetting-multi-assets -# Could split out different types of glue tables into different assets. For example the cross walk table could be a separate asset -# that way dagster doesn't think all glue tables depend on generators_entity_eia, boilers_entity_eia. - - @multi_asset( outs={ - table_name: AssetOut(io_manager_key="pudl_sqlite_io_manager") + table_name: AssetOut(io_manager_key="pudl_sqlite_io_manager", is_required=False) for table_name in Package.get_etl_group_tables("glue") # do not load epacamd_eia glue assets bc they are stand-alone assets below. if "epacamd_eia" not in table_name }, + can_subset=True, required_resource_keys={"datastore", "dataset_settings"}, ) def create_glue_tables(context): @@ -46,9 +41,9 @@ def create_glue_tables(context): # Ensure they are sorted so they match up with the asset outs glue_dfs = dict(sorted(glue_dfs.items())) - return ( - Output(output_name=table_name, value=df) for table_name, df in glue_dfs.items() - ) + for table_name, df in glue_dfs.items(): + if table_name in context.selected_output_names: + yield Output(output_name=table_name, value=df) ##################### diff --git a/src/pudl/extract/eia860.py b/src/pudl/extract/eia860.py index 4e3094e600..76261f974e 100644 --- a/src/pudl/extract/eia860.py +++ b/src/pudl/extract/eia860.py @@ -158,7 +158,11 @@ def eia860_raw_dfs() -> dict[str, pd.DataFrame]: # TODO (bendnorman): Figure out type hint for context keyword and mutli_asset return @multi_asset( - outs={table_name: AssetOut() for table_name in sorted(raw_table_names)}, + outs={ + table_name: AssetOut(is_required=False) + for table_name in sorted(raw_table_names) + }, + can_subset=True, required_resource_keys={"datastore", "dataset_settings"}, ) def extract_eia860(context, eia860_raw_dfs): @@ -189,7 +193,6 @@ def extract_eia860(context, eia860_raw_dfs): } eia860_raw_dfs = dict(sorted(eia860_raw_dfs.items())) - return ( - Output(output_name=table_name, value=df) - for table_name, df in eia860_raw_dfs.items() - ) + for table_name, df in eia860_raw_dfs.items(): + if table_name in context.selected_output_names: + yield Output(output_name=table_name, value=df) diff --git a/src/pudl/extract/eia923.py b/src/pudl/extract/eia923.py index 09610b048f..4d9393eb1d 100644 --- a/src/pudl/extract/eia923.py +++ b/src/pudl/extract/eia923.py @@ -94,7 +94,7 @@ def get_dtypes(page, **partition): # TODO (bendnorman): Add this information to the metadata -eia_raw_table_names = ( +raw_table_names = ( "raw_boiler_fuel_eia923", "raw_fuel_receipts_costs_eia923", "raw_generation_fuel_eia923", @@ -110,7 +110,11 @@ def get_dtypes(page, **partition): # TODO (bendnorman): Figure out type hint for context keyword and mutli_asset return @multi_asset( - outs={table_name: AssetOut() for table_name in sorted(eia_raw_table_names)}, + outs={ + table_name: AssetOut(is_required=False) + for table_name in sorted(raw_table_names) + }, + can_subset=True, required_resource_keys={"datastore", "dataset_settings"}, ) def extract_eia923(context): @@ -134,12 +138,12 @@ def extract_eia923(context): eia923_raw_dfs = dict(sorted(eia923_raw_dfs.items())) - return ( - Output(output_name=table_name, value=df) - for table_name, df in eia923_raw_dfs.items() + for table_name, df in eia923_raw_dfs.items(): # There's an issue with the EIA-923 archive for 2018 which prevents this table # from being extracted currently. When we update to a new DOI this problem will # probably fix itself. See comments on this issue: # https://github.com/catalyst-cooperative/pudl/issues/2448 - if table_name != "raw_emissions_control_eia923" - ) + if (table_name in context.selected_output_names) and ( + table_name != "raw_emissions_control_eia923" + ): + yield Output(output_name=table_name, value=df) From ee19172a2cbd7dd3bc00a88fbbe79a88310b0495 Mon Sep 17 00:00:00 2001 From: bendnorman Date: Thu, 16 Nov 2023 17:28:48 -0900 Subject: [PATCH 2/3] Simplify materialize_asset.py script --- devtools/materialize_asset.py | 58 ++++++++++++++++------------------- 1 file changed, 27 insertions(+), 31 deletions(-) diff --git a/devtools/materialize_asset.py b/devtools/materialize_asset.py index 63837d472e..63fd610055 100755 --- a/devtools/materialize_asset.py +++ b/devtools/materialize_asset.py @@ -1,13 +1,21 @@ #! /usr/bin/env python -"""Materialize one asset & its upstream deps in-process so you can debug.""" +"""Materialize one asset & its upstream deps in-process so you can debug. + +If you are using the VSCode Debugger, you'll need to specify the asset_id +in the launch.json file: + +{ + ..., + "args": ["{YOUR_ASSET_ID}}"] + ..., +} +""" import argparse -import importlib.resources -from dagster import AssetSelection, Definitions, define_asset_job +from dagster import materialize -from pudl import etl -from pudl.settings import EtlSettings +from pudl.etl import default_assets, default_resources, load_dataset_settings_from_file def _parse(): @@ -19,35 +27,23 @@ def _parse(): def main(asset_id): """Entry point. - Defines dagster context like in etl/__init__.py - needs to be kept in sync. + Materialize one asset & its upstream deps in-process so you can debug. - Then creates a job with asset selection. + Args: + asset_id: Name of asset you want to materialize. """ - pkg_source = importlib.resources.files("pudl.package_data.settings").joinpath( - "etl_fast.yml" - ) - with importlib.resources.as_file(pkg_source) as yf: - etl_fast_settings = EtlSettings.from_yaml(yf).datasets - - # TODO (daz/zach): maybe there's a way to do this directly with dagster cli? - defs = Definitions( - assets=etl.default_assets, - resources=etl.default_resources, - jobs=[ - define_asset_job( - name="materialize_one", - selection=AssetSelection.keys(asset_id).upstream(), - config={ - "resources": { - "dataset_settings": { - "config": etl_fast_settings.dict(), - }, - }, - }, - ), - ], + materialize( + default_assets, + selection=f"*{asset_id}", + resources=default_resources, + run_config={ + "resources": { + "dataset_settings": { + "config": load_dataset_settings_from_file("etl_fast") + } + } + }, ) - defs.get_job_def("materialize_one").execute_in_process() if __name__ == "__main__": From eec3db7ab9832970d348d6b2f8b1ffd7b94f101f Mon Sep 17 00:00:00 2001 From: bendnorman Date: Fri, 17 Nov 2023 02:31:54 +0000 Subject: [PATCH 3/3] Update conda-lock.yml and rendered conda environment files. --- environments/conda-linux-64.lock.yml | 2 +- environments/conda-lock.yml | 24 ++++++++++++------------ environments/conda-osx-64.lock.yml | 2 +- environments/conda-osx-arm64.lock.yml | 2 +- 4 files changed, 15 insertions(+), 15 deletions(-) diff --git a/environments/conda-linux-64.lock.yml b/environments/conda-linux-64.lock.yml index 5d8fbeeb3d..6ae9c5cae4 100644 --- a/environments/conda-linux-64.lock.yml +++ b/environments/conda-linux-64.lock.yml @@ -401,7 +401,7 @@ dependencies: - arrow=1.3.0=pyhd8ed1ab_0 - async-timeout=4.0.3=pyhd8ed1ab_0 - aws-c-s3=0.3.24=h7630044_0 - - botocore=1.32.1=pyhd8ed1ab_0 + - botocore=1.32.2=pyhd8ed1ab_0 - branca=0.7.0=pyhd8ed1ab_1 - croniter=2.0.1=pyhd8ed1ab_0 - cryptography=41.0.5=py311h63ff55d_0 diff --git a/environments/conda-lock.yml b/environments/conda-lock.yml index 25b88b04d3..60a7c54ead 100644 --- a/environments/conda-lock.yml +++ b/environments/conda-lock.yml @@ -5316,7 +5316,7 @@ package: category: main optional: false - name: botocore - version: 1.32.1 + version: 1.32.2 manager: conda platform: linux-64 dependencies: @@ -5324,10 +5324,10 @@ package: python: ">=3.7" python-dateutil: ">=2.1,<3.0.0" urllib3: ">=1.25.4,<1.27" - url: https://conda.anaconda.org/conda-forge/noarch/botocore-1.32.1-pyhd8ed1ab_0.conda + url: https://conda.anaconda.org/conda-forge/noarch/botocore-1.32.2-pyhd8ed1ab_0.conda hash: - md5: ed6c51f21b00b73f27d754083a03734f - sha256: 1bbfa7c5b2b0016779805c664bfba9958f5ce5916f57c1a65d173f15d4dc7471 + md5: 303d0f8f09c41c07b18b9a1112cec29b + sha256: 621ee76f9d1e741039513e94ef3e4d3442f76098d863f50474ec60d823ef11ae category: main optional: false - name: branca @@ -13213,7 +13213,7 @@ package: category: main optional: false - name: botocore - version: 1.32.1 + version: 1.32.2 manager: conda platform: osx-64 dependencies: @@ -13221,10 +13221,10 @@ package: python-dateutil: ">=2.1,<3.0.0" jmespath: ">=0.7.1,<2.0.0" urllib3: ">=1.25.4,<1.27" - url: https://conda.anaconda.org/conda-forge/noarch/botocore-1.32.1-pyhd8ed1ab_0.conda + url: https://conda.anaconda.org/conda-forge/noarch/botocore-1.32.2-pyhd8ed1ab_0.conda hash: - md5: ed6c51f21b00b73f27d754083a03734f - sha256: 1bbfa7c5b2b0016779805c664bfba9958f5ce5916f57c1a65d173f15d4dc7471 + md5: 303d0f8f09c41c07b18b9a1112cec29b + sha256: 621ee76f9d1e741039513e94ef3e4d3442f76098d863f50474ec60d823ef11ae category: main optional: false - name: branca @@ -21026,7 +21026,7 @@ package: category: main optional: false - name: botocore - version: 1.32.1 + version: 1.32.2 manager: conda platform: osx-arm64 dependencies: @@ -21034,10 +21034,10 @@ package: python-dateutil: ">=2.1,<3.0.0" jmespath: ">=0.7.1,<2.0.0" urllib3: ">=1.25.4,<1.27" - url: https://conda.anaconda.org/conda-forge/noarch/botocore-1.32.1-pyhd8ed1ab_0.conda + url: https://conda.anaconda.org/conda-forge/noarch/botocore-1.32.2-pyhd8ed1ab_0.conda hash: - md5: ed6c51f21b00b73f27d754083a03734f - sha256: 1bbfa7c5b2b0016779805c664bfba9958f5ce5916f57c1a65d173f15d4dc7471 + md5: 303d0f8f09c41c07b18b9a1112cec29b + sha256: 621ee76f9d1e741039513e94ef3e4d3442f76098d863f50474ec60d823ef11ae category: main optional: false - name: branca diff --git a/environments/conda-osx-64.lock.yml b/environments/conda-osx-64.lock.yml index b737fade32..f78edaa821 100644 --- a/environments/conda-osx-64.lock.yml +++ b/environments/conda-osx-64.lock.yml @@ -381,7 +381,7 @@ dependencies: - arrow=1.3.0=pyhd8ed1ab_0 - async-timeout=4.0.3=pyhd8ed1ab_0 - aws-crt-cpp=0.24.7=ha2eb20f_1 - - botocore=1.32.1=pyhd8ed1ab_0 + - botocore=1.32.2=pyhd8ed1ab_0 - branca=0.7.0=pyhd8ed1ab_1 - croniter=2.0.1=pyhd8ed1ab_0 - cryptography=41.0.5=py311hd51016d_0 diff --git a/environments/conda-osx-arm64.lock.yml b/environments/conda-osx-arm64.lock.yml index 85c3987a69..0ab33d633f 100644 --- a/environments/conda-osx-arm64.lock.yml +++ b/environments/conda-osx-arm64.lock.yml @@ -381,7 +381,7 @@ dependencies: - arrow=1.3.0=pyhd8ed1ab_0 - async-timeout=4.0.3=pyhd8ed1ab_0 - aws-crt-cpp=0.24.7=h2da6921_1 - - botocore=1.32.1=pyhd8ed1ab_0 + - botocore=1.32.2=pyhd8ed1ab_0 - branca=0.7.0=pyhd8ed1ab_1 - croniter=2.0.1=pyhd8ed1ab_0 - cryptography=41.0.5=py311h71175c2_0