From f9f44e56b5466e1bed5621e2fb0c481a9248dcdb Mon Sep 17 00:00:00 2001 From: Melissa DeLucchi <113376043+delucchi-cmu@users.noreply.github.com> Date: Tue, 12 Mar 2024 10:32:18 -0400 Subject: [PATCH] Add a lowest order argument. (#255) --- docs/catalogs/arguments.rst | 25 ++++++++++-- src/hipscat_import/catalog/arguments.py | 10 +++++ src/hipscat_import/catalog/run_import.py | 2 + .../catalog/test_run_round_trip.py | 39 +++++++++++++++++++ 4 files changed, 72 insertions(+), 4 deletions(-) diff --git a/docs/catalogs/arguments.rst b/docs/catalogs/arguments.rst index 481a0ffe..8fd0da0e 100644 --- a/docs/catalogs/arguments.rst +++ b/docs/catalogs/arguments.rst @@ -203,10 +203,27 @@ this parameter, see notebook :doc:`/notebooks/estimate_pixel_threshold` For more discussion of the "Binning" and all other stages, see :doc:`temp_files` -Alternatively, you can use the ``constant_healpix_order`` argument. This will -**ignore** both of the ``pixel_threshold`` and ``highest_healpix_order`` arguments -and the catalog will be partitioned by healpix pixels at the -``constant_healpix_order``. This can be useful for very sparse datasets. +Sparse Datasets +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +For sparse datasets you might want to force your catalog partitioning to avoid +partitions with very large area on they sky. + +Why? If you have sparse data that you know you will want to cross-match or join +to a catalog that is much denser, you may find yourself trying to match a large +(in terms of area on the sky) pixel to thousands of smaller pixels in the denser +catalog that occupy the same large region in the sky. Using more pixels of higher +order will have some inefficiencies in terms of on-disk storage, but will be +easier to compute joins and cross-matches to large datasets. + +There are two strategies for tweaking the partitioning: + +* **order range** - use the ``lowest_healpix_order`` argument, in addition + to the ``highest_healpix_order``. +* **constant order** - use the ``constant_healpix_order`` argument. This will + **ignore** the ``pixel_threshold``, ``highest_healpix_order``, and + ``lowest_healpix_order`` arguments and the catalog will be partitioned by + healpix pixels at the ``constant_healpix_order``. Progress Reporting ------------------------------------------------------------------------------- diff --git a/src/hipscat_import/catalog/arguments.py b/src/hipscat_import/catalog/arguments.py index 561be79d..73c4111c 100644 --- a/src/hipscat_import/catalog/arguments.py +++ b/src/hipscat_import/catalog/arguments.py @@ -56,6 +56,10 @@ class ImportArguments(RuntimeArguments): """healpix order to use when mapping. if this is a positive number, this will be the order of all final pixels and we will not combine pixels according to the threshold""" + lowest_healpix_order: int = 0 + """the lowest possible healpix order that we will use for the final + catalog partitioning. setting this higher than 0 will prevent creating + partitions with a large area on the sky.""" highest_healpix_order: int = 7 """healpix order to use when mapping. this will not necessarily be the order used in the final catalog, as we may combine @@ -83,11 +87,16 @@ def _check_arguments(self): """Check existence and consistency of argument values""" super()._check_arguments() + if self.lowest_healpix_order == self.highest_healpix_order: + self.constant_healpix_order = self.lowest_healpix_order if self.constant_healpix_order >= 0: check_healpix_order_range(self.constant_healpix_order, "constant_healpix_order") self.mapping_healpix_order = self.constant_healpix_order else: check_healpix_order_range(self.highest_healpix_order, "highest_healpix_order") + check_healpix_order_range( + self.lowest_healpix_order, "lowest_healpix_order", upper_bound=self.highest_healpix_order + ) if not 100 <= self.pixel_threshold <= 1_000_000_000: raise ValueError("pixel_threshold should be between 100 and 1,000,000,000") self.mapping_healpix_order = self.highest_healpix_order @@ -149,6 +158,7 @@ def additional_runtime_provenance_info(self) -> dict: "use_hipscat_index": self.use_hipscat_index, "sort_columns": self.sort_columns, "constant_healpix_order": self.constant_healpix_order, + "lowest_healpix_order": self.lowest_healpix_order, "highest_healpix_order": self.highest_healpix_order, "pixel_threshold": self.pixel_threshold, "mapping_healpix_order": self.mapping_healpix_order, diff --git a/src/hipscat_import/catalog/run_import.py b/src/hipscat_import/catalog/run_import.py index 60d6a303..d1fcb8aa 100644 --- a/src/hipscat_import/catalog/run_import.py +++ b/src/hipscat_import/catalog/run_import.py @@ -135,11 +135,13 @@ def run(args, client): alignment = pixel_math.generate_alignment( raw_histogram, highest_order=args.highest_healpix_order, + lowest_order=args.lowest_healpix_order, threshold=args.pixel_threshold, ) destination_pixel_map = pixel_math.compute_pixel_map( raw_histogram, highest_order=args.highest_healpix_order, + lowest_order=args.lowest_healpix_order, threshold=args.pixel_threshold, ) step_progress.update(1) diff --git a/tests/hipscat_import/catalog/test_run_round_trip.py b/tests/hipscat_import/catalog/test_run_round_trip.py index 918c2924..a3fd9b4d 100644 --- a/tests/hipscat_import/catalog/test_run_round_trip.py +++ b/tests/hipscat_import/catalog/test_run_round_trip.py @@ -313,6 +313,45 @@ def test_import_constant_healpix_order( assert np.logical_and(ids >= 700, ids < 832).all() +@pytest.mark.dask +def test_import_lowest_healpix_order( + dask_client, + small_sky_parts_dir, + tmp_path, +): + """Test basic execution. + - tests that all the final tiles are at the lowest healpix order, + and that we don't create tiles where there is no data. + """ + args = ImportArguments( + output_artifact_name="small_sky_object_catalog", + input_path=small_sky_parts_dir, + file_reader="csv", + output_path=tmp_path, + dask_tmp=tmp_path, + lowest_healpix_order=2, + highest_healpix_order=4, + progress_bar=False, + ) + + runner.run(args, dask_client) + + # Check that the catalog metadata file exists + catalog = Catalog.read_from_hipscat(args.catalog_path) + assert catalog.on_disk + assert catalog.catalog_path == args.catalog_path + # Check that the partition info file exists - all pixels at order 2! + assert all(pixel.order == 2 for pixel in catalog.partition_info.get_healpix_pixels()) + + # Pick a parquet file and make sure it contains as many rows as we expect + output_file = os.path.join(args.catalog_path, "Norder=2", "Dir=0", "Npix=178.parquet") + + data_frame = pd.read_parquet(output_file, engine="pyarrow") + assert len(data_frame) == 14 + ids = data_frame["id"] + assert np.logical_and(ids >= 700, ids < 832).all() + + @pytest.mark.dask def test_import_starr_file( dask_client,