Skip to content

Commit

Permalink
Add a lowest order argument. (#255)
Browse files Browse the repository at this point in the history
  • Loading branch information
delucchi-cmu committed Mar 12, 2024
1 parent 97562d0 commit f9f44e5
Show file tree
Hide file tree
Showing 4 changed files with 72 additions and 4 deletions.
25 changes: 21 additions & 4 deletions docs/catalogs/arguments.rst
Original file line number Diff line number Diff line change
Expand Up @@ -203,10 +203,27 @@ this parameter, see notebook :doc:`/notebooks/estimate_pixel_threshold`

For more discussion of the "Binning" and all other stages, see :doc:`temp_files`

Alternatively, you can use the ``constant_healpix_order`` argument. This will
**ignore** both of the ``pixel_threshold`` and ``highest_healpix_order`` arguments
and the catalog will be partitioned by healpix pixels at the
``constant_healpix_order``. This can be useful for very sparse datasets.
Sparse Datasets
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

For sparse datasets you might want to force your catalog partitioning to avoid
partitions with very large area on they sky.

Why? If you have sparse data that you know you will want to cross-match or join
to a catalog that is much denser, you may find yourself trying to match a large
(in terms of area on the sky) pixel to thousands of smaller pixels in the denser
catalog that occupy the same large region in the sky. Using more pixels of higher
order will have some inefficiencies in terms of on-disk storage, but will be
easier to compute joins and cross-matches to large datasets.

There are two strategies for tweaking the partitioning:

* **order range** - use the ``lowest_healpix_order`` argument, in addition
to the ``highest_healpix_order``.
* **constant order** - use the ``constant_healpix_order`` argument. This will
**ignore** the ``pixel_threshold``, ``highest_healpix_order``, and
``lowest_healpix_order`` arguments and the catalog will be partitioned by
healpix pixels at the ``constant_healpix_order``.

Progress Reporting
-------------------------------------------------------------------------------
Expand Down
10 changes: 10 additions & 0 deletions src/hipscat_import/catalog/arguments.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,10 @@ class ImportArguments(RuntimeArguments):
"""healpix order to use when mapping. if this is
a positive number, this will be the order of all final pixels and we
will not combine pixels according to the threshold"""
lowest_healpix_order: int = 0
"""the lowest possible healpix order that we will use for the final
catalog partitioning. setting this higher than 0 will prevent creating
partitions with a large area on the sky."""
highest_healpix_order: int = 7
"""healpix order to use when mapping. this will
not necessarily be the order used in the final catalog, as we may combine
Expand Down Expand Up @@ -83,11 +87,16 @@ def _check_arguments(self):
"""Check existence and consistency of argument values"""
super()._check_arguments()

if self.lowest_healpix_order == self.highest_healpix_order:
self.constant_healpix_order = self.lowest_healpix_order
if self.constant_healpix_order >= 0:
check_healpix_order_range(self.constant_healpix_order, "constant_healpix_order")
self.mapping_healpix_order = self.constant_healpix_order
else:
check_healpix_order_range(self.highest_healpix_order, "highest_healpix_order")
check_healpix_order_range(
self.lowest_healpix_order, "lowest_healpix_order", upper_bound=self.highest_healpix_order
)
if not 100 <= self.pixel_threshold <= 1_000_000_000:
raise ValueError("pixel_threshold should be between 100 and 1,000,000,000")
self.mapping_healpix_order = self.highest_healpix_order
Expand Down Expand Up @@ -149,6 +158,7 @@ def additional_runtime_provenance_info(self) -> dict:
"use_hipscat_index": self.use_hipscat_index,
"sort_columns": self.sort_columns,
"constant_healpix_order": self.constant_healpix_order,
"lowest_healpix_order": self.lowest_healpix_order,
"highest_healpix_order": self.highest_healpix_order,
"pixel_threshold": self.pixel_threshold,
"mapping_healpix_order": self.mapping_healpix_order,
Expand Down
2 changes: 2 additions & 0 deletions src/hipscat_import/catalog/run_import.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,11 +135,13 @@ def run(args, client):
alignment = pixel_math.generate_alignment(
raw_histogram,
highest_order=args.highest_healpix_order,
lowest_order=args.lowest_healpix_order,
threshold=args.pixel_threshold,
)
destination_pixel_map = pixel_math.compute_pixel_map(
raw_histogram,
highest_order=args.highest_healpix_order,
lowest_order=args.lowest_healpix_order,
threshold=args.pixel_threshold,
)
step_progress.update(1)
Expand Down
39 changes: 39 additions & 0 deletions tests/hipscat_import/catalog/test_run_round_trip.py
Original file line number Diff line number Diff line change
Expand Up @@ -313,6 +313,45 @@ def test_import_constant_healpix_order(
assert np.logical_and(ids >= 700, ids < 832).all()


@pytest.mark.dask
def test_import_lowest_healpix_order(
dask_client,
small_sky_parts_dir,
tmp_path,
):
"""Test basic execution.
- tests that all the final tiles are at the lowest healpix order,
and that we don't create tiles where there is no data.
"""
args = ImportArguments(
output_artifact_name="small_sky_object_catalog",
input_path=small_sky_parts_dir,
file_reader="csv",
output_path=tmp_path,
dask_tmp=tmp_path,
lowest_healpix_order=2,
highest_healpix_order=4,
progress_bar=False,
)

runner.run(args, dask_client)

# Check that the catalog metadata file exists
catalog = Catalog.read_from_hipscat(args.catalog_path)
assert catalog.on_disk
assert catalog.catalog_path == args.catalog_path
# Check that the partition info file exists - all pixels at order 2!
assert all(pixel.order == 2 for pixel in catalog.partition_info.get_healpix_pixels())

# Pick a parquet file and make sure it contains as many rows as we expect
output_file = os.path.join(args.catalog_path, "Norder=2", "Dir=0", "Npix=178.parquet")

data_frame = pd.read_parquet(output_file, engine="pyarrow")
assert len(data_frame) == 14
ids = data_frame["id"]
assert np.logical_and(ids >= 700, ids < 832).all()


@pytest.mark.dask
def test_import_starr_file(
dask_client,
Expand Down

0 comments on commit f9f44e5

Please sign in to comment.