Skip to content

Commit

Permalink
Verification pipeline boilerplate (#126)
Browse files Browse the repository at this point in the history
* Verification pipeline boilerplate

* Input catalog options.

* Contribution docs updates.

* Merge recent changes (#135)

* Use minimum stage name formatting.

* Run copier.

* Add a tad more context for failure email.

* Pin pandas version

* Remove benchmarks for now.

* unpin sphinx versions (#134)

---------

Co-authored-by: Max West <110124344+maxwest-uw@users.noreply.github.com>

---------

Co-authored-by: Max West <110124344+maxwest-uw@users.noreply.github.com>
  • Loading branch information
delucchi-cmu and maxwest-uw committed Oct 2, 2023
1 parent c6481b9 commit c6f7166
Show file tree
Hide file tree
Showing 7 changed files with 177 additions and 0 deletions.
10 changes: 10 additions & 0 deletions docs/guide/contributing.rst
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,13 @@ Most folks use conda for virtual environments. You may want to as well.
Testing
-------------------------------------------------------------------------------

We use ``pytest`` as our preferred unit test runner engine, in keeping with
LSST DM style guide. We make heavy use of
`pytest fixtures <https://docs.pytest.org/en/7.1.x/explanation/fixtures.html#about-fixtures>`_,
which set up various resources used for unit testing, or provide consistent
paths. These are defined in ``conftest.py`` files. They're powerful and flexible
(and fun in their own way), and we encourage contributors to familiarize themselves.

Please add or update unit tests for all changes made to the codebase. You can run
unit tests locally simply with:

Expand All @@ -81,6 +88,9 @@ Create your PR

Please use PR best practices, and get someone to review your code.

The LINCC Frameworks guidelines and philosophy on code reviews can be found on
`our wiki <https://github.com/lincc-frameworks/docs/wiki/Design-and-Code-Review-Policy>`_.

We have a suite of continuous integration tests that run on PR creation. Please
follow the recommendations of the linter.

Expand Down
4 changes: 4 additions & 0 deletions src/hipscat_import/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,13 @@
import hipscat_import.index.run_index as index_runner
import hipscat_import.margin_cache.margin_cache as margin_runner
import hipscat_import.soap.run_soap as soap_runner
import hipscat_import.verification.run_verification as verification_runner
from hipscat_import.catalog.arguments import ImportArguments
from hipscat_import.index.arguments import IndexArguments
from hipscat_import.margin_cache.margin_cache_arguments import MarginCacheArguments
from hipscat_import.runtime_arguments import RuntimeArguments
from hipscat_import.soap.arguments import SoapArguments
from hipscat_import.verification.arguments import VerificationArguments

# pragma: no cover

Expand Down Expand Up @@ -45,6 +47,8 @@ def pipeline_with_client(args: RuntimeArguments, client: Client):
margin_runner.generate_margin_cache(args, client)
elif isinstance(args, SoapArguments):
soap_runner.run(args, client)
elif isinstance(args, VerificationArguments):
verification_runner.run(args)
else:
raise ValueError("unknown args type")
except Exception as exception: # pylint: disable=broad-exception-caught
Expand Down
Empty file.
44 changes: 44 additions & 0 deletions src/hipscat_import/verification/arguments.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
"""Utility to hold all arguments required throughout verification pipeline"""

from dataclasses import dataclass, field
from typing import List, Optional

from hipscat.catalog import Catalog

from hipscat_import.runtime_arguments import RuntimeArguments


@dataclass
class VerificationArguments(RuntimeArguments):
"""Data class for holding verification arguments"""

## Input
input_catalog_path: str = ""
"""Path to an existing catalog that will be inspected."""
input_catalog: Optional[Catalog] = None
"""In-memory representation of a catalog. If not provided, it will be loaded
from the input_catalog_path."""

## Verification options
field_distribution_cols: List[str] = field(default_factory=list)
"""List of fields to get the overall distribution for. e.g. ["ra", "dec"].
Should be valid columns in the parquet files."""

def __post_init__(self):
self._check_arguments()

def _check_arguments(self):
super()._check_arguments()
if not self.input_catalog_path and not self.input_catalog:
raise ValueError("input catalog is required (either input_catalog_path or input_catalog)")
if not self.input_catalog:
self.input_catalog = Catalog.read_from_hipscat(catalog_path=self.input_catalog_path)
if not self.input_catalog_path:
self.input_catalog_path = self.input_catalog.catalog_path

def additional_runtime_provenance_info(self) -> dict:
return {
"pipeline": "verification pipeline",
"input_catalog_path": str(self.input_catalog_path),
"field_distribution_cols": self.field_distribution_cols,
}
14 changes: 14 additions & 0 deletions src/hipscat_import/verification/run_verification.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
"""Run pass/fail checks and generate verification report of existing hipscat table."""

from hipscat_import.verification.arguments import VerificationArguments


def run(args):
"""Run verification pipeline."""
if not args:
raise TypeError("args is required and should be type VerificationArguments")
if not isinstance(args, VerificationArguments):
raise TypeError("args must be type VerificationArguments")

# implement everything else.
raise NotImplementedError("Verification not yet implemented.")
25 changes: 25 additions & 0 deletions tests/hipscat_import/verification/test_run_verification.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import pytest

from hipscat_import.verification.arguments import VerificationArguments
import hipscat_import.verification.run_verification as runner


def test_bad_args():
"""Runner should fail with empty or mis-typed arguments"""
with pytest.raises(TypeError, match="VerificationArguments"):
runner.run(None)

args = {"output_catalog_name": "bad_arg_type"}
with pytest.raises(TypeError, match="VerificationArguments"):
runner.run(args)


def test_no_implementation(tmp_path, small_sky_object_catalog):
"""Womp womp. Test that we don't have a verification pipeline implemented"""
args = VerificationArguments(
input_catalog_path=small_sky_object_catalog,
output_path=tmp_path,
output_catalog_name="small_sky_object_verification_report",
)
with pytest.raises(NotImplementedError, match="not yet implemented"):
runner.run(args)
80 changes: 80 additions & 0 deletions tests/hipscat_import/verification/test_verification_arguments.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
"""Tests of argument validation"""


import pytest
from hipscat.catalog import Catalog

from hipscat_import.verification.arguments import VerificationArguments


def test_none():
"""No arguments provided. Should error for required args."""
with pytest.raises(ValueError):
VerificationArguments()


def test_empty_required(tmp_path):
"""*Most* required arguments are provided."""
## Input path is missing
with pytest.raises(ValueError, match="input_catalog_path"):
VerificationArguments(
output_path=tmp_path,
output_catalog_name="small_sky_object_verification_report",
)


def test_invalid_paths(tmp_path, small_sky_object_catalog):
"""Required arguments are provided, but paths aren't found."""
## Prove that it works with required args
VerificationArguments(
input_catalog_path=small_sky_object_catalog,
output_path=tmp_path,
output_catalog_name="small_sky_object_verification_report",
)

## Bad input path
with pytest.raises(FileNotFoundError):
VerificationArguments(
input_catalog_path="path",
output_path="path",
output_catalog_name="small_sky_object_verification_report",
)


def test_good_paths(tmp_path, small_sky_object_catalog):
"""Required arguments are provided, and paths are found."""
tmp_path_str = str(tmp_path)
args = VerificationArguments(
input_catalog_path=small_sky_object_catalog,
output_path=tmp_path,
output_catalog_name="small_sky_object_verification_report",
)
assert args.input_catalog_path == small_sky_object_catalog
assert str(args.output_path) == tmp_path_str
assert str(args.tmp_path).startswith(tmp_path_str)


def test_catalog_object(tmp_path, small_sky_object_catalog):
"""Required arguments are provided, and paths are found."""
small_sky_catalog_object = Catalog.read_from_hipscat(catalog_path=small_sky_object_catalog)
tmp_path_str = str(tmp_path)
args = VerificationArguments(
input_catalog=small_sky_catalog_object,
output_path=tmp_path,
output_catalog_name="small_sky_object_verification_report",
)
assert args.input_catalog_path == small_sky_object_catalog
assert str(args.output_path) == tmp_path_str
assert str(args.tmp_path).startswith(tmp_path_str)


def test_provenance_info(small_sky_object_catalog, tmp_path):
"""Verify that provenance info includes verification-specific fields."""
args = VerificationArguments(
input_catalog_path=small_sky_object_catalog,
output_path=tmp_path,
output_catalog_name="small_sky_object_verification_report",
)

runtime_args = args.provenance_info()["runtime_args"]
assert "input_catalog_path" in runtime_args

0 comments on commit c6f7166

Please sign in to comment.