Verification pipeline boilerplate (#126)

* Verification pipeline boilerplate * Input catalog options. * Contribution docs updates. * Merge recent changes (#135) * Use minimum stage name formatting. * Run copier. * Add a tad more context for failure email. * Pin pandas version * Remove benchmarks for now. * unpin sphinx versions (#134) --------- Co-authored-by: Max West <110124344+maxwest-uw@users.noreply.github.com> --------- Co-authored-by: Max West <110124344+maxwest-uw@users.noreply.github.com>
astronomy-commons · Oct 2, 2023 · c6f7166 · c6f7166
1 parent c6481b9
commit c6f7166
Show file tree

Hide file tree

Showing 7 changed files with 177 additions and 0 deletions.
diff --git a/docs/guide/contributing.rst b/docs/guide/contributing.rst
@@ -61,6 +61,13 @@ Most folks use conda for virtual environments. You may want to as well.
 Testing
 -------------------------------------------------------------------------------
 
+We use ``pytest`` as our preferred unit test runner engine, in keeping with
+LSST DM style guide. We make heavy use of 
+`pytest fixtures <https://docs.pytest.org/en/7.1.x/explanation/fixtures.html#about-fixtures>`_, 
+which set up various resources used for unit testing, or provide consistent 
+paths. These are defined in ``conftest.py`` files. They're powerful and flexible 
+(and fun in their own way), and we encourage contributors to familiarize themselves.
+
 Please add or update unit tests for all changes made to the codebase. You can run
 unit tests locally simply with:
 
@@ -81,6 +88,9 @@ Create your PR
 
 Please use PR best practices, and get someone to review your code.
 
+The LINCC Frameworks guidelines and philosophy on code reviews can be found on 
+`our wiki <https://github.com/lincc-frameworks/docs/wiki/Design-and-Code-Review-Policy>`_.
+
 We have a suite of continuous integration tests that run on PR creation. Please
 follow the recommendations of the linter.
 

diff --git a/src/hipscat_import/pipeline.py b/src/hipscat_import/pipeline.py
@@ -8,11 +8,13 @@
 import hipscat_import.index.run_index as index_runner
 import hipscat_import.margin_cache.margin_cache as margin_runner
 import hipscat_import.soap.run_soap as soap_runner
+import hipscat_import.verification.run_verification as verification_runner
 from hipscat_import.catalog.arguments import ImportArguments
 from hipscat_import.index.arguments import IndexArguments
 from hipscat_import.margin_cache.margin_cache_arguments import MarginCacheArguments
 from hipscat_import.runtime_arguments import RuntimeArguments
 from hipscat_import.soap.arguments import SoapArguments
+from hipscat_import.verification.arguments import VerificationArguments
 
 # pragma: no cover
 
@@ -45,6 +47,8 @@ def pipeline_with_client(args: RuntimeArguments, client: Client):
             margin_runner.generate_margin_cache(args, client)
         elif isinstance(args, SoapArguments):
             soap_runner.run(args, client)
+        elif isinstance(args, VerificationArguments):
+            verification_runner.run(args)
         else:
             raise ValueError("unknown args type")
     except Exception as exception:  # pylint: disable=broad-exception-caught

diff --git a/src/hipscat_import/verification/__init__.py b/src/hipscat_import/verification/__init__.py
diff --git a/src/hipscat_import/verification/arguments.py b/src/hipscat_import/verification/arguments.py
@@ -0,0 +1,44 @@
+"""Utility to hold all arguments required throughout verification pipeline"""
+
+from dataclasses import dataclass, field
+from typing import List, Optional
+
+from hipscat.catalog import Catalog
+
+from hipscat_import.runtime_arguments import RuntimeArguments
+
+
+@dataclass
+class VerificationArguments(RuntimeArguments):
+    """Data class for holding verification arguments"""
+
+    ## Input
+    input_catalog_path: str = ""
+    """Path to an existing catalog that will be inspected."""
+    input_catalog: Optional[Catalog] = None
+    """In-memory representation of a catalog. If not provided, it will be loaded
+    from the input_catalog_path."""
+
+    ## Verification options
+    field_distribution_cols: List[str] = field(default_factory=list)
+    """List of fields to get the overall distribution for. e.g. ["ra", "dec"].
+    Should be valid columns in the parquet files."""
+
+    def __post_init__(self):
+        self._check_arguments()
+
+    def _check_arguments(self):
+        super()._check_arguments()
+        if not self.input_catalog_path and not self.input_catalog:
+            raise ValueError("input catalog is required (either input_catalog_path or input_catalog)")
+        if not self.input_catalog:
+            self.input_catalog = Catalog.read_from_hipscat(catalog_path=self.input_catalog_path)
+        if not self.input_catalog_path:
+            self.input_catalog_path = self.input_catalog.catalog_path
+
+    def additional_runtime_provenance_info(self) -> dict:
+        return {
+            "pipeline": "verification pipeline",
+            "input_catalog_path": str(self.input_catalog_path),
+            "field_distribution_cols": self.field_distribution_cols,
+        }
diff --git a/src/hipscat_import/verification/run_verification.py b/src/hipscat_import/verification/run_verification.py
@@ -0,0 +1,14 @@
+"""Run pass/fail checks and generate verification report of existing hipscat table."""
+
+from hipscat_import.verification.arguments import VerificationArguments
+
+
+def run(args):
+    """Run verification pipeline."""
+    if not args:
+        raise TypeError("args is required and should be type VerificationArguments")
+    if not isinstance(args, VerificationArguments):
+        raise TypeError("args must be type VerificationArguments")
+
+    # implement everything else.
+    raise NotImplementedError("Verification not yet implemented.")
diff --git a/tests/hipscat_import/verification/test_run_verification.py b/tests/hipscat_import/verification/test_run_verification.py
@@ -0,0 +1,25 @@
+import pytest
+
+from hipscat_import.verification.arguments import VerificationArguments
+import hipscat_import.verification.run_verification as runner
+
+
+def test_bad_args():
+    """Runner should fail with empty or mis-typed arguments"""
+    with pytest.raises(TypeError, match="VerificationArguments"):
+        runner.run(None)
+
+    args = {"output_catalog_name": "bad_arg_type"}
+    with pytest.raises(TypeError, match="VerificationArguments"):
+        runner.run(args)
+
+
+def test_no_implementation(tmp_path, small_sky_object_catalog):
+    """Womp womp. Test that we don't have a verification pipeline implemented"""
+    args = VerificationArguments(
+        input_catalog_path=small_sky_object_catalog,
+        output_path=tmp_path,
+        output_catalog_name="small_sky_object_verification_report",
+    )
+    with pytest.raises(NotImplementedError, match="not yet implemented"):
+        runner.run(args)
diff --git a/tests/hipscat_import/verification/test_verification_arguments.py b/tests/hipscat_import/verification/test_verification_arguments.py
@@ -0,0 +1,80 @@
+"""Tests of argument validation"""
+
+
+import pytest
+from hipscat.catalog import Catalog
+
+from hipscat_import.verification.arguments import VerificationArguments
+
+
+def test_none():
+    """No arguments provided. Should error for required args."""
+    with pytest.raises(ValueError):
+        VerificationArguments()
+
+
+def test_empty_required(tmp_path):
+    """*Most* required arguments are provided."""
+    ## Input path is missing
+    with pytest.raises(ValueError, match="input_catalog_path"):
+        VerificationArguments(
+            output_path=tmp_path,
+            output_catalog_name="small_sky_object_verification_report",
+        )
+
+
+def test_invalid_paths(tmp_path, small_sky_object_catalog):
+    """Required arguments are provided, but paths aren't found."""
+    ## Prove that it works with required args
+    VerificationArguments(
+        input_catalog_path=small_sky_object_catalog,
+        output_path=tmp_path,
+        output_catalog_name="small_sky_object_verification_report",
+    )
+
+    ## Bad input path
+    with pytest.raises(FileNotFoundError):
+        VerificationArguments(
+            input_catalog_path="path",
+            output_path="path",
+            output_catalog_name="small_sky_object_verification_report",
+        )
+
+
+def test_good_paths(tmp_path, small_sky_object_catalog):
+    """Required arguments are provided, and paths are found."""
+    tmp_path_str = str(tmp_path)
+    args = VerificationArguments(
+        input_catalog_path=small_sky_object_catalog,
+        output_path=tmp_path,
+        output_catalog_name="small_sky_object_verification_report",
+    )
+    assert args.input_catalog_path == small_sky_object_catalog
+    assert str(args.output_path) == tmp_path_str
+    assert str(args.tmp_path).startswith(tmp_path_str)
+
+
+def test_catalog_object(tmp_path, small_sky_object_catalog):
+    """Required arguments are provided, and paths are found."""
+    small_sky_catalog_object = Catalog.read_from_hipscat(catalog_path=small_sky_object_catalog)
+    tmp_path_str = str(tmp_path)
+    args = VerificationArguments(
+        input_catalog=small_sky_catalog_object,
+        output_path=tmp_path,
+        output_catalog_name="small_sky_object_verification_report",
+    )
+    assert args.input_catalog_path == small_sky_object_catalog
+    assert str(args.output_path) == tmp_path_str
+    assert str(args.tmp_path).startswith(tmp_path_str)
+
+
+def test_provenance_info(small_sky_object_catalog, tmp_path):
+    """Verify that provenance info includes verification-specific fields."""
+    args = VerificationArguments(
+        input_catalog_path=small_sky_object_catalog,
+        output_path=tmp_path,
+        output_catalog_name="small_sky_object_verification_report",
+    )
+
+    runtime_args = args.provenance_info()["runtime_args"]
+    assert "input_catalog_path" in runtime_args