Merge recent changes (#139)

* unpin sphinx versions (#134) * Verification pipeline boilerplate (#126) * Verification pipeline boilerplate * Input catalog options. * Contribution docs updates. * Merge recent changes (#135) * Use minimum stage name formatting. * Run copier. * Add a tad more context for failure email. * Pin pandas version * Remove benchmarks for now. * unpin sphinx versions (#134) --------- Co-authored-by: Max West <110124344+maxwest-uw@users.noreply.github.com> --------- Co-authored-by: Max West <110124344+maxwest-uw@users.noreply.github.com> * Check for valid catalog directories. (#136) * Check for valid catalog directories. * Hit uncovered file check. --------- Co-authored-by: Max West <110124344+maxwest-uw@users.noreply.github.com>
astronomy-commons · Oct 6, 2023 · 24b1f4e · 24b1f4e
1 parent 08e41a1
commit 24b1f4e
Show file tree

Hide file tree

Showing 16 changed files with 255 additions and 10 deletions.
diff --git a/docs/guide/contributing.rst b/docs/guide/contributing.rst
@@ -61,6 +61,13 @@ Most folks use conda for virtual environments. You may want to as well.
 Testing
 -------------------------------------------------------------------------------
 
+We use ``pytest`` as our preferred unit test runner engine, in keeping with
+LSST DM style guide. We make heavy use of 
+`pytest fixtures <https://docs.pytest.org/en/7.1.x/explanation/fixtures.html#about-fixtures>`_, 
+which set up various resources used for unit testing, or provide consistent 
+paths. These are defined in ``conftest.py`` files. They're powerful and flexible 
+(and fun in their own way), and we encourage contributors to familiarize themselves.
+
 Please add or update unit tests for all changes made to the codebase. You can run
 unit tests locally simply with:
 
@@ -81,6 +88,9 @@ Create your PR
 
 Please use PR best practices, and get someone to review your code.
 
+The LINCC Frameworks guidelines and philosophy on code reviews can be found on 
+`our wiki <https://github.com/lincc-frameworks/docs/wiki/Design-and-Code-Review-Policy>`_.
+
 We have a suite of continuous integration tests that run on PR creation. Please
 follow the recommendations of the linter.
 

diff --git a/docs/requirements.txt b/docs/requirements.txt
@@ -1,6 +1,6 @@
-sphinx==6.1.3
-sphinx-rtd-theme==1.2.0
-sphinx-autoapi==2.0.1
+sphinx
+sphinx-rtd-theme
+sphinx-autoapi
 nbsphinx
 ipython
 jupytext

diff --git a/pyproject.toml b/pyproject.toml
@@ -36,9 +36,9 @@ dev = [
     "pytest",
     "pytest-cov",
     "pytest-timeout",
-    "sphinx==6.1.3", # Used to automatically generate documentation
-    "sphinx-rtd-theme==1.2.0", # Used to render documentation
-    "sphinx-autoapi==2.0.1", # Used to automatically generate api documentation
+    "sphinx", # Used to automatically generate documentation
+    "sphinx-rtd-theme", # Used to render documentation
+    "sphinx-autoapi", # Used to automatically generate api documentation
     "mypy", # Used for static type checking of files
     # if you add dependencies here while experimenting in a notebook and you
     # want that notebook to render in your documentation, please add the

diff --git a/src/hipscat_import/index/arguments.py b/src/hipscat_import/index/arguments.py
@@ -5,6 +5,7 @@
 
 from hipscat.catalog import Catalog
 from hipscat.catalog.index.index_catalog_info import IndexCatalogInfo
+from hipscat.io.validation import is_valid_catalog
 
 from hipscat_import.runtime_arguments import RuntimeArguments
 
@@ -38,6 +39,8 @@ def _check_arguments(self):
         if not self.include_hipscat_index and not self.include_order_pixel:
             raise ValueError("At least one of include_hipscat_index or include_order_pixel must be True")
 
+        if not is_valid_catalog(self.input_catalog_path):
+            raise ValueError("input_catalog_path not a valid catalog")
         self.input_catalog = Catalog.read_from_hipscat(catalog_path=self.input_catalog_path)
 
         if self.compute_partition_size < 100_000:

diff --git a/src/hipscat_import/margin_cache/margin_cache_arguments.py b/src/hipscat_import/margin_cache/margin_cache_arguments.py
@@ -5,7 +5,7 @@
 import numpy as np
 from hipscat.catalog import Catalog
 from hipscat.catalog.margin_cache.margin_cache_catalog_info import MarginCacheCatalogInfo
-from hipscat.io import file_io
+from hipscat.io.validation import is_valid_catalog
 
 from hipscat_import.runtime_arguments import RuntimeArguments
 
@@ -33,8 +33,10 @@ def __post_init__(self):
 
     def _check_arguments(self):
         super()._check_arguments()
-        if not file_io.does_file_or_directory_exist(self.input_catalog_path):
-            raise FileNotFoundError("input_catalog_path not found on local storage")
+        if not self.input_catalog_path:
+            raise ValueError("input_catalog_path is required")
+        if not is_valid_catalog(self.input_catalog_path):
+            raise ValueError("input_catalog_path not a valid catalog")
 
         self.catalog = Catalog.read_from_hipscat(self.input_catalog_path)
 

diff --git a/src/hipscat_import/pipeline.py b/src/hipscat_import/pipeline.py
@@ -8,11 +8,13 @@
 import hipscat_import.index.run_index as index_runner
 import hipscat_import.margin_cache.margin_cache as margin_runner
 import hipscat_import.soap.run_soap as soap_runner
+import hipscat_import.verification.run_verification as verification_runner
 from hipscat_import.catalog.arguments import ImportArguments
 from hipscat_import.index.arguments import IndexArguments
 from hipscat_import.margin_cache.margin_cache_arguments import MarginCacheArguments
 from hipscat_import.runtime_arguments import RuntimeArguments
 from hipscat_import.soap.arguments import SoapArguments
+from hipscat_import.verification.arguments import VerificationArguments
 
 # pragma: no cover
 
@@ -45,6 +47,8 @@ def pipeline_with_client(args: RuntimeArguments, client: Client):
             margin_runner.generate_margin_cache(args, client)
         elif isinstance(args, SoapArguments):
             soap_runner.run(args, client)
+        elif isinstance(args, VerificationArguments):
+            verification_runner.run(args)
         else:
             raise ValueError("unknown args type")
     except Exception as exception:  # pylint: disable=broad-exception-caught

diff --git a/src/hipscat_import/soap/arguments.py b/src/hipscat_import/soap/arguments.py
@@ -2,6 +2,7 @@
 
 from hipscat.catalog.association_catalog.association_catalog import AssociationCatalogInfo
 from hipscat.catalog.catalog_type import CatalogType
+from hipscat.io.validation import is_valid_catalog
 
 from hipscat_import.runtime_arguments import RuntimeArguments
 
@@ -33,11 +34,15 @@ def _check_arguments(self):
             raise ValueError("object_catalog_dir is required")
         if not self.object_id_column:
             raise ValueError("object_id_column is required")
+        if not is_valid_catalog(self.object_catalog_dir):
+            raise ValueError("object_catalog_dir not a valid catalog")
 
         if not self.source_catalog_dir:
             raise ValueError("source_catalog_dir is required")
         if not self.source_object_id_column:
             raise ValueError("source_object_id_column is required")
+        if not is_valid_catalog(self.source_catalog_dir):
+            raise ValueError("source_catalog_dir not a valid catalog")
 
         if self.compute_partition_size < 100_000:
             raise ValueError("compute_partition_size must be at least 100_000")

diff --git a/src/hipscat_import/verification/__init__.py b/src/hipscat_import/verification/__init__.py
diff --git a/src/hipscat_import/verification/arguments.py b/src/hipscat_import/verification/arguments.py
@@ -0,0 +1,44 @@
+"""Utility to hold all arguments required throughout verification pipeline"""
+
+from dataclasses import dataclass, field
+from typing import List, Optional
+
+from hipscat.catalog import Catalog
+
+from hipscat_import.runtime_arguments import RuntimeArguments
+
+
+@dataclass
+class VerificationArguments(RuntimeArguments):
+    """Data class for holding verification arguments"""
+
+    ## Input
+    input_catalog_path: str = ""
+    """Path to an existing catalog that will be inspected."""
+    input_catalog: Optional[Catalog] = None
+    """In-memory representation of a catalog. If not provided, it will be loaded
+    from the input_catalog_path."""
+
+    ## Verification options
+    field_distribution_cols: List[str] = field(default_factory=list)
+    """List of fields to get the overall distribution for. e.g. ["ra", "dec"].
+    Should be valid columns in the parquet files."""
+
+    def __post_init__(self):
+        self._check_arguments()
+
+    def _check_arguments(self):
+        super()._check_arguments()
+        if not self.input_catalog_path and not self.input_catalog:
+            raise ValueError("input catalog is required (either input_catalog_path or input_catalog)")
+        if not self.input_catalog:
+            self.input_catalog = Catalog.read_from_hipscat(catalog_path=self.input_catalog_path)
+        if not self.input_catalog_path:
+            self.input_catalog_path = self.input_catalog.catalog_path
+
+    def additional_runtime_provenance_info(self) -> dict:
+        return {
+            "pipeline": "verification pipeline",
+            "input_catalog_path": str(self.input_catalog_path),
+            "field_distribution_cols": self.field_distribution_cols,
+        }
diff --git a/src/hipscat_import/verification/run_verification.py b/src/hipscat_import/verification/run_verification.py
@@ -0,0 +1,14 @@
+"""Run pass/fail checks and generate verification report of existing hipscat table."""
+
+from hipscat_import.verification.arguments import VerificationArguments
+
+
+def run(args):
+    """Run verification pipeline."""
+    if not args:
+        raise TypeError("args is required and should be type VerificationArguments")
+    if not isinstance(args, VerificationArguments):
+        raise TypeError("args must be type VerificationArguments")
+
+    # implement everything else.
+    raise NotImplementedError("Verification not yet implemented.")
diff --git a/tests/hipscat_import/catalog/test_file_readers.py b/tests/hipscat_import/catalog/test_file_readers.py
@@ -33,6 +33,14 @@ def test_unknown_file_type():
         get_file_reader("unknown")
 
 
+def test_file_exists(small_sky_dir):
+    """File reader factory method should fail for missing files or directories"""
+    with pytest.raises(FileNotFoundError, match="File not found"):
+        next(CsvReader().read("foo_not_really_a_path"))
+    with pytest.raises(FileNotFoundError, match="Directory found at path"):
+        next(CsvReader().read(small_sky_dir))
+
+
 def test_csv_reader(small_sky_single_file):
     """Verify we can read the csv file into a single data frame."""
     total_chunks = 0

diff --git a/tests/hipscat_import/index/test_index_argument.py b/tests/hipscat_import/index/test_index_argument.py
@@ -22,6 +22,17 @@ def test_empty_required(tmp_path, small_sky_object_catalog):
             output_catalog_name="small_sky_object_index",
         )
 
+    ## Input path is bad
+    with pytest.raises(ValueError, match="input_catalog_path"):
+        IndexArguments(
+            input_catalog_path="/foo",
+            indexing_column="id",
+            output_path=tmp_path,
+            output_catalog_name="small_sky_object_index",
+            overwrite=True,
+        )
+
+    ## Indexing column is required.
     with pytest.raises(ValueError, match="indexing_column "):
         IndexArguments(
             input_catalog_path=small_sky_object_catalog,

diff --git a/tests/hipscat_import/margin_cache/test_arguments_margin_cache.py b/tests/hipscat_import/margin_cache/test_arguments_margin_cache.py
@@ -11,13 +11,23 @@
 def test_empty_required(tmp_path):
     """*Most* required arguments are provided."""
     ## Input catalog path is missing
-    with pytest.raises(FileNotFoundError, match="input_catalog_path"):
+    with pytest.raises(ValueError, match="input_catalog_path"):
         MarginCacheArguments(
             margin_threshold=5.0,
             output_path=tmp_path,
             output_catalog_name="catalog_cache",
         )
 
+    ## Input catalog path is bad
+    with pytest.raises(ValueError, match="input_catalog_path"):
+        MarginCacheArguments(
+            input_catalog_path="/foo",
+            margin_threshold=5.0,
+            output_path=tmp_path,
+            output_catalog_name="catalog_cache",
+            overwrite=True,
+        )
+
 
 def test_margin_order_dynamic(small_sky_source_catalog, tmp_path):
     """Ensure we can dynamically set the margin_order"""

diff --git a/tests/hipscat_import/soap/test_soap_arguments.py b/tests/hipscat_import/soap/test_soap_arguments.py
@@ -46,6 +46,35 @@ def test_empty_required(tmp_path, small_sky_object_catalog, small_sky_source_cat
             )
 
 
+def test_catalog_paths(tmp_path, small_sky_object_catalog, small_sky_source_catalog):
+    """*Most* required arguments are provided."""
+    ## Object catalog path is bad.
+    with pytest.raises(ValueError, match="object_catalog_dir"):
+        SoapArguments(
+            object_catalog_dir="/foo",
+            object_id_column="id",
+            source_catalog_dir=small_sky_source_catalog,
+            source_object_id_column="object_id",
+            output_catalog_name="small_sky_association",
+            output_path=tmp_path,
+            progress_bar=False,
+            overwrite=True,
+        )
+
+    ## Source catalog path is bad.
+    with pytest.raises(ValueError, match="source_catalog_dir"):
+        SoapArguments(
+            object_catalog_dir=small_sky_object_catalog,
+            object_id_column="id",
+            source_catalog_dir="/foo",
+            source_object_id_column="object_id",
+            output_catalog_name="small_sky_association",
+            output_path=tmp_path,
+            progress_bar=False,
+            overwrite=True,
+        )
+
+
 def test_compute_partition_size(tmp_path, small_sky_object_catalog, small_sky_source_catalog):
     """Test validation of compute_partition_size."""
     with pytest.raises(ValueError, match="compute_partition_size"):

diff --git a/tests/hipscat_import/verification/test_run_verification.py b/tests/hipscat_import/verification/test_run_verification.py
@@ -0,0 +1,25 @@
+import pytest
+
+from hipscat_import.verification.arguments import VerificationArguments
+import hipscat_import.verification.run_verification as runner
+
+
+def test_bad_args():
+    """Runner should fail with empty or mis-typed arguments"""
+    with pytest.raises(TypeError, match="VerificationArguments"):
+        runner.run(None)
+
+    args = {"output_catalog_name": "bad_arg_type"}
+    with pytest.raises(TypeError, match="VerificationArguments"):
+        runner.run(args)
+
+
+def test_no_implementation(tmp_path, small_sky_object_catalog):
+    """Womp womp. Test that we don't have a verification pipeline implemented"""
+    args = VerificationArguments(
+        input_catalog_path=small_sky_object_catalog,
+        output_path=tmp_path,
+        output_catalog_name="small_sky_object_verification_report",
+    )
+    with pytest.raises(NotImplementedError, match="not yet implemented"):
+        runner.run(args)
diff --git a/tests/hipscat_import/verification/test_verification_arguments.py b/tests/hipscat_import/verification/test_verification_arguments.py
@@ -0,0 +1,80 @@
+"""Tests of argument validation"""
+
+
+import pytest
+from hipscat.catalog import Catalog
+
+from hipscat_import.verification.arguments import VerificationArguments
+
+
+def test_none():
+    """No arguments provided. Should error for required args."""
+    with pytest.raises(ValueError):
+        VerificationArguments()
+
+
+def test_empty_required(tmp_path):
+    """*Most* required arguments are provided."""
+    ## Input path is missing
+    with pytest.raises(ValueError, match="input_catalog_path"):
+        VerificationArguments(
+            output_path=tmp_path,
+            output_catalog_name="small_sky_object_verification_report",
+        )
+
+
+def test_invalid_paths(tmp_path, small_sky_object_catalog):
+    """Required arguments are provided, but paths aren't found."""
+    ## Prove that it works with required args
+    VerificationArguments(
+        input_catalog_path=small_sky_object_catalog,
+        output_path=tmp_path,
+        output_catalog_name="small_sky_object_verification_report",
+    )
+
+    ## Bad input path
+    with pytest.raises(FileNotFoundError):
+        VerificationArguments(
+            input_catalog_path="path",
+            output_path="path",
+            output_catalog_name="small_sky_object_verification_report",
+        )
+
+
+def test_good_paths(tmp_path, small_sky_object_catalog):
+    """Required arguments are provided, and paths are found."""
+    tmp_path_str = str(tmp_path)
+    args = VerificationArguments(
+        input_catalog_path=small_sky_object_catalog,
+        output_path=tmp_path,
+        output_catalog_name="small_sky_object_verification_report",
+    )
+    assert args.input_catalog_path == small_sky_object_catalog
+    assert str(args.output_path) == tmp_path_str
+    assert str(args.tmp_path).startswith(tmp_path_str)
+
+
+def test_catalog_object(tmp_path, small_sky_object_catalog):
+    """Required arguments are provided, and paths are found."""
+    small_sky_catalog_object = Catalog.read_from_hipscat(catalog_path=small_sky_object_catalog)
+    tmp_path_str = str(tmp_path)
+    args = VerificationArguments(
+        input_catalog=small_sky_catalog_object,
+        output_path=tmp_path,
+        output_catalog_name="small_sky_object_verification_report",
+    )
+    assert args.input_catalog_path == small_sky_object_catalog
+    assert str(args.output_path) == tmp_path_str
+    assert str(args.tmp_path).startswith(tmp_path_str)
+
+
+def test_provenance_info(small_sky_object_catalog, tmp_path):
+    """Verify that provenance info includes verification-specific fields."""
+    args = VerificationArguments(
+        input_catalog_path=small_sky_object_catalog,
+        output_path=tmp_path,
+        output_catalog_name="small_sky_object_verification_report",
+    )
+
+    runtime_args = args.provenance_info()["runtime_args"]
+    assert "input_catalog_path" in runtime_args