Skip to content

Commit

Permalink
Merge recent changes (#139)
Browse files Browse the repository at this point in the history
* unpin sphinx versions (#134)

* Verification pipeline boilerplate (#126)

* Verification pipeline boilerplate

* Input catalog options.

* Contribution docs updates.

* Merge recent changes (#135)

* Use minimum stage name formatting.

* Run copier.

* Add a tad more context for failure email.

* Pin pandas version

* Remove benchmarks for now.

* unpin sphinx versions (#134)

---------

Co-authored-by: Max West <110124344+maxwest-uw@users.noreply.github.com>

---------

Co-authored-by: Max West <110124344+maxwest-uw@users.noreply.github.com>

* Check for valid catalog directories. (#136)

* Check for valid catalog directories.

* Hit uncovered file check.

---------

Co-authored-by: Max West <110124344+maxwest-uw@users.noreply.github.com>
  • Loading branch information
delucchi-cmu and maxwest-uw committed Oct 6, 2023
1 parent 08e41a1 commit 24b1f4e
Show file tree
Hide file tree
Showing 16 changed files with 255 additions and 10 deletions.
10 changes: 10 additions & 0 deletions docs/guide/contributing.rst
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,13 @@ Most folks use conda for virtual environments. You may want to as well.
Testing
-------------------------------------------------------------------------------

We use ``pytest`` as our preferred unit test runner engine, in keeping with
LSST DM style guide. We make heavy use of
`pytest fixtures <https://docs.pytest.org/en/7.1.x/explanation/fixtures.html#about-fixtures>`_,
which set up various resources used for unit testing, or provide consistent
paths. These are defined in ``conftest.py`` files. They're powerful and flexible
(and fun in their own way), and we encourage contributors to familiarize themselves.

Please add or update unit tests for all changes made to the codebase. You can run
unit tests locally simply with:

Expand All @@ -81,6 +88,9 @@ Create your PR

Please use PR best practices, and get someone to review your code.

The LINCC Frameworks guidelines and philosophy on code reviews can be found on
`our wiki <https://github.com/lincc-frameworks/docs/wiki/Design-and-Code-Review-Policy>`_.

We have a suite of continuous integration tests that run on PR creation. Please
follow the recommendations of the linter.

Expand Down
6 changes: 3 additions & 3 deletions docs/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
sphinx==6.1.3
sphinx-rtd-theme==1.2.0
sphinx-autoapi==2.0.1
sphinx
sphinx-rtd-theme
sphinx-autoapi
nbsphinx
ipython
jupytext
Expand Down
6 changes: 3 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,9 @@ dev = [
"pytest",
"pytest-cov",
"pytest-timeout",
"sphinx==6.1.3", # Used to automatically generate documentation
"sphinx-rtd-theme==1.2.0", # Used to render documentation
"sphinx-autoapi==2.0.1", # Used to automatically generate api documentation
"sphinx", # Used to automatically generate documentation
"sphinx-rtd-theme", # Used to render documentation
"sphinx-autoapi", # Used to automatically generate api documentation
"mypy", # Used for static type checking of files
# if you add dependencies here while experimenting in a notebook and you
# want that notebook to render in your documentation, please add the
Expand Down
3 changes: 3 additions & 0 deletions src/hipscat_import/index/arguments.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from hipscat.catalog import Catalog
from hipscat.catalog.index.index_catalog_info import IndexCatalogInfo
from hipscat.io.validation import is_valid_catalog

from hipscat_import.runtime_arguments import RuntimeArguments

Expand Down Expand Up @@ -38,6 +39,8 @@ def _check_arguments(self):
if not self.include_hipscat_index and not self.include_order_pixel:
raise ValueError("At least one of include_hipscat_index or include_order_pixel must be True")

if not is_valid_catalog(self.input_catalog_path):
raise ValueError("input_catalog_path not a valid catalog")
self.input_catalog = Catalog.read_from_hipscat(catalog_path=self.input_catalog_path)

if self.compute_partition_size < 100_000:
Expand Down
8 changes: 5 additions & 3 deletions src/hipscat_import/margin_cache/margin_cache_arguments.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import numpy as np
from hipscat.catalog import Catalog
from hipscat.catalog.margin_cache.margin_cache_catalog_info import MarginCacheCatalogInfo
from hipscat.io import file_io
from hipscat.io.validation import is_valid_catalog

from hipscat_import.runtime_arguments import RuntimeArguments

Expand Down Expand Up @@ -33,8 +33,10 @@ def __post_init__(self):

def _check_arguments(self):
super()._check_arguments()
if not file_io.does_file_or_directory_exist(self.input_catalog_path):
raise FileNotFoundError("input_catalog_path not found on local storage")
if not self.input_catalog_path:
raise ValueError("input_catalog_path is required")
if not is_valid_catalog(self.input_catalog_path):
raise ValueError("input_catalog_path not a valid catalog")

self.catalog = Catalog.read_from_hipscat(self.input_catalog_path)

Expand Down
4 changes: 4 additions & 0 deletions src/hipscat_import/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,13 @@
import hipscat_import.index.run_index as index_runner
import hipscat_import.margin_cache.margin_cache as margin_runner
import hipscat_import.soap.run_soap as soap_runner
import hipscat_import.verification.run_verification as verification_runner
from hipscat_import.catalog.arguments import ImportArguments
from hipscat_import.index.arguments import IndexArguments
from hipscat_import.margin_cache.margin_cache_arguments import MarginCacheArguments
from hipscat_import.runtime_arguments import RuntimeArguments
from hipscat_import.soap.arguments import SoapArguments
from hipscat_import.verification.arguments import VerificationArguments

# pragma: no cover

Expand Down Expand Up @@ -45,6 +47,8 @@ def pipeline_with_client(args: RuntimeArguments, client: Client):
margin_runner.generate_margin_cache(args, client)
elif isinstance(args, SoapArguments):
soap_runner.run(args, client)
elif isinstance(args, VerificationArguments):
verification_runner.run(args)
else:
raise ValueError("unknown args type")
except Exception as exception: # pylint: disable=broad-exception-caught
Expand Down
5 changes: 5 additions & 0 deletions src/hipscat_import/soap/arguments.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from hipscat.catalog.association_catalog.association_catalog import AssociationCatalogInfo
from hipscat.catalog.catalog_type import CatalogType
from hipscat.io.validation import is_valid_catalog

from hipscat_import.runtime_arguments import RuntimeArguments

Expand Down Expand Up @@ -33,11 +34,15 @@ def _check_arguments(self):
raise ValueError("object_catalog_dir is required")
if not self.object_id_column:
raise ValueError("object_id_column is required")
if not is_valid_catalog(self.object_catalog_dir):
raise ValueError("object_catalog_dir not a valid catalog")

if not self.source_catalog_dir:
raise ValueError("source_catalog_dir is required")
if not self.source_object_id_column:
raise ValueError("source_object_id_column is required")
if not is_valid_catalog(self.source_catalog_dir):
raise ValueError("source_catalog_dir not a valid catalog")

if self.compute_partition_size < 100_000:
raise ValueError("compute_partition_size must be at least 100_000")
Expand Down
Empty file.
44 changes: 44 additions & 0 deletions src/hipscat_import/verification/arguments.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
"""Utility to hold all arguments required throughout verification pipeline"""

from dataclasses import dataclass, field
from typing import List, Optional

from hipscat.catalog import Catalog

from hipscat_import.runtime_arguments import RuntimeArguments


@dataclass
class VerificationArguments(RuntimeArguments):
"""Data class for holding verification arguments"""

## Input
input_catalog_path: str = ""
"""Path to an existing catalog that will be inspected."""
input_catalog: Optional[Catalog] = None
"""In-memory representation of a catalog. If not provided, it will be loaded
from the input_catalog_path."""

## Verification options
field_distribution_cols: List[str] = field(default_factory=list)
"""List of fields to get the overall distribution for. e.g. ["ra", "dec"].
Should be valid columns in the parquet files."""

def __post_init__(self):
self._check_arguments()

def _check_arguments(self):
super()._check_arguments()
if not self.input_catalog_path and not self.input_catalog:
raise ValueError("input catalog is required (either input_catalog_path or input_catalog)")
if not self.input_catalog:
self.input_catalog = Catalog.read_from_hipscat(catalog_path=self.input_catalog_path)
if not self.input_catalog_path:
self.input_catalog_path = self.input_catalog.catalog_path

def additional_runtime_provenance_info(self) -> dict:
return {
"pipeline": "verification pipeline",
"input_catalog_path": str(self.input_catalog_path),
"field_distribution_cols": self.field_distribution_cols,
}
14 changes: 14 additions & 0 deletions src/hipscat_import/verification/run_verification.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
"""Run pass/fail checks and generate verification report of existing hipscat table."""

from hipscat_import.verification.arguments import VerificationArguments


def run(args):
"""Run verification pipeline."""
if not args:
raise TypeError("args is required and should be type VerificationArguments")
if not isinstance(args, VerificationArguments):
raise TypeError("args must be type VerificationArguments")

# implement everything else.
raise NotImplementedError("Verification not yet implemented.")
8 changes: 8 additions & 0 deletions tests/hipscat_import/catalog/test_file_readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,14 @@ def test_unknown_file_type():
get_file_reader("unknown")


def test_file_exists(small_sky_dir):
"""File reader factory method should fail for missing files or directories"""
with pytest.raises(FileNotFoundError, match="File not found"):
next(CsvReader().read("foo_not_really_a_path"))
with pytest.raises(FileNotFoundError, match="Directory found at path"):
next(CsvReader().read(small_sky_dir))


def test_csv_reader(small_sky_single_file):
"""Verify we can read the csv file into a single data frame."""
total_chunks = 0
Expand Down
11 changes: 11 additions & 0 deletions tests/hipscat_import/index/test_index_argument.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,17 @@ def test_empty_required(tmp_path, small_sky_object_catalog):
output_catalog_name="small_sky_object_index",
)

## Input path is bad
with pytest.raises(ValueError, match="input_catalog_path"):
IndexArguments(
input_catalog_path="/foo",
indexing_column="id",
output_path=tmp_path,
output_catalog_name="small_sky_object_index",
overwrite=True,
)

## Indexing column is required.
with pytest.raises(ValueError, match="indexing_column "):
IndexArguments(
input_catalog_path=small_sky_object_catalog,
Expand Down
12 changes: 11 additions & 1 deletion tests/hipscat_import/margin_cache/test_arguments_margin_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,23 @@
def test_empty_required(tmp_path):
"""*Most* required arguments are provided."""
## Input catalog path is missing
with pytest.raises(FileNotFoundError, match="input_catalog_path"):
with pytest.raises(ValueError, match="input_catalog_path"):
MarginCacheArguments(
margin_threshold=5.0,
output_path=tmp_path,
output_catalog_name="catalog_cache",
)

## Input catalog path is bad
with pytest.raises(ValueError, match="input_catalog_path"):
MarginCacheArguments(
input_catalog_path="/foo",
margin_threshold=5.0,
output_path=tmp_path,
output_catalog_name="catalog_cache",
overwrite=True,
)


def test_margin_order_dynamic(small_sky_source_catalog, tmp_path):
"""Ensure we can dynamically set the margin_order"""
Expand Down
29 changes: 29 additions & 0 deletions tests/hipscat_import/soap/test_soap_arguments.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,35 @@ def test_empty_required(tmp_path, small_sky_object_catalog, small_sky_source_cat
)


def test_catalog_paths(tmp_path, small_sky_object_catalog, small_sky_source_catalog):
"""*Most* required arguments are provided."""
## Object catalog path is bad.
with pytest.raises(ValueError, match="object_catalog_dir"):
SoapArguments(
object_catalog_dir="/foo",
object_id_column="id",
source_catalog_dir=small_sky_source_catalog,
source_object_id_column="object_id",
output_catalog_name="small_sky_association",
output_path=tmp_path,
progress_bar=False,
overwrite=True,
)

## Source catalog path is bad.
with pytest.raises(ValueError, match="source_catalog_dir"):
SoapArguments(
object_catalog_dir=small_sky_object_catalog,
object_id_column="id",
source_catalog_dir="/foo",
source_object_id_column="object_id",
output_catalog_name="small_sky_association",
output_path=tmp_path,
progress_bar=False,
overwrite=True,
)


def test_compute_partition_size(tmp_path, small_sky_object_catalog, small_sky_source_catalog):
"""Test validation of compute_partition_size."""
with pytest.raises(ValueError, match="compute_partition_size"):
Expand Down
25 changes: 25 additions & 0 deletions tests/hipscat_import/verification/test_run_verification.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import pytest

from hipscat_import.verification.arguments import VerificationArguments
import hipscat_import.verification.run_verification as runner


def test_bad_args():
"""Runner should fail with empty or mis-typed arguments"""
with pytest.raises(TypeError, match="VerificationArguments"):
runner.run(None)

args = {"output_catalog_name": "bad_arg_type"}
with pytest.raises(TypeError, match="VerificationArguments"):
runner.run(args)


def test_no_implementation(tmp_path, small_sky_object_catalog):
"""Womp womp. Test that we don't have a verification pipeline implemented"""
args = VerificationArguments(
input_catalog_path=small_sky_object_catalog,
output_path=tmp_path,
output_catalog_name="small_sky_object_verification_report",
)
with pytest.raises(NotImplementedError, match="not yet implemented"):
runner.run(args)
80 changes: 80 additions & 0 deletions tests/hipscat_import/verification/test_verification_arguments.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
"""Tests of argument validation"""


import pytest
from hipscat.catalog import Catalog

from hipscat_import.verification.arguments import VerificationArguments


def test_none():
"""No arguments provided. Should error for required args."""
with pytest.raises(ValueError):
VerificationArguments()


def test_empty_required(tmp_path):
"""*Most* required arguments are provided."""
## Input path is missing
with pytest.raises(ValueError, match="input_catalog_path"):
VerificationArguments(
output_path=tmp_path,
output_catalog_name="small_sky_object_verification_report",
)


def test_invalid_paths(tmp_path, small_sky_object_catalog):
"""Required arguments are provided, but paths aren't found."""
## Prove that it works with required args
VerificationArguments(
input_catalog_path=small_sky_object_catalog,
output_path=tmp_path,
output_catalog_name="small_sky_object_verification_report",
)

## Bad input path
with pytest.raises(FileNotFoundError):
VerificationArguments(
input_catalog_path="path",
output_path="path",
output_catalog_name="small_sky_object_verification_report",
)


def test_good_paths(tmp_path, small_sky_object_catalog):
"""Required arguments are provided, and paths are found."""
tmp_path_str = str(tmp_path)
args = VerificationArguments(
input_catalog_path=small_sky_object_catalog,
output_path=tmp_path,
output_catalog_name="small_sky_object_verification_report",
)
assert args.input_catalog_path == small_sky_object_catalog
assert str(args.output_path) == tmp_path_str
assert str(args.tmp_path).startswith(tmp_path_str)


def test_catalog_object(tmp_path, small_sky_object_catalog):
"""Required arguments are provided, and paths are found."""
small_sky_catalog_object = Catalog.read_from_hipscat(catalog_path=small_sky_object_catalog)
tmp_path_str = str(tmp_path)
args = VerificationArguments(
input_catalog=small_sky_catalog_object,
output_path=tmp_path,
output_catalog_name="small_sky_object_verification_report",
)
assert args.input_catalog_path == small_sky_object_catalog
assert str(args.output_path) == tmp_path_str
assert str(args.tmp_path).startswith(tmp_path_str)


def test_provenance_info(small_sky_object_catalog, tmp_path):
"""Verify that provenance info includes verification-specific fields."""
args = VerificationArguments(
input_catalog_path=small_sky_object_catalog,
output_path=tmp_path,
output_catalog_name="small_sky_object_verification_report",
)

runtime_args = args.provenance_info()["runtime_args"]
assert "input_catalog_path" in runtime_args

0 comments on commit 24b1f4e

Please sign in to comment.