Skip to content

Commit

Permalink
Remove input_format argument. (#234)
Browse files Browse the repository at this point in the history
* Remove input_format argument.

* Fix notebook and improve coverage.

* Make file_reader required (default None)

* Fix notebook.
  • Loading branch information
delucchi-cmu committed Feb 22, 2024
1 parent 022e7cf commit 3df522f
Show file tree
Hide file tree
Showing 16 changed files with 64 additions and 69 deletions.
7 changes: 3 additions & 4 deletions docs/catalogs/arguments.rst
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ A minimal arguments block will look something like:
ra_column="ObjectRA",
dec_column="ObjectDec",
input_path="./my_data",
input_format="csv",
file_reader="csv",
output_artifact_name="test_cat",
output_path="./output",
)
Expand Down Expand Up @@ -102,8 +102,8 @@ Which files?

There are a few ways to specify the files to read:

* ``input_path`` + ``input_format``:
will search for files ending with the format string in the indicated directory.
* ``input_path``:
will search for files the indicated directory.
* ``input_file_list``:
a list of fully-specified paths you want to read.

Expand Down Expand Up @@ -162,7 +162,6 @@ You can find the full API documentation for
...
## Locates files like "/directory/to/files/**starr"
input_path="/directory/to/files/",
input_format="starr",
## NB - you need the parens here!
file_reader=StarrReader(),
Expand Down
1 change: 0 additions & 1 deletion docs/catalogs/public/allwise.rst
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@ Example import
args = ImportArguments(
output_artifact_name="allwise",
input_path="/path/to/allwise/",
input_format="csv.bz2",
file_reader=CsvReader(
header=None,
separator="|",
Expand Down
1 change: 0 additions & 1 deletion docs/catalogs/public/neowise.rst
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@ Example import
args = ImportArguments(
output_artifact_name="neowise_1",
input_path="/path/to/neowiser_year8/",
input_format="csv.bz2",
file_reader=CsvReader(
header=None,
separator="|",
Expand Down
2 changes: 0 additions & 2 deletions docs/catalogs/public/panstarrs.rst
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@ Example import of objects (otmo)
args = ImportArguments(
output_artifact_name="ps1_otmo",
input_file_list=in_file_paths,
input_format="csv",
file_reader=CsvReader(
header=None,
index_col=False,
Expand Down Expand Up @@ -73,7 +72,6 @@ Example import of detections
args = ImportArguments(
output_artifact_name="ps1_detection",
input_file_list=in_file_paths,
input_format="csv",
file_reader=CsvReader(
header=None,
index_col=False,
Expand Down
2 changes: 1 addition & 1 deletion docs/catalogs/public/sdss.rst
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ Example import
args = ImportArguments(
output_artifact_name="sdss_dr16q",
input_path="/data/sdss/parquet/",
input_format="parquet",
file_reader="parquet",
ra_column="RA",
dec_column="DEC",
sort_columns="ID",
Expand Down
1 change: 0 additions & 1 deletion docs/catalogs/public/tic.rst
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@ Example import
args = ImportArguments(
output_artifact_name="tic_1",
input_path="/path/to/tic/",
input_format="csv.gz",
file_reader=CsvReader(
header=None,
column_names=type_frame["name"].values.tolist(),
Expand Down
2 changes: 1 addition & 1 deletion docs/catalogs/public/zubercal.rst
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ Challenges with this data set
input_file_list=files,
## NB - you need the parens here!
file_reader=ZubercalParquetReader(),
input_format="parquet",
file_reader="parquet",
catalog_type="source",
ra_column="objra",
dec_column="objdec",
Expand Down
13 changes: 9 additions & 4 deletions docs/notebooks/unequal_schema.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,11 @@
"\n",
"args = ImportArguments(\n",
" output_artifact_name=\"mixed_csv_bad\",\n",
" input_path=mixed_schema_csv_dir,\n",
" input_format=\"csv\",\n",
" input_file_list=[\n",
" os.path.join(mixed_schema_csv_dir, \"input_01.csv\"),\n",
" os.path.join(mixed_schema_csv_dir, \"input_02.csv\"),\n",
" ],\n",
" file_reader=\"csv\",\n",
" output_path=tmp_path.name,\n",
" highest_healpix_order=1,\n",
")\n",
Expand Down Expand Up @@ -129,8 +132,10 @@
"tmp_path = tempfile.TemporaryDirectory()\n",
"args = ImportArguments(\n",
" output_artifact_name=\"mixed_csv_good\",\n",
" input_path=mixed_schema_csv_dir,\n",
" input_format=\"csv\",\n",
" input_file_list=[\n",
" os.path.join(mixed_schema_csv_dir, \"input_01.csv\"),\n",
" os.path.join(mixed_schema_csv_dir, \"input_02.csv\"),\n",
" ],\n",
" output_path=tmp_path.name,\n",
" highest_healpix_order=1,\n",
" file_reader=get_file_reader(\"csv\", schema_file=mixed_schema_csv_parquet),\n",
Expand Down
25 changes: 11 additions & 14 deletions src/hipscat_import/catalog/arguments.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,8 @@ class ImportArguments(RuntimeArguments):
"""level of catalog data, object (things in the sky) or source (detections)"""
input_path: FilePointer | None = None
"""path to search for the input data"""
input_format: str = ""
"""specifier of the input data format. this will be used to find an appropriate
InputReader type, and may be used to find input files, via a match like
``<input_path>/*<input_format>`` """
input_file_list: List[FilePointer] = field(default_factory=list)
"""can be used instead of `input_format` to import only specified files"""
"""can be used instead of input_path to import only specified files"""
input_paths: List[FilePointer] = field(default_factory=list)
"""resolved list of all files that will be used in the importer"""
input_storage_options: Union[Dict[Any, Any], None] = None
Expand Down Expand Up @@ -74,7 +70,7 @@ class ImportArguments(RuntimeArguments):
debug_stats_only: bool = False
"""do not perform a map reduce and don't create a new
catalog. generate the partition info"""
file_reader: InputReader | None = None
file_reader: InputReader | str | None = None
"""instance of input reader that specifies arguments necessary for reading
from your input files"""
resume_plan: ResumePlan | None = None
Expand All @@ -87,9 +83,6 @@ def _check_arguments(self):
"""Check existence and consistency of argument values"""
super()._check_arguments()

if not self.input_format:
raise ValueError("input_format is required")

if self.constant_healpix_order >= 0:
check_healpix_order_range(self.constant_healpix_order, "constant_healpix_order")
self.mapping_healpix_order = self.constant_healpix_order
Expand All @@ -104,13 +97,15 @@ def _check_arguments(self):

if (not self.input_path and not self.input_file_list) or (self.input_path and self.input_file_list):
raise ValueError("exactly one of input_path or input_file_list is required")
if not self.file_reader:
self.file_reader = get_file_reader(self.input_format)
if self.file_reader is None:
raise ValueError("file_reader is required")
if isinstance(self.file_reader, str):
self.file_reader = get_file_reader(self.file_reader)

# Basic checks complete - make more checks and create directories where necessary
self.input_paths = find_input_paths(
self.input_path,
f"*{self.input_format}",
"**/**.*",
self.input_file_list,
storage_options=self.input_storage_options,
)
Expand All @@ -134,13 +129,15 @@ def to_catalog_info(self, total_rows) -> CatalogInfo:
return CatalogInfo(**info)

def additional_runtime_provenance_info(self) -> dict:
file_reader_info = {"type": self.file_reader}
if isinstance(self.file_reader, InputReader):
file_reader_info = self.file_reader.provenance_info()
return {
"catalog_name": self.output_artifact_name,
"epoch": self.epoch,
"catalog_type": self.catalog_type,
"input_path": str(self.input_path),
"input_paths": self.input_paths,
"input_format": self.input_format,
"input_file_list": self.input_file_list,
"ra_column": self.ra_column,
"dec_column": self.dec_column,
Expand All @@ -151,7 +148,7 @@ def additional_runtime_provenance_info(self) -> dict:
"pixel_threshold": self.pixel_threshold,
"mapping_healpix_order": self.mapping_healpix_order,
"debug_stats_only": self.debug_stats_only,
"file_reader_info": self.file_reader.provenance_info() if self.file_reader is not None else {},
"file_reader_info": file_reader_info,
}


Expand Down
2 changes: 1 addition & 1 deletion tests/.pylintrc
Original file line number Diff line number Diff line change
Expand Up @@ -503,7 +503,7 @@ ignore-imports=yes
ignore-signatures=yes

# Minimum lines number of a similarity.
min-similarity-lines=10
min-similarity-lines=20


[SPELLING]
Expand Down
39 changes: 20 additions & 19 deletions tests/hipscat_import/catalog/test_argument_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,21 +15,20 @@ def test_none():

def test_empty_required(small_sky_parts_dir, tmp_path):
"""*Most* required arguments are provided."""
## Input format is missing
with pytest.raises(ValueError, match="input_format"):
## File reader is missing
with pytest.raises(ValueError, match="file_reader"):
ImportArguments(
output_artifact_name="catalog",
input_format=None,
file_reader=None,
input_path=small_sky_parts_dir,
output_path=tmp_path,
)

## Input path is missing
with pytest.raises(ValueError, match="input_file"):
ImportArguments(
output_artifact_name="catalog",
file_reader="csv",
input_path="",
input_format="csv",
output_path=tmp_path,
overwrite=True,
)
Expand All @@ -41,8 +40,8 @@ def test_invalid_paths(blank_data_dir, tmp_path):
ImportArguments(
output_artifact_name="catalog",
input_path=blank_data_dir,
file_reader="csv",
output_path=tmp_path,
input_format="csv",
progress_bar=False,
)

Expand All @@ -51,19 +50,21 @@ def test_invalid_paths(blank_data_dir, tmp_path):
ImportArguments(
output_artifact_name="catalog",
input_path="path",
file_reader="csv",
output_path=tmp_path,
overwrite=True,
input_format="csv",
)


def test_missing_paths(tmp_path):
## Input path has no files
with pytest.raises(FileNotFoundError):
ImportArguments(
output_artifact_name="catalog",
input_path=blank_data_dir,
file_reader="csv",
input_path=tmp_path,
output_path=tmp_path,
overwrite=True,
input_format="parquet",
)


Expand All @@ -73,7 +74,7 @@ def test_good_paths(blank_data_dir, blank_data_file, tmp_path):
args = ImportArguments(
output_artifact_name="catalog",
input_path=blank_data_dir,
input_format="csv",
file_reader="csv",
output_path=tmp_path_str,
tmp_dir=tmp_path_str,
progress_bar=False,
Expand All @@ -88,7 +89,7 @@ def test_multiple_files_in_path(small_sky_parts_dir, tmp_path):
args = ImportArguments(
output_artifact_name="catalog",
input_path=small_sky_parts_dir,
input_format="csv",
file_reader="csv",
output_path=tmp_path,
progress_bar=False,
)
Expand All @@ -101,7 +102,7 @@ def test_single_debug_file(formats_headers_csv, tmp_path):
args = ImportArguments(
output_artifact_name="catalog",
input_file_list=[formats_headers_csv],
input_format="csv",
file_reader="csv",
output_path=tmp_path,
progress_bar=False,
)
Expand All @@ -115,7 +116,7 @@ def test_healpix_args(blank_data_dir, tmp_path):
ImportArguments(
output_artifact_name="catalog",
input_path=blank_data_dir,
input_format="csv",
file_reader="csv",
output_path=tmp_path,
highest_healpix_order=30,
overwrite=True,
Expand All @@ -124,7 +125,7 @@ def test_healpix_args(blank_data_dir, tmp_path):
ImportArguments(
output_artifact_name="catalog",
input_path=blank_data_dir,
input_format="csv",
file_reader="csv",
output_path=tmp_path,
pixel_threshold=3,
overwrite=True,
Expand All @@ -133,7 +134,7 @@ def test_healpix_args(blank_data_dir, tmp_path):
ImportArguments(
output_artifact_name="catalog",
input_path=blank_data_dir,
input_format="csv",
file_reader="csv",
output_path=tmp_path,
constant_healpix_order=30,
overwrite=True,
Expand All @@ -147,7 +148,7 @@ def test_catalog_type(blank_data_dir, tmp_path):
output_artifact_name="catalog",
catalog_type=None,
input_path=blank_data_dir,
input_format="csv",
file_reader="csv",
output_path=tmp_path,
)

Expand All @@ -156,7 +157,7 @@ def test_catalog_type(blank_data_dir, tmp_path):
output_artifact_name="catalog",
catalog_type="association",
input_path=blank_data_dir,
input_format="csv",
file_reader="csv",
output_path=tmp_path,
)

Expand All @@ -166,7 +167,7 @@ def test_to_catalog_info(blank_data_dir, tmp_path):
args = ImportArguments(
output_artifact_name="catalog",
input_path=blank_data_dir,
input_format="csv",
file_reader="csv",
output_path=tmp_path,
tmp_dir=tmp_path,
progress_bar=False,
Expand All @@ -181,7 +182,7 @@ def test_provenance_info(blank_data_dir, tmp_path):
args = ImportArguments(
output_artifact_name="catalog",
input_path=blank_data_dir,
input_format="csv",
file_reader="csv",
output_path=tmp_path,
tmp_dir=tmp_path,
progress_bar=False,
Expand Down
7 changes: 3 additions & 4 deletions tests/hipscat_import/catalog/test_run_import.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def test_resume_dask_runner(
args = ImportArguments(
output_artifact_name="resume_catalog",
input_path=small_sky_parts_dir,
input_format="csv",
file_reader="csv",
output_path=tmp_path,
dask_tmp=tmp_path,
tmp_dir=tmp_path,
Expand Down Expand Up @@ -112,7 +112,7 @@ def test_resume_dask_runner(
args = ImportArguments(
output_artifact_name="resume",
input_path=small_sky_parts_dir,
input_format="csv",
file_reader="csv",
output_path=tmp_path,
dask_tmp=tmp_path,
tmp_dir=tmp_path,
Expand Down Expand Up @@ -145,7 +145,6 @@ def test_dask_runner(
args = ImportArguments(
output_artifact_name="small_sky_object_catalog",
input_path=small_sky_parts_dir,
input_format="csv",
file_reader=CsvReader(
type_map={
"ra": np.float32,
Expand Down Expand Up @@ -220,7 +219,7 @@ def test_dask_runner_stats_only(dask_client, small_sky_parts_dir, tmp_path):
args = ImportArguments(
output_artifact_name="small_sky",
input_path=small_sky_parts_dir,
input_format="csv",
file_reader="csv",
output_path=tmp_path,
dask_tmp=tmp_path,
highest_healpix_order=1,
Expand Down
Loading

0 comments on commit 3df522f

Please sign in to comment.