# Unit test data

This directory contains very small, toy, data sets that are used
for unit tests.

## Object catalog: small_sky

This "object catalog" is 131 randomly generated radec values. 

- All radec positions are in the Healpix pixel order 0, pixel 11.
- IDs are integers from 700-831.

In [None]:
import hats_import.pipeline as runner
from hats_import.catalog.arguments import ImportArguments
from hats_import.index.arguments import IndexArguments
from hats_import.margin_cache.margin_cache_arguments import MarginCacheArguments
from hats_import.soap import SoapArguments
import tempfile
from dask.distributed import Client

tmp_path = tempfile.TemporaryDirectory()
tmp_dir = tmp_path.name

client = Client(n_workers=1, threads_per_worker=1, local_directory=tmp_dir)

### small_sky_order1

This catalog has the same data points as other small sky catalogs,
but is coerced to spreading these data points over partitions at order 1, instead
of order 0.

This means there are 4 leaf partition files, instead of just 1, and so can
be useful for confirming reads/writes over multiple leaf partition files.

NB: Setting `constant_healpix_order` coerces the import pipeline to create
leaf partitions at order 1.

In [None]:
with tempfile.TemporaryDirectory() as pipeline_tmp:
    args = ImportArguments(
        input_file_list=["raw/small_sky/small_sky.csv"],
        output_path=".",
        file_reader="csv",
        output_artifact_name="small_sky_order1",
        constant_healpix_order=1,
        tmp_dir=pipeline_tmp,
    )
    runner.pipeline_with_client(args, client)

### small_sky_order1_no_pandas_meta

Copies small_sky_order1 but removes the pandas metadata from the parquet files

In [None]:
from pathlib import Path
import hats
import pyarrow.parquet as pq

out_catalog_name = "small_sky_order1_no_pandas_meta"

sso1 = hats.read_hats("small_sky_order1")
paths = hats.io.paths.pixel_catalog_files(sso1.catalog_base_dir, sso1.get_healpix_pixels())
out_paths = hats.io.paths.pixel_catalog_files(out_catalog_name, sso1.get_healpix_pixels())
for path, out_path in zip(paths, out_paths):
    table = pq.read_table(path, partitioning=None)
    table = table.replace_schema_metadata()
    output_file = Path(out_path)
    output_file.parent.mkdir(exist_ok=True, parents=True)
    pq.write_table(table, out_path)
hats.io.write_parquet_metadata(out_catalog_name)
sso1.catalog_info.copy_and_update(catalog_name=out_catalog_name).to_properties_file(out_catalog_name)
sso1.partition_info.write_to_file(hats.io.paths.get_partition_info_pointer(out_catalog_name))

### small_sky

This "object catalog" is 131 randomly generated radec values. 

- All radec positions are in the Healpix pixel order 0, pixel 11.
- IDs are integers from 700-831.

This catalog was generated with the following snippet:

In [None]:
with tempfile.TemporaryDirectory() as pipeline_tmp:
    args = ImportArguments(
        input_file_list=["raw/small_sky/small_sky.csv"],
        output_path=".",
        file_reader="csv",
        output_artifact_name="small_sky",
        tmp_dir=pipeline_tmp,
        highest_healpix_order=5,
    )
    runner.pipeline_with_client(args, client)

### small_sky_order1_id_index

In [None]:
with tempfile.TemporaryDirectory() as pipeline_tmp:
    args = IndexArguments(
        input_catalog_path="./small_sky_order1",
        indexing_column="id",
        output_path=".",
        output_artifact_name="small_sky_order1_id_index",
        include_healpix_29=False,
        compute_partition_size=200_000,
        tmp_dir=pipeline_tmp,
    )
    runner.pipeline_with_client(args, client)

## Object catalog: small_sky_source

This "source catalog" is 131 detections at each of the 131 objects
in the "small_sky" catalog. These have a random magnitude, MJD, and 
band (selected from ugrizy). The full script that generated the values
can be found [here](https://github.com/delucchi-cmu/hipscripts/blob/main/twiddling/small_sky_source.py)

### small_sky_order1_source

In [None]:
with tempfile.TemporaryDirectory() as pipeline_tmp:
    args = ImportArguments(
        input_file_list=["raw/small_sky_source/small_sky_source.csv"],
        output_path=".",
        file_reader="csv",
        ra_column="source_ra",
        dec_column="source_dec",
        catalog_type="source",
        output_artifact_name="small_sky_order1_source",
        constant_healpix_order=1,
        tmp_dir=pipeline_tmp,
    )
    runner.pipeline_with_client(args, client)

### small_sky_source

In [None]:
with tempfile.TemporaryDirectory() as pipeline_tmp:
    args = ImportArguments(
        input_file_list=["raw/small_sky_source/small_sky_source.csv"],
        output_path=".",
        file_reader="csv",
        ra_column="source_ra",
        dec_column="source_dec",
        catalog_type="source",
        output_artifact_name="small_sky_source",
        highest_healpix_order=2,
        pixel_threshold=3000,
        tmp_dir=pipeline_tmp,
    )
    runner.pipeline_with_client(args, client)

### small_sky_source_margin

In [None]:
with tempfile.TemporaryDirectory() as pipeline_tmp:
    args = MarginCacheArguments(
        input_catalog_path="small_sky_source",
        output_path=".",
        output_artifact_name="small_sky_source_margin",
        margin_threshold=180,
        margin_order=8,
        tmp_dir=pipeline_tmp,
    )
    runner.pipeline_with_client(args, client)

### small_sky_order1_source_margin

In [None]:
with tempfile.TemporaryDirectory() as pipeline_tmp:
    args = MarginCacheArguments(
        input_catalog_path="small_sky_order1_source",
        output_path=".",
        output_artifact_name="small_sky_order1_source_margin",
        margin_threshold=7200,
        margin_order=4,
        tmp_dir=pipeline_tmp,
    )
    runner.pipeline_with_client(args, client)

### small_sky_order3_source_margin

This one is similar to the previous margin catalogs but it is generated from a source catalog of order 3.

In [None]:
with tempfile.TemporaryDirectory() as pipeline_tmp:
    args = ImportArguments(
        input_file_list=["raw/small_sky_source/small_sky_source.csv"],
        output_path=".",
        file_reader="csv",
        ra_column="source_ra",
        dec_column="source_dec",
        catalog_type="source",
        output_artifact_name="small_sky_order3_source",
        constant_healpix_order=3,
        tmp_dir=pipeline_tmp,
    )
    runner.pipeline_with_client(args, client)

with tempfile.TemporaryDirectory() as pipeline_tmp:
    args = MarginCacheArguments(
        input_catalog_path="small_sky_order3_source",
        output_path=".",
        output_artifact_name="small_sky_order3_source_margin",
        margin_threshold=300,
        margin_order=7,
        tmp_dir=pipeline_tmp,
    )
    runner.pipeline_with_client(args, client)

## Connections between tables

### small_sky_to_o1source

In [None]:
with tempfile.TemporaryDirectory() as pipeline_tmp:
    args = SoapArguments(
        object_catalog_dir="small_sky",
        object_id_column="id",
        source_catalog_dir="small_sky_order1_source",
        source_object_id_column="object_id",
        source_id_column="source_id",
        output_path=".",
        output_artifact_name="small_sky_to_o1source",
        write_leaf_files=True,
    )
    runner.pipeline_with_client(args, client)

### small_sky_to_o1source_soft

In [None]:
with tempfile.TemporaryDirectory() as pipeline_tmp:
    args = SoapArguments(
        object_catalog_dir="small_sky",
        object_id_column="id",
        source_catalog_dir="small_sky_order1_source",
        source_object_id_column="object_id",
        source_id_column="source_id",
        output_path=".",
        output_artifact_name="small_sky_to_o1source_soft",
        write_leaf_files=False,
    )
    runner.pipeline_with_client(args, client)

## Perturbed object catalog

In order to test validity of cross match, we create a new version of the "small sky" catalog where each radec is slightly perturbed.

### small_sky_xmatch

The initial perturbation is stored as a CSV, and we can re-import from this raw data set.

In [None]:
with tempfile.TemporaryDirectory() as pipeline_tmp:
    args = ImportArguments(
        input_file_list=["raw/xmatch/small_sky_xmatch.csv"],
        output_path=".",
        file_reader="csv",
        output_artifact_name="small_sky_xmatch",
        pixel_threshold=100,
        tmp_dir=pipeline_tmp,
        highest_healpix_order=4,
    )
    runner.pipeline_with_client(args, client)

### small_sky_to_xmatch

Association table between the original "small sky" object catalog, and the perturbed "small sky xmatch" catalog.

Used to test joining THROUGH the association catalog.

In [None]:
with tempfile.TemporaryDirectory() as pipeline_tmp:
    args = SoapArguments(
        object_catalog_dir="small_sky",
        object_id_column="id",
        source_catalog_dir="small_sky_xmatch",
        source_object_id_column="id",
        source_id_column="id",
        output_path=".",
        write_leaf_files=True,
        output_artifact_name="small_sky_to_xmatch",
    )
    runner.pipeline_with_client(args, client)

### small_sky_to_xmatch_soft

Similar to the above catalog, but does not generate leaf files

In [None]:
with tempfile.TemporaryDirectory() as pipeline_tmp:
    args = SoapArguments(
        object_catalog_dir="small_sky",
        object_id_column="id",
        source_catalog_dir="small_sky_xmatch",
        source_object_id_column="id",
        source_id_column="id",
        output_path=".",
        write_leaf_files=False,
        output_artifact_name="small_sky_to_xmatch_soft",
    )
    runner.pipeline_with_client(args, client)

### small_sky_xmatch_margin

Create a margin catalog from the perturbed data points.

In [None]:
with tempfile.TemporaryDirectory() as pipeline_tmp:
    args = MarginCacheArguments(
        input_catalog_path="small_sky_xmatch",
        output_path=".",
        output_artifact_name="small_sky_xmatch_margin",
        margin_threshold=7200,
        margin_order=4,
        tmp_dir=pipeline_tmp,
    )
    runner.pipeline_with_client(args, client)

### small_sky_left_xmatch

This adds a new point that's outside of the (0,11) pixel of the small sky catalog. Otherwise, the points are the same.

In [None]:
with tempfile.TemporaryDirectory() as pipeline_tmp:
    args = ImportArguments(
        input_file_list=["raw/xmatch/small_sky_left_xmatch.csv"],
        output_path=".",
        file_reader="csv",
        output_artifact_name="small_sky_left_xmatch",
        pixel_threshold=100,
        tmp_dir=pipeline_tmp,
        highest_healpix_order=5,
    )
    runner.pipeline_with_client(args, client)

# Generate Expected Results Files

In [None]:
import hats
import pandas as pd
from astropy.coordinates import SkyCoord
import astropy.units as u

## Small Sky Source Cone Search

In [None]:
ss_source = hats.read_hats("small_sky_order1_source")

In [None]:
ra = -35
dec = -55
radius_degrees = 2

In [None]:
paths = [hats.io.pixel_catalog_file(ss_source.catalog_base_dir, p) for p in ss_source.get_healpix_pixels()]
ss_source_df = pd.concat([pd.read_parquet(p) for p in paths])
coords = SkyCoord(
    ss_source_df["source_ra"].to_numpy() * u.deg, ss_source_df["source_dec"].to_numpy() * u.deg, frame="icrs"
)
center_coord = SkyCoord(ra * u.deg, dec * u.deg, frame="icrs")
cone_search_output = ss_source_df.iloc[coords.separation(center_coord).deg < radius_degrees]
cone_search_output.to_csv("raw/cone_search_expected/catalog.csv")

### Small Sky Source Margin Cone Search

In [None]:
ss_source_margin = hats.read_hats("small_sky_order1_source_margin")

In [None]:
paths = [
    hats.io.pixel_catalog_file(ss_source_margin.catalog_base_dir, p)
    for p in ss_source_margin.get_healpix_pixels()
]
ss_source_margin_df = pd.concat([pd.read_parquet(p) for p in paths])
coords = SkyCoord(
    ss_source_margin_df["source_ra"].to_numpy() * u.deg,
    ss_source_margin_df["source_dec"].to_numpy() * u.deg,
    frame="icrs",
)
center_coord = SkyCoord(ra * u.deg, dec * u.deg, frame="icrs")
cone_search_output = ss_source_margin_df.iloc[coords.separation(center_coord).deg < radius_degrees]
cone_search_output.to_csv("raw/cone_search_expected/margin.csv")

In [None]:
tmp_path.cleanup()
client.close()