# Unit test data

This directory contains very small, toy, data sets that are used
for unit tests.

## Object catalog: small_sky

This "object catalog" is 131 randomly generated radec values. 

- All radec positions are in the Healpix pixel order 0, pixel 11.
- IDs are integers from 700-831.

In [None]:
import os
import tempfile
from pathlib import Path

import astropy.units as u
import hats
import numpy as np
import pandas as pd
import pyarrow.parquet as pq
from astropy.coordinates import SkyCoord
from dask.distributed import Client
from hats.io.file_io import get_upath, remove_directory
from hats.io.paths import DATASET_DIR
from hats.pixel_math.spatial_index import healpix_to_spatial_index
from hats_import import (
    ImportArguments,
    pipeline_with_client,
    IndexArguments,
    CollectionArguments,
    MarginCacheArguments,
)
from lsdb.io import to_hats
import lsdb

tmp_path = tempfile.TemporaryDirectory()
tmp_dir = tmp_path.name

client = Client(n_workers=1, threads_per_worker=1, local_directory=tmp_dir)

### small_sky_order1

This catalog has the same data points as other small sky catalogs,
but is coerced to spreading these data points over partitions at order 1, instead
of order 0.

This means there are 4 leaf partition files, instead of just 1, and so can
be useful for confirming reads/writes over multiple leaf partition files.

NB: Setting `constant_healpix_order` coerces the import pipeline to create
leaf partitions at order 1.

In [None]:
remove_directory("./small_sky_order1_collection")
with tempfile.TemporaryDirectory() as pipeline_tmp:
    args = (
        CollectionArguments(
            output_artifact_name="small_sky_order1_collection",
            output_path=".",
            tmp_dir=pipeline_tmp,
            addl_hats_properties={"obs_regime": "Optical", "default_index": "id"},
        )
        .catalog(
            input_file_list=["raw/small_sky/small_sky.csv"],
            file_reader="csv",
            output_artifact_name="small_sky_order1",
            constant_healpix_order=1,
        )
        .add_margin(
            margin_threshold=3600, output_artifact_name="small_sky_order1_margin_1deg", is_default=True
        )
        .add_margin(margin_threshold=7200, output_artifact_name="small_sky_order1_margin_2deg")
        .add_index(
            indexing_column="id",
            output_artifact_name="small_sky_order1_id_index",
            include_healpix_29=False,
            compute_partition_size=200_000,
        )
    )

    pipeline_with_client(args, client)

### small_sky_order1_no_pandas_meta

Copies small_sky_order1 but removes the pandas metadata from the parquet files

In [None]:
out_catalog_name = "small_sky_order1_no_pandas_meta"

sso1 = hats.read_hats("small_sky_order1")
for pixel in sso1.get_healpix_pixels():
    path = hats.io.paths.pixel_catalog_file(sso1.catalog_base_dir, pixel)
    out_path = hats.io.paths.pixel_catalog_file(out_catalog_name, pixel)
    table = pq.read_table(path, partitioning=None)
    table = table.replace_schema_metadata()
    output_file = Path(out_path)
    output_file.parent.mkdir(exist_ok=True, parents=True)
    pq.write_table(table, out_path)
hats.io.write_parquet_metadata(out_catalog_name)
sso1.catalog_info.copy_and_update(catalog_name=out_catalog_name).to_properties_file(out_catalog_name)
sso1.partition_info.write_to_file(hats.io.paths.get_partition_info_pointer(out_catalog_name))

### small_sky_order1_default_columns

Copies small_sky_order1 but adds a list of default columns to the properties file

In [None]:
out_catalog_name = "small_sky_order1_default_columns"
out_catalog_path = get_upath(out_catalog_name)

sso1 = hats.read_hats("small_sky_order1_collection/small_sky_order1")
sso1_dataset_path = get_upath("small_sky_order1_collection/small_sky_order1") / DATASET_DIR
out_dataset_path = out_catalog_path / DATASET_DIR

out_catalog_path.mkdir(exist_ok=True)
if not out_dataset_path.exists():
    os.symlink(f"../{sso1_dataset_path}", out_dataset_path)
sso1.catalog_info.copy_and_update(
    catalog_name=out_catalog_name, default_columns=["ra", "dec", "id"]
).to_properties_file(out_catalog_path)
sso1.partition_info.write_to_file(hats.io.paths.get_partition_info_pointer(out_catalog_path))

### small_sky

This "object catalog" is 131 randomly generated radec values. 

- All radec positions are in the Healpix pixel order 0, pixel 11.
- IDs are integers from 700-831.

This catalog was generated with the following snippet:

In [None]:
remove_directory("./small_sky")
with tempfile.TemporaryDirectory() as pipeline_tmp:
    args = ImportArguments(
        input_file_list=["raw/small_sky/small_sky.csv"],
        output_path=".",
        file_reader="csv",
        output_artifact_name="small_sky",
        tmp_dir=pipeline_tmp,
        highest_healpix_order=5,
    )
    pipeline_with_client(args, client)

### small_sky_npix_alt_suffix

Copies small_sky but changes the parquet file suffix.

In [None]:
import shutil
import hats

# hats/lsdb does not constrain the suffix,
# but the suffix should make the file recognizable as parquet for compatibility with other libraries.
npix_suffix = ".parq"  # could also include the compression, e.g., ".snappy.parquet"

sso = hats.read_hats("small_sky")
paths = [hats.io.paths.pixel_catalog_file(sso.catalog_base_dir, pixel) for pixel in sso.get_healpix_pixels()]

out_catalog_name = "small_sky_npix_alt_suffix"
out_catalog_info = sso.catalog_info.copy_and_update(catalog_name=out_catalog_name, npix_suffix=npix_suffix)
out_paths = [
    hats.io.paths.pixel_catalog_file(out_catalog_name, pixel, npix_suffix=npix_suffix)
    for pixel in sso.get_healpix_pixels()
]

for path, out_path in zip(paths, out_paths):
    out_path.parent.mkdir(parents=True, exist_ok=True)
    shutil.copy(path, out_path)
hats.io.write_parquet_metadata(out_catalog_name)
out_catalog_info.to_properties_file(out_catalog_name)
sso.partition_info.write_to_file(hats.io.paths.get_partition_info_pointer(out_catalog_name))

### small_sky_npix_as_dir

Copies small_sky but makes Npix a directory.

In [None]:
import shutil
import hats

npix_suffix = "/"

sso = hats.read_hats("small_sky")
paths = [hats.io.paths.pixel_catalog_file(sso.catalog_base_dir, pixel) for pixel in sso.get_healpix_pixels()]

out_catalog_name = "small_sky_npix_as_dir"
out_catalog_info = sso.catalog_info.copy_and_update(catalog_name=out_catalog_name, npix_suffix=npix_suffix)
out_dirs = [
    hats.io.paths.pixel_catalog_file(out_catalog_name, pixel, npix_suffix=npix_suffix)
    for pixel in sso.get_healpix_pixels()
]

for path, out_dir in zip(paths, out_dirs):
    out_dir.mkdir(parents=True, exist_ok=True)
    # hats/lsdb will only care about `out_dir`. They will be agnostic to filenames, given `npix_suffix = "/"`.
    shutil.copy(path, out_dir / "part0.parquet")
hats.io.write_parquet_metadata(out_catalog_name)
out_catalog_info.to_properties_file(out_catalog_name)
sso.partition_info.write_to_file(hats.io.paths.get_partition_info_pointer(out_catalog_name))

## Object catalog: small_sky_source

This "source catalog" is 131 detections at each of the 131 objects
in the "small_sky" catalog. These have a random magnitude, MJD, and 
band (selected from ugrizy). The full script that generated the values
can be found [here](https://github.com/delucchi-cmu/hipscripts/blob/main/twiddling/small_sky_source.py)

### small_sky_order1_source

In [None]:
remove_directory("./small_sky_order1_source_collection")
with tempfile.TemporaryDirectory() as pipeline_tmp:
    args = (
        CollectionArguments(
            output_artifact_name="small_sky_order1_source_collection",
            output_path=".",
            tmp_dir=pipeline_tmp,
            addl_hats_properties={"obs_regime": "Optical", "default_index": "object_id"},
        )
        .catalog(
            input_file_list=["raw/small_sky_source/small_sky_source.csv"],
            file_reader="csv",
            ra_column="source_ra",
            dec_column="source_dec",
            catalog_type="source",
            output_artifact_name="small_sky_order1_source",
            constant_healpix_order=1,
        )
        .add_margin(
            margin_threshold=7200, output_artifact_name="small_sky_order1_source_margin", is_default=True
        )
        .add_index(
            indexing_column="object_id",
            output_artifact_name="small_sky_order1_source_object_id_index",
            include_healpix_29=False,
            compute_partition_size=200_000,
        )
        .add_index(
            indexing_column="band",
            output_artifact_name="small_sky_order1_source_band_index",
            include_healpix_29=False,
            compute_partition_size=200_000,
        )
    )

    pipeline_with_client(args, client)

### small_sky_source

In [None]:
remove_directory("./small_sky_source")
with tempfile.TemporaryDirectory() as pipeline_tmp:
    args = ImportArguments(
        input_file_list=["raw/small_sky_source/small_sky_source.csv"],
        output_path=".",
        file_reader="csv",
        ra_column="source_ra",
        dec_column="source_dec",
        catalog_type="source",
        output_artifact_name="small_sky_source",
        highest_healpix_order=2,
        pixel_threshold=3000,
        tmp_dir=pipeline_tmp,
    )
    pipeline_with_client(args, client)

### small_sky_source_margin

In [None]:
remove_directory("./small_sky_source_margin")
with tempfile.TemporaryDirectory() as pipeline_tmp:
    args = MarginCacheArguments(
        input_catalog_path="small_sky_source",
        output_path=".",
        output_artifact_name="small_sky_source_margin",
        margin_threshold=180,
        margin_order=8,
        tmp_dir=pipeline_tmp,
    )
    pipeline_with_client(args, client)

### small_sky_order3_source_margin

This one is similar to the previous margin catalogs but it is generated from a source catalog of order 3.

In [None]:
remove_directory("./small_sky_order3_source")
with tempfile.TemporaryDirectory() as pipeline_tmp:
    args = ImportArguments(
        input_file_list=["raw/small_sky_source/small_sky_source.csv"],
        output_path=".",
        file_reader="csv",
        ra_column="source_ra",
        dec_column="source_dec",
        catalog_type="source",
        output_artifact_name="small_sky_order3_source",
        constant_healpix_order=3,
        tmp_dir=pipeline_tmp,
    )
    pipeline_with_client(args, client)

remove_directory("./small_sky_order3_source_margin")
with tempfile.TemporaryDirectory() as pipeline_tmp:
    args = MarginCacheArguments(
        input_catalog_path="small_sky_order3_source",
        output_path=".",
        output_artifact_name="small_sky_order3_source_margin",
        margin_threshold=300,
        margin_order=7,
        tmp_dir=pipeline_tmp,
    )
    pipeline_with_client(args, client)

In [1]:
import pandas as pd

In [2]:
sss = pd.read_csv("raw/small_sky_source/small_sky_source.csv")
sss

Unnamed: 0,source_id,source_ra,source_dec,mjd,mag,band,object_id,object_ra,object_dec
0,70000,301.927172,-59.438686,58363.286356,17.104989,z,810,301.5,-59.5
1,70001,305.920293,-59.907292,58363.380619,17.586861,i,716,305.5,-60.5
2,70002,303.823213,-43.372721,58363.422318,20.557382,u,717,303.5,-43.5
3,70003,337.807437,-38.361372,58363.586547,18.673041,z,745,337.5,-38.5
4,70004,320.566223,-53.451949,58363.654246,18.840165,g,760,320.5,-53.5
...,...,...,...,...,...,...,...,...,...
17156,87156,335.733501,-68.952632,59562.482971,15.090752,i,826,335.5,-69.5
17157,87157,320.540038,-69.270556,59562.587765,18.990507,r,792,320.5,-69.5
17158,87158,327.535831,-51.379757,59562.638550,16.450693,g,762,327.5,-51.5
17159,87159,303.553419,-37.935144,59562.754894,20.616895,y,755,303.5,-38.5


In [3]:
sss.sort_values(by=['object_id', 'mjd'])

Unnamed: 0,source_id,source_ra,source_dec,mjd,mag,band,object_id,object_ra,object_dec
207,70207,282.515013,-57.993802,58378.426426,18.625267,z,700,282.5,-58.5
250,70250,282.878847,-58.424703,58381.220413,20.964687,i,700,282.5,-58.5
427,70427,282.771294,-57.985642,58394.908226,15.026069,g,700,282.5,-58.5
469,70469,282.838942,-58.498943,58397.833886,15.377244,g,700,282.5,-58.5
540,70540,282.941666,-58.293303,58401.868034,17.113278,y,700,282.5,-58.5
...,...,...,...,...,...,...,...,...,...
16641,86641,306.934526,-49.940970,59527.335471,19.645697,i,830,306.5,-50.5
16707,86707,306.811547,-49.986029,59531.913668,20.205668,z,830,306.5,-50.5
16889,86889,307.028380,-50.219512,59544.452198,20.804142,r,830,306.5,-50.5
16910,86910,306.904954,-50.052089,59546.300590,20.536627,y,830,306.5,-50.5


## Nested catalogs

### small_sky_order1_nested

In [None]:
remove_directory("small_sky_order1_nested_sources")
small_sky_order1_catalog = lsdb.open_catalog("small_sky_order1")
small_sky_order1_source_with_margin = lsdb.open_catalog(
    "small_sky_order1_source", margin_cache="small_sky_order1_source_margin"
)
small_sky_order1_nested = small_sky_order1_catalog.join_nested(
    small_sky_order1_source_with_margin, left_on="id", right_on="object_id", nested_column_name="sources"
)
to_hats(
    small_sky_order1_nested,
    base_catalog_path="small_sky_order1_nested_sources",
    catalog_name="small_sky_order1_nested_sources",
    histogram_order=5,
)

### small_sky_order1_nested_margin

In [None]:
remove_directory("small_sky_order1_nested_sources_margin")
with tempfile.TemporaryDirectory() as pipeline_tmp:
    args = MarginCacheArguments(
        input_catalog_path="small_sky_order1_nested_sources",
        output_path=".",
        output_artifact_name="small_sky_order1_nested_sources_margin",
        margin_threshold=7200,
        margin_order=4,
        tmp_dir=pipeline_tmp,
    )
    pipeline_with_client(args, client)

## Connections between tables

### small_sky_to_o1source

In [None]:
association_kwargs = {
    "primary_catalog_dir": "small_sky",
    "primary_column_association": "id_small_sky",
    "primary_id_column": "id",
    "join_catalog_dir": "small_sky_order1_source",
    "join_column_association": "object_id_small_sky_order1_source",
    "join_id_column": "source_id",
}

remove_directory("small_sky_to_o1source")
small_sky_catalog = lsdb.open_catalog("small_sky")
small_sky_order1_source_with_margin = lsdb.open_catalog(
    "small_sky_order1_source", margin_cache="small_sky_order1_source_margin"
)
small_sky_order1source = small_sky_catalog.join(
    small_sky_order1_source_with_margin, left_on="id", right_on="object_id"
)
lsdb.io.to_association(
    small_sky_order1source,
    base_catalog_path="small_sky_to_o1source",
    catalog_name="small_sky_to_o1source",
    **association_kwargs,
)

## Perturbed object catalog

In order to test validity of cross match, we create a new version of the "small sky" catalog where each radec is slightly perturbed.

### small_sky_xmatch

The initial perturbation is stored as a CSV, and we can re-import from this raw data set.

In [None]:
remove_directory("./small_sky_xmatch")
with tempfile.TemporaryDirectory() as pipeline_tmp:
    args = ImportArguments(
        input_file_list=["raw/xmatch/small_sky_xmatch.csv"],
        output_path=".",
        file_reader="csv",
        output_artifact_name="small_sky_xmatch",
        pixel_threshold=100,
        tmp_dir=pipeline_tmp,
        highest_healpix_order=4,
    )
    pipeline_with_client(args, client)

### small_sky_xmatch_margin

Create a margin catalog from the perturbed data points.

In [None]:
remove_directory("./small_sky_xmatch_margin")
with tempfile.TemporaryDirectory() as pipeline_tmp:
    args = MarginCacheArguments(
        input_catalog_path="small_sky_xmatch",
        output_path=".",
        output_artifact_name="small_sky_xmatch_margin",
        margin_threshold=7200,
        margin_order=4,
        tmp_dir=pipeline_tmp,
    )
    pipeline_with_client(args, client)

### small_sky_left_xmatch

This adds a new point that's outside of the (0,11) pixel of the small sky catalog. Otherwise, the points are the same.

In [None]:
remove_directory("./small_sky_left_xmatch")
with tempfile.TemporaryDirectory() as pipeline_tmp:
    args = ImportArguments(
        input_file_list=["raw/xmatch/small_sky_left_xmatch.csv"],
        output_path=".",
        file_reader="csv",
        output_artifact_name="small_sky_left_xmatch",
        pixel_threshold=100,
        tmp_dir=pipeline_tmp,
        highest_healpix_order=2,
    )
    pipeline_with_client(args, client)

## Generate Expected Results Files

### Small Sky Source Cone Search

In [None]:
ss_source = hats.read_hats("small_sky_order1_source")

In [None]:
ra = -35
dec = -55
radius_degrees = 2

In [None]:
paths = [hats.io.pixel_catalog_file(ss_source.catalog_base_dir, p) for p in ss_source.get_healpix_pixels()]
ss_source_df = pd.concat([pd.read_parquet(p) for p in paths])
coords = SkyCoord(
    ss_source_df["source_ra"].to_numpy() * u.deg, ss_source_df["source_dec"].to_numpy() * u.deg, frame="icrs"
)
center_coord = SkyCoord(ra * u.deg, dec * u.deg, frame="icrs")
cone_search_output = ss_source_df.iloc[coords.separation(center_coord).deg < radius_degrees]
cone_search_output.to_csv("raw/cone_search_expected/catalog.csv", index=False)

### Small Sky Source Margin Cone Search

In [None]:
ss_source_margin = hats.read_hats("small_sky_order1_source_margin")

In [None]:
paths = [
    hats.io.pixel_catalog_file(ss_source_margin.catalog_base_dir, p)
    for p in ss_source_margin.get_healpix_pixels()
]
ss_source_margin_df = pd.concat([pd.read_parquet(p) for p in paths])
coords = SkyCoord(
    ss_source_margin_df["source_ra"].to_numpy() * u.deg,
    ss_source_margin_df["source_dec"].to_numpy() * u.deg,
    frame="icrs",
)
center_coord = SkyCoord(ra * u.deg, dec * u.deg, frame="icrs")
cone_search_output = ss_source_margin_df.iloc[coords.separation(center_coord).deg < radius_degrees]
cone_search_output.to_csv("raw/cone_search_expected/margin.csv", index=False)

## Square map

Create a trivial map-type catalog. This just contains a `star_count` per order 0
healpix tile. The value is the square of the healpix index.

In [None]:
target_pixels = np.arange(0, 12)

healpix_29 = healpix_to_spatial_index(0, target_pixels)

square_vals = target_pixels * target_pixels
value_frame = pd.DataFrame({"_healpix_29": healpix_29, "star_count": square_vals})

In [None]:
remove_directory("./square_map")
with tempfile.TemporaryDirectory() as pipeline_tmp:
    csv_file = Path(pipeline_tmp) / "square_map.csv"
    value_frame.to_csv(csv_file, index=False)
    args = ImportArguments(
        constant_healpix_order=0,  ## forces the moc to order 0.
        catalog_type="map",
        use_healpix_29=True,
        ra_column=None,
        dec_column=None,
        file_reader="csv",
        input_file_list=[csv_file],
        output_artifact_name="square_map",
        output_path=".",
        tmp_dir=pipeline_tmp,
    )

    pipeline_with_client(args, client)

In [None]:
tmp_path.cleanup()
client.close()