# CLOUD unit test data

There are two types of data used in unit tests in this repo: local and cloud. This notebook concerns itself only with the CLOUD versions of test data, so you can re-generate it.

This also works to initialize data in a new cloud provider, instead of simply copying an existing data set.

## Object catalog: small sky

This is the same "object catalog" with 131 randomly generated radec values inside the order0-pixel11 healpix pixel that is used in HATS and LSDB unit test suites.

In [None]:
import os
import tempfile
from upath import UPath
import shutil

import hats
from hats_import import ImportArguments, pipeline_with_client, CollectionArguments
from dask.distributed import Client
from hats.io.file_io import remove_directory

tmp_path = tempfile.TemporaryDirectory()
tmp_dir = tmp_path.name

storage_options = {
    "account_key": os.environ.get("ABFS_LINCCDATA_ACCOUNT_KEY"),
    "account_name": os.environ.get("ABFS_LINCCDATA_ACCOUNT_NAME"),
}
storage_options


output_path = UPath("../cloud/data")

client = Client(n_workers=1, threads_per_worker=1, local_directory=tmp_dir)

### small_sky

This catalog was generated with the following snippet:

In [None]:
remove_directory(output_path / "small_sky")
with tempfile.TemporaryDirectory() as pipeline_tmp:
    args = ImportArguments(
        input_path="small_sky_parts",
        highest_healpix_order=1,
        file_reader="csv",
        output_path=output_path,
        output_artifact_name="small_sky",
        tmp_dir=pipeline_tmp,
    )
    pipeline_with_client(args, client)

### small_sky_order1

This catalog has the same data points as other small sky catalogs, but is coerced to spreading these data points over partitions at order 1, instead of order 0.

This means there are 4 leaf partition files, instead of just 1, and so can be useful for confirming reads/writes over multiple leaf partition files.

NB: Setting `constant_healpix_order` coerces the import pipeline to create leaf partitions at order 1.

This catalog was generated with the following snippet:

In [None]:
remove_directory(output_path / "small_sky_order1")
with tempfile.TemporaryDirectory() as pipeline_tmp:
    args = (
        CollectionArguments(
            output_path=output_path,
            output_artifact_name="small_sky_order1",
            tmp_dir=pipeline_tmp,
        )
        .catalog(
            input_path="small_sky_parts",
            file_reader="csv",
            constant_healpix_order=1,
            output_artifact_name="small_sky_order1",
        )
        .add_margin(
            output_artifact_name="small_sky_order1_margin",
            margin_threshold=7200,
            is_default=True,
        )
        .add_index(
            indexing_column="id",
            output_artifact_name="small_sky_object_index",
        )
    )
    pipeline_with_client(args, client)

### small_sky_xmatch


In [None]:
remove_directory(output_path / "small_sky_xmatch")
with tempfile.TemporaryDirectory() as pipeline_tmp:
    args = ImportArguments(
        input_file_list=["xmatch/xmatch_catalog_raw.csv"],
        file_reader="csv",
        constant_healpix_order=1,
        output_path=output_path,
        output_artifact_name="small_sky_xmatch",
        pixel_threshold=100,
        tmp_dir=pipeline_tmp,
    )
    pipeline_with_client(args, client)

## small_sky_npix_as_dir

Copies small_sky but makes Npix a directory.

In [None]:
npix_suffix = "/"

sso = hats.read_hats(output_path / "small_sky")
paths = [hats.io.paths.pixel_catalog_file(sso.catalog_base_dir, pixel) for pixel in sso.get_healpix_pixels()]

out_catalog_name = "small_sky_npix_as_dir"
out_catalog_path = output_path / out_catalog_name
out_catalog_info = sso.catalog_info.copy_and_update(catalog_name=out_catalog_name, npix_suffix=npix_suffix)
out_dirs = [
    hats.io.paths.pixel_catalog_file(out_catalog_path, pixel, npix_suffix=npix_suffix)
    for pixel in sso.get_healpix_pixels()
]

for path, out_dir in zip(paths, out_dirs):
    out_dir.mkdir(parents=True, exist_ok=True)
    # hats/lsdb will only care about `out_dir`. They will be agnostic to filenames, given `npix_suffix = "/"`.
    shutil.copy(path, out_dir / "part0.parquet")
hats.io.write_parquet_metadata(out_catalog_path)
out_catalog_info.to_properties_file(out_catalog_path)
sso.partition_info.write_to_file(hats.io.paths.get_partition_info_pointer(out_catalog_path))

In [None]:
tmp_path.cleanup()
client.close()