# Importing catalogs to HiPSCat format

This notebook presents two ways of importing catalogs to HiPSCat format. The first uses the __lsdb.from_dataframe()__ method, which is helpful to load smaller catalogs from a single dataframe, while the second uses the __hipscat import pipeline__.

In [None]:
import lsdb
import os
import pandas as pd

In [None]:
catalog_name = "small_sky_order1"

In [None]:
# Input paths
test_data_dir = os.path.join("../../tests", "data")
catalog_dir = os.path.join(test_data_dir, catalog_name)
catalog_csv_path = os.path.join(catalog_dir, f"{catalog_name}.csv")

In [None]:
# Output paths
catalog_from_dataframe = f"{catalog_name}-from_dataframe"
catalog_from_importer = f"{catalog_name}-from_importer"

## Using lsdb.from_dataframe()

In [None]:
%%time

# Read simple catalog from its CSV file
catalog = lsdb.from_dataframe(
    pd.read_csv(catalog_csv_path),
    catalog_name=catalog_from_dataframe,
    catalog_type="object",
    highest_order=5,
    threshold=100
)

# Save it to disk in HiPSCat format
catalog.to_hipscat(catalog_from_dataframe)

## Using the import pipeline

In [None]:
# Install hipscat-import
!pip install hipscat-import --quiet

In [None]:
from dask.distributed import Client
from hipscat_import.catalog.arguments import ImportArguments
from hipscat_import.pipeline import pipeline_with_client

In [None]:
# Create directory if it does not yet exist
os.makedirs(catalog_from_importer, exist_ok=True)

In [None]:
args = ImportArguments(
    sort_columns="id",
    ra_column="ra",
    dec_column="dec",
    highest_healpix_order=5,
    pixel_threshold=100,
    input_path=catalog_dir,
    input_format=f"small_sky_order1.csv",
    output_artifact_name=catalog_from_importer,
    output_path=".",
    dask_tmp=".",
    overwrite=True,
)

In [None]:
%%time
with Client(local_directory=args.dask_tmp, n_workers=4) as client:
    pipeline_with_client(args, client)

### Load both catalogs and check that they are equivalent

In [None]:
from_dataframe_catalog = lsdb.read_hipscat(catalog_from_dataframe)
from_dataframe_catalog._ddf

In [None]:
from_importer_catalog = lsdb.read_hipscat(catalog_from_importer)
from_importer_catalog._ddf

In [None]:
# Verify that pixels are similar
assert from_dataframe_catalog.get_healpix_pixels() == from_importer_catalog.get_healpix_pixels()
# Verify that resulting dataframes contain the same data
pd.testing.assert_frame_equal(from_dataframe_catalog.compute().sort_index(), from_importer_catalog.compute().sort_index(), check_dtype=False)