### Sample import for gaia
This notebook will download the first 10 files for gaia dr3 and import them as a hipscat.

In [1]:
import numpy as np
import pandas as pd
import healpy as hp
import requests
import httplib2
import json
import os
from bs4 import BeautifulSoup, SoupStrainer
from hipscat.io import write_metadata

def get_cat_urls(url='http://cdn.gea.esac.esa.int/Gaia/gdr3/gaia_source/', fmt='.csv.gz'):
    """
    This function parses the source url 'https://cdn.gea.esac.esa.int/Gaia/gdr3/gaia_source/'
    for .csv.gz files and returns them as a list.

    :param url: the source url from where the Gaia data needs to be downloaded
    :return: list object with file names
    """

    try:
        http = httplib2.Http()
        status, response = http.request(url)
    except requests.exceptions.HTTPError as err:
        raise SystemExit(err)

    csv_files = []
    for link in BeautifulSoup(response, parse_only=SoupStrainer('a'), features="html.parser"):
        if link.has_attr('href') and link['href'].endswith(fmt):
            abs_path = url + link['href']
            csv_files.append(abs_path)

    return csv_files

def cache_sample_data(output_dir, sample_data, skiprows=None):
    cache_dir = os.path.join(output_dir, "cache")
    if not os.path.exists(cache_dir):
        os.makedirs(cache_dir)
        
    for fn in sample_data:
        parqFn = os.path.join(cache_dir, os.path.basename(fn).split('.')[0] + '.parquet')
        if not os.path.exists(parqFn):
            df = pd.read_csv(fn, skiprows=skiprows)
            df.to_parquet(parqFn)

    return cache_dir

def write_parquet_metadata(cache_dir):
    write_metadata.write_parquet_metadata(cache_dir)
    return os.path.join(cache_dir, "_metadata")



In [2]:
import hipscat_import.pipeline as runner
from hipscat_import.catalog.arguments import ImportArguments

output_dir = os.path.join(os.getcwd(), "output")
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

cache_dir = cache_sample_data(output_dir, get_cat_urls()[:10], skiprows=np.arange(0,1000))
schema_file = write_parquet_metadata(cache_dir)

args = ImportArguments(    
    output_catalog_name="sample_gaia",
    input_path=cache_dir,
    input_format="parquet",
    ra_column="ra",
    dec_column="dec",
    id_column="source_id",
    output_path=output_dir,
    use_schema_file=schema_file
    manual_dtypes={'libname_gspphot':'unicode'},
    pixel_threshold=500_000,
    overwrite=True
)
runner.pipeline(args)

Mapping  : 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:11<00:00,  1.18s/it]
Binning  : 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:45<00:00, 45.30s/it]
Splitting: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:47<00:00,  4.80s/it]
Reducing : 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 23/23 [00:55<00:00,  2.43s/it]
Finishing: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:02<00:00,  2.27it/s]


In [4]:
final_path = os.path.join(output_dir, "sample_gaia")
print(f"hipscat path to \"sample_gaia\": {final_path}")

hipscat path to "sample_gaia": /Users/crisp/git-clones/hipscat-import/nb/output/sample_gaia
