In [1]:
import boto3
import io
import urllib
import s3fs
import json
from pathlib import Path
import attr
import numpy
import tiledb
import tiledb.cloud
from tiledb.cloud.compute import DelayedArrayUDF, Delayed
import pandas
import geopandas
import fiona
from fiona.session import AWSSession
import pystac
from scipy.stats import skew, kurtosis
import uuid
from numba import jit
import gc

In [2]:
import pystac
from pystac.extensions.projection import ProjectionExtension
from pystac.extensions.pointcloud import (
PointcloudExtension,
SchemaType,
PhenomenologyType,
Schema,
Statistic,
)

In [3]:
from shapely.geometry import Polygon

In [4]:
from reap_gsf import reap, data_model
from bathy_datasets import rhealpix, storage, geometry, asb_spreadsheet,stac_metadata

In [5]:
session = boto3.Session()
creds = session.get_credentials()

In [6]:
fs = s3fs.S3FileSystem(key=creds.access_key, secret=creds.secret_key, use_listings_cache=False)

In [7]:
#uid = uuid.uuid4()
uid = uuid.UUID("419ae2f5-48c1-4d81-bd1f-18bf80e658cd")

In [8]:
survey_uri = "s3://ausseabed-pl019-provided-data/JamesCookUniversity/0331_NETasmania_Bergersen_10G/Products/gsf/"
outdir_uri = "s3://ausseabed-pl019-ingested-data/L2/0331_NETasmania_Bergersen_10G/"
asb_metadata_uri = "s3://ausseabed-pl019-provided-data/JamesCookUniversity/0331_NETasmania_Bergersen_10G/spreadsheet-metadata.json"
survey_info_uri = "s3://ausseabed-pl019-provided-data/JamesCookUniversity/0331_NETasmania_Bergersen_10G/schema-info.json"

In [9]:
base_prefix = "ga_ausseabed"
array_name = f"{base_prefix}_{uid}_bathymetry"
array_uri = f"{outdir_uri}{array_name}.tiledb"
tiledb_array_uri = f"tiledb://sixy6e/{array_name}"
soundings_cell_density_uri = f"{outdir_uri}{base_prefix}_{uid}_soundings-cell-density-resolution-12.geojson"
coverage_uri = f"{outdir_uri}{base_prefix}_{uid}_coverage.geojson"
stac_md_uri = f"{outdir_uri}{base_prefix}_{uid}_stac-metadata.geojson"

In [10]:
soundings_cell_density_uri_15 = f"{outdir_uri}{base_prefix}_{uid}_soundings-cell-density-resolution-15.geojson"
soundings_cell_density_array_uri_15 = f"{outdir_uri}{base_prefix}_{uid}_soundings-cell-density-resolution-15.tiledb"
soundings_cell_density_sparse_array_uri_15 = f"{outdir_uri}{base_prefix}_{uid}_soundings-cell-density-resolution-15-sparse-array.tiledb"

In [11]:
res12_out_uri_sparse = f"{outdir_uri}{base_prefix}_{uid}_soundings-cell-density-resolution-12-sparse-array.tiledb"
res12_out_uri_dense = f"{outdir_uri}{base_prefix}_{uid}_soundings-cell-density-resolution-12-dense-array.tiledb"

In [12]:
tmpdir = f"{outdir_uri}{base_prefix}_{uid}_tmp/"

In [13]:
tmpdir

's3://ausseabed-pl019-ingested-data/L2/0331_NETasmania_Bergersen_10G/ga_ausseabed_419ae2f5-48c1-4d81-bd1f-18bf80e658cd_tmp/'

In [14]:
array_uri

's3://ausseabed-pl019-ingested-data/L2/0331_NETasmania_Bergersen_10G/ga_ausseabed_419ae2f5-48c1-4d81-bd1f-18bf80e658cd_bathymetry.tiledb'

In [15]:
def get_sonar_metadata(json_uri):
    """
    Temporary func for pulling metadata from a sample GSF file.
    """
    with fs.open(json_uri) as src:
        md = json.loads(src.read())
    stream_task = Delayed("sixy6e/retrieve_stream", name="retrieve")(md["gsf_uri"], creds.access_key, creds.secret_key)
    dataframe_task = Delayed("sixy6e/decode_gsf", name="decode", image_name="3.7-geo")(stream_task, slice(10))
    df, finfo = dataframe_task.compute()
    sonar_metadata = finfo[3].record(0).read(stream_task.result()[0])
    history = attr.asdict(finfo[6].record(0).read(stream_task.result()[0]))
    for key, value in history.items():
        sonar_metadata[key] = value
    return sonar_metadata


def reduce_region_codes_timestamps(results):
    """
    The reduce part of the map-reduce construct for handling the region_code counts.
    Combine all the region_code counts then summarise the results.
    """
    region_codes = [i[0] for i in results]
    timestamps = [i[1] for i in results]
    df = pandas.concat(region_codes)
    cell_count = df.groupby(["region_code"])["count"].agg("sum").to_frame("count").reset_index()
    
    timestamps_df = pandas.DataFrame(
        {
            "start_datetime": [i[0] for i in timestamps],
            "end_datetime": [i[1] for i in timestamps],
        }
    )

    start_end_timestamp = [
        timestamps_df.start_datetime.min().to_pydatetime(),
        timestamps_df.end_datetime.max().to_pydatetime(),
    ]

    return cell_count, start_end_timestamp


def reduce_region_codes(region_codes):
    """
    The reduce part of the map-reduce construct for handling the region_code counts.
    Combine all the region_code counts then summarise the results.
    """
    df = pandas.concat(region_codes)
    cell_count = df.groupby(["region_code"])["count"].agg("sum").to_frame("count").reset_index()

    return cell_count


def reduce_timestamps(timestamps):
    """
    The reduce part of the map-reduce construct for handling the region_code counts.
    Combine all the region_code counts then summarise the results.
    """
    timestamps_df = pandas.DataFrame(
        {
            "start_datetime": [i[0] for i in timestamps],
            "end_datetime": [i[1] for i in timestamps],
        }
    )

    start_end_timestamp = [
        timestamps_df.start_datetime.min().to_pydatetime(),
        timestamps_df.end_datetime.max().to_pydatetime(),
    ]

    return start_end_timestamp


def gather_stats(results):
    """
    Gather the results from all the stats tasks and
    combine into a single dict.
    """
    data = {}
    for item in results:
        for key in item:
            data[key] = item[key]
    return data

In [16]:
def retrieve_stream(uri, access_key, skey):
    """
    Not testing the creation of the stream object at this point.
    But for testing, we also need to keep the download to occur only
    once.
    """
    session = boto3.Session(aws_access_key_id=access_key, aws_secret_access_key=skey)
    dev_resource = session.resource("s3")
    uri = urllib.parse.urlparse(uri)
    obj = dev_resource.Object(bucket_name=uri.netloc, key=uri.path[1:])
    stream = io.BytesIO(obj.get()["Body"].read())
    return stream, obj.content_length


def append_ping_dataframe(dataframe, array_uri, access_key, skey):
    """Append the ping dataframe read from a GSF file."""
    config = tiledb.Config(
        {"vfs.s3.aws_access_key_id": access_key, "vfs.s3.aws_secret_access_key": skey}
    )
    ctx = tiledb.Ctx(config=config)
    kwargs = {
        "mode": "append",
        "sparse": True,
        "ctx": ctx,
    }

    tiledb.dataframe_.from_pandas(array_uri, dataframe, **kwargs)


def ingest_gsf_slice(
    file_record,
    stream,
    access_key,
    skey,
    array_uri,
    idx=slice(None),
    cell_frequency=False,
):
    """
    General steps:
    Extract the ping data.
    Calculate the rHEALPIX code.
    Summarise the rHEALPIX codes (frequency count).
    Get timestamps of first and last pings.
    Write the ping data to a TileDB array.
    res = [df.groupby(["key"])["key"].agg("count").to_frame("count").reset_index() for i in range(3)]
    df2 = pandas.concat(res)
    df2.groupby(["key"])["count"].agg("sum")
    """
    swath_pings = data_model.SwathBathymetryPing.from_records(file_record, stream, idx)
    swath_pings.ping_dataframe["region_code"] = rhealpix.rhealpix_code(
        swath_pings.ping_dataframe.X, swath_pings.ping_dataframe.Y, 15
    )

    # frequency of dggs cells
    if cell_frequency:
        cell_count = (
            swath_pings.ping_dataframe.groupby(["region_code"])["region_code"]
            .agg("count")
            .to_frame("count")
            .reset_index()
        )

        start_end_time = [
            swath_pings.ping_dataframe.timestamp.min().to_pydatetime(),
            swath_pings.ping_dataframe.timestamp.max().to_pydatetime(),
        ]

    else:
        cell_count = None
        start_end_time = None

    # write to tiledb array
    append_ping_dataframe(swath_pings.ping_dataframe, array_uri, access_key, skey)

    return cell_count, start_end_time


def ingest_gsf_slices(gsf_uri, access_key, skey, array_uri, slices, cell_frequency=False):
    """
    Ingest a list of ping slices from a given GSF file.
    """
    stream, stream_length = retrieve_stream(gsf_uri, access_key, skey)
    finfo = reap.file_info(stream, stream_length)
    ping_file_record = finfo[1]

    cell_counts = []
    start_end_timestamps = []

    for idx in slices:
        count, start_end_time = ingest_gsf_slice(
            ping_file_record, stream, access_key, skey, array_uri, idx, cell_frequency
        )
        cell_counts.append(count)
        start_end_timestamps.append(start_end_time)

    if cell_frequency:
        # aggreate the ping slices and calculate the cell counts
        concatenated = pandas.concat(cell_counts)
        cell_count = (
            concatenated.groupby(["region_code"])["count"]
            .agg("sum")
            .to_frame("count")
            .reset_index()
        )

        # aggregate the min and max timestamps, then find the min max timestamps
        timestamps_df = pandas.DataFrame(
            {
                "start_datetime": [i[0] for i in start_end_timestamps],
                "end_datetime": [i[1] for i in start_end_timestamps],
            }
        )

        start_end_timestamp = [
            timestamps_df.start_datetime.min().to_pydatetime(),
            timestamps_df.end_datetime.max().to_pydatetime(),
        ]

    else:
        cell_count = None
        start_end_timestamp = None

    return cell_count, start_end_timestamp

In [17]:
def dummy_reducer(results):
    return len(results)

In [18]:
def scatter(iterable, n):
    """
    Evenly scatters an interable by `n` blocks.
    Sourced from:
    http://stackoverflow.com/questions/2130016/splitting-a-list-of-arbitrary-size-into-only-roughly-n-equal-parts

    :param iterable:
        An iterable or preferably a 1D list or array.

    :param n:
        An integer indicating how many blocks to create.

    :return:
        A `list` consisting of `n` blocks of roughly equal size, each
        containing elements from `iterable`.
    """

    q, r = len(iterable) // n, len(iterable) % n
    res = (iterable[i * q + min(i, r) : (i + 1) * q + min(i + 1, r)] for i in range(n))
    return list(res)

In [19]:
def ingest_gsfs_singular(files, processing_node_limit, ping_slice_step, slices_per_node, array_uris, cell_freq_uris, local=False):
    """
    Prototype ingester. Multi-file method. Each GSF will have a corresponding TileDB array.
    After which all TileDB arrays will be merged into 1.
    """

    node_counter = 0
    tasks = []
    tasks_dict = {n: [] for n in range(processing_node_limit)}
    files_dict = {fname: [] for fname in files}
    cell_frequency_tasks = []
    timestamps_tasks = []

    for enumi, pathname in enumerate(files):
        metadata_pathname = pathname.replace(".gsf", ".json")
        base_name = Path(pathname).stem
        with fs.open(metadata_pathname) as src:
            gsf_metadata = json.loads(src.read())

        ping_count = gsf_metadata["file_record_types"]["GSF_SWATH_BATHYMETRY_PING"]["record_count"]

        slices = [slice(start, start+ping_slice_step) for start in numpy.arange(0, ping_count, ping_slice_step)]
        slice_chunks = [slices[i:i+slices_per_node] for i in range(0, len(slices), slices_per_node)]

        array_uri = array_uris[enumi]
        cell_freq_uri = cell_freq_uris[enumi]

        for slice_chunk in slice_chunks:
            start_idx = slice_chunk[0].start
            end_idx = slice_chunk[-1].stop
            task_name = f"{base_name}-{start_idx}-{end_idx}-{node_counter}"
            
            if local:
                task = Delayed(ingest_gsf_slices, name=task_name, local=True)(gsf_metadata["gsf_uri"], creds.access_key, creds.secret_key, array_uri, slice_chunk, cell_frequency=False)
            else:
                task = Delayed("sixy6e/ingest_gsf_slices", name=task_name, image_name="3.7-geo", timeout=1800)(gsf_metadata["gsf_uri"], creds.access_key, creds.secret_key, array_uri, slice_chunk, cell_frequency=False)

            if len(tasks_dict[node_counter]):
                task.depends_on(tasks_dict[node_counter][-1])

            tasks.append(task)
            tasks_dict[node_counter].append(task)
            node_counter += 1

            files_dict[pathname].append(task)

            if node_counter == processing_node_limit:
                node_counter = 0

        cell_freq_task = Delayed("sixy6e/cell_frequency", name=f"{base_name}-cell-frequency")(array_uri, cell_freq_uri, creds.access_key, creds.secret_key)
        timestamp_task = Delayed("sixy6e/start_end_timestamp", name=f"{base_name}-start-end-timestamps")(array_uri, creds.access_key, creds.secret_key)
        
        for dep in files_dict[pathname]:
            cell_freq_task.depends_on(dep)


        timestamp_task = Delayed("sixy6e/start_end_timestamps", name=f"{base_name}-start-end-timestamps")(array_uri, creds.access_key, creds.secret_key)
        timestamp_task.depends_on(cell_freq_task)

        cell_frequency_tasks.append(cell_freq_task)
        timestamps_tasks.append(timestamp_task)


    reduce_task = Delayed(reduce_timestamps, "reduce-timestamps", local=True)(timestamps_tasks)
    
    return reduce_task

In [20]:
def ingest_gsfs_singular_no_cellfreq(files, processing_node_limit, ping_slice_step, slices_per_node, array_uris, cell_freq_uris, local=False):
    """
    Prototype ingester. Multi-file method. Each GSF will have a corresponding TileDB array.
    After which all TileDB arrays will be merged into 1.
    """

    node_counter = 0
    tasks = []
    tasks_dict = {n: [] for n in range(processing_node_limit)}
    files_dict = {fname: [] for fname in files}
    cell_frequency_tasks = []
    timestamps_tasks = []

    for enumi, pathname in enumerate(files):
        metadata_pathname = pathname.replace(".gsf", ".json")
        base_name = Path(pathname).stem
        with fs.open(metadata_pathname) as src:
            gsf_metadata = json.loads(src.read())

        ping_count = gsf_metadata["file_record_types"]["GSF_SWATH_BATHYMETRY_PING"]["record_count"]

        slices = [slice(start, start+ping_slice_step) for start in numpy.arange(0, ping_count, ping_slice_step)]
        slice_chunks = [slices[i:i+slices_per_node] for i in range(0, len(slices), slices_per_node)]

        array_uri = array_uris[enumi]
        cell_freq_uri = cell_freq_uris[enumi]

        for slice_chunk in slice_chunks:
            start_idx = slice_chunk[0].start
            end_idx = slice_chunk[-1].stop
            task_name = f"{base_name}-{start_idx}-{end_idx}-{node_counter}"
            
            if local:
                task = Delayed(ingest_gsf_slices, name=task_name, local=True)(gsf_metadata["gsf_uri"], creds.access_key, creds.secret_key, array_uri, slice_chunk, cell_frequency=False)
            else:
                task = Delayed("sixy6e/ingest_gsf_slices", name=task_name, image_name="3.7-geo", timeout=1800)(gsf_metadata["gsf_uri"], creds.access_key, creds.secret_key, array_uri, slice_chunk, cell_frequency=False)

            if len(tasks_dict[node_counter]):
                task.depends_on(tasks_dict[node_counter][-1])

            tasks.append(task)
            tasks_dict[node_counter].append(task)
            node_counter += 1

            files_dict[pathname].append(task)

            if node_counter == processing_node_limit:
                node_counter = 0

        #cell_freq_task = Delayed("sixy6e/cell_frequency", name=f"{base_name}-cell-frequency")(array_uri, cell_freq_uri, creds.access_key, creds.secret_key)
        #timestamp_task = Delayed("sixy6e/start_end_timestamp", name=f"{base_name}-start-end-timestamps")(array_uri, creds.access_key, creds.secret_key)
        
        #for dep in files_dict[pathname]:
        #    cell_freq_task.depends_on(dep)


        #timestamp_task = Delayed("sixy6e/start_end_timestamps", name=f"{base_name}-start-end-timestamps")(array_uri, creds.access_key, creds.secret_key)
        #timestamp_task.depends_on(cell_freq_task)

        #cell_frequency_tasks.append(cell_freq_task)
        #timestamps_tasks.append(timestamp_task)


    #reduce_task = Delayed(reduce_timestamps, "reduce-timestamps", local=True)(timestamps_tasks)
    reduce_task = Delayed(dummy_reducer, name="dummy-reducer-ingester", local=True)(tasks)
    
    return reduce_task

In [21]:
def load_and_concat(array_uris, ctx, out_uri=None):
    def concat(array_uris, ctx):
        data = []

        for uri in array_uris:
            with tiledb.open(uri, ctx=ctx) as ds:
                data.append(ds.df[:])

        concatenated = pandas.concat(data, copy=False)

        return concatenated

    concatenated = concat(array_uris, ctx)
    summarised = concatenated.groupby(["region_code"])["count"].agg("sum").to_frame("count").reset_index()
    if out_uri is None:
        return summarised
    else:
        write_chunked(summarised, out_uri, ctx, chunks=1000000)

In [22]:
@jit(nopython=True)
def strtoint(s):
    return ord(s) - 48

In [23]:
@jit(nopython=True)
def _unpack_code(region_codes: numpy.ndarray, ncodes, res):
    resolutions = [str(f"R{i}") for i in range(res)]
    unpacked = {
        "R1": numpy.zeros(ncodes, dtype="uint8"),
        "R2": numpy.zeros(ncodes, dtype="uint8"),
        "R3": numpy.zeros(ncodes, dtype="uint8"),
        "R4": numpy.zeros(ncodes, dtype="uint8"),
        "R5": numpy.zeros(ncodes, dtype="uint8"),
        "R6": numpy.zeros(ncodes, dtype="uint8"),
        "R7": numpy.zeros(ncodes, dtype="uint8"),
        "R8": numpy.zeros(ncodes, dtype="uint8"),
        "R9": numpy.zeros(ncodes, dtype="uint8"),
        "R10": numpy.zeros(ncodes, dtype="uint8"),
        "R11": numpy.zeros(ncodes, dtype="uint8"),
        "R12": numpy.zeros(ncodes, dtype="uint8"),
        "R13": numpy.zeros(ncodes, dtype="uint8"),
        "R14": numpy.zeros(ncodes, dtype="uint8"),
        "R15": numpy.zeros(ncodes, dtype="uint8"),
    }
    r0 = numpy.zeros(ncodes, dtype="<U1")
    for i in range(ncodes):
        code = str(region_codes[i])
        r0[i] = code[0]
        for j in range(1, res):
            unpacked[resolutions[j]][i] = strtoint(code[j])
    return r0, unpacked

In [24]:
def unpack_code(region_codes: numpy.ndarray, dataframe=True):
    res = len(region_codes[0])
    region_codes = region_codes.astype(f"<U{len(region_codes[0])}")
    r0, unpacked = _unpack_code(region_codes, region_codes.shape[0], res)
    unpacked_dict = {"R0": r0}
    for key in unpacked:
        unpacked_dict[key] = unpacked[key]
    if dataframe:
        result = pandas.DataFrame(unpacked_dict)
    else:
        result = unpacked_dict
    return result

In [25]:
def reduce_resoltion(df, resolution=12, chunks=10000):
    def reduce_res(dataframe, resolution):
        res = resolution + 1
        reduced = pandas.DataFrame(
            {
                "region_code": dataframe.region_code.str[0:res],
                "count": dataframe["count"].values,
            }
        )

        return reduced
    
    def group_res(dataframe):
        return dataframe.groupby(["region_code"])["count"].agg("sum").to_frame("count").reset_index()

    idxs = [(start, start + chunks) for start in numpy.arange(0, df.shape[0], chunks)]
    idx0 = idxs[0]
    subset = df[idx0[0]:idx0[1]]
    base_reduced = group_res(reduce_res(subset, resolution))

    for idx in idxs[1:]:
        subset = df[idx[0]:idx[1]]
        reduced = reduce_res(subset, resolution)
        concatenated = pandas.concat([base_reduced, reduced], copy=False)
        base_reduced = group_res(concatenated)

    return base_reduced

In [26]:
def write_chunked(df, out_uri, ctx, chunks=10000):
    idxs = [(start, start + chunks) for start in numpy.arange(0, df.shape[0], chunks)]
    rows_written = 0
    kwargs = {
        "sparse": False,
        "column_types": {"region_code": str, "count": numpy.uint64},
        "ctx": ctx,
    }
    for idx in idxs:
        subset = df[idx[0]:idx[1]]
        kwargs["row_start_idx"] = rows_written
        tiledb.dataframe_.from_pandas(out_uri, subset, **kwargs)
        kwargs["mode"] = "append"
        rows_written += len(subset)

In [27]:
def write_sparse_rhealpix_chunked(df, out_uri, ctx, chunks=10000):
    """Requires the output array to have already been created."""
    idxs = [(start, start + chunks) for start in numpy.arange(0, df.shape[0], chunks)]
    kwargs = {
        "mode": "append",
        "sparse": True,
        "ctx": ctx,
    }
    for idx in idxs:
        subset = df[idx[0]:idx[1]]
        new_df = unpack_code(subset.region_code.values)
        new_df["region_code"] = subset.region_code.values
        new_df["count"] = subset["count"].values
        tiledb.dataframe_.from_pandas(out_uri, new_df, **kwargs)

In [28]:
def filter_empty_files(files):
    """
    Filter out GSF's containing no Pings so we don't attempt to process them.
    """
    empty_files = []
    non_empty_files = []

    for pathname in files:
        metadata_pathname = pathname.replace(".gsf", ".json")
        base_name = Path(pathname).stem
        with fs.open(metadata_pathname) as src:
            gsf_metadata = json.loads(src.read())

        ping_count = gsf_metadata["file_record_types"]["GSF_SWATH_BATHYMETRY_PING"]["record_count"]
        if ping_count == 0:
            empty_files.append(pathname)
        else:
            non_empty_files.append(pathname)

    return non_empty_files, empty_files

In [29]:
def filter_large_files(files, size_limit_mb):
    """
    Filter out GSF's that are large than size_limit_mb so that they're processed locally.
    """
    manageable_files = []
    large_files = []

    for pathname in files:
        metadata_pathname = pathname.replace(".gsf", ".json")
        base_name = Path(pathname).stem
        with fs.open(metadata_pathname) as src:
            gsf_metadata = json.loads(src.read())

        # filter large files before hand, so we can still pass them through here but process locally
        if (gsf_metadata["size"] / 1024 / 1024) > size_limit_mb:
            large_files.append(pathname)
        else:
            manageable_files.append(pathname)

    return manageable_files, large_files

In [30]:
with fs.open(survey_info_uri) as src:
    survey_info = json.loads(src.read())

In [31]:
#required_attributes = survey_info["schemas"][0]
# this is temporary. better to have it defined internally. or programmatically derived as a union of all schemas from all pings
required_attributes = [
    "Z",
    "across_track",
    "along_track",
    "beam_angle",
    "beam_angle_forward",
    "beam_flags",
    "beam_number",
    "centre_beam",
    "course",
    "depth_corrector",
    "gps_tide_corrector",
    "heading",
    "heave",
    "height",
    "horizontal_error",
    "ping_flags",
    "pitch",
    "roll",
    "sector_number",
    "separation",
    "speed",
    "tide_corrector",
    "timestamp",
    "travel_time",
    "vertical_error",
    "region_code",
]

In [32]:
config = tiledb.Config(
        {"vfs.s3.aws_access_key_id": creds.access_key, "vfs.s3.aws_secret_access_key": creds.secret_key}
    )
config_dict = config.dict()
ctx = tiledb.Ctx(config=config)

In [66]:
storage.create_mbes_array(array_uri, required_attributes, ctx)

In [33]:
files = fs.glob(survey_uri + "**.gsf")
len(files)

395

In [34]:
# filter and empty files

In [35]:
non_empty_files, empty_files = filter_empty_files(files)

In [36]:
len(non_empty_files), len(empty_files)

(395, 0)

In [37]:
# filter large files

In [38]:
size_limit_mb = 500
manageable_files, large_files = filter_large_files(non_empty_files, size_limit_mb)

In [39]:
len(manageable_files), len(large_files)

(395, 0)

In [40]:
tmp_bathy_uris = [tmpdir + "bathymetry/" + Path(fname).with_suffix(".tiledb").name for fname in manageable_files]
tmp_freq_uris = [tmpdir + "cell-frequency/" + Path(fname).stem + ".tiledb" for fname in manageable_files]
tmp_bathy_large_uris = [tmpdir + "bathymetry/" + Path(fname).with_suffix(".tiledb").name for fname in large_files]
tmp_freq_large_uris = [tmpdir + "cell-frequency/" + Path(fname).stem + ".tiledb" for fname in large_files]

In [39]:
manageable_files[0]

'ausseabed-pl019-provided-data/JamesCookUniversity/0331_NETasmania_Bergersen_10G/Products/gsf/2011-139/0000_20110519_002043_ChallengerPreTests.gsf'

In [40]:
tmp_bathy_uris[0]

's3://ausseabed-pl019-ingested-data/L2/0331_NETasmania_Bergersen_10G/ga_ausseabed_419ae2f5-48c1-4d81-bd1f-18bf80e658cd_tmp/bathymetry/0000_20110519_002043_ChallengerPreTests.tiledb'

In [41]:
tmp_freq_uris[0]

's3://ausseabed-pl019-ingested-data/L2/0331_NETasmania_Bergersen_10G/ga_ausseabed_419ae2f5-48c1-4d81-bd1f-18bf80e658cd_tmp/cell-frequency/0000_20110519_002043_ChallengerPreTests.tiledb'

In [43]:
# create the arrays to hold the bathymetry (required multi-node and multi-proc ingestion)
for fname_uri in tmp_bathy_uris:
    storage.create_mbes_array(fname_uri, required_attributes, ctx)

In [None]:
for fname_uri in tmp_bathy_large_uris:
    storage.create_mbes_array(fname_uri, required_attributes, ctx)

In [40]:
sonar_metadata = get_sonar_metadata(manageable_files[0].replace(".gsf", ".json"))

In [45]:
len(manageable_files)

395

In [66]:
n_partitions = 10
files_blocks = scatter(manageable_files, n_partitions)
tmp_bathy_blocks = scatter(tmp_bathy_uris, n_partitions)
tmp_freq_blocks = scatter(tmp_freq_uris, n_partitions)
len(files_blocks[0])

79

In [96]:
processing_node_limit = 40
ping_slice_step = 2000
slices_per_node = 4
local_tasks_limit = 1
local_ping_slice_step = 2000
local_slices_per_task = 4

In [None]:
# GSF processing

In [59]:
reduce_task0 = ingest_gsfs_singular(files_blocks[0], processing_node_limit, ping_slice_step, slices_per_node, tmp_bathy_blocks[0], tmp_freq_blocks[0])

In [60]:
start_end_timestamps0 = reduce_task0.compute()

TileDBCloudError: Error message: received an error from the container: the container running the UDF exited (killed; exit code 137) (likely exceeded the memory limit)

Docker logs:
 - Code: 5000

In [69]:
reduce_task1 = ingest_gsfs_singular(files_blocks[1], processing_node_limit, ping_slice_step, slices_per_node, tmp_bathy_blocks[1], tmp_freq_blocks[1])

In [70]:
start_end_timestamps1 = reduce_task1.compute()

TileDBCloudError: Error message: received an error from the container: the container running the UDF exited (killed; exit code 137) (likely exceeded the memory limit)

Docker logs:
 - Code: 5000

In [99]:
reduce_task2 = ingest_gsfs_singular(files_blocks[2], processing_node_limit, ping_slice_step, slices_per_node, tmp_bathy_blocks[2], tmp_freq_blocks[2])

In [100]:
start_end_timestamps2 = reduce_task2.compute()

TileDBCloudError: Error message: context deadline exceeded

Docker logs:
 - Code: 5000

In [102]:
reduce_task3 = ingest_gsfs_singular(files_blocks[3], processing_node_limit, ping_slice_step, slices_per_node, tmp_bathy_blocks[3], tmp_freq_blocks[3])

In [103]:
start_end_timestamps3 = reduce_task3.compute()

In [104]:
start_end_timestamps3

[datetime.datetime(2011, 5, 26, 21, 52, 2, 383000),
 datetime.datetime(2011, 5, 27, 16, 56, 18, 501000)]

In [105]:
reduce_task4 = ingest_gsfs_singular(files_blocks[4], processing_node_limit, ping_slice_step, slices_per_node, tmp_bathy_blocks[4], tmp_freq_blocks[4])

In [106]:
start_end_timestamps4 = reduce_task4.compute()

TileDBCloudError: Error message: received an error from the container: the container running the UDF exited (killed; exit code 137) (likely exceeded the memory limit)

Docker logs:
 - Code: 5000

In [146]:
# cleanup and prep for resubmit of any tasks that failed (at this stage assuming the only tasks that would fail are gsf ingestion)
unfinished_files = []
unfinished_freq_uris = []
unfinished_bathy_uris = []

In [147]:
block_idxs = range(n_partitions)

for idx in block_idxs:
    for i, uri_fname in enumerate(tmp_freq_blocks[idx]):
        if fs.exists(uri_fname):
            continue

        unfinished_files.append(files_blocks[idx][i])
        unfinished_freq_uris.append(uri_fname)
        unfinished_bathy_uris.append(tmp_bathy_blocks[idx][i])

        if fs.exists(tmp_bathy_blocks[idx][i]):
            fs.rm(tmp_bathy_blocks[idx][i], recursive=True)

        storage.create_mbes_array(tmp_bathy_blocks[idx][i], required_attributes, ctx)

In [148]:
len(unfinished_files)

11

In [110]:
len(scatter(unfinished_files, 5)[0])

29

In [149]:
unfinished_partitions = 1
unfinished_files_blocks = scatter(unfinished_files, unfinished_partitions)
unfinished_bathy_blocks = scatter(unfinished_bathy_uris, unfinished_partitions)
unfinished_freq_blocks = scatter(unfinished_freq_uris, unfinished_partitions)
len(unfinished_files_blocks[0])

11

In [117]:
len(unfinished_blocks[1])

29

In [135]:
processing_node_limit = 30
ping_slice_step = 2000
slices_per_node = 4
local_tasks_limit = 1
local_ping_slice_step = 2000
local_slices_per_task = 4

In [143]:
task_idx = 0

In [144]:
reduce_task_unfinished = ingest_gsfs_singular(unfinished_files_blocks[task_idx], processing_node_limit, ping_slice_step, slices_per_node, unfinished_bathy_blocks[task_idx], unfinished_freq_blocks[task_idx])

In [145]:
start_end_timestamps_unfinished = reduce_task_unfinished.compute()

TileDBCloudError: Error message: received an error from the container: the container running the UDF exited (killed; exit code 137) (likely exceeded the memory limit)

Docker logs:
 - Code: 5000

In [None]:
# process the leftovers locally

In [150]:
local_tasks_limit = 8
local_ping_slice_step = 3000
local_slices_per_task = 10

In [151]:
task_idx = 0

In [152]:
reduce_task_unfinished = ingest_gsfs_singular(unfinished_files_blocks[task_idx], local_tasks_limit, local_ping_slice_step, local_slices_per_task, unfinished_bathy_blocks[task_idx], unfinished_freq_blocks[task_idx], local=True)

In [153]:
start_end_timestamps_unfinished = reduce_task_unfinished.compute()

In [None]:
# keep the below error

In [93]:
start_end_timestamps_unfinished = reduce_task_unfinished.compute()

TileDBCloudError: Cannot connect to the Docker daemon at tcp://localhost:2375. Is the docker daemon running? - Code: 5000

In [None]:
# timestamps

In [54]:
intermediate_bathy_arrays = fs.glob("s3://ausseabed-pl019-ingested-data/L2/0331_NETasmania_Bergersen_10G/ga_ausseabed_419ae2f5-48c1-4d81-bd1f-18bf80e658cd_tmp/bathymetry/**.tiledb")

In [55]:
len(intermediate_bathy_arrays)

395

In [56]:
timestamps_blocks = scatter(intermediate_bathy_arrays, 10)
len(timestamps_blocks[0])

40

In [57]:
timestamps_collect = []

In [58]:
for timestamps_block in timestamps_blocks:
    timestamps_tasks = []
    for fname in timestamps_block:
        base_name = Path(fname).stem
        uri_fname = f"s3://{fname}"
        timestamp_task = Delayed("sixy6e/start_end_timestamps", name=f"{base_name}-start-end-timestamps")(uri_fname, creds.access_key, creds.secret_key)
        timestamps_tasks.append(timestamp_task)


    reduce_task = Delayed(reduce_timestamps, "reduce-timestamps", local=True)(timestamps_tasks)
    timestamps_collect.append(reduce_task.compute())

In [59]:
final_start_end_timestamps = reduce_timestamps(timestamps_collect)

In [60]:
final_start_end_timestamps

[datetime.datetime(2011, 5, 19, 0, 20, 43, 487000),
 datetime.datetime(2011, 5, 28, 12, 11, 26, 924000)]

In [None]:
# many bathy tiledb's to one tiledb

In [111]:
array_uri

's3://ausseabed-pl019-ingested-data/L2/0331_NETasmania_Bergersen_10G/ga_ausseabed_419ae2f5-48c1-4d81-bd1f-18bf80e658cd_bathymetry.tiledb'

In [67]:
def combine_tiledbs(array_uris, out_array_uri, access_key, skey):
    config = tiledb.Config(
        {"vfs.s3.aws_access_key_id": access_key, "vfs.s3.aws_secret_access_key": skey}
    )
    ctx = tiledb.Ctx(config=config)

    for array_uri in array_uris:
        with tiledb.open(array_uri, ctx=ctx) as ds:
            df = ds.df[:]

        append_ping_dataframe(df, out_array_uri, access_key, skey)

In [44]:
intermediate_bathy_array_uris = [f"s3://{fname}" for fname in intermediate_bathy_arrays]

In [65]:
intermediate_bathy_array_uris[0]

's3://ausseabed-pl019-ingested-data/L2/0331_NETasmania_Bergersen_10G/ga_ausseabed_419ae2f5-48c1-4d81-bd1f-18bf80e658cd_tmp/bathymetry/0000_20110519_002043_ChallengerPreTests.tiledb'

In [69]:
bathy_blocks = scatter(intermediate_bathy_array_uris, 12)

In [70]:
len(bathy_blocks[0])

33

In [71]:
ingest_tasks = []
for block_id, bblock in enumerate(bathy_blocks):
    task = Delayed(combine_tiledbs, name=f"ingesting-block-{block_id}", local=True)(bblock, array_uri, creds.access_key, creds.secret_key)
    ingest_tasks.append(task)

dummy_reducer = Delayed(dummy_reducer, name="dummy-reducer-ingester", local=True)(ingest_tasks)

In [72]:
ingested_result = dummy_reducer.compute()

In [None]:
###### need to reprocess the cell freq arrays; we've confirmed  that all bathy arrays have good region codes
# intermediate_bathy_array_uris

In [None]:
# combine and reduce the cell frequency arrays

In [84]:
intermediate_freq_arrays = fs.glob("s3://ausseabed-pl019-ingested-data/L2/0331_NETasmania_Bergersen_10G/ga_ausseabed_419ae2f5-48c1-4d81-bd1f-18bf80e658cd_tmp/cell-frequency/**.tiledb")

In [85]:
len(intermediate_freq_arrays)

395

In [86]:
intermediate_freq_array_uris = [f"s3://{fname}" for fname in intermediate_freq_arrays]

In [87]:
intermediate_freq_array_uris[0]

's3://ausseabed-pl019-ingested-data/L2/0331_NETasmania_Bergersen_10G/ga_ausseabed_419ae2f5-48c1-4d81-bd1f-18bf80e658cd_tmp/cell-frequency/0000_20110519_002043_ChallengerPreTests.tiledb'

In [88]:
intermediate_freq_array_uris[1]

's3://ausseabed-pl019-ingested-data/L2/0331_NETasmania_Bergersen_10G/ga_ausseabed_419ae2f5-48c1-4d81-bd1f-18bf80e658cd_tmp/cell-frequency/0001_20110519_002702_ChallengerPreTests.tiledb'

In [89]:
partitions = 10

In [90]:
freq_blocks = scatter(intermediate_freq_array_uris, partitions)

In [91]:
len(freq_blocks[0])

40

In [93]:
sub_partitions = 4

In [39]:
cell_freq_blocks_tmp_dir = "s3://ausseabed-pl019-ingested-data/L2/0331_NETasmania_Bergersen_10G/ga_ausseabed_419ae2f5-48c1-4d81-bd1f-18bf80e658cd_tmp/cell-frequency-blocks-pass1/"

In [94]:
tasks = []
tasks_dict = {i: [] for i in range(partitions)}
for i, freq_block in enumerate(freq_blocks):
    sub_freq_blocks = scatter(freq_block, sub_partitions)
    for j, sub_block in enumerate(sub_freq_blocks):
        task_name = f"combine-cell-freq_block-{i}_sub-block-{j}"
        out_uri = f"{cell_freq_blocks_tmp_dir}cell-frequency-block-{i}-sub-block-{j}.tiledb"
        task = Delayed(load_and_concat, name=task_name, local=True)(sub_block, ctx, out_uri)
        tasks.append(task)
        if len(tasks_dict[i]):
            task.depends_on(tasks_dict[i][-1])
        tasks_dict[i].append(task)

dummy_reducer = Delayed(dummy_reducer, name="dummy-reducer-ingester", local=True)(tasks)

In [95]:
freq_pass1_result = dummy_reducer.compute()

In [40]:
pass1_cell_freqs = fs.glob(cell_freq_blocks_tmp_dir + "**.tiledb")
len(pass1_cell_freqs)

40

In [41]:
pass1_cell_freqs[0]

'ausseabed-pl019-ingested-data/L2/0331_NETasmania_Bergersen_10G/ga_ausseabed_419ae2f5-48c1-4d81-bd1f-18bf80e658cd_tmp/cell-frequency-blocks-pass1/cell-frequency-block-0-sub-block-0.tiledb'

In [42]:
pass1_cell_freq_uris = [f"s3://{fname}" for fname in pass1_cell_freqs]

In [43]:
pass1_cell_freq_uris[0]

's3://ausseabed-pl019-ingested-data/L2/0331_NETasmania_Bergersen_10G/ga_ausseabed_419ae2f5-48c1-4d81-bd1f-18bf80e658cd_tmp/cell-frequency-blocks-pass1/cell-frequency-block-0-sub-block-0.tiledb'

In [100]:
domains = []
for p1_uri in pass1_cell_freq_uris:
    with tiledb.open(p1_uri, ctx=ctx) as ds:
        domains.append(ds.nonempty_domain())

In [101]:
domains

[((0, 1007070),),
 ((0, 1414477),),
 ((0, 288136),),
 ((0, 9882425),),
 ((0, 9371543),),
 ((0, 8362559),),
 ((0, 1481274),),
 ((0, 10923194),),
 ((0, 16495232),),
 ((0, 12037945),),
 ((0, 13729277),),
 ((0, 9155998),),
 ((0, 9311652),),
 ((0, 8182732),),
 ((0, 5172546),),
 ((0, 4891603),),
 ((0, 4907285),),
 ((0, 5867419),),
 ((0, 6417760),),
 ((0, 5364690),),
 ((0, 4688849),),
 ((0, 5000046),),
 ((0, 6229752),),
 ((0, 4691809),),
 ((0, 5810146),),
 ((0, 7952419),),
 ((0, 6335173),),
 ((0, 4689733),),
 ((0, 5021095),),
 ((0, 4810180),),
 ((0, 4950232),),
 ((0, 3655926),),
 ((0, 1409),),
 ((0, 198264),),
 ((0, 5936527),),
 ((0, 5285647),),
 ((0, 4936003),),
 ((0, 4972591),),
 ((0, 6119660),),
 ((0, 5416258),)]

In [44]:
cell_freq_blocks_tmp_dir = "s3://ausseabed-pl019-ingested-data/L2/0331_NETasmania_Bergersen_10G/ga_ausseabed_419ae2f5-48c1-4d81-bd1f-18bf80e658cd_tmp/cell-frequency-blocks-pass2/"

In [45]:
partitions = 4
sub_partitions = 1

In [46]:
freq_blocks = scatter(pass1_cell_freq_uris, partitions)

In [47]:
len(freq_blocks[0])

10

In [48]:
tasks = []
tasks_dict = {i: [] for i in range(partitions)}
for i, freq_block in enumerate(freq_blocks):
    sub_freq_blocks = scatter(freq_block, sub_partitions)
    for j, sub_block in enumerate(sub_freq_blocks):
        task_name = f"combine-cell-freq_block-{i}_sub-block-{j}"
        out_uri = f"{cell_freq_blocks_tmp_dir}cell-frequency-block-{i}-sub-block-{j}.tiledb"
        task = Delayed(load_and_concat, name=task_name, local=True)(sub_block, ctx, out_uri)
        tasks.append(task)
        if len(tasks_dict[i]):
            task.depends_on(tasks_dict[i][-1])
        tasks_dict[i].append(task)

dummy_reducer = Delayed(dummy_reducer, name="dummy-reducer-ingester", local=True)(tasks)

In [49]:
freq_pass2_result = dummy_reducer.compute()

In [50]:
pass2_cell_freqs = fs.glob(cell_freq_blocks_tmp_dir + "**.tiledb")
len(pass2_cell_freqs)

4

In [51]:
pass2_cell_freq_uris = [f"s3://{fname}" for fname in pass2_cell_freqs]

In [52]:
domains = []
for p2_uri in pass2_cell_freq_uris:
    with tiledb.open(p2_uri, ctx=ctx) as ds:
        domains.append(ds.nonempty_domain())

In [53]:
domains

[((0, 70893436),), ((0, 68432198),), ((0, 54224618),), ((0, 41470052),)]

In [54]:
cell_freq_blocks_tmp_dir = "s3://ausseabed-pl019-ingested-data/L2/0331_NETasmania_Bergersen_10G/ga_ausseabed_419ae2f5-48c1-4d81-bd1f-18bf80e658cd_tmp/cell-frequency-blocks-pass3/"

In [55]:
partitions = 1
sub_partitions = 2

In [56]:
freq_blocks = scatter(pass2_cell_freq_uris, partitions)

In [57]:
len(freq_blocks[0])

4

In [58]:
len(scatter(freq_blocks[0], sub_partitions)[-1])

2

In [59]:
tasks = []
tasks_dict = {i: [] for i in range(partitions)}
for i, freq_block in enumerate(freq_blocks):
    sub_freq_blocks = scatter(freq_block, sub_partitions)
    for j, sub_block in enumerate(sub_freq_blocks):
        task_name = f"combine-cell-freq_block-{i}_sub-block-{j}"
        out_uri = f"{cell_freq_blocks_tmp_dir}cell-frequency-block-{i}-sub-block-{j}.tiledb"
        task = Delayed(load_and_concat, name=task_name, local=True)(sub_block, ctx, out_uri)
        tasks.append(task)
        if len(tasks_dict[i]):
            task.depends_on(tasks_dict[i][-1])
        tasks_dict[i].append(task)

dummy_reducer = Delayed(dummy_reducer, name="dummy-reducer-ingester", local=True)(tasks)

In [60]:
freq_pass3_result = dummy_reducer.compute()

In [61]:
pass3_cell_freqs = fs.glob(cell_freq_blocks_tmp_dir + "**.tiledb")
len(pass3_cell_freqs)

2

In [62]:
pass3_cell_freq_uris = [f"s3://{fname}" for fname in pass3_cell_freqs]

In [63]:
domains = []
for p3_uri in pass3_cell_freq_uris:
    with tiledb.open(p3_uri, ctx=ctx) as ds:
        domains.append(ds.nonempty_domain())

In [64]:
domains

[((0, 139285718),), ((0, 95694671),)]

In [66]:
soundings_cell_density_array_uri_15

's3://ausseabed-pl019-ingested-data/L2/0331_NETasmania_Bergersen_10G/ga_ausseabed_419ae2f5-48c1-4d81-bd1f-18bf80e658cd_soundings-cell-density-resolution-15.tiledb'

In [67]:
load_and_concat(pass3_cell_freq_uris, ctx, soundings_cell_density_array_uri_15)

In [None]:
# sparse rhealpix version of cell frequency

In [68]:
index_filters = tiledb.FilterList([tiledb.ZstdFilter(level=16)])
dims = [tiledb.Dim(f"R{i}", domain=(0, 8), dtype="uint8", tile=1, filters=index_filters) for i in range(1, 16)]
dim0 = tiledb.Dim("R0", tile=None, dtype="ascii", filters=index_filters)
dims.insert(0, dim0)
cell_freq_attributes = [tiledb.Attr("region_code", dtype=str, filters=[tiledb.ZstdFilter(level=16)]), tiledb.Attr("count", dtype="uint64", filters=[tiledb.ZstdFilter(level=16)])]
domain = tiledb.Domain(*dims)
schema = tiledb.ArraySchema(domain=domain, sparse=True, attrs=cell_freq_attributes, cell_order="row-major", tile_order="row-major", capacity=100_000,allows_duplicates=True)

In [69]:
with tiledb.scope_ctx(ctx):
    tiledb.Array.create(soundings_cell_density_sparse_array_uri_15, schema)

In [70]:
with tiledb.open(soundings_cell_density_array_uri_15, ctx=ctx) as ds:
    df = ds.df[:]
    
write_sparse_rhealpix_chunked(df, soundings_cell_density_sparse_array_uri_15, ctx, chunks=1000000)

del df

In [71]:
gc.collect()

310

In [121]:
len(intermediate_bathy_array_uris)

395

In [77]:
intermediate_bathy_array_uris[0]

's3://ausseabed-pl019-ingested-data/L2/0331_NETasmania_Bergersen_10G/ga_ausseabed_419ae2f5-48c1-4d81-bd1f-18bf80e658cd_tmp/bathymetry/0000_20110519_002043_ChallengerPreTests.tiledb'

In [78]:
intermediate_cell_freq_array_uris = [tmpdir + "cell-frequency/" + Path(fname).stem + ".tiledb" for fname in intermediate_bathy_array_uris]

In [79]:
intermediate_cell_freq_array_uris[0]

's3://ausseabed-pl019-ingested-data/L2/0331_NETasmania_Bergersen_10G/ga_ausseabed_419ae2f5-48c1-4d81-bd1f-18bf80e658cd_tmp/cell-frequency/0000_20110519_002043_ChallengerPreTests.tiledb'

In [80]:
intermediate_cell_freq_array_uris[1]

's3://ausseabed-pl019-ingested-data/L2/0331_NETasmania_Bergersen_10G/ga_ausseabed_419ae2f5-48c1-4d81-bd1f-18bf80e658cd_tmp/cell-frequency/0001_20110519_002702_ChallengerPreTests.tiledb'

In [82]:
reprocess_cell_freq_tasks = []
max_nodes = 12
count = 0
task_nodes = {i: [] for i in range(max_nodes)}
for i, cell_uri in enumerate(intermediate_bathy_array_uris):
    base_name = Path(cell_uri).stem
    cell_freq_uri = intermediate_cell_freq_array_uris[i]
    task = Delayed(cell_frequency, name=f"{base_name}-cell-frequency", local=True)(cell_uri, cell_freq_uri, creds.access_key, creds.secret_key)
    reprocess_cell_freq_tasks.append(task)
    if len(task_nodes[count]):
        task.depends_on(task_nodes[count][-1])
    task_nodes[count].append(task)
dummy_reducer = Delayed(dummy_reducer, name="dummy-reducer-ingester", local=True)(reprocess_cell_freq_tasks)

In [83]:
dummy_reducer.compute()

<tiledb.cloud.compute.delayed.Delayed at 0x7f4c36f3a510>

In [81]:
def cell_frequency(array_uri, cell_frequency_uri, access_key, skey):
    """
    Calculate the frequency distirbution of each region code (cell count).
    Result is written to a tiledb array.
    """
    config = tiledb.Config(
        {"vfs.s3.aws_access_key_id": access_key, "vfs.s3.aws_secret_access_key": skey}
    )
    ctx = tiledb.Ctx(config=config)
    kwargs = {
        "sparse": False,
        "column_types": {
            "region_code": str,
            "count": numpy.uint64,
        },
        "ctx": ctx,
    }

    with tiledb.open(array_uri, ctx=ctx) as ds:
        query = ds.query(attrs=["region_code"], coords=False)
        df = query.df[:]

    frequency_df = (
        df.groupby(["region_code"])["region_code"]
        .agg("count")
        .to_frame("count")
        .reset_index()
    )

    tiledb.dataframe_.from_pandas(cell_frequency_uri, frequency_df, **kwargs)

In [102]:
local_tasks_limit = 12
local_ping_slice_step = 3000
local_slices_per_task = 2

In [103]:
redo_ingest = ingest_gsfs_singular_no_cellfreq(reprocess_files, local_tasks_limit, local_ping_slice_step, local_slices_per_task, tmp_bathy_uris, tmp_freq_uris, local=True)

In [104]:
redo_ingest.visualize()

Visualize(value='{"nodes": ["d03ea208-ecc9-4e38-a482-41dcdfbdbbb3", "781b72a9-36f3-42b1-bec7-483dce0544df"], "…

In [105]:
start_end_timestamps_redo = redo_ingest.compute()

In [185]:
1

1

In [186]:
reprocess_cell_freq_tasks = []
max_nodes = 12
count = 0
task_nodes = {i: [] for i in range(max_nodes)}
for i, cell_uri in enumerate(tmp_bathy_uris):
    base_name = Path(cell_uri).stem
    cell_freq_uri = tmp_freq_uris[i]
    task = Delayed(cell_frequency, name=f"{base_name}-cell-frequency", local=True)(cell_uri, cell_freq_uri, creds.access_key, creds.secret_key)
    reprocess_cell_freq_tasks.append(task)
    if len(task_nodes[count]):
        task.depends_on(task_nodes[count][-1])
    task_nodes[count].append(task)
dummy_reducer = Delayed(dummy_reducer, name="dummy-reducer-ingester", local=True)(reprocess_cell_freq_tasks)

In [187]:
dummy_reducer.compute()

<tiledb.cloud.compute.delayed.Delayed at 0x7fa35dad8250>

In [None]:
# res 12 creation

In [72]:
with tiledb.open(soundings_cell_density_array_uri_15, ctx=ctx) as ds:
    df = ds.df[:]

res12_df = reduce_resoltion(df, resolution=12, chunks=1000000)

del df

In [73]:
res12_df

Unnamed: 0,region_code,count
0,R788471273825,1726
1,R788471273827,442
2,R788471273828,4518
3,R788471273851,369
4,R788471273852,5062
...,...,...
961280,S300752824066,265
961281,S300752824067,115
961282,S300752824300,212
961283,S300752824301,4


In [74]:
res12_df["count"].sum()

501018176

In [75]:
with tiledb.scope_ctx(ctx):
    tiledb.Array.create(res12_out_uri_sparse, schema)

In [76]:
write_sparse_rhealpix_chunked(res12_df, res12_out_uri_sparse, ctx, chunks=100000)

In [77]:
write_chunked(res12_df, res12_out_uri_dense, ctx, 100000)

In [41]:
# had to install a patch and restart the kernel

In [42]:
with tiledb.open(res12_out_uri_dense, ctx=ctx) as ds:
    res12_df = ds.df[:]

In [43]:
res12_df["geometry"] = rhealpix.rhealpix_geo_boundary(res12_df.region_code.values)

In [82]:
@jit(nopython=True)
def test_str_set():
    data = set(["R", "Q", "P", "O"])
    print("T" in data)

In [84]:
test_str_set()

NumbaValueError: Failed in nopython mode pipeline (step: native lowering)
[1m[1mUse of reference counted items in 'set()' is unsupported, offending type is: 'unicode_type'.[0m
[0m[1mDuring: lowering "data = call $2load_global.0($12build_list.5, func=$2load_global.0, args=[Var($12build_list.5, 831108089.py:3)], kws=(), vararg=None, target=None)" at /tmp/ipykernel_8278/831108089.py (3)[0m

In [85]:
@jit(nopython=True)
def test_str_list():
    data = ["R", "Q", "P", "O"]
    print("T" in data)

In [86]:
test_str_list()

False


In [44]:
gdf = geopandas.GeoDataFrame(res12_df, crs="epsg:4326")

In [45]:
gdf

Unnamed: 0,region_code,count,geometry
0,R788471273825,1726,"POLYGON ((148.37203 -40.31026, 148.37220 -40.3..."
1,R788471273827,442,"POLYGON ((148.37186 -40.31045, 148.37203 -40.3..."
2,R788471273828,4518,"POLYGON ((148.37203 -40.31045, 148.37220 -40.3..."
3,R788471273851,369,"POLYGON ((148.37186 -40.31064, 148.37203 -40.3..."
4,R788471273852,5062,"POLYGON ((148.37203 -40.31064, 148.37220 -40.3..."
...,...,...,...
961280,S300752824066,265,"POLYGON ((147.95618 -44.44680, 147.95636 -44.4..."
961281,S300752824067,115,"POLYGON ((147.95624 -44.44700, 147.95641 -44.4..."
961282,S300752824300,212,"POLYGON ((147.95601 -44.44680, 147.95618 -44.4..."
961283,S300752824301,4,"POLYGON ((147.95606 -44.44700, 147.95624 -44.4..."


In [46]:
with fiona.Env(session=AWSSession(aws_access_key_id=creds.access_key, aws_secret_access_key=creds.secret_key)):
    gdf.to_file(soundings_cell_density_uri, driver="GeoJSONSeq", coordinate_precision=11)

In [47]:
dissolved = geopandas.GeoDataFrame(geometry.dissolve(gdf), crs="epsg:4326")

  aout[:] = out


In [48]:
with fiona.Env(session=AWSSession(aws_access_key_id=creds.access_key, aws_secret_access_key=creds.secret_key)):
    dissolved.to_file(coverage_uri, driver="GeoJSONSeq", coordinate_precision=11)

In [49]:
res12_df["count"].max()

31504958

In [50]:
res12_df["count"].min()

1

In [51]:
dggs = rhealpix.RhealpixDGGS.from_ellipsoid()

In [52]:
dggs.cell_width(12)

18.8309796906348

In [53]:
area_ha = gdf.shape[0] * dggs.cell_width(12) **2 / 10000
sonar_metadata["area_ha"] = area_ha
area_ha

34087.723271273644

In [61]:
gdf2 = geopandas.GeoDataFrame({"region_code": gdf.region_code.str[0:10], "count": gdf["count"]}).groupby(["region_code"])["count"].agg("sum").to_frame("count").reset_index()

In [62]:
gdf2["count"].max()

62659587

In [63]:
gdf2["count"].min()

1

In [64]:
gdf2

Unnamed: 0,region_code,count
0,R788471273,14814
1,R788471274,642591
2,R788471277,722077
3,R788471278,1483464
4,R788471286,216544
...,...,...
3782,S300752812,6871
3783,S300752820,18759
3784,S300752821,78196
3785,S300752823,14940


In [65]:
slices = []
for geom in rhealpix.rhealpix_geo_boundary(gdf2.region_code.values, round_coords=False):
    bounds = geom.bounds
    slices.append((
        slice(bounds[0], bounds[-2]),
        slice(bounds[1], bounds[-1])
    ))

In [69]:
n_partitions = 12
n_sub_partitions = 3
blocks = scatter(slices, n_partitions)

In [70]:
len(blocks), len(blocks[0])

(12, 316)

In [71]:
len(scatter(blocks[0], n_sub_partitions)[0])

106

In [72]:
stats_attrs = [at for at in required_attributes if at not in ["timestamp", "region_code"]]
stats_attrs.insert(0, "Y")
stats_attrs.insert(0, "X")

In [73]:
stats_results = []
tasks_dict = {stat: [] for stat in stats_attrs}
reduce_tasks = []

for i, block in enumerate(blocks):
    sub_tasks = []
    sub_blocks = scatter(block, n_sub_partitions)

    for si, sub_block in enumerate(sub_blocks):
        for attribute in stats_attrs:
            
            if attribute in ["X", "Y"]:
                schema = attribute
            else:
                schema = None

            task_name = f"block-{i}-sub_block-{si}-{attribute}"
            task = Delayed("sixy6e/basic_statistics_incremental", name=task_name)(array_uri, config_dict, attribute, schema=schema, idxs=sub_block, summarise=False)

            if len(tasks_dict[attribute]) > 1:
                task.depends_on(tasks_dict[attribute][-1])

            tasks_dict[attribute].append(task)

for attribute in stats_attrs:
    task_name = f"reduce-attibute-{attribute}"
    reducer_task = Delayed("sixy6e/basic_statistics_reduce", name=task_name)(tasks_dict[attribute], attribute)
    reduce_tasks.append(reducer_task)

collect_stats_task = Delayed(gather_stats, local=True, name="gather-stats")(reduce_tasks)

In [74]:
stats_results = collect_stats_task.compute()

In [75]:
crs_info = {
    "horizontal_datum": "epsg:4326",
    "vertical_datum": "epsg:5714",
}

In [76]:
with tiledb.open(array_uri, "w", ctx=ctx) as ds:
    ds.meta["crs_info"] = json.dumps(crs_info)
    ds.meta["basic_statistics"] = json.dumps(stats_results, cls=stac_metadata.Encoder)

In [77]:
with fs.open(asb_metadata_uri) as src:
    asb_metadata = json.loads(src.read())

In [78]:
dataset_metadata = stac_metadata.prepare(
    uid,
    sonar_metadata,
    stats_results,
    asb_metadata,
    array_uri,
    coverage_uri,
    soundings_cell_density_uri,
    creds.access_key,
    creds.secret_key,
    final_start_end_timestamps,
    outdir_uri,
    stac_md_uri,
)

In [79]:
stats_results.keys()

dict_keys(['X', 'Y', 'Z', 'across_track', 'along_track', 'beam_angle', 'beam_angle_forward', 'beam_flags', 'beam_number', 'centre_beam', 'course', 'depth_corrector', 'gps_tide_corrector', 'heading', 'heave', 'height', 'horizontal_error', 'ping_flags', 'pitch', 'roll', 'sector_number', 'separation', 'speed', 'tide_corrector', 'travel_time', 'vertical_error'])

In [80]:
stats_results["Z"]

{'minimum': -3.143967600279552e+16,
 'maximum': 349.9800109863281,
 'count': 532307589,
 'total': 19340187835.096706,
 'mean': 36.332729862877656,
 'variance': 1005.9687584293672,
 'stddev': 31.717010553161646,
 'skewness': 1.7290816342976785,
 'kurtosis': 4.192311369979897,
 'name': 'Z'}