In [1]:
import boto3
import io
import urllib
import s3fs
import json
from pathlib import Path
import attr
import numpy
import tiledb
import tiledb.cloud
from tiledb.cloud.compute import DelayedArrayUDF, Delayed
import pandas
import geopandas
import fiona
from fiona.session import AWSSession
import pystac
from scipy.stats import skew, kurtosis
import uuid
from numba import jit
import gc

In [2]:
import pystac
from pystac.extensions.projection import ProjectionExtension
from pystac.extensions.pointcloud import (
PointcloudExtension,
SchemaType,
PhenomenologyType,
Schema,
Statistic,
)

In [3]:
from reap_gsf import reap, data_model
from bathy_datasets import rhealpix, storage, geometry, asb_spreadsheet,stac_metadata

In [4]:
session = boto3.Session()
creds = session.get_credentials()

In [5]:
fs = s3fs.S3FileSystem(key=creds.access_key, secret=creds.secret_key, use_listings_cache=False)

In [6]:
uid = uuid.uuid4()

In [7]:
survey_uri = "s3://ausseabed-pl019-provided-data/CSIRO/IN2018_T02/GSF/em710/"
outdir_uri = "s3://ausseabed-pl019-ingested-data/L2/IN2018_T02-em710/"
asb_metadata_uri = "s3://ausseabed-pl019-provided-data/CSIRO/IN2018_T02/GSF/em710/IN2018_T02_EM710.json"
survey_info_uri = "s3://ausseabed-pl019-provided-data/CSIRO/IN2018_T02/GSF/em710/schema-info.json"

In [8]:
base_prefix = "ga_ausseabed"
array_name = f"{base_prefix}_{uid}_bathymetry"
array_uri = f"{outdir_uri}{array_name}.tiledb"
tiledb_array_uri = f"tiledb://sixy6e/{array_name}"
soundings_cell_density_uri = f"{outdir_uri}{base_prefix}_{uid}_soundings-cell-density-resolution-12.geojson"
coverage_uri = f"{outdir_uri}{base_prefix}_{uid}_coverage.geojson"
stac_md_uri = f"{outdir_uri}{base_prefix}_{uid}_stac-metadata.geojson"

In [9]:
soundings_cell_density_uri_15 = f"{outdir_uri}{base_prefix}_{uid}_soundings-cell-density-resolution-15.geojson"
soundings_cell_density_array_uri_15 = f"{outdir_uri}{base_prefix}_{uid}_soundings-cell-density-resolution-15.tiledb"
soundings_cell_density_sparse_array_uri_15 = f"{outdir_uri}{base_prefix}_{uid}_soundings-cell-density-resolution-15-sparse-array.tiledb"

In [10]:
res12_out_uri_sparse = f"{outdir_uri}{base_prefix}_{uid}_soundings-cell-density-resolution-12-sparse-array.tiledb"
res12_out_uri_dense = f"{outdir_uri}{base_prefix}_{uid}_soundings-cell-density-resolution-12-dense-array.tiledb"

In [11]:
tmpdir = f"{outdir_uri}{base_prefix}_{uid}_tmp/"

In [12]:
tmpdir

's3://ausseabed-pl019-ingested-data/L2/IN2015_T01-em122/ga_ausseabed_1fa2a13d-4985-4a5e-a175-a0de5305aedb_tmp/'

In [13]:
array_uri

's3://ausseabed-pl019-ingested-data/L2/IN2015_T01-em122/ga_ausseabed_1fa2a13d-4985-4a5e-a175-a0de5305aedb_bathymetry.tiledb'

In [14]:
def get_sonar_metadata(json_uri):
    """
    Temporary func for pulling metadata from a sample GSF file.
    """
    with fs.open(json_uri) as src:
        md = json.loads(src.read())
    stream_task = Delayed("sixy6e/retrieve_stream", name="retrieve")(md["gsf_uri"], creds.access_key, creds.secret_key)
    dataframe_task = Delayed("sixy6e/decode_gsf", name="decode", image_name="3.7-geo")(stream_task, slice(10))
    df, finfo = dataframe_task.compute()
    sonar_metadata = finfo[3].record(0).read(stream_task.result()[0])
    history = attr.asdict(finfo[6].record(0).read(stream_task.result()[0]))
    for key, value in history.items():
        sonar_metadata[key] = value
    return sonar_metadata


def reduce_region_codes_timestamps(results):
    """
    The reduce part of the map-reduce construct for handling the region_code counts.
    Combine all the region_code counts then summarise the results.
    """
    region_codes = [i[0] for i in results]
    timestamps = [i[1] for i in results]
    df = pandas.concat(region_codes)
    cell_count = df.groupby(["region_code"])["count"].agg("sum").to_frame("count").reset_index()
    
    timestamps_df = pandas.DataFrame(
        {
            "start_datetime": [i[0] for i in timestamps],
            "end_datetime": [i[1] for i in timestamps],
        }
    )

    start_end_timestamp = [
        timestamps_df.start_datetime.min().to_pydatetime(),
        timestamps_df.end_datetime.max().to_pydatetime(),
    ]

    return cell_count, start_end_timestamp


def reduce_region_codes(region_codes):
    """
    The reduce part of the map-reduce construct for handling the region_code counts.
    Combine all the region_code counts then summarise the results.
    """
    df = pandas.concat(region_codes)
    cell_count = df.groupby(["region_code"])["count"].agg("sum").to_frame("count").reset_index()

    return cell_count


def reduce_timestamps(timestamps):
    """
    The reduce part of the map-reduce construct for handling the region_code counts.
    Combine all the region_code counts then summarise the results.
    """
    timestamps_df = pandas.DataFrame(
        {
            "start_datetime": [i[0] for i in timestamps],
            "end_datetime": [i[1] for i in timestamps],
        }
    )

    start_end_timestamp = [
        timestamps_df.start_datetime.min().to_pydatetime(),
        timestamps_df.end_datetime.max().to_pydatetime(),
    ]

    return start_end_timestamp


def gather_stats(results):
    """
    Gather the results from all the stats tasks and
    combine into a single dict.
    """
    data = {}
    for item in results:
        for key in item:
            data[key] = item[key]
    return data

In [15]:
def retrieve_stream(uri, access_key, skey):
    """
    Not testing the creation of the stream object at this point.
    But for testing, we also need to keep the download to occur only
    once.
    """
    session = boto3.Session(aws_access_key_id=access_key, aws_secret_access_key=skey)
    dev_resource = session.resource("s3")
    uri = urllib.parse.urlparse(uri)
    obj = dev_resource.Object(bucket_name=uri.netloc, key=uri.path[1:])
    stream = io.BytesIO(obj.get()["Body"].read())
    return stream, obj.content_length


def append_ping_dataframe(dataframe, array_uri, access_key, skey):
    """Append the ping dataframe read from a GSF file."""
    config = tiledb.Config(
        {"vfs.s3.aws_access_key_id": access_key, "vfs.s3.aws_secret_access_key": skey}
    )
    ctx = tiledb.Ctx(config=config)
    kwargs = {
        "mode": "append",
        "sparse": True,
        "ctx": ctx,
    }

    tiledb.dataframe_.from_pandas(array_uri, dataframe, **kwargs)


def ingest_gsf_slice(
    file_record,
    stream,
    access_key,
    skey,
    array_uri,
    idx=slice(None),
    cell_frequency=False,
):
    """
    General steps:
    Extract the ping data.
    Calculate the rHEALPIX code.
    Summarise the rHEALPIX codes (frequency count).
    Get timestamps of first and last pings.
    Write the ping data to a TileDB array.
    res = [df.groupby(["key"])["key"].agg("count").to_frame("count").reset_index() for i in range(3)]
    df2 = pandas.concat(res)
    df2.groupby(["key"])["count"].agg("sum")
    """
    swath_pings = data_model.SwathBathymetryPing.from_records(file_record, stream, idx)
    swath_pings.ping_dataframe["region_code"] = rhealpix.rhealpix_code(
        swath_pings.ping_dataframe.X, swath_pings.ping_dataframe.Y, 15
    )

    # frequency of dggs cells
    if cell_frequency:
        cell_count = (
            swath_pings.ping_dataframe.groupby(["region_code"])["region_code"]
            .agg("count")
            .to_frame("count")
            .reset_index()
        )

        start_end_time = [
            swath_pings.ping_dataframe.timestamp.min().to_pydatetime(),
            swath_pings.ping_dataframe.timestamp.max().to_pydatetime(),
        ]

    else:
        cell_count = None
        start_end_time = None

    # write to tiledb array
    append_ping_dataframe(swath_pings.ping_dataframe, array_uri, access_key, skey)

    return cell_count, start_end_time


def ingest_gsf_slices(gsf_uri, access_key, skey, array_uri, slices, cell_frequency=False):
    """
    Ingest a list of ping slices from a given GSF file.
    """
    stream, stream_length = retrieve_stream(gsf_uri, access_key, skey)
    finfo = reap.file_info(stream, stream_length)
    ping_file_record = finfo[1]

    cell_counts = []
    start_end_timestamps = []

    for idx in slices:
        count, start_end_time = ingest_gsf_slice(
            ping_file_record, stream, access_key, skey, array_uri, idx, cell_frequency
        )
        cell_counts.append(count)
        start_end_timestamps.append(start_end_time)

    if cell_frequency:
        # aggreate the ping slices and calculate the cell counts
        concatenated = pandas.concat(cell_counts)
        cell_count = (
            concatenated.groupby(["region_code"])["count"]
            .agg("sum")
            .to_frame("count")
            .reset_index()
        )

        # aggregate the min and max timestamps, then find the min max timestamps
        timestamps_df = pandas.DataFrame(
            {
                "start_datetime": [i[0] for i in start_end_timestamps],
                "end_datetime": [i[1] for i in start_end_timestamps],
            }
        )

        start_end_timestamp = [
            timestamps_df.start_datetime.min().to_pydatetime(),
            timestamps_df.end_datetime.max().to_pydatetime(),
        ]

    else:
        cell_count = None
        start_end_timestamp = None

    return cell_count, start_end_timestamp

In [16]:
def dummy_reducer(results):
    return len(results)

def scatter(iterable, n):
    """
    Evenly scatters an interable by `n` blocks.
    Sourced from:
    http://stackoverflow.com/questions/2130016/splitting-a-list-of-arbitrary-size-into-only-roughly-n-equal-parts

    :param iterable:
        An iterable or preferably a 1D list or array.

    :param n:
        An integer indicating how many blocks to create.

    :return:
        A `list` consisting of `n` blocks of roughly equal size, each
        containing elements from `iterable`.
    """

    q, r = len(iterable) // n, len(iterable) % n
    res = (iterable[i * q + min(i, r) : (i + 1) * q + min(i + 1, r)] for i in range(n))
    return list(res)

In [None]:
def cell_frequency(array_uri, cell_frequency_uri, access_key, skey):
    """
    Calculate the frequency distirbution of each region code (cell count).
    Result is written to a tiledb array.
    """
    config = tiledb.Config(
        {"vfs.s3.aws_access_key_id": access_key, "vfs.s3.aws_secret_access_key": skey}
    )
    ctx = tiledb.Ctx(config=config)
    kwargs = {
        "sparse": False,
        "column_types": {
            "region_code": str,
            "count": numpy.uint64,
        },
        "ctx": ctx,
    }

    with tiledb.open(array_uri, ctx=ctx) as ds:
        query = ds.query(attrs=["region_code"], coords=False)
        df = query.df[:]

    frequency_df = (
        df.groupby(["region_code"])["region_code"]
        .agg("count")
        .to_frame("count")
        .reset_index()
    )

    tiledb.dataframe_.from_pandas(cell_frequency_uri, frequency_df, **kwargs)


def start_end_timestamps(array_uri, access_key, skey):
    """
    Find the min/max of the timestamp attribute.
    """
    config = tiledb.Config(
        {"vfs.s3.aws_access_key_id": access_key, "vfs.s3.aws_secret_access_key": skey}
    )
    ctx = tiledb.Ctx(config=config)

    with tiledb.open(array_uri, ctx=ctx) as ds:
        query = ds.query(attrs=["timestamp"], coords=False)
        df = query.df[:]

    start_end_time = [
        df.timestamp.min().to_pydatetime(),
        df.timestamp.max().to_pydatetime(),
    ]

    return start_end_time

In [17]:
def ingest_gsfs_singular(files, processing_node_limit, ping_slice_step, slices_per_node, array_uris, cell_freq_uris, local=False):
    """
    Prototype ingester. Multi-file method. Each GSF will have a corresponding TileDB array.
    After which all TileDB arrays will be merged into 1.
    """

    node_counter = 0
    tasks = []
    tasks_dict = {n: [] for n in range(processing_node_limit)}
    files_dict = {fname: [] for fname in files}
    cell_frequency_tasks = []
    timestamps_tasks = []

    for enumi, pathname in enumerate(files):
        metadata_pathname = pathname.replace(".gsf", ".json")
        base_name = Path(pathname).stem
        with fs.open(metadata_pathname) as src:
            gsf_metadata = json.loads(src.read())

        ping_count = gsf_metadata["file_record_types"]["GSF_SWATH_BATHYMETRY_PING"]["record_count"]

        slices = [slice(start, start+ping_slice_step) for start in numpy.arange(0, ping_count, ping_slice_step)]
        slice_chunks = [slices[i:i+slices_per_node] for i in range(0, len(slices), slices_per_node)]

        array_uri = array_uris[enumi]
        cell_freq_uri = cell_freq_uris[enumi]

        for slice_chunk in slice_chunks:
            start_idx = slice_chunk[0].start
            end_idx = slice_chunk[-1].stop
            task_name = f"{base_name}-{start_idx}-{end_idx}-{node_counter}"
            
            if local:
                task = Delayed(ingest_gsf_slices, name=task_name, local=True)(gsf_metadata["gsf_uri"], creds.access_key, creds.secret_key, array_uri, slice_chunk, cell_frequency=False)
            else:
                task = Delayed("sixy6e/ingest_gsf_slices", name=task_name, image_name="3.7-geo", timeout=1800)(gsf_metadata["gsf_uri"], creds.access_key, creds.secret_key, array_uri, slice_chunk, cell_frequency=False)

            if len(tasks_dict[node_counter]):
                task.depends_on(tasks_dict[node_counter][-1])

            tasks.append(task)
            tasks_dict[node_counter].append(task)
            node_counter += 1

            files_dict[pathname].append(task)

            if node_counter == processing_node_limit:
                node_counter = 0

        if local:
            cell_freq_task = Delayed(cell_frequency, name=f"{base_name}-cell-frequency", local=True)(array_uri, cell_freq_uri, creds.access_key, creds.secret_key)
            timestamp_task = Delayed(start_end_timestamps, name=f"{base_name}-start-end-timestamps", local=True)(array_uri, creds.access_key, creds.secret_key)
        else:
            cell_freq_task = Delayed("sixy6e/cell_frequency", name=f"{base_name}-cell-frequency")(array_uri, cell_freq_uri, creds.access_key, creds.secret_key)
            timestamp_task = Delayed("sixy6e/start_end_timestamps", name=f"{base_name}-start-end-timestamps")(array_uri, creds.access_key, creds.secret_key)
        
        for dep in files_dict[pathname]:
            cell_freq_task.depends_on(dep)


        #timestamp_task = Delayed("sixy6e/start_end_timestamps", name=f"{base_name}-start-end-timestamps")(array_uri, creds.access_key, creds.secret_key)
        timestamp_task.depends_on(cell_freq_task)

        cell_frequency_tasks.append(cell_freq_task)
        timestamps_tasks.append(timestamp_task)


    reduce_task = Delayed(reduce_timestamps, "reduce-timestamps", local=True)(timestamps_tasks)
    
    return reduce_task

In [18]:
def load_and_concat(array_uris, ctx, out_uri=None):
    def concat(array_uris, ctx):
        data = []

        for uri in array_uris:
            with tiledb.open(uri, ctx=ctx) as ds:
                data.append(ds.df[:])

        concatenated = pandas.concat(data, copy=False)

        return concatenated

    concatenated = concat(array_uris, ctx)
    summarised = concatenated.groupby(["region_code"])["count"].agg("sum").to_frame("count").reset_index()
    if out_uri is None:
        return summarised
    else:
        write_chunked(summarised, out_uri, ctx, chunks=1000000)

In [19]:
@jit(nopython=True)
def strtoint(s):
    return ord(s) - 48

@jit(nopython=True)
def _unpack_code(region_codes: numpy.ndarray, ncodes, res):
    resolutions = [str(f"R{i}") for i in range(res)]
    unpacked = {
        "R1": numpy.zeros(ncodes, dtype="uint8"),
        "R2": numpy.zeros(ncodes, dtype="uint8"),
        "R3": numpy.zeros(ncodes, dtype="uint8"),
        "R4": numpy.zeros(ncodes, dtype="uint8"),
        "R5": numpy.zeros(ncodes, dtype="uint8"),
        "R6": numpy.zeros(ncodes, dtype="uint8"),
        "R7": numpy.zeros(ncodes, dtype="uint8"),
        "R8": numpy.zeros(ncodes, dtype="uint8"),
        "R9": numpy.zeros(ncodes, dtype="uint8"),
        "R10": numpy.zeros(ncodes, dtype="uint8"),
        "R11": numpy.zeros(ncodes, dtype="uint8"),
        "R12": numpy.zeros(ncodes, dtype="uint8"),
        "R13": numpy.zeros(ncodes, dtype="uint8"),
        "R14": numpy.zeros(ncodes, dtype="uint8"),
        "R15": numpy.zeros(ncodes, dtype="uint8"),
    }
    r0 = numpy.zeros(ncodes, dtype="<U1")
    for i in range(ncodes):
        code = str(region_codes[i])
        r0[i] = code[0]
        for j in range(1, res):
            unpacked[resolutions[j]][i] = strtoint(code[j])
    return r0, unpacked

In [20]:
def unpack_code(region_codes: numpy.ndarray, dataframe=True):
    res = len(region_codes[0])
    region_codes = region_codes.astype(f"<U{len(region_codes[0])}")
    r0, unpacked = _unpack_code(region_codes, region_codes.shape[0], res)
    unpacked_dict = {"R0": r0}
    for key in unpacked:
        unpacked_dict[key] = unpacked[key]
    if dataframe:
        result = pandas.DataFrame(unpacked_dict)
    else:
        result = unpacked_dict
    return result

In [21]:
def reduce_resoltion(df, resolution=12, chunks=10000):
    def reduce_res(dataframe, resolution):
        res = resolution + 1
        reduced = pandas.DataFrame(
            {
                "region_code": dataframe.region_code.str[0:res],
                "count": dataframe["count"].values,
            }
        )

        return reduced
    
    def group_res(dataframe):
        return dataframe.groupby(["region_code"])["count"].agg("sum").to_frame("count").reset_index()

    idxs = [(start, start + chunks) for start in numpy.arange(0, df.shape[0], chunks)]
    idx0 = idxs[0]
    subset = df[idx0[0]:idx0[1]]
    base_reduced = group_res(reduce_res(subset, resolution))

    for idx in idxs[1:]:
        subset = df[idx[0]:idx[1]]
        reduced = reduce_res(subset, resolution)
        concatenated = pandas.concat([base_reduced, reduced], copy=False)
        base_reduced = group_res(concatenated)

    return base_reduced

In [22]:
def write_chunked(df, out_uri, ctx, chunks=10000):
    idxs = [(start, start + chunks) for start in numpy.arange(0, df.shape[0], chunks)]
    rows_written = 0
    kwargs = {
        "sparse": False,
        "column_types": {"region_code": str, "count": numpy.uint64},
        "ctx": ctx,
    }
    for idx in idxs:
        subset = df[idx[0]:idx[1]]
        kwargs["row_start_idx"] = rows_written
        tiledb.dataframe_.from_pandas(out_uri, subset, **kwargs)
        kwargs["mode"] = "append"
        rows_written += len(subset)


def write_sparse_rhealpix_chunked(df, out_uri, ctx, chunks=10000):
    """Requires the output array to have already been created."""
    idxs = [(start, start + chunks) for start in numpy.arange(0, df.shape[0], chunks)]
    kwargs = {
        "mode": "append",
        "sparse": True,
        "ctx": ctx,
    }
    for idx in idxs:
        subset = df[idx[0]:idx[1]]
        new_df = unpack_code(subset.region_code.values)
        new_df["region_code"] = subset.region_code.values
        new_df["count"] = subset["count"].values
        tiledb.dataframe_.from_pandas(out_uri, new_df, **kwargs)

In [23]:
def filter_empty_files(files):
    """
    Filter out GSF's containing no Pings so we don't attempt to process them.
    """
    empty_files = []
    non_empty_files = []

    for pathname in files:
        metadata_pathname = pathname.replace(".gsf", ".json")
        base_name = Path(pathname).stem
        with fs.open(metadata_pathname) as src:
            gsf_metadata = json.loads(src.read())

        ping_count = gsf_metadata["file_record_types"]["GSF_SWATH_BATHYMETRY_PING"]["record_count"]
        if ping_count == 0:
            empty_files.append(pathname)
        else:
            non_empty_files.append(pathname)

    return non_empty_files, empty_files


def filter_large_files(files, size_limit_mb):
    """
    Filter out GSF's that are large than size_limit_mb so that they're processed locally.
    """
    manageable_files = []
    large_files = []

    for pathname in files:
        metadata_pathname = pathname.replace(".gsf", ".json")
        base_name = Path(pathname).stem
        with fs.open(metadata_pathname) as src:
            gsf_metadata = json.loads(src.read())

        # filter large files before hand, so we can still pass them through here but process locally
        if (gsf_metadata["size"] / 1024 / 1024) > size_limit_mb:
            large_files.append(pathname)
        else:
            manageable_files.append(pathname)

    return manageable_files, large_files

In [42]:
def combine_tiledbs(array_uris, out_array_uri, access_key, skey):
    config = tiledb.Config(
        {"vfs.s3.aws_access_key_id": access_key, "vfs.s3.aws_secret_access_key": skey}
    )
    ctx = tiledb.Ctx(config=config)

    for array_uri in array_uris:
        with tiledb.open(array_uri, ctx=ctx) as ds:
            df = ds.df[:]

        append_ping_dataframe(df, out_array_uri, access_key, skey)

In [24]:
with fs.open(survey_info_uri) as src:
    survey_info = json.loads(src.read())

In [25]:
#required_attributes = survey_info["schemas"][0]
# this is temporary. better to have it defined internally. or programmatically derived as a union of all schemas from all pings
required_attributes = [
    "Z",
    "across_track",
    "along_track",
    "beam_angle",
    "beam_angle_forward",
    "beam_flags",
    "beam_number",
    "centre_beam",
    "course",
    "depth_corrector",
    "gps_tide_corrector",
    "heading",
    "heave",
    "height",
    "horizontal_error",
    "ping_flags",
    "pitch",
    "roll",
    "sector_number",
    "separation",
    "speed",
    "tide_corrector",
    "timestamp",
    "travel_time",
    "vertical_error",
    "region_code",
]

In [26]:
config = tiledb.Config(
        {"vfs.s3.aws_access_key_id": creds.access_key, "vfs.s3.aws_secret_access_key": creds.secret_key}
    )
config_dict = config.dict()
ctx = tiledb.Ctx(config=config)

In [27]:
storage.create_mbes_array(array_uri, required_attributes, ctx)

In [28]:
files = fs.glob(survey_uri + "**.gsf")
len(files)

64

In [29]:
# filter and empty files
non_empty_files, empty_files = filter_empty_files(files)
len(non_empty_files), len(empty_files)

(64, 0)

In [30]:
# filter large files
size_limit_mb = 500
manageable_files, large_files = filter_large_files(non_empty_files, size_limit_mb)
len(manageable_files), len(large_files)

(64, 0)

In [31]:
intermediate_bathy_uris = [tmpdir + "bathymetry/" + Path(fname).with_suffix(".tiledb").name for fname in manageable_files]
intermediate_freq_uris = [tmpdir + "cell-frequency/" + Path(fname).stem + ".tiledb" for fname in manageable_files]
intermediate_bathy_large_uris = [tmpdir + "bathymetry/" + Path(fname).with_suffix(".tiledb").name for fname in large_files]
intermediate_freq_large_uris = [tmpdir + "cell-frequency/" + Path(fname).stem + ".tiledb" for fname in large_files]

In [32]:
# create the arrays to hold the bathymetry (required multi-node and multi-proc ingestion)
for fname_uri in intermediate_bathy_uris:
    storage.create_mbes_array(fname_uri, required_attributes, ctx)

In [33]:
for fname_uri in intermediate_bathy_large_uris:
    storage.create_mbes_array(fname_uri, required_attributes, ctx)

In [34]:
sonar_metadata = get_sonar_metadata(manageable_files[0].replace(".gsf", ".json"))

In [36]:
n_partitions = 1
files_blocks = scatter(manageable_files, n_partitions)
tmp_bathy_blocks = scatter(intermediate_bathy_uris, n_partitions)
tmp_freq_blocks = scatter(intermediate_freq_uris, n_partitions)
len(files_blocks[0])

64

In [37]:
processing_node_limit = 40
ping_slice_step = 2000
slices_per_node = 4
local_tasks_limit = 1
local_ping_slice_step = 2000
local_slices_per_task = 4

In [38]:
# GSF processing

In [39]:
reduce_task0 = ingest_gsfs_singular(files_blocks[0], processing_node_limit, ping_slice_step, slices_per_node, tmp_bathy_blocks[0], tmp_freq_blocks[0])

In [40]:
reduce_task0.visualize()

Visualize(value='{"nodes": ["f630c1b0-2303-4b35-bb26-e1ad208f37d5", "d962d54d-02fb-4cc9-bb5a-dd9e3b45d5ce", "9…

In [41]:
start_end_timestamps0 = reduce_task0.compute()

In [43]:
start_end_timestamps0

[datetime.datetime(2015, 5, 10, 10, 5, 27, 51000),
 datetime.datetime(2015, 5, 13, 0, 5, 24, 400000)]

In [None]:
unfinished_bathy_uris = []
finished_uris = []
unfinished_freq_uris = []
unfinished_gsf_files = []
for i, fname_uri in enumerate(intermediate_bathy_uris):
    if fs.exists(intermediate_freq_uris[i]):
        finished_uris.append(fname_uri)
        continue
    unfinished_bathy_uris.append(fname_uri)
    unfinished_freq_uris.append(intermediate_freq_uris[i])
    unfinished_gsf_files.append("ausseabed-pl019-provided-data/CSIRO/IN2018_V04/GSF/em710/" + Path(fname_uri).stem + ".gsf")
    fs.rm(fname_uri, recursive=True)
    storage.create_mbes_array(fname_uri, required_attributes, ctx)

In [None]:
len(finished_uris)

In [None]:
unfinished_gsf_files[0]

In [None]:
unfinished_bathy_uris[0]

In [None]:
local_tasks_limit = 10
local_ping_slice_step = 2000
local_slices_per_task = 2

In [None]:
reduce_task_unfinished = ingest_gsfs_singular(unfinished_gsf_files, local_tasks_limit, local_ping_slice_step, local_slices_per_task, unfinished_bathy_uris, unfinished_freq_uris, local=True)

In [None]:
start_end_timestamps_unfinished = reduce_task_unfinished.compute()

In [None]:
timestamps_blocks = scatter(intermediate_bathy_uris, 10)
(len(timestamps_blocks[0]), len(timestamps_blocks[-1]))

In [None]:
timestamps_collect = []

In [None]:
for timestamps_block in timestamps_blocks:
    timestamps_tasks = []
    for uri_fname in timestamps_block:
        base_name = Path(uri_fname).stem
        #uri_fname = f"s3://{fname}"
        timestamp_task = Delayed("sixy6e/start_end_timestamps", name=f"{base_name}-start-end-timestamps")(uri_fname, creds.access_key, creds.secret_key)
        timestamps_tasks.append(timestamp_task)


    reduce_task = Delayed(reduce_timestamps, "reduce-timestamps", local=True)(timestamps_tasks)
    timestamps_collect.append(reduce_task.compute())

In [None]:
final_start_end_timestamps = reduce_timestamps(timestamps_collect)

In [None]:
final_start_end_timestamps

In [45]:
bathy_blocks = scatter(intermediate_bathy_uris, 8)
len(bathy_blocks[0])

8

In [46]:
ingest_tasks = []
for block_id, bblock in enumerate(bathy_blocks):
    task = Delayed(combine_tiledbs, name=f"ingesting-block-{block_id}", local=True)(bblock, array_uri, creds.access_key, creds.secret_key)
    ingest_tasks.append(task)

dummy_reducer = Delayed(dummy_reducer, name="dummy-reducer-ingester", local=True)(ingest_tasks)

In [47]:
ingested_result = dummy_reducer.compute()

In [49]:
partitions = 5
sub_partitions = 2

In [50]:
freq_blocks = scatter(intermediate_freq_uris, partitions)
len(freq_blocks[0])

13

In [51]:
cell_freq_blocks_tmp_dir = tmpdir + "cell-frequency-blocks-pass1/"

In [52]:
tasks = []
tasks_dict = {i: [] for i in range(partitions)}
for i, freq_block in enumerate(freq_blocks):
    sub_freq_blocks = scatter(freq_block, sub_partitions)
    for j, sub_block in enumerate(sub_freq_blocks):
        task_name = f"combine-cell-freq_block-{i}_sub-block-{j}"
        out_uri = f"{cell_freq_blocks_tmp_dir}cell-frequency-block-{i}-sub-block-{j}.tiledb"
        task = Delayed(load_and_concat, name=task_name, local=True)(sub_block, ctx, out_uri)
        tasks.append(task)
        if len(tasks_dict[i]):
            task.depends_on(tasks_dict[i][-1])
        tasks_dict[i].append(task)

dummy_reducer = Delayed(dummy_reducer, name="dummy-reducer-ingester", local=True)(tasks)

In [53]:
dummy_reducer.visualize()

Visualize(value='{"nodes": ["9aa6794b-218b-48fb-b76a-d1175adc3158", "a4f7c298-4216-4b82-b6a3-378cd17f01ba", "2…

In [54]:
freq_pass1_result = dummy_reducer.compute()

In [55]:
pass1_cell_freqs = fs.glob(cell_freq_blocks_tmp_dir + "**.tiledb")
len(pass1_cell_freqs)

10

In [56]:
pass1_cell_freq_uris = [f"s3://{fname}" for fname in pass1_cell_freqs]

In [57]:
domains = []
for p1_uri in pass1_cell_freq_uris:
    with tiledb.open(p1_uri, ctx=ctx) as ds:
        domains.append(ds.nonempty_domain())
domains

[((0, 5777019),),
 ((0, 2232462),),
 ((0, 642704),),
 ((0, 376392),),
 ((0, 2428873),),
 ((0, 1204686),),
 ((0, 402589),),
 ((0, 549998),),
 ((0, 951183),),
 ((0, 2078189),)]

In [58]:
partitions = 2
sub_partitions = 1

In [59]:
cell_freq_blocks_tmp_dir = tmpdir + "cell-frequency-blocks-pass2/"

In [60]:
freq_blocks = scatter(pass1_cell_freq_uris, partitions)

In [61]:
len(freq_blocks)

2

In [62]:
tasks = []
tasks_dict = {i: [] for i in range(partitions)}
for i, freq_block in enumerate(freq_blocks):
    sub_freq_blocks = scatter(freq_block, sub_partitions)
    for j, sub_block in enumerate(sub_freq_blocks):
        task_name = f"combine-cell-freq_block-{i}_sub-block-{j}"
        out_uri = f"{cell_freq_blocks_tmp_dir}cell-frequency-block-{i}-sub-block-{j}.tiledb"
        task = Delayed(load_and_concat, name=task_name, local=True)(sub_block, ctx, out_uri)
        tasks.append(task)
        if len(tasks_dict[i]):
            task.depends_on(tasks_dict[i][-1])
        tasks_dict[i].append(task)

dummy_reducer = Delayed(dummy_reducer, name="dummy-reducer-ingester", local=True)(tasks)

In [63]:
freq_pass2_result = dummy_reducer.compute()

In [64]:
pass2_cell_freqs = fs.glob(cell_freq_blocks_tmp_dir + "**.tiledb")
len(pass2_cell_freqs)

2

In [67]:
pass2_cell_freq_uris = [f"s3://{fname}" for fname in pass2_cell_freqs]
len(pass2_cell_freq_uris)

2

In [66]:
domains = []
for p2_uri in pass2_cell_freq_uris:
    with tiledb.open(p2_uri, ctx=ctx) as ds:
        domains.append(ds.nonempty_domain())
domains

[((0, 11457454),), ((0, 5186649),)]

In [69]:
load_and_concat(pass2_cell_freq_uris, ctx, soundings_cell_density_array_uri_15)

In [None]:
# sparse rhealpix version of cell frequency

In [70]:
index_filters = tiledb.FilterList([tiledb.ZstdFilter(level=16)])
dims = [tiledb.Dim(f"R{i}", domain=(0, 8), dtype="uint8", tile=1, filters=index_filters) for i in range(1, 16)]
dim0 = tiledb.Dim("R0", tile=None, dtype="ascii", filters=index_filters)
dims.insert(0, dim0)
cell_freq_attributes = [tiledb.Attr("region_code", dtype=str, filters=[tiledb.ZstdFilter(level=16)]), tiledb.Attr("count", dtype="uint64", filters=[tiledb.ZstdFilter(level=16)])]
domain = tiledb.Domain(*dims)
schema = tiledb.ArraySchema(domain=domain, sparse=True, attrs=cell_freq_attributes, cell_order="row-major", tile_order="row-major", capacity=100_000,allows_duplicates=True)

In [71]:
with tiledb.scope_ctx(ctx):
    tiledb.Array.create(soundings_cell_density_sparse_array_uri_15, schema)

In [72]:
with tiledb.open(soundings_cell_density_array_uri_15, ctx=ctx) as ds:
    df = ds.df[:]
    
write_sparse_rhealpix_chunked(df, soundings_cell_density_sparse_array_uri_15, ctx, chunks=1000000)

del df
gc.collect()

37177

In [73]:
with tiledb.open(soundings_cell_density_array_uri_15, ctx=ctx) as ds:
    df = ds.df[:]

res12_df = reduce_resoltion(df, resolution=12, chunks=1000000)

del df
gc.collect()

127

In [74]:
res12_df["count"].sum()

16677792

In [75]:
with tiledb.scope_ctx(ctx):
    tiledb.Array.create(res12_out_uri_sparse, schema)

In [76]:
write_sparse_rhealpix_chunked(res12_df, res12_out_uri_sparse, ctx, chunks=100000)

In [77]:
write_chunked(res12_df, res12_out_uri_dense, ctx, 100000)

In [78]:
res12_df["geometry"] = rhealpix.rhealpix_geo_boundary(res12_df.region_code.values)

In [79]:
gdf = geopandas.GeoDataFrame(res12_df, crs="epsg:4326")

In [80]:
gdf

Unnamed: 0,region_code,count,geometry
0,R785882555282,1,"POLYGON ((149.99983 -37.90878, 150.00000 -37.9..."
1,R785882555285,2,"POLYGON ((149.99983 -37.90897, 150.00000 -37.9..."
2,R785882555288,1,"POLYGON ((149.99983 -37.90915, 150.00000 -37.9..."
3,R785882555522,5,"POLYGON ((149.99983 -37.90933, 150.00000 -37.9..."
4,R785882555525,4,"POLYGON ((149.99983 -37.90951, 150.00000 -37.9..."
...,...,...,...
4032631,S422426446233,11,"POLYGON ((-90.00281 -75.90987, -90.00225 -75.9..."
4032632,S422426446236,10,"POLYGON ((-90.00225 -75.91004, -90.00168 -75.9..."
4032633,S422426446260,10,"POLYGON ((-90.00112 -75.91040, -90.00112 -75.9..."
4032634,S422426446263,8,"POLYGON ((-90.00112 -75.91040, -90.00056 -75.9..."


In [81]:
with fiona.Env(session=AWSSession(aws_access_key_id=creds.access_key, aws_secret_access_key=creds.secret_key)):
    gdf.to_file(soundings_cell_density_uri, driver="GeoJSONSeq", coordinate_precision=11)

In [None]:
dissolved = geopandas.GeoDataFrame(geometry.dissolve(gdf), crs="epsg:4326")

In [None]:
with fiona.Env(session=AWSSession(aws_access_key_id=creds.access_key, aws_secret_access_key=creds.secret_key)):
    dissolved.to_file(coverage_uri, driver="GeoJSONSeq", coordinate_precision=11)

In [None]:
(res12_df["count"].max(), res12_df["count"].min())

In [None]:
dggs = rhealpix.RhealpixDGGS.from_ellipsoid()
dggs.cell_width(12)

In [None]:
area_ha = gdf.shape[0] * dggs.cell_width(12) **2 / 10000
sonar_metadata["area_ha"] = area_ha
area_ha

In [None]:
gdf2 = geopandas.GeoDataFrame({"region_code": gdf.region_code.str[0:10], "count": gdf["count"]}).groupby(["region_code"])["count"].agg("sum").to_frame("count").reset_index()

In [None]:
(gdf2["count"].max(), gdf2["count"].min())

In [None]:
slices = []
for geom in rhealpix.rhealpix_geo_boundary(gdf2.region_code.values, round_coords=False):
    bounds = geom.bounds
    slices.append((
        slice(bounds[0], bounds[-2]),
        slice(bounds[1], bounds[-1])
    ))

In [None]:
n_partitions = 12
n_sub_partitions = 3
blocks = scatter(slices, n_partitions)

In [None]:
len(blocks), len(blocks[0])

In [None]:
len(scatter(blocks[0], n_sub_partitions)[0])

In [None]:
stats_attrs = [at for at in required_attributes if at not in ["timestamp", "region_code"]]
stats_attrs.insert(0, "Y")
stats_attrs.insert(0, "X")

In [None]:
stats_results = []
tasks_dict = {stat: [] for stat in stats_attrs}
reduce_tasks = []

for i, block in enumerate(blocks):
    sub_tasks = []
    sub_blocks = scatter(block, n_sub_partitions)

    for si, sub_block in enumerate(sub_blocks):
        for attribute in stats_attrs:
            
            if attribute in ["X", "Y"]:
                schema = attribute
            else:
                schema = None

            task_name = f"block-{i}-sub_block-{si}-{attribute}"
            task = Delayed("sixy6e/basic_statistics_incremental", name=task_name)(array_uri, config_dict, attribute, schema=schema, idxs=sub_block, summarise=False)

            if len(tasks_dict[attribute]) > 1:
                task.depends_on(tasks_dict[attribute][-1])

            tasks_dict[attribute].append(task)

for attribute in stats_attrs:
    task_name = f"reduce-attibute-{attribute}"
    reducer_task = Delayed("sixy6e/basic_statistics_reduce", name=task_name)(tasks_dict[attribute], attribute)
    reduce_tasks.append(reducer_task)

collect_stats_task = Delayed(gather_stats, local=True, name="gather-stats")(reduce_tasks)

In [None]:
stats_results = collect_stats_task.compute()

In [None]:
crs_info = {
    "horizontal_datum": "epsg:4326",
    "vertical_datum": "epsg:5714",  # TODO******************************************
}

In [None]:
with tiledb.open(array_uri, "w", ctx=ctx) as ds:
    ds.meta["crs_info"] = json.dumps(crs_info)
    ds.meta["basic_statistics"] = json.dumps(stats_results, cls=stac_metadata.Encoder)

In [None]:
with fs.open(asb_metadata_uri) as src:
    asb_metadata = json.loads(src.read())

In [None]:
dataset_metadata = stac_metadata.prepare(
    uid,
    sonar_metadata,
    stats_results,
    asb_metadata,
    array_uri,
    coverage_uri,
    soundings_cell_density_uri,
    creds.access_key,
    creds.secret_key,
    final_start_end_timestamps,
    outdir_uri,
    stac_md_uri,
)

In [None]:
tiledb.cloud.register_array(
    uri=array_uri,
    namespace="sixy6e", # Optional, you may register it under your username, or one of your organizations
    array_name=array_name,
    description=asb_metadata["survey_general"]["abstract"],  # Optional 
    access_credentials_name="AusSeabedGMRT-PL019"
)