In [1]:
import boto3
import io
import urllib
import s3fs
import json
from pathlib import Path
import attr
import numpy
import tiledb
import tiledb.cloud
from tiledb.cloud.compute import DelayedArrayUDF, Delayed
import pandas
import geopandas
import fiona
from fiona.session import AWSSession
import pystac
from scipy.stats import skew, kurtosis
import uuid

In [2]:
import pystac
from pystac.extensions.projection import ProjectionExtension
from pystac.extensions.pointcloud import (
    PointcloudExtension,
    SchemaType,
    PhenomenologyType,
    Schema,
    Statistic,
)

In [3]:
from reap_gsf import reap, data_model
from bathy_datasets import rhealpix, storage, geometry, asb_spreadsheet, stac_metadata

In [4]:
# uid = uuid.uuid4()
uid = uuid.UUID("e5831d06-507b-4a45-a3d3-4658bc7e25f0")

In [5]:
survey_uri = "s3://ausseabed-pl019-provided-data/DeakinUniversity/Bunurong_MNP/"
outdir_uri = "s3://ausseabed-pl019-ingested-data/L2/Bunurong_MNP/"
asb_metadata_uri = "s3://ausseabed-pl019-provided-data/DeakinUniversity/Bunurong_MNP/metadata/spreadsheet-metadata.json"
survey_info_uri = "s3://ausseabed-pl019-provided-data/DeakinUniversity/Bunurong_MNP/schema-info.json"

In [6]:
base_prefix = "ga_ausseabed"
array_name = f"{base_prefix}_{uid}_bathymetry"
array_uri = f"{outdir_uri}{array_name}.tiledb"
tiledb_array_uri = f"tiledb://sixy6e/{array_name}"
soundings_cell_density_uri = f"{outdir_uri}{base_prefix}_{uid}_soundings-cell-density-resolution-12.geojson"
coverage_uri = f"{outdir_uri}{base_prefix}_{uid}_coverage.geojson"
stac_md_uri = f"{outdir_uri}{base_prefix}_{uid}_stac-metadata.geojson"

In [7]:
soundings_cell_density_uri_15 = f"{outdir_uri}{base_prefix}_{uid}_soundings-cell-density-resolution-15.geojson"

In [4]:
session = boto3.Session()
creds = session.get_credentials()

In [9]:
fs = s3fs.S3FileSystem(key=creds.access_key, secret=creds.secret_key, use_listings_cache=False)

In [10]:
files = fs.glob(survey_uri + "**.gsf")
len(files)

180

In [11]:
def get_sonar_metadata(json_uri):
    """
    Temporary func for pulling metadata from a sample GSF file.
    """
    with fs.open(json_uri) as src:
        md = json.loads(src.read())
    stream_task = Delayed("sixy6e/retrieve_stream", name="retrieve")(md["gsf_uri"], creds.access_key, creds.secret_key)
    dataframe_task = Delayed("sixy6e/decode_gsf", name="decode", image_name="3.7-geo")(stream_task, slice(10))
    df, finfo = dataframe_task.compute()
    sonar_metadata = finfo[3].record(0).read(stream_task.result()[0])
    history = attr.asdict(finfo[6].record(0).read(stream_task.result()[0]))
    for key, value in history.items():
        sonar_metadata[key] = value
    return sonar_metadata


def reduce_region_codes(results):
    """
    The reduce part of the map-reduce construct for handling the region_code counts.
    Combine all the region_code counts then summarise the results.
    """
    region_codes = [i[0] for i in results]
    timestamps = [i[1] for i in results]
    df = pandas.concat(region_codes)
    cell_count = df.groupby(["region_code"])["count"].agg("sum").to_frame("count").reset_index()
    
    timestamps_df = pandas.DataFrame(
        {
            "start_datetime": [i[0] for i in timestamps],
            "end_datetime": [i[1] for i in timestamps],
        }
    )

    start_end_timestamp = [
        timestamps_df.start_datetime.min().to_pydatetime(),
        timestamps_df.end_datetime.max().to_pydatetime(),
    ]

    return cell_count, start_end_timestamp


def gather_stats(results):
    """
    Gather the results from all the stats tasks and
    combine into a single dict.
    """
    data = {}
    for item in results:
        for key in item:
            data[key] = item[key]
    return data

In [12]:
def retrieve_stream(uri, access_key, skey):
    """
    Not testing the creation of the stream object at this point.
    But for testing, we also need to keep the download to occur only
    once.
    """
    session = boto3.Session(aws_access_key_id=access_key, aws_secret_access_key=skey)
    dev_resource = session.resource("s3")
    uri = urllib.parse.urlparse(uri)
    obj = dev_resource.Object(bucket_name=uri.netloc, key=uri.path[1:])
    stream = io.BytesIO(obj.get()["Body"].read())
    return stream, obj.content_length


def append_ping_dataframe(dataframe, array_uri, access_key, skey):
    """Append the ping dataframe read from a GSF file."""
    config = tiledb.Config(
        {"vfs.s3.aws_access_key_id": access_key, "vfs.s3.aws_secret_access_key": skey}
    )
    ctx = tiledb.Ctx(config=config)
    kwargs = {
        "mode": "append",
        "sparse": True,
        "ctx": ctx,
    }

    tiledb.dataframe_.from_pandas(array_uri, dataframe, **kwargs)


def ingest_gsf_slice(
    file_record, stream, access_key, skey, array_uri, idx=slice(None)
):
    """
    General steps:
    Extract the ping data.
    Calculate the rHEALPIX code.
    Summarise the rHEALPIX codes (frequency count).
    Get timestamps of first and last pings.
    Write the ping data to a TileDB array.
    res = [df.groupby(["key"])["key"].agg("count").to_frame("count").reset_index() for i in range(3)]
    df2 = pandas.concat(res)
    df2.groupby(["key"])["count"].agg("sum")
    """
    swath_pings = data_model.SwathBathymetryPing.from_records(file_record, stream, idx)
    swath_pings.ping_dataframe["region_code"] = rhealpix.rhealpix_code(
        swath_pings.ping_dataframe.X, swath_pings.ping_dataframe.Y, 15
    )

    # frequency of dggs cells
    cell_count = (
        swath_pings.ping_dataframe.groupby(["region_code"])["region_code"]
        .agg("count")
        .to_frame("count")
        .reset_index()
    )

    start_end_time = [
        swath_pings.ping_dataframe.timestamp.min().to_pydatetime(),
        swath_pings.ping_dataframe.timestamp.max().to_pydatetime(),
    ]

    # write to tiledb array
    append_ping_dataframe(swath_pings.ping_dataframe, array_uri, access_key, skey)

    return cell_count, start_end_time


def ingest_gsf_slices(gsf_uri, access_key, skey, array_uri, slices):
    """
    Ingest a list of ping slices from a given GSF file.
    """
    stream, stream_length = retrieve_stream(gsf_uri, access_key, skey)
    finfo = reap.file_info(stream, stream_length)
    ping_file_record = finfo[1]

    cell_counts = []
    start_end_timestamps = []

    for idx in slices:
        count, start_end_time = ingest_gsf_slice(
            ping_file_record, stream, access_key, skey, array_uri, idx
        )
        cell_counts.append(count)
        start_end_timestamps.append(start_end_time)

    # aggreate the ping slices and calculate the cell counts
    concatenated = pandas.concat(cell_counts)
    cell_count = (
        concatenated.groupby(["region_code"])["count"]
        .agg("sum")
        .to_frame("count")
        .reset_index()
    )

    # aggregate the min and max timestamps, then find the min max timestamps
    timestamps_df = pandas.DataFrame(
        {
            "start_datetime": [i[0] for i in start_end_timestamps],
            "end_datetime": [i[1] for i in start_end_timestamps],
        }
    )

    start_end_timestamp = [
        timestamps_df.start_datetime.min().to_pydatetime(),
        timestamps_df.end_datetime.max().to_pydatetime(),
    ]

    return cell_count, start_end_timestamp

In [13]:
def basic_statistics(data, attribute):
    """
    Calculate basic statistics.
    * minimum
    * maximum
    * count
    * total/sum
    * mean
    * variance
    * standard deviation
    * skewness
    * kurtosis
    """
    data_attr = data[attribute]
    result = {
        attribute: {
            "minimum": numpy.nanmin(data_attr),
            "maximum": numpy.nanmax(data_attr),
            "count": data_attr.shape[0],
            "total": numpy.nansum(data_attr),
            "mean": numpy.nanmean(data_attr),
            "variance": numpy.nanvar(data_attr, ddof=1),  # unbiased sample
            "stddev": numpy.nanstd(data_attr, ddof=1),  # unbiased sample
            "skewness": skew(data_attr, nan_policy="omit"),
            "kurtosis": kurtosis(data_attr, nan_policy="omit"),
        }
    }
    return result

In [14]:
with fs.open(survey_info_uri) as src:
    survey_info = json.loads(src.read())

In [15]:
#required_attributes = survey_info["schemas"][0]
# this is temporary. better to have it defined internally. or programmatically derived as a union of all schemas from all pings
required_attributes = [
    "Z",
    "across_track",
    "along_track",
    "beam_angle",
    "beam_angle_forward",
    "beam_flags",
    "beam_number",
    "centre_beam",
    "course",
    "depth_corrector",
    "gps_tide_corrector",
    "heading",
    "heave",
    "height",
    "horizontal_error",
    "ping_flags",
    "pitch",
    "roll",
    "sector_number",
    "separation",
    "speed",
    "tide_corrector",
    "timestamp",
    "travel_time",
    "vertical_error",
    "region_code",
]

In [16]:
# required_attributes.append("region_code")

In [17]:
sonar_metadata = get_sonar_metadata(files[0].replace(".gsf", ".json"))

In [None]:
with fs.open(files[0].replace(".gsf", ".json")) as src:
        md = json.loads(src.read())
stream_task = Delayed("sixy6e/retrieve_stream", name="retrieve")(md["gsf_uri"], creds.access_key, creds.secret_key)
dataframe_task = Delayed("sixy6e/decode_gsf", name="decode", image_name="3.7-geo")(stream_task, slice(10))
df, finfo = dataframe_task.compute()

In [None]:
df.columns

In [18]:
config = tiledb.Config(
        {"vfs.s3.aws_access_key_id": creds.access_key, "vfs.s3.aws_secret_access_key": creds.secret_key}
    )
ctx = tiledb.Ctx(config=config)

In [None]:
storage.create_mbes_array(array_uri, required_attributes, ctx)

In [None]:
array_uri

In [None]:
size_limit_mb = 500
processing_node_limit = 30
ping_slice_step = 1000
slices_per_node = 5
local_tasks_limit = 4
local_ping_slice_step = 5000
local_slices_per_task = 4

In [None]:
node_counter = 0
skipped_files = []
large_files = []
tasks = []
tasks_dict = {n: [] for n in range(processing_node_limit)}

for pathname in files:
    metadata_pathname = pathname.replace(".gsf", ".json")
    base_name = Path(pathname).stem
    with fs.open(metadata_pathname) as src:
        gsf_metadata = json.loads(src.read())

    if (gsf_metadata["size"] / 1024 / 1024) > size_limit_mb:
        large_files.append(pathname)
        continue

    ping_count = gsf_metadata["file_record_types"]["GSF_SWATH_BATHYMETRY_PING"]["record_count"]
    if ping_count == 0:
        skipped_files.append(pathname)
        continue

    slices = [slice(start, start+ping_slice_step) for start in numpy.arange(0, ping_count, ping_slice_step)]
    slice_chunks = [slices[i:i+slices_per_node] for i in range(0, len(slices), slices_per_node)]

    for slice_chunk in slice_chunks:
        start_idx = slice_chunk[0].start
        end_idx = slice_chunk[0].stop
        task_name = f"{base_name}-{start_idx}-{end_idx}-{node_counter}"
        task = Delayed("sixy6e/ingest_gsf_slices", name=task_name, image_name="3.7-geo")(gsf_metadata["gsf_uri"], creds.access_key, creds.secret_key, array_uri, slice_chunk)
        task.set_timeout(1800)

        if len(tasks_dict[node_counter]):
            task.depends_on(tasks_dict[node_counter][-1])

        tasks.append(task)
        tasks_dict[node_counter].append(task)
        node_counter += 1

        if node_counter == processing_node_limit:
            node_counter = 0

reduce_task = Delayed(reduce_region_codes, "reduce-region_codes-timestamps", local=True)(tasks)

In [None]:
reduce_task.visualize()

In [None]:
len(tasks)

In [None]:
cell_count_df, start_end_timestamps = reduce_task.compute()

In [None]:
start_end_timestamps

In [None]:
cell_count_df

In [None]:
with fs.open("s3://ausseabed-pl019-provided-data/temp-work/bunurong-1st-stage-region_codes.json", "w") as src:
    cell_count_df.to_json(src)

In [None]:
for pathname in files:
    metadata_pathname = pathname.replace(".gsf", ".json")
    base_name = Path(pathname).stem
    with fs.open(metadata_pathname) as src:
        gsf_metadata = json.loads(src.read())

    if (gsf_metadata["size"] / 1024 / 1024) > size_limit_mb:
        large_files.append(pathname)
        continue

In [None]:
len(large_files)

In [None]:
large_files

In [None]:
pings = []
for lrg_file in large_files:
    metadata_pathname = pathname.replace(".gsf", ".json")
    base_name = Path(pathname).stem
    with fs.open(metadata_pathname) as src:
        gsf_metadata = json.loads(src.read())

    ping_count = gsf_metadata["file_record_types"]["GSF_SWATH_BATHYMETRY_PING"]["record_count"]
    pings.append(ping_count)

In [None]:
pings

In [None]:
local_tasks_limit = 1
local_ping_slice_step = 1000
local_slices_per_task = 4

In [None]:
local_counter = 0
skipped_files = []
tasks2 = []
local_tasks_dict = {n: [] for n in range(local_tasks_limit)}

for lrg_file in large_files:
    metadata_pathname = pathname.replace(".gsf", ".json")
    base_name = Path(pathname).stem
    with fs.open(metadata_pathname) as src:
        gsf_metadata = json.loads(src.read())

    ping_count = gsf_metadata["file_record_types"]["GSF_SWATH_BATHYMETRY_PING"]["record_count"]
    if ping_count == 0:
        skipped_files.append(pathname)
        continue

    slices = [slice(start, start+local_ping_slice_step) for start in numpy.arange(0, ping_count, local_ping_slice_step)]
    slice_chunks = [slices[i:i+local_slices_per_task] for i in range(0, len(slices), local_slices_per_task)]

    for slice_chunk in slice_chunks:
        start_idx = slice_chunk[0].start
        end_idx = slice_chunk[0].stop
        task_name = f"{base_name}-{start_idx}-{end_idx}-local-{local_counter}"
        task = Delayed(ingest_gsf_slices, name=task_name, local=True)(gsf_metadata["gsf_uri"], creds.access_key, creds.secret_key, array_uri, slice_chunk)

        if len(local_tasks_dict[local_counter]):
            task.depends_on(local_tasks_dict[local_counter][-1])

        tasks2.append(task)
        local_tasks_dict[local_counter].append(task)
        local_counter += 1

        if local_counter == local_tasks_limit:
            local_counter = 0

reduce_task2 = Delayed(reduce_region_codes, "reduce-region_codes-timestamps", local=True)(tasks2)

In [None]:
len(tasks2)

In [None]:
reduce_task2.visualize()

In [None]:
cell_count_df2, start_end_timestamps2 = reduce_task2.compute()

In [None]:
local_non_local_results = [
    [cell_count_df, start_end_timestamps],
    [cell_count_df2, start_end_timestamps2],
]

In [None]:
final_cell_count_df, final_start_end_timestamps = reduce_region_codes(local_non_local_results)

In [None]:
final_cell_count_df

In [None]:
final_start_end_timestamps

In [None]:
tasks = None
tasks2 = None
reduce_task = None
reduce_task2 = None
tasks_dict = None
local_tasks_dict = None
local_non_local_results = None
cell_count_df = None
start_end_timestamps = None
cell_count_df2 = None
start_end_timestamps2 = None

In [None]:
final_cell_count_df["geometry"] = rhealpix.rhealpix_geo_boundary(final_cell_count_df.region_code.values)

In [None]:
gdf15 = geopandas.GeoDataFrame(final_cell_count_df, crs="epsg:4326")

In [None]:
with fiona.Env(session=AWSSession(aws_access_key_id=creds.access_key, aws_secret_access_key=creds.secret_key)):
    gdf15.to_file(soundings_cell_density_uri_15, driver="GeoJSONSeq", coordinate_precision=11)

In [None]:
gdf15 = None

In [None]:
resolution12_df = pandas.DataFrame(
    {
        "region_code": final_cell_count_df.region_code.str[0:13],
        "count": final_cell_count_df["count"],
    }
).groupby(
    ["region_code"]
)["count"].agg("sum").to_frame("count").reset_index()

In [None]:
resolution12_df

In [None]:
resolution12_df["geometry"] = rhealpix.rhealpix_geo_boundary(resolution12_df.region_code.values)

In [19]:
#gdf = geopandas.GeoDataFrame(resolution12_df, crs="epsg:4326")
with fiona.Env(session=AWSSession(aws_access_key_id=creds.access_key, aws_secret_access_key=creds.secret_key)):
    gdf = geopandas.read_file(soundings_cell_density_uri)

  for feature in features_lst:


In [None]:
gdf

In [None]:
with fiona.Env(session=AWSSession(aws_access_key_id=creds.access_key, aws_secret_access_key=creds.secret_key)):
    gdf.to_file(soundings_cell_density_uri, driver="GeoJSONSeq", coordinate_precision=11)

In [None]:
dissolved = geopandas.GeoDataFrame(geometry.dissolve(gdf), crs="epsg:4326")

In [None]:
with fiona.Env(session=AWSSession(aws_access_key_id=creds.access_key, aws_secret_access_key=creds.secret_key)):
    dissolved.to_file(coverage_uri, driver="GeoJSONSeq", coordinate_precision=11)

In [20]:
dggs = rhealpix.RhealpixDGGS.from_ellipsoid()

In [21]:
49393180 * dggs.cell_width(15) **2 / 10000

2402.6211133415763

In [22]:
area_ha = gdf.shape[0] * dggs.cell_width(12) **2 / 10000
sonar_metadata["area_ha"] = area_ha
area_ha

2584.189739145068

In [23]:
with fs.open(asb_metadata_uri) as src:
    asb_metadata = json.loads(src.read())

In [24]:
asb_metadata["survey_general"]["abstract"]

'The Bunurong Marine National Park bathymetry survey was acquired by Deakin University Marine Mapping lab onboard the M/V Yolla over 5 days in 2017 (14/06-16/06, 21/09-22/09) using a Kongsberg EM2040c. This survey was part of a Parks Victoria project to better understand the habitats and associated biodiversity of Bunurong MNP.'

In [None]:
tiledb.cloud.register_array(
    uri=array_uri,
    namespace="sixy6e", # Optional, you may register it under your username, or one of your organizations
    array_name=array_name,
    description=asb_metadata["survey_general"]["abstract"],  # Optional 
    access_credentials_name="AusSeabedGMRT-PL019"
)

In [64]:
with tiledb.open(array_uri, ctx=ctx) as ds:
    schema = ds.schema
    domain = ds.domain
    non_empty_domain = ds.nonempty_domain()

In [None]:
non_empty_domain

In [65]:
idx = (slice(*non_empty_domain[0]), slice(*non_empty_domain[1]))
idx

(slice(145.62982936389727, 145.67557202558876, None),
 slice(-38.726892940903674, -38.66776783703864, None))

In [None]:
non_empty_domain[0]

In [None]:
ds = tiledb.open(array_uri, config=config_dict)

In [None]:
ds.close()

In [None]:
query = ds.query(attrs=["Z"], coords=False, return_incomplete=True).multi_index[idx]

In [None]:
query.estimated_result_sizes()

In [None]:
config_dict = ctx.config().dict()

In [None]:
stats_report.visualize()

In [None]:
stats_results_dict = stats_report.compute()

In [None]:
non_empty_domain

In [61]:
crs_info = {
    "horizontal_datum": "epsg:4326",
    "vertical_data": "epsg:4326",
}

In [93]:
with tiledb.open(array_uri, "w", ctx=ctx) as ds:
    ds.meta["crs_info"] = json.dumps(crs_info)
    ds.meta["basic_statistics"] = json.dumps(stats_results_dict, cls=stac_metadata.Encoder)

In [94]:
dataset_metadata = stac_metadata.prepare(
    uid,
    sonar_metadata,
    stats_results_dict,
    asb_metadata,
    array_uri,
    coverage_uri,
    soundings_cell_density_uri,
    creds.access_key,
    creds.secret_key,
    final_start_end_timestamps,
    outdir_uri,
    stac_md_uri,
)

In [112]:
query = ds.query(return_incomplete=True).df[idx]

In [113]:
query.estimated_result_sizes()

{'X': EstimatedResultSize(offsets_bytes=0, data_bytes=4838553600),
 'Y': EstimatedResultSize(offsets_bytes=0, data_bytes=4838553600),
 'Z': EstimatedResultSize(offsets_bytes=0, data_bytes=2419276800),
 'timestamp': EstimatedResultSize(offsets_bytes=0, data_bytes=4838553600),
 'across_track': EstimatedResultSize(offsets_bytes=0, data_bytes=2419276800),
 'along_track': EstimatedResultSize(offsets_bytes=0, data_bytes=2419276800),
 'travel_time': EstimatedResultSize(offsets_bytes=0, data_bytes=2419276800),
 'beam_angle': EstimatedResultSize(offsets_bytes=0, data_bytes=2419276800),
 'beam_angle_forward': EstimatedResultSize(offsets_bytes=0, data_bytes=2419276800),
 'vertical_error': EstimatedResultSize(offsets_bytes=0, data_bytes=2419276800),
 'horizontal_error': EstimatedResultSize(offsets_bytes=0, data_bytes=2419276800),
 'sector_number': EstimatedResultSize(offsets_bytes=0, data_bytes=604819200),
 'beam_flags': EstimatedResultSize(offsets_bytes=0, data_bytes=604819200),
 'ping_flags': Es

In [96]:
4838553600 / 1024 / 1024 / 1024

4.506254196166992

In [59]:
def reduce(results_list, attribute):
    """
    The reducer for the incremental basic statistics routine.
    Tiles/blocks/chunks of data can be distributed across a bunch of
    workers. This routine takes all of those individual results, and
    reduces them to generate the basic statistiscs.
    See for method:
    https://math.stackexchange.com/questions/1765042/moving-window-computation-of-skewness-and-kurtosis
    """
    gather = {
        "minimum": [],
        "maximum": [],
        "count": [],
        "total": [],
        "mu2_prime": [],
        "mu3_prime": [],
        "mu4_prime": [],
    }

    for res in results_list:
        for stat in res[attribute]:
            gather[stat].append(res[attribute][stat])

    minv = numpy.min(gather["minimum"])
    maxv = numpy.max(gather["maximum"])
    total = numpy.sum(gather["total"])
    count = numpy.sum(gather["count"])
    mu2_prime = numpy.sum(gather["mu2_prime"])
    mu3_prime = numpy.sum(gather["mu3_prime"])
    mu4_prime = numpy.sum(gather["mu4_prime"])

    xbar = total / count
    mu2_prime = mu2_prime / count
    mu3_prime = mu3_prime / count
    mu4_prime = mu4_prime / count

    mu2 = mu2_prime - xbar**2
    mu3 = mu3_prime - 3 * xbar * mu2_prime + 2 * xbar**3
    mu4 = mu4_prime - 4 * xbar * mu3_prime + 6 * xbar**2 * mu2_prime - 3 * xbar**4

    sigma2 = mu2 * count / (count - 1)
    sigma = numpy.sqrt(sigma2)
    skew = mu3 / numpy.sqrt(mu2**3)
    kurt = (mu4 / mu2**2) - 3
        
    result = {
        attribute: {
            "minimum": minv,
            "maximum": maxv,
            "count": count,
            "total": total,
            "mean": xbar,
            "variance": sigma2,
            "stddev": sigma,
            "skewness": skew,
            "kurtosis": kurt,
        }
    }

    return result


def scatter(iterable, n):
    """
    Evenly scatters an interable by `n` blocks.
    Sourced from:
    http://stackoverflow.com/questions/2130016/splitting-a-list-of-arbitrary-size-into-only-roughly-n-equal-parts

    :param iterable:
        An iterable or preferably a 1D list or array.

    :param n:
        An integer indicating how many blocks to create.

    :return:
        A `list` consisting of `n` blocks of roughly equal size, each
        containing elements from `iterable`.
    """

    q, r = len(iterable) // n, len(iterable) % n
    res = (iterable[i * q + min(i, r) : (i + 1) * q + min(i + 1, r)] for i in range(n))
    return list(res)


def collect_stats_results(results):
    results = {}
    for item in stats_results:
        for key in item:
            results[key] = item[key]
    return results

In [27]:
config_dict = config.dict()

In [20]:
with fiona.Env(
    session=AWSSession(
        aws_access_key_id=creds.access_key,
        aws_secret_access_key=creds.secret_key)
    ):
    gdf = geopandas.read_file(soundings_cell_density_uri)

  for feature in features_lst:


In [103]:
gdf

Unnamed: 0,region_code,count,geometry
0,R787230158768,3845,"POLYGON ((145.63515 -38.67348, 145.63515 -38.6..."
1,R787230158776,11988,"POLYGON ((145.63532 -38.67348, 145.63532 -38.6..."
2,R787230158777,15217,"POLYGON ((145.63549 -38.67348, 145.63549 -38.6..."
3,R787230158778,16656,"POLYGON ((145.63566 -38.67348, 145.63566 -38.6..."
4,R787230158786,22741,"POLYGON ((145.63583 -38.67348, 145.63583 -38.6..."
...,...,...,...
72870,R787230585361,73,"POLYGON ((145.67461 -38.72612, 145.67461 -38.7..."
72871,R787230585362,8,"POLYGON ((145.67478 -38.72612, 145.67478 -38.7..."
72872,R787230585363,93,"POLYGON ((145.67444 -38.72630, 145.67444 -38.7..."
72873,R787230585364,31,"POLYGON ((145.67461 -38.72630, 145.67461 -38.7..."


In [30]:
gdf2 = geopandas.GeoDataFrame({"region_code": gdf.region_code.str[0:11], "count": gdf["count"]}).groupby(["region_code"])["count"].agg("sum").to_frame("count").reset_index()

In [31]:
gdf2

Unnamed: 0,region_code,count
0,R7872301587,105588
1,R7872301588,11860
2,R7872301814,4978
3,R7872301815,3029746
4,R7872301817,411670
...,...,...
968,R7872305846,1677
969,R7872305847,1205
970,R7872305848,547
971,R7872305850,22796


In [32]:
gdf2["geometry"] = rhealpix.rhealpix_geo_boundary(gdf2.region_code.values, round_coords=False)

In [33]:
slices = []
for i, row in gdf2.iterrows():
    bounds = row.geometry.bounds
    slices.append((
        slice(bounds[0], bounds[-2]),
        slice(bounds[1], bounds[-1])
    ))

In [51]:
n_partitions = 4
n_sub_partitions = 2
blocks = scatter(slices, n_partitions)
attribute = "Z"

In [52]:
len(blocks), len(blocks[0])

(4, 244)

In [53]:
len(scatter(blocks[0], n_sub_partitions)[1])

122

In [37]:
stats_attrs = [at for at in required_attributes if at not in ["timestamp", "region_code"]]
stats_attrs.insert(0, "Y")
stats_attrs.insert(0, "X")

In [54]:
stats_results = []
tasks_dict = {stat: [] for stat in stats_attrs}
reduce_tasks = []

for i, block in enumerate(blocks):
    sub_tasks = []
    sub_blocks = scatter(block, n_sub_partitions)

    for si, sub_block in enumerate(sub_blocks):
        for attribute in stats_attrs:
            
            if attribute in ["X", "Y"]:
                schema = attribute
            else:
                schema = None

            task_name = f"block-{i}-sub_block-{si}-{attribute}"
            task = Delayed("sixy6e/basic_statistics_incremental", name=task_name)(array_uri, config_dict, attribute, schema=schema, idxs=sub_block, summarise=False)

            if len(tasks_dict[attribute]) > 1:
                task.depends_on(tasks_dict[attribute][-1])

            tasks_dict[attribute].append(task)

for attribute in stats_attrs:
    task_name = f"reduce-attibute-{attribute}"
    reducer_task = Delayed("sixy6e/basic_statistics_reduce", name=task_name)(tasks_dict[attribute], attribute)
    reduce_tasks.append(reducer_task)

collect_stats_task = Delayed(collect_stats_results, local=True, name="gather-stats")(reduce_tasks)

In [56]:
stats_results = collect_stats_task.compute()

In [67]:
ds = tiledb.open(array_uri, ctx=ctx)

In [68]:
query = ds.query(attrs=["timestamp"], coords=False)

In [69]:
timestamps = query.df[idx]

In [74]:
final_start_end_timestamps = [timestamps.timestamp.min().to_pydatetime(), timestamps.timestamp.max().to_pydatetime()]

In [75]:
final_start_end_timestamps

[datetime.datetime(2017, 6, 14, 0, 29, 56, 287000),
 datetime.datetime(2017, 9, 22, 4, 14, 23, 77000)]

In [95]:
dataset_metadata

{'type': 'Feature',
 'stac_version': '1.0.0',
 'id': 'e5831d06-507b-4a45-a3d3-4658bc7e25f0',
 'properties': {'sonar:processed_datetime': datetime.datetime(2017, 6, 14, 0, 29, 56, 261529, tzinfo=datetime.timezone.utc),
  'sonar:platform_type': 'surface_ship',
  'sonar:full_raw_data': False,
  'sonar:roll_compensated': True,
  'sonar:pitch_compensated': True,
  'sonar:heave_compensated': True,
  'sonar:tide_compensated': True,
  'sonar:number_of_receivers': 1,
  'sonar:number_of_transmitters': 1,
  'sonar:depth_calculation': 'corrected',
  'sonar:ray_tracing': True,
  'sonar:msb_applied_to_attitude': False,
  'sonar:heave_removed_from_gps_tc': False,
  'sonar:utc_offset': 'unknown',
  'sonar:roll_reference': 'unknown',
  'sonar:draft_to_apply': 0.0,
  'sonar:pitch_to_apply': 0.0,
  'sonar:roll_to_apply': 0.0,
  'sonar:gyro_to_apply': 0.0,
  'sonar:position_offset_to_apply': [0.0, 0.0, 0.0],
  'sonar:antenna_offset_to_apply': [0.0, 0.0, 0.0],
  'sonar:transducer_offset_to_apply': [0.0, 0.

In [2]:
tmp_array = "s3://ausseabed-pl019-provided-data/temp-work/bunurong-test-100000-capacity.tiledb/"

In [5]:
config = tiledb.Config(
    {
        "vfs.s3.aws_access_key_id": creds.access_key, 
        "vfs.s3.aws_secret_access_key": creds.secret_key,
        "sm.consolidation.buffer_size": 536870912,
    }
)
ctx = tiledb.Ctx(config=config)

In [None]:
tiledb.consolidate(tmp_array)