In [2]:
import boto3
import io
import urllib
import s3fs
import json
from pathlib import Path
import attr
import numpy
import tiledb
import tiledb.cloud
from tiledb.cloud.compute import DelayedArrayUDF, Delayed
import pandas
import geopandas
import fiona
from fiona.session import AWSSession
import pystac
from scipy.stats import skew, kurtosis
import uuid

In [3]:
import pystac
from pystac.extensions.projection import ProjectionExtension
from pystac.extensions.pointcloud import (
    PointcloudExtension,
    SchemaType,
    PhenomenologyType,
    Schema,
    Statistic,
)

In [2]:
from reap_gsf import reap, data_model
from bathy_datasets import rhealpix, storage, geometry, asb_spreadsheet, stac_metadata

In [5]:
session = boto3.Session()
creds = session.get_credentials()

In [6]:
fs = s3fs.S3FileSystem(key=creds.access_key, secret=creds.secret_key, use_listings_cache=False)

In [7]:
uid = uuid.uuid4()

In [8]:
survey_url = "s3://ausseabed-pl019-provided-data/DeakinUniversity/WilsonsProm_WestGlennie_Refuge_SRL/"
outdir_uri = "s3://ausseabed-pl019-ingested-data/L2/WilsonsProm_WestGlennie_Refuge_SRL/"
asb_metadata_uri = "s3://ausseabed-pl019-provided-data/DeakinUniversity/WilsonsProm_WestGlennie_Refuge_SRL/metadata/spreadsheet-metadata.json"

In [9]:
base_prefix = "ga_ausseabed"
array_name = f"{base_prefix}_{uid}_bathymetry"
array_uri = f"{outdir_uri}{array_name}.tiledb"
tiledb_array_uri = f"tiledb://sixy6e/{array_name}"
soundings_cell_density_uri = f"{outdir_uri}{base_prefix}_{uid}_soundings-cell-density-resolution-12.geojson"
coverage_uri = f"{outdir_uri}{base_prefix}_{uid}_coverage.geojson"
stac_md_uri = f"{outdir_uri}{base_prefix}_{uid}_stac-metadata.geojson"

In [10]:
files = fs.glob(survey_url + "**.gsf")
len(files)

10

In [11]:
fs.ls("s3://ausseabed-pl019-ingested-data/L2/")

['ausseabed-pl019-ingested-data/L2/',
 'ausseabed-pl019-ingested-data/L2/WilsonsProm_WestGlennie_Refuge_SRL']

In [12]:
def get_attributes(json_uri):
    """
    Temporary func for pulling the attributes from a sample GSF file.
    To ensure we only use attributes that the GSF contains.
    """
    with fs.open(json_uri) as src:
        md = json.loads(src.read())
    stream_task = Delayed("sixy6e/retrieve_stream", name="retrieve")(md["gsf_uri"], creds.access_key, creds.secret_key)
    dataframe_task = Delayed("sixy6e/decode_gsf", name="decode", image_name="3.7-geo")(stream_task, slice(10))
    df, finfo = dataframe_task.compute()
    required_attributes = list(df.columns)
    return required_attributes


def get_sonar_metadata(json_uri):
    """
    Temporary func for pulling metadata from a sample GSF file.
    """
    with fs.open(json_uri) as src:
        md = json.loads(src.read())
    stream_task = Delayed("sixy6e/retrieve_stream", name="retrieve")(md["gsf_uri"], creds.access_key, creds.secret_key)
    dataframe_task = Delayed("sixy6e/decode_gsf", name="decode", image_name="3.7-geo")(stream_task, slice(10))
    df, finfo = dataframe_task.compute()
    sonar_metadata = finfo[3].record(0).read(stream_task.result()[0])
    history = attr.asdict(finfo[6].record(0).read(stream_task.result()[0]))
    for key, value in history.items():
        sonar_metadata[key] = value
    return sonar_metadata


def reduce_region_codes(results):
    """
    The reduce part of the map-reduce construct for handling the region_code counts.
    Combine all the region_code counts then summarise the results.
    """
    region_codes = [i[0] for i in results]
    timestamps = [i[1] for i in results]
    df = pandas.concat(region_codes)
    cell_count = df.groupby(["region_code"])["count"].agg("sum").to_frame("count").reset_index()
    
    timestamps_df = pandas.DataFrame(
        {
            "start_datetime": [i[0] for i in timestamps],
            "end_datetime": [i[1] for i in timestamps],
        }
    )

    start_end_timestamp = [
        timestamps_df.start_datetime.min().to_pydatetime(),
        timestamps_df.end_datetime.max().to_pydatetime(),
    ]

    return cell_count, start_end_timestamp


def gather_stats(results):
    """
    Gather the results from all the stats tasks and
    combine into a single dict.
    """
    data = {}
    for item in results:
        for key in item:
            data[key] = item[key]
    return data

In [13]:
required_attributes = get_attributes(files[0].replace(".gsf", ".json"))

In [14]:
sonar_metadata = get_sonar_metadata(files[0].replace(".gsf", ".json"))

In [15]:
config = tiledb.Config(
        {"vfs.s3.aws_access_key_id": creds.access_key, "vfs.s3.aws_secret_access_key": creds.secret_key}
    )
ctx = tiledb.Ctx(config=config)

In [16]:
storage.create_mbes_array(array_uri, required_attributes, ctx)

In [17]:
size_limit_mb = 500
processing_node_limit = 30
ping_slice_step = 500
slices_per_node = 3

In [18]:
node_counter = 0
skipped_files = []
tasks = []
tasks_dict = {n: [] for n in range(processing_node_limit)}
for pathname in files:
    metadata_pathname = pathname.replace(".gsf", ".json")
    base_name = Path(pathname).stem
    with fs.open(metadata_pathname) as src:
        gsf_metadata = json.loads(src.read())

    if (gsf_metadata["size"] / 1024 / 1024) > size_limit_mb:
        skipped_files.append(pathname)
        continue

    ping_count = gsf_metadata["file_record_types"]["GSF_SWATH_BATHYMETRY_PING"]["record_count"]
    slices = [slice(start, start+ping_slice_step) for start in numpy.arange(0, ping_count, ping_slice_step)]
    slice_chunks = [slices[i:i+slices_per_node] for i in range(0, len(slices), slices_per_node)]

    for slice_chunk in slice_chunks:
        start_idx = slice_chunk[0].start
        end_idx = slice_chunk[0].stop
        task_name = f"{base_name}-{start_idx}-{end_idx}-{node_counter}"
        task = Delayed("sixy6e/ingest_gsf_slices", name=task_name, image_name="3.7-geo")(gsf_metadata["gsf_uri"], creds.access_key, creds.secret_key, array_uri, slice_chunk)

        if len(tasks_dict[node_counter]):
            task.depends_on(tasks_dict[node_counter][-1])

        tasks.append(task)
        tasks_dict[node_counter].append(task)
        node_counter += 1

        if node_counter == processing_node_limit:
            node_counter = 0

reduce_task = Delayed(reduce_region_codes, "reduce-region_codes-timestamps", local=True)(tasks)

In [19]:
reduce_task.visualize()

Visualize(value='{"nodes": ["72014d4a-a52b-48f4-af5d-d1d35f42a432", "d8b2f4d6-a531-481f-83ed-5ae275768e18", "c…

In [20]:
len(tasks)

66

In [21]:
cell_count_df, start_end_timestamps = reduce_task.compute()

In [22]:
resolution12_df = pandas.DataFrame(
    {
        "region_code": cell_count_df.region_code.str[0:13],
        "count": cell_count_df["count"],
    }
).groupby(
    ["region_code"]
)["count"].agg("sum").to_frame("count").reset_index()

In [23]:
resolution12_df["geometry"] = rhealpix.rhealpix_geo_boundary(resolution12_df.region_code.values)

In [24]:
gdf = geopandas.GeoDataFrame(resolution12_df, crs="epsg:4326")

In [25]:
with fiona.Env(session=AWSSession(aws_access_key_id=creds.access_key, aws_secret_access_key=creds.secret_key)):
    gdf.to_file(soundings_cell_density_uri, driver="GeoJSONSeq", coordinate_precision=11)

In [26]:
dissolved = geopandas.GeoDataFrame(geometry.dissolve(gdf), crs="epsg:4326")

  aout[:] = out
  aout[:] = out


In [27]:
with fiona.Env(session=AWSSession(aws_access_key_id=creds.access_key, aws_secret_access_key=creds.secret_key)):
    dissolved.to_file(coverage_uri, driver="GeoJSONSeq", coordinate_precision=11)

In [28]:
dggs = rhealpix.RhealpixDGGS.from_ellipsoid()

In [29]:
area_ha = gdf.shape[0] * dggs.cell_width(12) **2 / 10000
sonar_metadata["area_ha"] = area_ha
area_ha

407.5839020477999

In [30]:
with fs.open(asb_metadata_uri) as src:
    asb_metadata = json.loads(src.read())

In [31]:
tiledb.cloud.register_array(
    uri=array_uri,
    namespace="sixy6e", # Optional, you may register it under your username, or one of your organizations
    array_name=array_name,
    description=asb_metadata["survey_general"]["abstract"],  # Optional 
    access_credentials_name="AusSeabedGMRT-PL019"
)

In [32]:
with tiledb.open(array_uri, ctx=ctx) as ds:
    schema = ds.schema
    domain = ds.domain
    non_empty_domain = ds.nonempty_domain()

In [33]:
stats_tasks = []

for i in range(domain.ndim):
    attribute = domain.dim(i).name
    name = f"{attribute}_basic_stats"
    
    stats_tasks.append(
        DelayedArrayUDF(tiledb_array_uri, "sixy6e/basic-statistics", attrs=[attribute], name=name, attribute=attribute)(non_empty_domain)
    )

for i in range(schema.nattr):
    attribute = schema.attr(i).name
    name = f"{attribute}_basic_stats"
    
    if attribute in ["timestamp", "region_code"]:
        continue
        
    stats_tasks.append(
        DelayedArrayUDF(tiledb_array_uri, "sixy6e/basic-statistics", attrs=[attribute], name=name, attribute=attribute)(non_empty_domain)
    )

stats_report = Delayed(gather_stats, local=True, name="stats-report")(stats_tasks)

In [34]:
stats_report.visualize()

Visualize(value='{"nodes": ["b5a3a1ac-b4c6-428e-99c3-3f6377532ed8", "807ed2cb-9bd1-40aa-909f-48fa34181c9f", "c…

In [35]:
stats_results = stats_report.compute()

In [36]:
#tiledb.cloud.deregister_array(tiledb_array_uri)

In [37]:
crs_info = {
    "horizontal_datum": "epsg:4326",
    "vertical_data": "epsg:4326",
}

In [38]:
with tiledb.open(array_uri, "w", ctx=ctx) as ds:
    ds.meta["crs_info"] = json.dumps(crs_info)
    ds.meta["basic_statistics"] = json.dumps(stats_results, cls=stac_metadata.Encoder)

In [1]:
dataset_metadata = stac_metadata.prepare(
    uid,
    sonar_metadata,
    stats_results,
    asb_metadata,
    array_uri,
    coverage_uri,
    soundings_cell_density_uri,
    creds.access_key,
    creds.secret_key,
    start_end_timestamps,
    outdir_uri,
    stac_md_uri,
)

NameError: name 'stac_metadata' is not defined

In [40]:
dataset_metadata

{'type': 'Feature',
 'stac_version': '1.0.0',
 'id': '76826530-a376-497a-a053-06053e9855db',
 'properties': {'sonar:processed_datetime': datetime.datetime(2016, 6, 18, 0, 37, 21, 662000, tzinfo=datetime.timezone.utc),
  'sonar:platform_type': 'surface_ship',
  'sonar:full_raw_data': False,
  'sonar:roll_compensated': True,
  'sonar:pitch_compensated': True,
  'sonar:heave_compensated': True,
  'sonar:tide_compensated': True,
  'sonar:number_of_receivers': 1,
  'sonar:number_of_transmitters': 1,
  'sonar:depth_calculation': 'corrected',
  'sonar:ray_tracing': True,
  'sonar:msb_applied_to_attitude': False,
  'sonar:heave_removed_from_gps_tc': False,
  'sonar:utc_offset': 'unknown',
  'sonar:roll_reference': 'unknown',
  'sonar:draft_to_apply': 0.0,
  'sonar:pitch_to_apply': 0.0,
  'sonar:roll_to_apply': 0.0,
  'sonar:gyro_to_apply': 0.0,
  'sonar:position_offset_to_apply': [0.0, 0.0, 0.0],
  'sonar:antenna_offset_to_apply': [0.0, 0.0, 0.0],
  'sonar:transducer_offset_to_apply': [0.0, 0.