In [1]:
import requests
import toolz
from datetime import datetime
import pystac

from pathlib import Path

In [2]:
# Catalog and dataset ID (for Argo)
GEONETWORK = "https://catalogue-imos.aodn.org.au/geonetwork/srv/api/0.1/records/{dataset}/formatters/json"
DATASET = "4402cb50-e20a-44ee-93e6-4728259250d2"
COLLECTION_NAME = "IMOS_Argo_South_Pacific"

In [3]:
#curl -X GET "https://catalogue-imos.aodn.org.au/geonetwork/srv/api/0.1/records/4402cb50-e20a-44ee-93e6-4728259250d2/formatters/json" 
# -H "accept: application/json;charset=utf-8"

# Geonetwork is funny how it wants us to use HTTP stuff and not URL Parameters
headers = {
    "Accept": "application/json",
}


response = requests.get(GEONETWORK.format(dataset=DATASET), headers=headers)
full_document = response.json()

In [4]:
# Dump the Geonetwork document to a file for reference
# import fsspec
# with fsspec.open(f"{DATASET}.json", "wb") as f:
#     f.write(response.content)

In [4]:
def geojson_polygon_coordinate_array(geometry):
    # a geojson polygon from gml vertices list
    # does not support holes
    crazy_nesting = [
        "gex:EX_BoundingPolygon",
        "gex:polygon",
        "gml:MultiSurface",
        "gml:surfaceMember",
        "gml:Polygon",
        "gml:exterior",
        "gml:LinearRing",
        "gml:posList",
        "#text"
    ]
    coords = toolz.get_in(crazy_nesting, geometry)
    gml_vertices_list = [float(x) for x in coords.split(" ")]
    vertices = [[gml_vertices_list[i], gml_vertices_list[i + 1]] for i in range(0, len(gml_vertices_list), 2)]
    return [vertices]


def geojson_multipolygon(geometries):
    # a geojson multipolygon from xml geometries
    multipolygon = {
        "type": "MultiPolygon",
        "coordinates": [
            geojson_polygon_coordinate_array(g) for g in geometries
        ]
    }
    return multipolygon

def extract_date(gml_time):
    # See if we havd a nil reason
    datetime_string = toolz.get_in(['gml:TimeInstant', 'gml:timePosition'], gml_time)
    if datetime_string is not None:
        return datetime(*[int(x) for x in datetime_string.split("-")])
    else:
        return None


In [5]:
gml_extent = toolz.get_in(["mdb:identificationInfo", "mri:MD_DataIdentification", "mri:extent"], full_document, no_default=True)[0]["gex:EX_Extent"]
spatial_extent = pystac.SpatialExtent([geojson_polygon_coordinate_array(geometry) for geometry in gml_extent["gex:geographicElement"]])
gml_time = toolz.get_in(["gex:temporalElement", 'gex:EX_TemporalExtent', 'gex:extent', 'gml:TimePeriod'], gml_extent, no_default=True)
start_time = extract_date(gml_time["gml:begin"])
end_time = extract_date(gml_time["gml:end"])
temporal_extent = pystac.TemporalExtent([[start_time, end_time]])

In [6]:
# Let's extract all the links from the GN document
distribution = toolz.get_in(["mdb:distributionInfo", 'mrd:MD_Distribution', 'mrd:transferOptions', 'mrd:MD_DigitalTransferOptions', 'mrd:onLine'], full_document, no_default=True)
links = []
for option in distribution:
    target = toolz.get_in(['cit:CI_OnlineResource','cit:linkage', 'gco:CharacterString', "#text"], option, no_default=True)
    name = toolz.get_in(['cit:CI_OnlineResource','cit:name', 'gco:CharacterString', "#text"], option, no_default=True)

    links.append(pystac.Link(
        target=target,
        title=name,
        rel=pystac.RelType.ALTERNATE
        )
    )

In [7]:
data_doc = toolz.get_in(["mdb:identificationInfo", "mri:MD_DataIdentification"], full_document)
abstract = toolz.get_in(["mri:abstract", "gco:CharacterString", "#text"], data_doc, no_default=True)
title = toolz.get_in(["mri:citation", "cit:CI_Citation", "cit:title", "gco:CharacterString", "#text"], data_doc, no_default=True)

# Turns out our extent is too precise... might need to think about this
extent = pystac.Extent(
    spatial=pystac.SpatialExtent([[-180, -90, 180, 90]]),
    temporal=temporal_extent
)

links.append(
    pystac.Link(
        target=f"https://catalogue-imos.aodn.org.au/geonetwork/srv/eng/catalog.search#/metadata/{DATASET}",
        title="Human readable dataset overview and reference",
        rel="describedby"
    )
)

assets = []

collection = pystac.Collection(
    id=COLLECTION_NAME,
    description=abstract,
    extent=extent,
    title=title,
    license="CC-BY-4.0",
    extra_fields={
        "imos_id": DATASET,
    }
)

collection.add_links(links)
for asset in assets:
    collection.add_asset(asset)

collection.set_self_href(f"https://catalogue-imos.aodn.org.au/stac/{COLLECTION_NAME}/")

collection.validate()

['https://schemas.stacspec.org/v1.0.0/collection-spec/json-schema/collection.json']

In [8]:
dest_path = Path("./documents") / COLLECTION_NAME
dest_file = str(dest_path) + "_stac-collection.json"

if not dest_path.exists():
    dest_path.mkdir(parents=True)

collection.save_object(dest_href=dest_file)


In [10]:
# Make sure we can load as a smoke test
collection = pystac.Collection.from_file("4402cb50-e20a-44ee-93e6-4728259250d2_stac-collection.json")
