In [1]:
import numpy as np
import pandas as pd
import pystac

In [2]:
# Set up pystac to read/write from S3

aws_profile = "edge-admin"

from urllib.parse import urlparse
import boto3
from pystac import Link
from pystac.stac_io import DefaultStacIO, StacIO
from typing import Union, Any


class CustomStacIO(DefaultStacIO):
    def __init__(self):
        boto3.setup_default_session(profile_name=aws_profile)
        self.s3 = boto3.resource("s3")
        super().__init__()

    def read_text(self, source: Union[str, Link], *args: Any, **kwargs: Any) -> str:
        parsed = urlparse(source)
        if parsed.scheme == "s3":
            bucket = parsed.netloc
            key = parsed.path[1:]

            obj = self.s3.Object(bucket, key)
            return obj.get()["Body"].read().decode("utf-8")
        else:
            return super().read_text(source, *args, **kwargs)

    def write_text(
        self, dest: Union[str, Link], txt: str, *args: Any, **kwargs: Any
    ) -> None:
        parsed = urlparse(dest)
        if parsed.scheme == "s3":
            bucket = parsed.netloc
            key = parsed.path[1:]
            self.s3.Object(bucket, key).put(Body=txt, ContentEncoding="utf-8")
        else:
            super().write_text(dest, txt, *args, **kwargs)


StacIO.set_default(CustomStacIO)

## Load the catalog
This will create a Catalog object, but will not load the metadata items yet.

In [3]:
catalog = pystac.Catalog.from_file('s3://aodn-data-index-nonproduction/catalog.json')
catalog

## Get all s3 paths for files in a collection
Querying by metadata uuid

In [4]:
uuid = "279a50e3-21a5-4590-85a0-71f963efab82"
collection = catalog.get_child(uuid, recursive=True)
collection

In [5]:
s3_paths = [item.assets['data'].href for item in collection.get_all_items()]
s3_paths

['s3://imos-data/IMOS/ANMN/SA/SAM1DS/gridded_timeseries/IMOS_ANMN-SA_TZ_20081211_SAM1DS_FV02_TEMP-gridded-timeseries_END-20090604_C-20220622.nc',
 's3://imos-data/IMOS/ANMN/SA/SAM2CP/gridded_timeseries/IMOS_ANMN-SA_TZ_20081020_SAM2CP_FV02_TEMP-gridded-timeseries_END-20100317_C-20220622.nc',
 's3://imos-data/IMOS/ANMN/SA/SAM3MS/gridded_timeseries/IMOS_ANMN-SA_TZ_20110222_SAM3MS_FV02_TEMP-gridded-timeseries_END-20130630_C-20220622.nc',
 's3://imos-data/IMOS/ANMN/SA/SAM4CY/gridded_timeseries/IMOS_ANMN-SA_TZ_20090205_SAM4CY_FV02_TEMP-gridded-timeseries_END-20100316_C-20220622.nc',
 's3://imos-data/IMOS/ANMN/SA/SAM5CB/gridded_timeseries/IMOS_ANMN-SA_TZ_20090206_SAM5CB_FV02_TEMP-gridded-timeseries_END-20241119_C-20250125.nc',
 's3://imos-data/IMOS/ANMN/SA/SAM6IS/gridded_timeseries/IMOS_ANMN-SA_TZ_20090205_SAM6IS_FV02_TEMP-gridded-timeseries_END-20090602_C-20220622.nc',
 's3://imos-data/IMOS/ANMN/SA/SAM7DS/gridded_timeseries/IMOS_ANMN-SA_TZ_20091215_SAM7DS_FV02_TEMP-gridded-timeseries_END-201

This can also work with higher level metadata items, containing several collections.

In [6]:
uuid = "6c981d98-d7fb-4120-9ebe-347ef1188ae0"
collection = catalog.get_child(uuid, recursive=True)
collection

In [7]:
s3_paths = [item.assets['data'].href for item in collection.get_items(recursive=True)]
len(s3_paths)

2337

## Identify duplicate files


In [8]:
# Get all items in ANMN collection
collection = catalog.get_child("f9c151bd-d95b-4af6-8cb7-21c05b7b383b", recursive=True)
items = collection.get_items(recursive=True)
# Convert to DataFrame
df = pd.DataFrame.from_dict(
    {
        item: item.properties | {"collection": item.collection_id}
        for item in items
    },
    orient="index",
)

In [9]:
# Find duplicated items
duplicates = df[~df['toolbox_input_file'].isna()]  # Only consider items with toolbox_input_file
duplicates = duplicates[
    duplicates.duplicated(
        [
            'deployment_code',
            'collection',
            'file_version',
            'instrument_serial_number',
            'toolbox_input_file',
        ],
        keep=False,
    )
]
duplicates

Unnamed: 0,Conventions,acknowledgement,author,author_email,citation,contributor_email,contributor_name,contributor_role,data_centre,data_centre_email,...,distribution_statement,project_acknowledgement,instrument_burst_duration,instrument_burst_interval,instrument_beam_angle,quality_control_set,instrument_average_interval,netcdf_group_name,toolbox_parser,rejected_files
<Item id=IMOS_ANMN_SA_CJ1_Biogeochem_profiles_IMOS_ANMN-SA_CDEFKOSTUZ_20240521T081400Z_CJ1_FV01_Profile-SBE19plus_C-20240905T020756Z_nc>,"CF-1.6,IMOS-1.4",Any users of IMOS data are required to clearly...,"de Oliveira, Hugo",hugo.oliveira@sa.gov.au,"The citation in a list of references is: ""IMOS...",,,,Australian Ocean Data Network (AODN),info@aodn.org.au,...,,,,,,,,,,
<Item id=IMOS_ANMN_SA_CJ1_Biogeochem_profiles_IMOS_ANMN-SA_CDEFKOSTUZ_20240521T081400Z_CJ1_FV01_Profile-SBE19plus_C-20241220T054611Z_nc>,"CF-1.6,IMOS-1.4",Any users of IMOS data are required to clearly...,"de Oliveira, Hugo",hugo.oliveira@sa.gov.au,"The citation in a list of references is: ""IMOS...",,,,Australian Ocean Data Network (AODN),info@aodn.org.au,...,,,,,,,,,,
<Item id=IMOS_ANMN_SA_CJ2_Biogeochem_profiles_IMOS_ANMN-SA_CDEFKOSTUZ_20240521T060400Z_CJ2_FV01_Profile-SBE19plus_C-20240905T020757Z_nc>,"CF-1.6,IMOS-1.4",Any users of IMOS data are required to clearly...,"de Oliveira, Hugo",hugo.oliveira@sa.gov.au,"The citation in a list of references is: ""IMOS...",,,,Australian Ocean Data Network (AODN),info@aodn.org.au,...,,,,,,,,,,
<Item id=IMOS_ANMN_SA_CJ2_Biogeochem_profiles_IMOS_ANMN-SA_CDEFKOSTUZ_20240521T060400Z_CJ2_FV01_Profile-SBE19plus_C-20241220T054612Z_nc>,"CF-1.6,IMOS-1.4",Any users of IMOS data are required to clearly...,"de Oliveira, Hugo",hugo.oliveira@sa.gov.au,"The citation in a list of references is: ""IMOS...",,,,Australian Ocean Data Network (AODN),info@aodn.org.au,...,,,,,,,,,,
<Item id=IMOS_ANMN_SA_CJ2_Biogeochem_profiles_IMOS_ANMN-SA_CDEFKOSTUZ_20240924T135900Z_CJ2_FV01_Profile-SBE19plus_C-20241009T025854Z_nc>,"CF-1.6,IMOS-1.4",Any users of IMOS data are required to clearly...,"de Oliveira, Hugo",hugo.oliveira@sa.gov.au,"The citation in a list of references is: ""IMOS...",,,,Australian Ocean Data Network (AODN),info@aodn.org.au,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
<Item id=IMOS_ANMN_SA_VBM100_Biogeochem_profiles_IMOS_ANMN-SA_CDEFKOSTUZ_20230623T013900Z_VBM100_FV01_Profile-SBE19plus_C-20250114T005707Z_nc>,"CF-1.6,IMOS-1.4",Any users of IMOS data are required to clearly...,"de Oliveira, Hugo",hugo.oliveira@sa.gov.au,"The citation in a list of references is: ""IMOS...",,,,Australian Ocean Data Network (AODN),info@aodn.org.au,...,,,,,,,,,,
<Item id=IMOS_ANMN_SA_VBM100_Biogeochem_profiles_IMOS_ANMN-SA_CDEFKOSTUZ_20240718T023500Z_VBM100_FV01_Profile-SBE19plus_C-20241015T003538Z_nc>,"CF-1.6,IMOS-1.4",Any users of IMOS data are required to clearly...,"de Oliveira, Hugo",hugo.oliveira@sa.gov.au,"The citation in a list of references is: ""IMOS...",,,,Australian Ocean Data Network (AODN),info@aodn.org.au,...,,,,,,,,,,
<Item id=IMOS_ANMN_SA_VBM100_Biogeochem_profiles_IMOS_ANMN-SA_CDEFKOSTUZ_20240718T023500Z_VBM100_FV01_Profile-SBE19plus_C-20250116T062051Z_nc>,"CF-1.6,IMOS-1.4",Any users of IMOS data are required to clearly...,"de Oliveira, Hugo",hugo.oliveira@sa.gov.au,"The citation in a list of references is: ""IMOS...",,,,Australian Ocean Data Network (AODN),info@aodn.org.au,...,,,,,,,,,,
<Item id=IMOS_ANMN_SA_VBM100_Biogeochem_profiles_IMOS_ANMN-SA_CDEFKOSTUZ_20240718T023700Z_VBM100_FV01_Profile-SBE19plus_C-20241015T002644Z_nc>,"CF-1.6,IMOS-1.4",Any users of IMOS data are required to clearly...,"de Oliveira, Hugo",hugo.oliveira@sa.gov.au,"The citation in a list of references is: ""IMOS...",,,,Australian Ocean Data Network (AODN),info@aodn.org.au,...,,,,,,,,,,


In [10]:
[
    item.assets['data'].href
    for item in df.index
    if item.assets['data'].href.startswith(
        's3://imos-data/IMOS/ANMN/SA/VBM100/Biogeochem_profiles/IMOS_ANMN-SA_CDEFKOSTUZ_2024'
    )
]

['s3://imos-data/IMOS/ANMN/SA/VBM100/Biogeochem_profiles/IMOS_ANMN-SA_CDEFKOSTUZ_20240407T063700Z_VBM100_FV01_Profile-SBE19plus_C-20240905T023025Z.nc',
 's3://imos-data/IMOS/ANMN/SA/VBM100/Biogeochem_profiles/IMOS_ANMN-SA_CDEFKOSTUZ_20240718T023500Z_VBM100_FV01_Profile-SBE19plus_C-20241015T003538Z.nc',
 's3://imos-data/IMOS/ANMN/SA/VBM100/Biogeochem_profiles/IMOS_ANMN-SA_CDEFKOSTUZ_20240718T023500Z_VBM100_FV01_Profile-SBE19plus_C-20250116T062051Z.nc',
 's3://imos-data/IMOS/ANMN/SA/VBM100/Biogeochem_profiles/IMOS_ANMN-SA_CDEFKOSTUZ_20240718T023700Z_VBM100_FV01_Profile-SBE19plus_C-20241015T002644Z.nc',
 's3://imos-data/IMOS/ANMN/SA/VBM100/Biogeochem_profiles/IMOS_ANMN-SA_CDEFKOSTUZ_20240718T023700Z_VBM100_FV01_Profile-SBE19plus_C-20250116T062050Z.nc',
 's3://imos-data/IMOS/ANMN/SA/VBM100/Biogeochem_profiles/IMOS_ANMN-SA_CDEFKOSTUZ_20241021T222800Z_VBM100_FV01_Profile-SBE19plus_C-20250113T230053Z.nc']

In [11]:
[dup.assets['data'].href for dup in duplicates.index]

['s3://imos-data/IMOS/ANMN/SA/CJ1/Biogeochem_profiles/IMOS_ANMN-SA_CDEFKOSTUZ_20240521T081400Z_CJ1_FV01_Profile-SBE19plus_C-20240905T020756Z.nc',
 's3://imos-data/IMOS/ANMN/SA/CJ1/Biogeochem_profiles/IMOS_ANMN-SA_CDEFKOSTUZ_20240521T081400Z_CJ1_FV01_Profile-SBE19plus_C-20241220T054611Z.nc',
 's3://imos-data/IMOS/ANMN/SA/CJ2/Biogeochem_profiles/IMOS_ANMN-SA_CDEFKOSTUZ_20240521T060400Z_CJ2_FV01_Profile-SBE19plus_C-20240905T020757Z.nc',
 's3://imos-data/IMOS/ANMN/SA/CJ2/Biogeochem_profiles/IMOS_ANMN-SA_CDEFKOSTUZ_20240521T060400Z_CJ2_FV01_Profile-SBE19plus_C-20241220T054612Z.nc',
 's3://imos-data/IMOS/ANMN/SA/CJ2/Biogeochem_profiles/IMOS_ANMN-SA_CDEFKOSTUZ_20240924T135900Z_CJ2_FV01_Profile-SBE19plus_C-20241009T025854Z.nc',
 's3://imos-data/IMOS/ANMN/SA/CJ2/Biogeochem_profiles/IMOS_ANMN-SA_CDEFKOSTUZ_20240924T135900Z_CJ2_FV01_Profile-SBE19plus_C-20241220T055637Z.nc',
 's3://imos-data/IMOS/ANMN/SA/CJ3/Biogeochem_profiles/IMOS_ANMN-SA_CDEFKOSTUZ_20240521T040600Z_CJ3_FV01_Profile-SBE19plus_C-

In [12]:
duplicates.index[0]

## Identify input files for creating a product
For example the Moorings - Hourly time-series product is generated from hundreds of individual instrument-deployment files at each mooring site.

Code adapted from [this script](https://github.com/aodn/python-aodndata/blob/master/aodndata/moorings/moorings_product_trigger.py).

In [13]:
INCLUDED_VARIABLES = {'TEMP', 'PSAL', 'CPHL', 'CHLF', 'CHLU', 'TURB', 'DOX1', 'DOX2', 'DOXS', 'PAR', 'VCUR'}
def pivot_variables(df: pd.DataFrame) -> pd.DataFrame:
    """Rearrange the file-list data frame so that each row lists one variable only
    (multiple rows per file where needed)
    """
    assert 'variables' in df.columns
    files_vars = []
    for row in df.itertuples():
        variables = set(row.variables.keys()) & INCLUDED_VARIABLES
        files_vars.extend((row.Index, v) for v in variables)

    files_vars = pd.DataFrame(files_vars, columns=['i', 'variable']).set_index('i')

    return df.drop(columns='variables').join(files_vars, how='right')

In [14]:
possible_sites= ['NRSDAR', 'NRSESP', 'NRSKAI', 'NRSMAI', 'NRSNIN', 'NRSNSI',
                 'NRSPHB', 'NRSROT', 'NRSYON', 'SEQ200', 'SEQ400', 'DARBGF']
aggregated_uuid = "moorings-aggregated-timeseries-product"
hourly_uuid = "efd8201c-1eca-412e-9ad2-0534e96cea14"
for site_code in possible_sites:
    site_files = df[df['site_code'] == site_code]
    source_index = site_files['file_version'] == 'Level 1 - Quality Controlled Data'

    # We will need to filter out aggregated products once we index them.
    # Can be done filtering by collection id, possibly after going through the
    # collections to see which ones are aggregations
    source_files = site_files.loc[source_index, ['created', 'cube:variables']].rename(
        columns={'cube:variables': 'variables'}
    )
    source_files = pivot_variables(source_files)

    # Aggregated products
    collection = catalog.get_child(aggregated_uuid, recursive=True)
    items = collection.get_items(recursive=True)
    # Convert to DataFrame
    aggregated_files = pd.DataFrame.from_dict(
        {item: item.properties | {"collection": item.collection_id} for item in items},
        orient="index",
    )
    aggregated_files = aggregated_files[['created', 'cube:variables']].rename(
        columns={'cube:variables': 'variables'}
    )
    aggregated_files = pivot_variables(aggregated_files)

    # Hourly products
    collection = catalog.get_child(hourly_uuid, recursive=True)
    items = collection.get_items(recursive=True)
    # Convert to DataFrame
    hourly_files = pd.DataFrame.from_dict(
        {item: item.properties | {"collejction": item.collection_id} for item in items},
        orient="index",
    )
    hourly_files = hourly_files[['created']]

    # when were data for each variable updated?
    vars_updated = source_files.groupby('variable').created.max()
    vars_updated.name = 'source_updated'
    aggregated_files = aggregated_files.join(vars_updated, on='variable', how='right')

    # which variables have newer data than the product, or missing product?
    new_vars = aggregated_files[np.logical_or(aggregated_files.created < aggregated_files.source_updated,
                                                aggregated_files.created.isna()
                                                )]
    new_vars = set(new_vars.variable)

    # reprocess hourly products if newer source files exist, or product files missing?
    products_updated = hourly_files.created.max()
    process_hourly = any(source_files.created > products_updated) or hourly_files.empty

    if len(new_vars) == 0 and not process_hourly:
        print(f"{site_code}: All products up to date")
    else:
        # products to generate
        # VCUR is used as a proxy for all velocity variables - if included, need to handle separately
        products = set()
        if 'VCUR' in new_vars:
            new_vars.remove('VCUR')
            products.update({'velocity_aggregated', 'velocity_hourly'})
        if new_vars:
            products.add('aggregated')
        if process_hourly:
            # gridded is created from hourly, so need to process both
            products.update({'hourly', 'gridded'})

        # create manifest
        manifest = {'site_code': site_code,
                    'variables': list(new_vars),
                    'products': list(products)
                    }
        print(manifest)

NRSDAR: All products up to date
NRSESP: All products up to date
NRSKAI: All products up to date
NRSMAI: All products up to date
NRSNIN: All products up to date
NRSNSI: All products up to date
NRSPHB: All products up to date
NRSROT: All products up to date
NRSYON: All products up to date
SEQ200: All products up to date
SEQ400: All products up to date
DARBGF: All products up to date
