## Access {{ dataset_name }} data in Parquet

A jupyter notebook to show how to access and plot {{ dataset_name }} data available as a [Parquet](https://parquet.apache.org) dataset on S3

In [None]:
dataset_name = "dataset_parquet_name"

## Install/Update packages and Load common functions

In [None]:
# only run once, then restart session if needed
!pip install uv

import os
import sys
import platform

def is_colab():
    try:
        import google.colab
        return True
    except ImportError:
        return False

# Get the current directory of the notebook
current_dir = os.getcwd()

# Check if requirements.txt exists in the current directory
local_requirements = os.path.join(current_dir, 'requirements.txt')
if os.path.exists(local_requirements):
    requirements_path = local_requirements
else:
    # Fall back to the online requirements.txt file
    requirements_path = 'https://raw.githubusercontent.com/aodn/aodn_cloud_optimised/main/notebooks/requirements.txt'

# Install packages using uv and the determined requirements file
if is_colab():  # For Google Colab
    import xarray as xr
    xr.set_options(display_style='text')
    os.system(f'uv pip install --system -r {requirements_path}')
    os.system('uv pip install --system pyopenssl --upgrade')
elif 'jupyter' in platform.uname().node:  # For Nectar Instance https://jupyterhub.rc.nectar.org.au
    os.system(f'uv pip install --system -r {requirements_path}')
else: # If running locallly
    os.system('uv venv')
    os.system(f'uv pip install -r {requirements_path}')

In [None]:
import os
import re
import requests
import importlib.util
from packaging.version import Version, InvalidVersion

DATAQUERY_PATH = "DataQuery.py"
DATAQUERY_URL = "https://raw.githubusercontent.com/aodn/aodn_cloud_optimised/main/aodn_cloud_optimised/lib/DataQuery.py"


def load_dataquery():
    remote_version, remote_code = get_remote_version_and_code()
    if remote_version is None:
        print("❌ Remote file does not contain a valid __version__, skipping update.")
        return

    local_version = get_local_version()
    if local_version is None:
        print("⚠️ Local file has no version or is missing. Downloading remote file.")
        write_dataquery(remote_code)
    elif remote_version > local_version:
        print(f"🔄 Updating: local version {local_version} < remote version {remote_version}")
        write_dataquery(remote_code)
    else:
        print(f"✅ Local version {local_version} is up to date (remote: {remote_version})")


def get_local_version():
    if not os.path.exists(DATAQUERY_PATH):
        return None
    try:
        spec = importlib.util.spec_from_file_location("DataQuery", DATAQUERY_PATH)
        module = importlib.util.module_from_spec(spec)
        spec.loader.exec_module(module)
        version_str = getattr(module, "__version__", None)
        return Version(version_str) if version_str else None
    except Exception as e:
        print(f"Error reading local version: {e}")
        return None


def get_remote_version_and_code():
    try:
        response = requests.get(DATAQUERY_URL)
        response.raise_for_status()
        code = response.text
        match = re.search(r'^__version__\s*=\s*["\']([^"\']+)["\']', code, re.MULTILINE)
        if match:
            version_str = match.group(1)
            return Version(version_str), code
        else:
            return None, code
    except Exception as e:
        print(f"Error fetching remote file: {e}")
        return None, None


def write_dataquery(code):
    with open(DATAQUERY_PATH, "w", encoding="utf-8") as f:
        f.write(code)
    print(f"📥 Wrote updated DataQuery.py")


load_dataquery()

In [None]:
from DataQuery import GetAodn

# Understanding the Dataset

## Get partition keys
Partitioning in Parquet involves organising data files based on the values of one or more columns, known as partition keys. When data is written to Parquet files with partitioning enabled, the files are physically stored in a directory structure that reflects the partition keys. This directory structure makes it easier to retrieve and process specific subsets of data based on the partition keys.

In [None]:
aodn = GetAodn()
dname = f'{dataset_name}.parquet'
%time aodn_dataset = aodn.get_dataset(dname)

In [None]:
aodn_dataset.dataset.partitioning.schema

## List unique partition values

In [None]:
%%time
unique_partition_value = aodn_dataset.get_unique_partition_values('YOUR_PARTITION_KEY')
print(list(unique_partition_value)[0:2])  # showing a subset only

## Visualise Spatial Extent of the dataset
In this section, we're plotting the polygons where data exists. This helps then with creating a bounding box where there is data

In [None]:
aodn_dataset.plot_spatial_extent()

## Get Temporal Extent of the dataset

Similary to the spatial extent, we're retrieving the minimum and maximum timestamp partition values of the dataset. This is not necessarely accurately representative of the TIME values, as the timestamp partition can be yearly/monthly... but is here to give an idea

In [None]:
%%time
aodn_dataset.get_temporal_extent()

## Read Metadata

For all parquet dataset, we create a sidecar file in the root of the dataset named **_common_matadata**. This contains the variable attributes.

In [None]:
metadata = aodn_dataset.get_metadata()
metadata

# Data Query and Plot

## Create a TIME and BoundingBox filter

In [None]:
%%time
df = aodn_dataset.get_data(date_start='2022-12-01', date_end='2023-01-01',lat_min=-34, lat_max=-28, lon_min=151, lon_max=160, lat_varname='latitude', lon_varname='longitude')

df = pd.read_parquet(dname, engine='pyarrow',filters=filter)
df.info()

## Create a TIME and scalar/number filter

In [None]:
%%time
df = aodn_dataset.get_data(date_start='2006-07-12', date_end='2023-02-05',scalar_filter='YOUR_PARTITION_KEY': 1901740})
df.info()