In [1]:
import os
import json
import shutil

from typing import List, Dict
from datetime import datetime
from hashlib import md5

## Defining your job inputs

The following cell is tagged with "parameters", which allows papermill to identify the cell containing per-run parameters
Cell tags may be accessed using the double-gear icon in JupyterLab's left-hand gutter.

All variables defined in the following cell are treated as job input parameters, and will be accessible through the `_context.json` file at runtime.

For more information, visit https://papermill.readthedocs.io/en/latest/

In [2]:
# Job input parameters

polygon: Dict = {
  "type": "Feature",
  "geometry": {
    "type": "Polygon",
    "coordinates": [[-180, -90], [-180, 90], [180, 90], [180, -90], [-180, -90]]
  },
  "properties": {
    "name": "Earth"
  }
}
track_number: int = 0
    
active_pixel_x_min: int = 20
active_pixel_x_max: int = 30
active_pixel_y_min: int = 20
active_pixel_y_max: int = 30    

outer_pixel_x_min: int = 15
outer_pixel_x_max: int = 35
outer_pixel_y_min: int = 15
outer_pixel_y_max: int = 35

start_date: str = "2000-01-01"
end_date: str = "2100-01-01"

# PCM-System Parameters
# These use reserved-prefix parameter names (_*) and are also parsed during `notebook-pge-wrapper specs` to generate the hysds-io and job-spec
_time_limit = 600
_soft_time_limit = 600
_disk_usage = "1GB"
_submission_type = "individual"
_required_queue = "factotum-job_worker-small"
_label = "Create Volcano Anomaly AOI"

## Defining your job outputs and metadata files

The following cell contains the functions necessary to create a trivial data product for ingestion into the PCM data product catalog.

These functions should be augmented to include your desired dataset definition data, metadata and job output files

It is also typical to include important fields (e.g. track number, orbit direction and temporal bound timestamps) in the dataset id

In [3]:
working_dir = os.path.abspath(os.curdir)

def generate_dummy_context_file() -> None:
    """When run in HySDS, a _context.json file will be present in the working directory, so this is only necessary for local development"""
    filepath: str = os.path.join(working_dir, '_context.json')
    print(f'Writing dummy context to {filepath}')
    with open(filepath, 'w+') as context_file:
        json.dump({'run_timestamp': datetime.now().isoformat()}, context_file)

def generate_dataset_id(id_prefix: str, context_str: str) -> str:
    """Generates a globally-unique ID for the data product produced.
    Uniqueness is generally ensured by the context, which will (theoretically) be either unique, or subject to deduplication by HySDS"""
    
    hash_suffix = md5(context_str.encode()).hexdigest()[0:5]

    job_id = f'{id_prefix}-{datetime.now().isoformat()}-{hash_suffix}'

    print(f'Generated job ID: {job_id}')
    return job_id


def generate_dataset_file(dataset_id: str, **kwargs) -> None:
    """Stores standardized metadata used for indexing products in HySDS GRQ"""
    dataset_definition_filepath: str = os.path.join(working_dir, dataset_id, f'{dataset_id}.dataset.json')
    metadata: dict = {
        'version': kwargs.get('version', 'v1.0'),
    }
    
    optional_fields = [
        'label',
        'location',  # Must adhere to geoJSON "geometry" format
        'starttime',
        'endtime'
    ]
    for field in optional_fields:
        if field in kwargs:
            metadata[field] = kwargs.get(field)
    
    with open(dataset_definition_filepath, 'w+') as dataset_file:
        print(f'Writing to {dataset_definition_filepath}')
        json.dump(metadata, dataset_file)
    
def generate_metadata_file(dataset_id: str, metadata: Dict) -> None:
    """Stores custom metadata keys/values used for indexing products in HySDS GRQ"""
    metadata_filepath: str = os.path.join(working_dir, dataset_id, f'{dataset_id}.met.json')

    with open(metadata_filepath, 'w+') as metadata_file:
        print(f'Writing to {metadata_filepath}')
        json.dump(metadata, metadata_file)
        

        
def generate_data_product(working_dir: str = working_dir, id_prefix: str = 'VOLCANO_ANOMALY_AOI') -> None:
    """Generates metadata/dataset files and packages them in a specially-named directory with the desired job output files, for ingestion into the data product catalog"""
    context_filepath: str = os.path.join(working_dir, '_context.json') 
    with open(context_filepath) as context_file:
        context_str: str = context_file.read()
            
    context = json.loads(context_str)
            
    dataset_id: str = generate_dataset_id(id_prefix, context_str)
        
    params = {}
    for param in context['job_specification']['params']:
        params[param['name']] = param['value']
    
    data_product_dir = os.path.join(working_dir, dataset_id)
    print(f'Generating data product at {data_product_dir}')
    
    os.mkdir(data_product_dir)
    
    data_product = params
    
    print(data_product)
    
    generate_metadata_file(dataset_id, data_product)
    
    geo_json = data_product.get('polygon')
    
    label = geo_json.get('properties').get('name')
    location = {
      "coordinates": [
        [
          [
            [
              -96.419273,
              21.798433
            ],
            [
              -96.074295,
              23.425812
            ],
            [
              -98.505028,
              23.842773
            ],
            [
              -98.819763,
              22.218616
            ],
            [
              -96.419273,
              21.798433
            ]
          ]
        ]
      ],
      "type": "MultiPolygon"
    }
    starttime = datetime.strptime(data_product.get('start_date'), '%Y-%m-%d').isoformat()
    endtime = datetime.strptime(data_product.get('end_date'), '%Y-%m-%d').isoformat()
    
    generate_dataset_file(dataset_id,
                          label=label,
                          location=location,
                          starttime=starttime,
                          endtime=endtime
                         )
        
    
    


## Defining your job's high-level execution flow

The following cell contains a trivial set of procedural calls, which will be run

In [4]:
generate_data_product()

print('PGE execution complete!')


Generated job ID: VOLCANO_ANOMALY_AOI-2021-06-15T23:22:30.399944-a4909
Generating data product at /home/jovyan/create_volcano_anomaly_aoi/notebook_pges/VOLCANO_ANOMALY_AOI-2021-06-15T23:22:30.399944-a4909
{'polygon': {'geometry': {'coordinates': [[-180, -90], [-180, 90], [180, 90], [180, -90], [-180, -90]], 'type': 'Polygon'}, 'properties': {'name': 'Earth'}, 'type': 'Feature'}, 'track_number': 124.0, 'active_pixel_x_min': 20.0, 'active_pixel_x_max': 30.0, 'active_pixel_y_min': 20.0, 'active_pixel_y_max': 30.0, 'outer_pixel_x_min': 15.0, 'outer_pixel_x_max': 35.0, 'outer_pixel_y_min': 15.0, 'outer_pixel_y_max': 35.0, 'start_date': '2000-01-01', 'end_date': '2100-01-01'}
Writing to /home/jovyan/create_volcano_anomaly_aoi/notebook_pges/VOLCANO_ANOMALY_AOI-2021-06-15T23:22:30.399944-a4909/VOLCANO_ANOMALY_AOI-2021-06-15T23:22:30.399944-a4909.met.json
Writing to /home/jovyan/create_volcano_anomaly_aoi/notebook_pges/VOLCANO_ANOMALY_AOI-2021-06-15T23:22:30.399944-a4909/VOLCANO_ANOMALY_AOI-202