source/code/containers/_common/entrypoint.aws.sh

#!/bin/bash
# Universal entrypoint script for containerized tooling for use with AWS Batch
# that handles data staging of predefined inputs and outputs.
#
# Environment Variables
#   JOB_WORKFLOW_NAME
#       Optional
#       Name of the parent workflow for this job.  Used with JOB_WORKFLOW_EXECUTION_ID
#       to generate a unique prefix for workflow outputs.
#
#   JOB_WORKFLOW_EXECUTION_ID
#       Optional
#       Unique identifier for the current workflow run.  Used with JOB_WORKFLOW_NAME
#       to generate a unique prefix for workflow outputs.
#
#   JOB_AWS_CLI_PATH
#       Required if staging data from S3
#       Default: /opt/miniconda/bin
#       Path to add to the PATH environment variable so that the AWS CLI can be
#       located.  Use this if bindmounting the AWS CLI from the host and it is
#       packaged in a self-contained way (e.g. not needing OS/distribution 
#       specific shared libraries).  The AWS CLI installed with `conda` is
#       sufficiently self-contained.  Using a standard python virtualenv does
#       not work.
# 
#   JOB_DATA_ISOLATION
#       Optional
#       Default: null
#       Set to 1 if container will need to use an isolated data space - e.g.
#       it will operate in a volume mounted from the host for scratch
#
#   JOB_INPUTS
#       Optional
#       Default: null
#       A space delimited list of s3 object urls - e.g.:
#           s3://{prefix1}/{key_pattern1} [s3://{prefix2}/{key_pattern2} [...]]
#       for files that the job will use as inputs
#
#   JOB_OUTPUTS
#       Optional
#       Default: null
#       A space delimited list of files - e.g.:
#           file1 [file2 [...]]
#       that the job generates that will be retained - i.e. transferred back to S3
#
#   JOB_OUTPUT_PREFIX
#       Required if JOB_OUTPUTS need to be stored on S3
#       Default: null
#       S3 location (e.g. s3://bucket/prefix) were job outputs will be stored

set -e  # exit on error

if [[ $JOB_VERBOSE ]]; then
    set -x  # enable echo
fi

DEFAULT_AWS_CLI_PATH=/opt/miniconda/bin
AWS_CLI_PATH=${JOB_AWS_CLI_PATH:-$DEFAULT_AWS_CLI_PATH}
PATH=$PATH:$AWS_CLI_PATH

# ensure that JOB_INPUT_PREFIX is fully evaluated if present
if [[ $JOB_INPUT_PREFIX ]]; then
    JOB_INPUT_PREFIX=`echo $JOB_INPUT_PREFIX | envsubst`
fi
# # meta-variable that isn't passed in to the container as an environment variable
# # but may be referenced in JOB_INPUTS
# JOB_INPUT_PREFIX=$JOB_OUTPUT_PREFIX
# if [[ $JOB_WORKFLOW_NAME && $JOB_WORKFLOW_EXECUTION_ID ]]; then
#     JOB_INPUT_PREFIX=$JOB_INPUT_PREFIX/$JOB_WORKFLOW_NAME/$JOB_WORKFLOW_EXECUTION_ID
# fi
# export JOB_INPUT_PREFIX

if [[ $JOB_DATA_ISOLATION && $JOB_DATA_ISOLATION == 1 ]]; then
    ## AWS Batch places multiple jobs on an instance
    ## To avoid file path clobbering if using a host mounted scratch use the JobID 
    ## and JobAttempt to create a unique path
    
    if [[ $AWS_BATCH_JOB_ID ]]; then
        GUID="$AWS_BATCH_JOB_ID/$AWS_BATCH_JOB_ATTEMPT"
    else
        GUID=`date | md5sum | cut -d " " -f 1`
    fi

    mkdir -p $GUID
    cd $GUID
fi

function stage_in() (
    # loops over list of inputs (patterns allowed) which are a space delimited list
    # of s3 urls:
    #   s3://{prefix1}/{key_pattern1} [s3://{prefix2}/{key_pattern2} [...]]
    # uses the AWS CLI to download objects

    # `noglob` option is needed so that patterns are not expanded against the 
    # local filesystem. this setting is local to the function
    set -o noglob

    for item in "$@"; do
        item=`echo $item | envsubst`
        if [[ $item =~ ^s3:// ]]; then
            local item_key=`basename $item`
            local item_prefix=`dirname $item`

            echo "[input] remote: $item ==> ./$item_key"
            
            aws s3 cp \
                --no-progress \
                --recursive \
                --exclude "*" \
                --include "${item_key}" \
                ${item_prefix} .

        else
            echo "[input] local: $item"

        fi
    done
)

function stage_out() (
    # loops over list of outputs which are a space delimited list of filenames:
    #   file1 [file2 [...]]
    # uses the AWS CLI to upload objects

    for item in "$@"; do
        if [ ! -f $item ]; then
            # If an expected output is not found it is generally considered an
            # error.  To suppress this error when using glob expansion you can 
            # set the `nullglob` option (`shopt -s nullglob`)
            echo "[output] ERROR: $item does not exist" 1>&2
            exit 1
        else
            if [[ $JOB_OUTPUT_PREFIX && $JOB_OUTPUT_PREFIX =~ ^s3:// ]]; then
                local item_key=`basename $item`
                local output_prefix=$JOB_OUTPUT_PREFIX

                if [[ $JOB_WORKFLOW_NAME && $JOB_WORKFLOW_EXECUTION_ID ]]; then
                    local output_prefix=$output_prefix/$JOB_WORKFLOW_NAME/$JOB_WORKFLOW_EXECUTION_ID
                fi

                echo "[output] remote: ./$item ==> $output_prefix/${item_key}"

                aws s3 cp \
                    --no-progress \
                    ./$item $output_prefix/${item_key}

            elif [[ $JOB_OUTPUT_PREFIX && ! $JOB_OUTPUT_PREFIX =~ ^s3:// ]]; then
                echo "[output] ERROR: unsupported remote output destination $JOB_OUTPUT_PREFIX" 1>&2

            else
                echo "[output] local: ./$item"

            fi
        fi
    done
)

# Command is specified in the JobSubmission container overrides.
# gives the user flexibility to specify tooling options as needed.
#
# Note that AWS Batch has an implicit 8kb limit on the amount of data allowed in
# container overrides, which includes environment variable data.
COMMAND=`echo "$*" | envsubst`

printenv
stage_in $JOB_INPUTS

echo "[command]: $COMMAND"
bash -c "$COMMAND"


stage_out $JOB_OUTPUTS