This repository has been archived by the owner on Jan 12, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 22
/
entrypoint.aws.sh
173 lines (145 loc) · 5.7 KB
/
entrypoint.aws.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
#!/bin/bash
# Universal entrypoint script for containerized tooling for use with AWS Batch
# that handles data staging of predefined inputs and outputs.
#
# Environment Variables
# JOB_WORKFLOW_NAME
# Optional
# Name of the parent workflow for this job. Used with JOB_WORKFLOW_EXECUTION_ID
# to generate a unique prefix for workflow outputs.
#
# JOB_WORKFLOW_EXECUTION_ID
# Optional
# Unique identifier for the current workflow run. Used with JOB_WORKFLOW_NAME
# to generate a unique prefix for workflow outputs.
#
# JOB_AWS_CLI_PATH
# Required if staging data from S3
# Default: /opt/miniconda/bin
# Path to add to the PATH environment variable so that the AWS CLI can be
# located. Use this if bindmounting the AWS CLI from the host and it is
# packaged in a self-contained way (e.g. not needing OS/distribution
# specific shared libraries). The AWS CLI installed with `conda` is
# sufficiently self-contained. Using a standard python virtualenv does
# not work.
#
# JOB_DATA_ISOLATION
# Optional
# Default: null
# Set to 1 if container will need to use an isolated data space - e.g.
# it will operate in a volume mounted from the host for scratch
#
# JOB_INPUTS
# Optional
# Default: null
# A space delimited list of s3 object urls - e.g.:
# s3://{prefix1}/{key_pattern1} [s3://{prefix2}/{key_pattern2} [...]]
# for files that the job will use as inputs
#
# JOB_OUTPUTS
# Optional
# Default: null
# A space delimited list of files - e.g.:
# file1 [file2 [...]]
# that the job generates that will be retained - i.e. transferred back to S3
#
# JOB_OUTPUT_PREFIX
# Required if JOB_OUTPUTS need to be stored on S3
# Default: null
# S3 location (e.g. s3://bucket/prefix) were job outputs will be stored
set -e # exit on error
if [[ $JOB_VERBOSE ]]; then
set -x # enable echo
fi
DEFAULT_AWS_CLI_PATH=/opt/miniconda/bin
AWS_CLI_PATH=${JOB_AWS_CLI_PATH:-$DEFAULT_AWS_CLI_PATH}
PATH=$PATH:$AWS_CLI_PATH
# ensure that JOB_INPUT_PREFIX is fully evaluated if present
if [[ $JOB_INPUT_PREFIX ]]; then
JOB_INPUT_PREFIX=`echo $JOB_INPUT_PREFIX | envsubst`
fi
# # meta-variable that isn't passed in to the container as an environment variable
# # but may be referenced in JOB_INPUTS
# JOB_INPUT_PREFIX=$JOB_OUTPUT_PREFIX
# if [[ $JOB_WORKFLOW_NAME && $JOB_WORKFLOW_EXECUTION_ID ]]; then
# JOB_INPUT_PREFIX=$JOB_INPUT_PREFIX/$JOB_WORKFLOW_NAME/$JOB_WORKFLOW_EXECUTION_ID
# fi
# export JOB_INPUT_PREFIX
if [[ $JOB_DATA_ISOLATION && $JOB_DATA_ISOLATION == 1 ]]; then
## AWS Batch places multiple jobs on an instance
## To avoid file path clobbering if using a host mounted scratch use the JobID
## and JobAttempt to create a unique path
if [[ $AWS_BATCH_JOB_ID ]]; then
GUID="$AWS_BATCH_JOB_ID/$AWS_BATCH_JOB_ATTEMPT"
else
GUID=`date | md5sum | cut -d " " -f 1`
fi
mkdir -p $GUID
cd $GUID
fi
function stage_in() (
# loops over list of inputs (patterns allowed) which are a space delimited list
# of s3 urls:
# s3://{prefix1}/{key_pattern1} [s3://{prefix2}/{key_pattern2} [...]]
# uses the AWS CLI to download objects
# `noglob` option is needed so that patterns are not expanded against the
# local filesystem. this setting is local to the function
set -o noglob
for item in "$@"; do
item=`echo $item | envsubst`
if [[ $item =~ ^s3:// ]]; then
local item_key=`basename $item`
local item_prefix=`dirname $item`
echo "[input] remote: $item ==> ./$item_key"
aws s3 cp \
--no-progress \
--recursive \
--exclude "*" \
--include "${item_key}" \
${item_prefix} .
else
echo "[input] local: $item"
fi
done
)
function stage_out() (
# loops over list of outputs which are a space delimited list of filenames:
# file1 [file2 [...]]
# uses the AWS CLI to upload objects
for item in "$@"; do
if [ ! -f $item ]; then
# If an expected output is not found it is generally considered an
# error. To suppress this error when using glob expansion you can
# set the `nullglob` option (`shopt -s nullglob`)
echo "[output] ERROR: $item does not exist" 1>&2
exit 1
else
if [[ $JOB_OUTPUT_PREFIX && $JOB_OUTPUT_PREFIX =~ ^s3:// ]]; then
local item_key=`basename $item`
local output_prefix=$JOB_OUTPUT_PREFIX
if [[ $JOB_WORKFLOW_NAME && $JOB_WORKFLOW_EXECUTION_ID ]]; then
local output_prefix=$output_prefix/$JOB_WORKFLOW_NAME/$JOB_WORKFLOW_EXECUTION_ID
fi
echo "[output] remote: ./$item ==> $output_prefix/${item_key}"
aws s3 cp \
--no-progress \
./$item $output_prefix/${item_key}
elif [[ $JOB_OUTPUT_PREFIX && ! $JOB_OUTPUT_PREFIX =~ ^s3:// ]]; then
echo "[output] ERROR: unsupported remote output destination $JOB_OUTPUT_PREFIX" 1>&2
else
echo "[output] local: ./$item"
fi
fi
done
)
# Command is specified in the JobSubmission container overrides.
# gives the user flexibility to specify tooling options as needed.
#
# Note that AWS Batch has an implicit 8kb limit on the amount of data allowed in
# container overrides, which includes environment variable data.
COMMAND=`echo "$*" | envsubst`
printenv
stage_in $JOB_INPUTS
echo "[command]: $COMMAND"
bash -c "$COMMAND"
stage_out $JOB_OUTPUTS