/
monitor_wdl_pipeline.sh
executable file
·247 lines (209 loc) · 7.48 KB
/
monitor_wdl_pipeline.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
#!/bin/bash
# Copyright 2017 Google Inc.
#
# Use of this source code is governed by a BSD-style
# license that can be found in the LICENSE file or at
# https://developers.google.com/open-source/licenses/bsd
# monitor_wdl_pipeline.sh
#
# Simple script that can be used to monitor the status of a WDL pipeline
# run by the Broad's Cromwell (https://github.com/broadinstitute/cromwell)
# on GCP.
#
# The script accepts an operation ID for a pipeline, extracts the
# LOGGING, WORKSPACE, and OUTPUT directories from the operation and then
# examines these directories to glean some insights into the status of the
# operation.
#
# Note: if the WORKSPACE and/or OUTPUT directories for the specified operation
# are already populated (for example by another operation), this script
# will emit incorrect output.
set -o errexit
set -o nounset
readonly SCRIPT_DIR=$(dirname "${0}")
readonly REPO_ROOT=$(cd ${SCRIPT_DIR}/ && pwd)
# Bring in polling utility functions
source ${REPO_ROOT}/operations_util.sh
# FUNCTIONS
# gsutil_ls
#
# Run "gsutil ls" masking stderr output and the non-zero exit code
function gsutil_ls() {
gsutil ls $* 2>/dev/null || true
}
readonly -f gsutil_ls
# line_count
#
# Emit the number of lines of text in a string.
function line_count() {
local lines="${1}"
if [[ -z "${lines}" ]]; then
echo "0"
else
echo "${lines}" | wc -l
fi
}
readonly line_count
# indent
#
# Indent stdin two spaces
function indent() {
sed -e 's#^# #'
}
readonly -f indent
# MAIN
if [[ $# -lt 1 ]]; then
2>&1 echo "Usage: $0 OPERATION-ID <location> <poll-interval>"
exit 1
fi
readonly OPERATION_ID="${1}"
readonly LOCATION="${2:-us-central1}" # Default: us-central1
readonly POLL_INTERVAL_SECONDS="${3:-60}" # Default: 60 seconds between requests
readonly POLL_WAIT_MAX="${4:-}" # Default: wait forever
echo "Location: ${LOCATION}"
# Get GCS paths from the operation
LOGGING=$(get_operation_logging "${OPERATION_ID}" \
"${LOCATION}" \
"metadata.pipeline.actions")
WORKSPACE=$(get_operation_value "${OPERATION_ID}" \
"${LOCATION}" \
"metadata.pipeline.environment.WORKSPACE")
OUTPUTS=$(get_operation_value "${OPERATION_ID}" \
"${LOCATION}" \
"metadata.pipeline.environment.OUTPUTS")
echo "Logging: ${LOGGING}"
echo "Workspace: ${WORKSPACE}"
echo "Outputs: ${OUTPUTS}"
# Loop until operation complete or POLL_WAIT_MAX
POLL_WAIT_TOTAL=0
LOGS_COUNT=-1
PREEMPT_COUNT=-1
OUTPUT_COUNT=-1
while [[ $(get_operation_done_status "${OPERATION_ID}" "${LOCATION}") != "true" ]]; do
echo
echo "$(date '+%Y-%m-%d %H:%M:%S'): operation not complete"
# Check that we haven't been polling too long
if [[ -n "${POLL_WAIT_MAX}" ]] && \
[[ "${POLL_WAIT_TOTAL}" -ge "${POLL_WAIT_MAX}" ]]; then
echo "Total wait time (${POLL_WAIT_TOTAL} seconds) has exceeded the max (${POLL_WAIT_MAX})."
exit 2
fi
# Gather info. These directories can be empty for a while during execution
# Logs should be 0 to 3 files and should be the first to show up
GS_WS=$(gsutil_ls "${WORKSPACE}/**" | sed -e 's#^'${WORKSPACE}/'##')
GS_OUT=$(gsutil_ls "${OUTPUTS}")
# The Cromwell filesystem is going to be:
# <workspace>/<workflow-name>/<cromwell-job-id>/call-<stage>
# with some variety to what is under each "call-<stage>"
#
# If the call is unsharded (not a scatter), then the call-<stage>
# directory contains objects like:
# script -- The code that gets executed on the VM.
# <stage>-rc.txt -- Contains the return code for the stage.
# This file will not be exist until the stage completes.
# If the VM is preempted, this file will never be written.
# <stage>-stdout.txt -- stdout captured during execution of exec.sh.
# <stage>-stderr.txt -- stderr captured during execution of exec.sh.
# <stage>.txt -- Genomics Pipeliens operations log.
#
# If the stage is sharded, then for each shard there will be a folder under
# call-<stage> named "shard-<n>" where numbering starts at 0.
#
# If the Pipelines VM was preempted, then a subdirectory will be found under
# call-<stage> named "attempt-<m>" where numbering starts at 2.
#
# For sharded and preemptible stages then, one may find:
# call-<stage>/shard-<n>/attempt-<m>
# From the list of the files in the workspace directory, extract useful
# sets of files from which we can glean status:
# All "script" files: indicates that the call and/or shard and/or attempt
# has started.
# All "rc" files: indicates that the call and/or shard and/or attempt
# has completed.
# All "attempt-*/script" files: indicates that a previous attempt failed
# and is assume to be due to preemption (*this is *not* checked explicitly).
WS_SCRIPTS=$(echo "${GS_WS}" | grep 'script' || true)
WS_RCS=$(echo "${GS_WS}" | grep 'rc' || true)
WS_PREEMPTS=$(echo "${GS_WS}" | grep 'attempt-[0-9]\+/script' || true)
WS_LOGS=$(echo "${GS_WS}" | grep '/*.log$' || true)
# Emit status
GS_LOGS_COUNT=$(line_count "${WS_LOGS}")
if [[ "${GS_LOGS_COUNT}" -ne "${LOGS_COUNT}" ]]; then
if [[ -n "${WS_LOGS}" ]]; then
echo "Operation logs found: "
echo "${WS_LOGS}" | sed -E -e 's#[^/]+/[^/]+/##' | indent
else
echo "No operations logs found."
fi
LOGS_COUNT="${GS_LOGS_COUNT}"
fi
if [[ -n "${WS_SCRIPTS}" ]]; then
if [[ -n "${WS_RCS}" ]]; then
echo "Calls (including shards) completed: "$(line_count "${WS_RCS}")
fi
# To determine what is running, find all call or call/shard paths that
# have an "script" with no "rc" file.
WS_CALLS_STARTED=$(
echo "${WS_SCRIPTS}" \
| sed -E \
-e 's#[^/]+/[^/]+/##' \
-e 's#/script##' \
-e 's#/attempt-[0-9]+##' \
| sort -u)
WS_CALLS_COMPLETE=$(
echo "${WS_RCS}" \
| sed -E \
-e 's#[^/]+/[^/]+/##' \
-e 's#/rc##' \
-e 's#/attempt-[0-9]+##' \
| sort -u)
IN_PROGRESS=$(\
comm -2 -3 \
<(echo "${WS_CALLS_STARTED}") \
<(echo "${WS_CALLS_COMPLETE}"))
if [[ -n ${IN_PROGRESS} ]]; then
echo "Calls started but not complete:"
echo "${IN_PROGRESS}" | indent
else
echo "No calls currently in progress."
echo "(Transitioning to next stage or copying final output)." | indent
fi
WS_PREEMPT_COUNT=$(line_count "${WS_PREEMPTS}")
if [[ "${WS_PREEMPT_COUNT}" -ne "${PREEMPT_COUNT}" ]]; then
echo "Total Preemptions: ${WS_PREEMPT_COUNT}"
PREEMPT_COUNT="${WS_PREEMPT_COUNT}"
fi
fi
GS_OUTPUT_COUNT=$(line_count "${GS_OUT}")
if [[ "${GS_OUTPUT_COUNT}" -ne "${OUTPUT_COUNT}" ]]; then
echo "There are ${GS_OUTPUT_COUNT} output files"
OUTPUT_COUNT="${GS_OUTPUT_COUNT}"
fi
echo "Sleeping ${POLL_INTERVAL_SECONDS} seconds"
sleep ${POLL_INTERVAL_SECONDS}
POLL_WAIT_TOTAL=$((POLL_WAIT_TOTAL + POLL_INTERVAL_SECONDS))
done
echo
echo "$(date '+%Y-%m-%d %H:%M:%S'): operation complete"
echo
echo "Completed operation status information"
get_operation_status "${OPERATION_ID}" "${LOCATION}" | indent
echo
echo "Operation output"
gsutil_ls "${OUTPUTS}" | indent
echo
echo "Preemption retries:"
WS_PREEMPTS=$(gsutil_ls ${WORKSPACE}/**/attempt-*/script)
if [[ -n "${WS_PREEMPTS}" ]]; then
echo "${WS_PREEMPTS}" \
| sed -E \
-e 's#^'${WORKSPACE}'/[^/]+/[^/]+/##' \
-e 's#/script##' \
| indent
echo "Total preemptions: " $(echo "${WS_PREEMPTS}" | wc -l)
else
echo "None" | indent
fi
echo
echo "Logging output"
gsutil_ls "${LOGGING}" | indent