From 5a39a81b5b0012356f1d3d2807cac09a6a3c296e Mon Sep 17 00:00:00 2001 From: Scott Frazer Date: Fri, 17 Jul 2015 13:08:06 -0400 Subject: [PATCH] Documenation updates --- DEVELOPER.md | 809 ++++++++++++++++++++++++++++++++++++++++++++ SPEC.md | 982 ++++++++---------------------------------------------- python/wdl/toc.py | 9 +- 3 files changed, 947 insertions(+), 853 deletions(-) create mode 100644 DEVELOPER.md diff --git a/DEVELOPER.md b/DEVELOPER.md new file mode 100644 index 0000000..136c7ec --- /dev/null +++ b/DEVELOPER.md @@ -0,0 +1,809 @@ +# Workflow Execution Algorithm + +This section outlines an example algorithm to execute tasks and workflows. It's presented here as a suggestion algorithm and as a learning tool. The algorithm here is the one used by the reference implementation. + +In this algorithm, workflows and tasks are executed the same way. Tasks should be considered one-step workflows. The rest of this document will refer only to executing workflows. + +Execution relies on two data structures: a symbol table and an execution table. + +The symbol table holds variables and their values and some meta-data about them (e.g. their type and whether they're an input or output). + +The execution table has an entry for each task that needs to be executed and its current status of execution. + +The details of these tables and the algorithm is shown in the following sections. + +## Symbol Table Construction + +Executing a workflow first depends on determining the variables for the workflow and constructing a **Symbol Table** consisting of all of these variables with null values initially. + +Determining the **Symbol Table** entries is done by examining all the tasks in the workflow and extracting every input (everything declared in `${`...`}`) and every output variable. The name of the variable is prepended with the tasks name, and task aliases may be used at the workflow level. Here is an example: + +``` +task t1 { + command {python script.py ${File path} -n ${sample_name}} + output {Array[String] result = "${sample_name}.data.txt"} + runtime {docker: "broadinstitute/job-runner:${docker_version}"} +} +task t2 { + command {wc -l ${infile1} ${infile2}} + output {int lines = stdout()} +} +workflow t3 { + call t1 + call t2 as renamed { + input: infile2=t1.result + } +} +``` + +The initial symbol table construction would be: + +|Variable |Value | +|-------------------------|----------------| +|t1.path |null | +|t1.sample_name |null | +|t1.docker_version |null | +|t1.result |null | +|renamed.infile1 |null | +|renamed.infile2 |null | + +## Symbol Table Initial Population + +Next is to populate the initial values. First, populate internal references. Any value that starts with a '%ref:' is a reference to a different value in the symbol table. In this case, we populate a reference from `renamed.infile2` to `t1.result`: + +|Variable |Value | +|-------------------------|----------------| +|t1.path |null | +|t1.sample_name |null | +|t1.docker_version |null | +|t1.result |null | +|renamed.infile1 |null | +|renamed.infile2 |%ref:t1.result | + +The client that is running the workflow MUST provide a valid value (matching or compatible type) for every *input* variable, which is anything inside `${`...`}`. + +In this case, let's say the client submits these values: + +|Variable |Value | +|-------------------------|---------------------------| +|t1.path |/path/to/input.txt | +|t1.sample_name |my_sample | +|t1.docker_version |latest | +|renamed.infile1 |/path/to/another_input.txt | + +The resulting symbol table would look like: + +|**Variable** |**Value** | +|-------------------------|---------------------------| +|t1.path |/path/to/input.txt | +|t1.sample_name |my_sample | +|t1.docker_version |latest | +|**t1.result** |**null** | +|renamed.infile1 |/path/to/another_input.txt | +|renamed.infile2 |%ref:t1.result | + +Notice how the only variable in the symbol table left unassigned is `t1.result` (and by reference, `renamed.infile2` is also unassigned). + +## Execution Table Construction + +The execution table has one entry for each task with its potentially aliased name. Initially the **Status** column is set to `not_started`. This table should include any additional columns to link the execution of this task to the client system. For example, if jobs are run in Sun GridEngine, there might be an additional column linking to a table which has the SGE Job ID and the current state of that job in SGE. Alternatively there might simply be a foreign key to a another table which contains implementation-specific information about the particular job. + +Valid states for the **Status** field are: + +* `not_started` - This task was not yet started because not all of its input parameters are defined +* `started` - All inputs to this task are defined and the task was launched in an implementation specific way +* `successful`- Execution of this task has completed with a zero return code. +* `failed` - Execution of this task has completed with a non-zero return code. +* `skipped` - This task cannot be run because an upstream task that was supposed to produce one of its inputs is in either `error` or `failed` states. +* `error` - The task has finished with a zero return code, but one of the outputs for the task doesn't match its declared type. Either the format of the file was wrong or the file does not exist. + +Terminal states are: `skipped`, `successful`, `failed`, or `error` + +A workflow execution table would include at least this: + +|**Task** |**Index** |**Status** |... | +|-------------------------|------------|------------|---------| +|task1 | |not_started | | +|task2 | |not_started | | + +The **Index** field is left blank unless this task is part of a series of scatter tasks, in which case this is an integer >= 0. Having an index value here means that some of the inputs to the task in the symbol table can be arrays and this index is used to index those arrays. + +## Workflow Evaluation + +After both the symbol table and execution table are constructed, the algorithm must examine all tasks in the execution table with a `not_started` state (i.e. all of them). It finds all parameters to the task in the symbol table and if all of them have a value specified, the task is started and the status field gets set to `started`. + +The workflow engine is responsible for monitoring the progress of the jobs it runs in an implementation specific way. Once a job completes, The workflow engine must set the job to either `successful` or `failed`. The workflow engine must then evaluate the tasks's output mappings and write them to the symbol table. If evaluation of the output mappings fails, set the status to `error`. Otherwise, set the status to `successful` if the return code is zero and `failed` otherwise. + +> **TODO**: Write a section about evaluating output mappings + +When evaluating if a `not_started` job should be started, the workflow engine must set the status to `skipped` and leave the return code undefined if any of is inputs references a job that is in either of these states: `error`, `failed`, or `skipped`. + +> **Job Avoidance**: When examining a `not_started` task, the engine might have a way of noticing that this particular task was already run in the past and we have results for it already. If that's the case, the workflow engine may set the state immediately to `finished` and fulfill its output mappings contract. + +## Workflow Termination + +A workflow is defined as finished if the following conditions are met: + +* All entries in the **Execution Table** are in a terminal state. + +## Example 1: Two-step parallel workflow + +``` +task grep_words { + command { + grep '^${start}' ${File infile} + } + output { + Array[String] words = read_lines(stdout()) + } +} +workflow wf { + File dictionary + call grep_words as grep_pythonic_words { + input: start="pythonic", infile=dictionary + } + call grep_words as grep_workf_words { + input: start="workf", infile=dictionary + } +} +``` + +Given these parameters values: + +|Variable |Value | +|-------------------------|---------------------------| +|wf.dictionary |/usr/share/dict/words | + +At start of execution, **Symbol Table** looks like this: + +|Name |Value |Type |I/O | +|-----------------------------|---------------------------------------|-------------|------| +|wf.dictionary |/usr/share/dict/words |file |input | +|wf.grep_pythonic_words.start |pythonic |String |input | +|wf.grep_pythonic_words.infile|%ref:wf.dictionary |file |input | +|wf.grep_pythonic_words.words |['pythonic', 'pythonical'] |Array[String]|output| +|wf.grep_workf_words.start |workf |String |input | +|wf.grep_workf_words.infile |%ref:wf.dictionary |file |input | +|wf.grep_workf_words.words |['workfellow', 'workfolk', 'workfolks']|Array[String]|output| + +And **Execution Table** looks like this: + +|Name |Status |Index|PID |rc | +|----------------------|-----------|-----|----|----| +|wf.grep_pythonic_words|not_started| | | | +|wf.grep_workf_words |not_started| | | | + +Any field beyond **Task**, **Status**, and **Index** are optional and implementation specific. In this example, **PID** and **rc** capture the fact that these jobs are running as subprocesses. + +The engine will then examine all tasks with a the `not_started` status and look up all their input variables in the **symbol table**. If there is a value for each of the inputs, the task is started and the status is changed to `started`. The status of `started` means that the workflow engine has started the job in an implementation-specific way. Or it might mean simply launching a sub-process in which case the *System Status* field might not be necessary (and perhaps there would be a column for PID). + +|Name |Status |Index|PID |rc | +|----------------------|-----------|-----|----|----| +|wf.grep_pythonic_words|started | |1643| | +|wf.grep_workf_words |started | |1644| | + +The workflow engine is responsible for updating all fields besides the first three in an implementation specific way. Once the workflow engine determines that a job is done executing, it updates the **Status** field to either `successful` or `failed`. + +|Name |Status |Index|PID |rc | +|----------------------|-----------|-----|----|----| +|wf.grep_pythonic_words|successful | |1643|0 | +|wf.grep_workf_words |started | |1644| | + +Upon a job finishing and the return code being set, the `outputs` section is processed for that task. In the case of the above example, when `wf.grep_pythonic_words` finishes the output mapping `Array[String] words = read_lines(stdout())` is processed. The workflow engine looks for the file called `stdout` in the current working directory where the process was launched. If it is not found, the task's status gets set to `error` and optionally an error message field gets set describing the nature of why the field was set. + +A workflow is defined as finished when all the tasks in the workflow are in a terminal state (i.e. `skipped`, `successful`, `failed`, or `error`). For this example, let's say the first task succeeds and the second task fails. The terminal state of the workflow is: + +|Name |Status |Index|PID |rc | +|----------------------|-----------|-----|----|----| +|wf.grep_pythonic_words|successful | |1643|0 | +|wf.grep_workf_words |failed | |1644|1 | + +And now since all entries in the execution table are in a terminal state, the workflow is finished. + +## Example 2: Loops + +This is a less common case, and likely will only be used in the iterative scatter-gather tasks. + +``` +task scatter_task { + command <<< + egrep ^.{${Int count}}$ ${File in} || exit 0 + >>> + output { + Array[String] words = read_lines(stdout()) + } +} + +task gather_task { + command { + python3 < 3) { + scatter(filename in files) { + call scatter_task { + input: in=filename, count=count + } + } + call gather_task { + input: count=count, word_lists=scatter_task.words + output: count=count + } + } +} +``` + +If we run the task specifying the value for the two required parameters as: + +|Parameter|Value | +|---------|--------------------------------| +|wf.files |["test/languages", "test/words"]| +|wf.count |6 | + +In this example, the file `test/languages` contains: + +``` +c +python +java +c++ +js +scala +d +c# +objective-c +php +vb +sql +R +shell +lisp +``` + +and `test/words` contains: + +``` +ace +act +bad +bag +cage +cake +call +doggy +daily +dairy +``` + +**Symbol Table Construction** + +The symbol table starts out with the following values defined as a combination of initialized values and user-provided values + +|Name |Value |Type |I/O | +|-----------------------------|--------------------------------|--------------------|------| +|wf.files |['test/languages', 'test/words']|Array[File] |input | +|wf.count |6 |Int |input | +|wf._w5._s6.filename |%ref:wf.files |Array[File] |input | +|wf._w5._s6.scatter_task.count|%ref:wf.count |String |input | +|wf._w5._s6.scatter_task.in |%ref:wf._w5._s6.filename |File |input | +|wf._w5._s6.scatter_task.words| |Array[Array[String]]|output| +|wf._w5.gather_task.count |%ref:wf.count |Int |input | +|wf._w5.gather_task.word_lists|%expr:scatter_task.words |Array[Array[String]]|input | +|wf._w5.gather_task.count |%ref:wf.count |Int |output| + +The fully qualified names in this symbol table also include the scopes that a `call` lives inside. `_s6` refers to a scatter block with an arbitrary unique ID of 6. `_w5` refers to a while loop with arbitrary unique ID of 5. + +If `scatter_task` were not being executed inside of a scatter block, the output (`wf._w5._s6.scatter_task.words`) would only be of type `Array[String]`, but any task executed in a scatter block will have all of its outputs in an array (one entry for each scatter). This is why `wf._w5._s6.scatter_task.words` has type `Array[Array[String]]`. Since the scatter and gather steps are inside of a loop, the entries that exist initially in the symbol table are only place holders that will be copied for each iteration. + +**Execution Table Construction** + +With this particular task, all of the calls happen inside of the loop. Since we don't know initially if the loop will be executed even once (we might not run `scatter_task` or `gather_task` even once) based on what the loop condition evaluates to. Because of this, only one entry exists in the execution table initially, and that entry is for the loop itself. + +|Name |Status |Index|Iter|PID |rc | +|------|-----------|-----|----|----|----| +|wf._w5|not_started| |0 | | | + +Every time the execution table is evaluated (for example, by polling or on an event), the loop table is also evaluated. The loop's condition is evaluated under the following circumstance: + +1. Execution Table status for the loop is `not_started` AND iteration == 0 +2. Execution Table status for the loop is `running` AND iteration > 0 AND all entries in the execution table for the current iteration are `successful` + +If either of these conditions is met, the loop is looked up and the condition evaluated. The following actions take place based on the evaluation of the loop condition: + +* Condition evalutes to `undefined`: this means one of the variables referenced in the conditional is undefined. + * Do nothing in this case. It should never be the case that a condition evaluates to `true` or `false` on one pass and then `undefined` on subsequent pass. +* Condition evaluates to `true`: + * Increment the iteration field on the execution table. + * Set status to `running` on the execution table for this loop, if it's not already set. + * Add all child nodes of the loop to the execution table and symbol table with a suffix of `._i1` where the number at the end is the current iteration number. They should be in state `not_started`. If any child nodes of the loop are themselves loops, add those to the control flow table. +* Condition evaluates to `false`: + * Mark the loop in the execution table as `successful` + +If the loop is `running` in the execution table, but any of the tasks in the current iteration is NOT `successful`, then mark the loop as `failed` which will prevent any further iterations. Let any tasks that are not in a terminal state finish. + +After the tables are constructed, since the loop is `not_started` and the iteration count is 0, the condition is evaluated. The condition is `count > 3`, which is looked up in the symbol table and has a value of `true`. Therefore: + +* Increment iteration value to 1 +* Set status to `running` +* Add nodes to the symbol table and execution table for this iteration + +**Symbol Table** + +|Name |Value |Type |I/O | +|---------------------------------|--------------------------------|--------------------|------| +|wf.files |['test/languages', 'test/words']|Array[File] |input | +|wf.count |6 |Int |input | +|wf._w5._s6.filename |%ref:wf.files |Array[File] |input | +|wf._w5._s6.scatter_task.count |%ref:wf.count |String |input | +|wf._w5._s6.scatter_task.in |%ref:wf._w5._s6.filename |file |input | +|wf._w5._s6.scatter_task.words | |Array[Array[String]]|output| +|wf._w5.gather_task.count |%ref:wf.count |Int |input | +|wf._w5.gather_task.word_lists |%expr:scatter_task.words |Array[Array[String]]|input | +|wf._w5.gather_task.count |%ref:wf.count |Int |output| +|wf._w5._s6.scatter_task._i1.count|%ref:wf.count |String |input | +|wf._w5._s6.scatter_task._i1.in |%ref:wf._w5._s6.filename |file |input | +|wf._w5._s6.scatter_task._i1.words| |Array[Array[String]]|output| +|wf._w5.gather_task._i1.count |%ref:wf.count |Int |input | +|wf._w5.gather_task._i1.word_lists|%expr:scatter_task.words |Array[Array[String]]|input | +|wf._w5.gather_task._i1.count |%ref:wf.count |Int |output| + +**Execution Table** + +|Name |Status |Index|Iter|PID |rc | +|---------------------------|-----------|-----|----|----|----| +|wf._w5 |started | |1 | | | +|wf._w5._s6.scatter_task._i1|not_started|0 | | | | +|wf._w5._s6.scatter_task._i1|not_started|1 | | | | +|wf._w5.gather_task._i1 |not_started| | | | | + +When the workflow engine then evaluates the execution table again, it sees that both entries for `wf._w5._s6.scatter_task._i1` are ready to run. + +The workflow engine then runs the commands for the two `scatter_task` calls in the current iteration: + +``` +egrep ^.{6}$ test/languages || exit 0 +``` + +``` +egrep ^.{6}$ python/test/words || exit 0 +``` + +**Symbol Table** + +|Name |Value |Type |I/O | +|---------------------------------|--------------------------------|--------------------|------| +|wf.files |['test/languages', 'test/words']|Array[File] |input | +|wf.count |6 |Int |input | +|wf._w5._s6.filename |%ref:wf.files |Array[File] |input | +|wf._w5._s6.scatter_task.count |%ref:wf.count |String |input | +|wf._w5._s6.scatter_task.in |%ref:wf._w5._s6.filename |file |input | +|wf._w5._s6.scatter_task.words | |Array[Array[String]]|output| +|wf._w5.gather_task.count |%ref:wf.count |Int |input | +|wf._w5.gather_task.word_lists |%expr:scatter_task.words |Array[Array[String]]|input | +|wf._w5.gather_task.count |%ref:wf.count |Int |output| +|wf._w5._s6.scatter_task._i1.count|%ref:wf.count |String |input | +|wf._w5._s6.scatter_task._i1.in |%ref:wf._w5._s6.filename |file |input | +|wf._w5._s6.scatter_task._i1.words|[['python'], []] |Array[Array[String]]|output| +|wf._w5.gather_task._i1.count |%ref:wf.count |Int |input | +|wf._w5.gather_task._i1.word_lists|%expr:scatter_task.words |Array[Array[String]]|input | +|wf._w5.gather_task._i1.count |%ref:wf.count |Int |output| + +The only thing that changed was a new value for `wf._w5._s6.scatter_task._i1.words` + +**Execution Table** + +|Name |Status |Index|Iter|PID |rc | +|---------------------------|-----------|-----|----|-----|----| +|wf._w5 |started | |1 | | | +|wf._w5._s6.scatter_task._i1|successful |0 | |47844|0 | +|wf._w5._s6.scatter_task._i1|successful |1 | |47846|0 | +|wf._w5.gather_task._i1 |not_started| | | | | + +Then, the `gather_task` step is ready because the value for `wf._w5._s6.scatter_task._i1.words` is now populated, so the engine runs the command: + +``` +python3 < grep-output.tsv + } + output { + File grepped = "grep-output.tsv" + } +} + +workflow cut_grep { + call cut_sh + call grep {input:file1=cut_sh.out2} + output { + grep.*, cut_sh.out1 + } +} +``` + +## Example 7: Output a Map + +Task that operates on a file that's a map of string->file (sample_id -> clean_bam_file) + +``` +task map-test { + command { sh foobar.sh ${Map[String, file] in} > something.tsv } + output { Map[String, Int] out = "something.tsv" } +} +``` + +# Notes & Things to Clarify + +## Workflow output syntax + +See [corresponding section](#outputs) for details + +## public/private -or- export statements + +We need a way to declare which parts of a WDL file are exported and which parts are private + +Perhaps a NodeJS-like model of explicit exports: + +``` +export (task1, task2, wf) +``` + +Perhaps assume that everything that isn't marked public is automatically private? + +``` +task t1 {...} +task t2 {...} +public wf my_wf {...} +``` + +## Seven Bridges Considerations + +1) Dependencies on ports in addition to dependencies on tasks (e.g. ps.procs instead of just ps) +2) Defining engine behavior around setting current working directory for tasks. Dir must be empty, task writes outputs to directory, support relative paths. Absolute paths may work, engine specific. +3) Enumerating success return codes? Wrapper script? extra definition on the task? Special output mapping (Int rc = ...)? +4) Defining objects (class Sample...) +5) Heterogeneous arrays +6) Supplying values from an object onto the command line + - Sample s; ${s.id} ${s.bam}; ??? + - Array[Sample] a; ${a.join(', ')}; ??? +7) What about files that travel together, like .bam and .bai files? +8) "docker" runtime format +9) Expressions in runtime section +10) Specifying environment variables and assembling / localizing a configuration file (e.g. in /etc) + +## Various loop issues + +* What if another downstream task sets a variable used in the loop expression? How do we know when we're *really* done executing the loop? + * Solution: variables in the loop condition may only be set from within the loop otherwise it's a syntax error +* What if variables in the condition is initialized but nothing inside the loop changes the var? + * Solution: Check that **exactly one** task inside the loop sets at least one variable in the loop condition -or- don't check and let it either infinitely loop or set a max iteration amount +* What if there's a completely stand-alone task inside a loop? + * Solution: This task would be pointless because it'd always run with the same inputs. This should be an error +* How are loop conditions evaluated? + * Should return one of three values: `undefined` when one of the variables is undefined, or `true` or `false` if everything is defined. + + +## Explicit vs. Implicit output mappings + +Problem: It's not obvious from this output mapping if it should parse it as TSV or JSON: + +``` +Map[String, String] my_var = stdout() +``` + +We could just say "all outputs for a task must use the exact same serialization method". + +Though, the more I play around with this syntax the more I don't like the implicit nature of it. + +* The syntax doesn't guide the user well. It's a convention, and not an obvious one. +* It *looks* like a type mismatch +* Trying to do something like `Map[String, String] my_var = subdir + "my_output"` looks more like a type mismatch. + +Since function calls are *very* easily supported in WDL, I propose going back to two built-in functions: `tsv()` and `json()` + +* `Map[String, String] blah = tsv(subdir + "somefile.tsv")` +* `Array[File] out = json(stdout())` + +What about reading primitives from files? + +* `int c = stdout()` +* `int c = read_int(stdout())` + +## Additional Syntax Checks + +* One `command` section +* `sep` is specified if postfix quantifier is a `+` or `*` +* Scatter block must contain at least one task that uses the iteration variable + * If it doesn't, it must depend on something that does. +* A loop expression can not be statically 'true' (e.g. `true`, `1==1`, etc) or else this is an infinite loop by definition. +* A loop must be able to modify the condition. One of the outputs of one of the tasks must alter one of the identifiers in the loop condition +* No circular dependencies? +* Tasks outside of the loop cannot set a variable defined in the loop. + +# Implementations + +Current implementations of the workflow description language description: + +* [Reference implementation (Python)](python/) diff --git a/SPEC.md b/SPEC.md index 4f4ac9c..6502643 100644 --- a/SPEC.md +++ b/SPEC.md @@ -26,13 +26,15 @@ * [Sections](#sections) * [Command Section](#command-section) * [Command Parts](#command-parts) - * [Command Parts (alternative heredoc syntax)](#command-parts-alternative-heredoc-syntax) + * [Alternative heredoc syntax](#alternative-heredoc-syntax) + * [Parameters Used Multiple Times](#parameters-used-multiple-times) * [Command Part Options](#command-part-options) * [sep](#sep) * [true and false](#true-and-false) * [serialize](#serialize) * [default](#default) * [Outputs Section](#outputs-section) + * [String Interpolation](#string-interpolation) * [Runtime Section](#runtime-section) * [serialize](#serialize) * [docker](#docker) @@ -47,7 +49,7 @@ * [Example 5: Word Count](#example-5-word-count) * [Example 6: tmap](#example-6-tmap) * [Workflow Definition](#workflow-definition) - * [Call Sub-Tasks](#call-sub-tasks) + * [Call Statement](#call-statement) * [Scatter](#scatter) * [Loops](#loops) * [Conditionals](#conditionals) @@ -55,11 +57,15 @@ * [Examples](#examples) * [Example 1: dRanger](#example-1-dranger) * [Variable Resolution & Scoping](#variable-resolution--scoping) -* [Standard Library Functions](#standard-library-functions) +* [Namespaces](#namespaces) + * [Additional Namespace Requirements](#additional-namespace-requirements) +* [Standard Library](#standard-library) * [mixed stdout()](#mixed-stdout) * [mixed stderr()](#mixed-stderr) - * [mixed tsv(String|File|Uri)](#mixed-tsvstringfileuri) - * [mixed json(String|File|Uri)](#mixed-jsonstringfileuri) + * [Array\[String\] read_lines(String|File|Uri)](#arraystring-read_linesstringfileuri) + * [Array\[Array\[String\]\] read_tsv(String|File|Uri)](#arrayarraystring-read_tsvstringfileuri) + * [Map\[String, String\] read_map(String|File|Uri)](#mapstring-string-read_mapstringfileuri) + * [mixed read_json(String|File|Uri)](#mixed-read_jsonstringfileuri) * [Int read_int(String|File|Uri)](#int-read_intstringfileuri) * [String read_string(String|File|Uri)](#string-read_stringstringfileuri) * [Float read_float(String|File|Uri)](#float-read_floatstringfileuri) @@ -86,26 +92,6 @@ * [object serialization as TSV](#object-serialization-as-tsv) * [object serialization as JSON](#object-serialization-as-json) * [Array\[object\]](#arrayobject) -* [Workflow Execution Algorithm](#workflow-execution-algorithm) - * [Symbol Table Construction](#symbol-table-construction) - * [Symbol Table Initial Population](#symbol-table-initial-population) - * [Execution Table Construction](#execution-table-construction) - * [Workflow Evaluation](#workflow-evaluation) - * [Workflow Termination](#workflow-termination) - * [Example 1: Two-step parallel workflow](#example-1-two-step-parallel-workflow) - * [Example 2: Loops](#example-2-loops) - * [Example 3: Nested Scatter](#example-3-nested-scatter) - * [Example 4: Nested Workflows and Scope](#example-4-nested-workflows-and-scope) - * [Example 5: Workflow scattering](#example-5-workflow-scattering) - * [Example 6: Workflow Outputs](#example-6-workflow-outputs) - * [Example 7: Output a Map](#example-7-output-a-map) -* [Notes & Things to Clarify](#notes--things-to-clarify) - * [Workflow output syntax](#workflow-output-syntax) - * [public/private -or- export statements](#publicprivate--or--export-statements) - * [Various loop issues](#various-loop-issues) - * [Explicit vs. Implicit output mappings](#explicit-vs-implicit-output-mappings) - * [Additional Syntax Checks](#additional-syntax-checks) -* [Implementations](#implementations) @@ -508,7 +494,7 @@ This command would be parsed as: * `${File input}` - command_part_var -### Command Parts (alternative heredoc syntax) +#### Alternative heredoc syntax Sometimes a command is sufficiently long enough or might use `{` characters that using a different set of delimiters would make it more clear. In this case, enclose the command in `<<<`...`>>>`, as follows: @@ -530,6 +516,26 @@ Parsing of this command should be the same as the prior section describes. > **Note**: the parser should strip any whitespace common to each line in the command. In the example above, 2 characters should be stripped from the beginning of each line. +#### Parameters Used Multiple Times + +In some cases it is desirable to use the same parameter twice in a command. A parameter can be declared multiple times if each declaration is the same. + +For example, + +``` +task test { + command { + ./script ${x} ${String x} ${x} + } +} +``` + +Since the default type if one isn't specified is `String`, this should be allowed. If a value of `foo` is specified for `x`, then the command would instantiate to: + +``` +./script foo foo foo +``` + ### Command Part Options ``` @@ -606,17 +612,17 @@ For example, if a task's output section looks like this: ``` output { - Int threshold = "threshold.txt" + Int threshold = read_int("threshold.txt") } ``` Then the task is expecting a file called "threshold.txt" in the current working directory where the task was executed. Inside of that file must be one line that contains only an integer and whitespace. See the [Data Types & Serialization](#data-types--serialization) section for more details. -The filename strings may also contain variable definitions themselves: +The filename strings may also contain variable definitions themselves (see the [String Interpolation](#string-interpolation) section below for more details): ``` output { - Array[String] quality_scores = "${sample_id}.scores.txt" + Array[String] quality_scores = read_lines("${sample_id}.scores.txt") } ``` @@ -626,6 +632,22 @@ If this is the case, then `sample_id` is considered an input to the task. Finally, glob-style `*` may be used in the filename. The glob may only match more than 1 file if the output is of type `array` +### String Interpolation + +Within tasks, any string literal can use string interpolation to access the value of any of the task's inputs. The most obvious example of this is being able to define an output file which is named as function of its input. For example: + +``` +task example { + python analysis.py --prefix=${prefix} ${File bam} + output { + File analyzed = "${prefix}.out" + File bam_sibling = "${bam}.suffix" + } +} +``` + +Any `${identifier}` inside of a string literal must be replaced with the value of the identifier. If prefix were specified as `foobar`, then `"${prefix}.out"` would be evaluated to `"foobar.out"`. + ### Runtime Section ``` @@ -833,17 +855,19 @@ workflow wf { } ``` -### Call Sub-Tasks +### Call Statement ``` -$call = 'call' $namespaced_task ('as' $identifier)? $call_body? -$namespaced_task = $identifier ('.' $identifier)* +$call = 'call' $namespaced_task_or_wf ('as' $identifier)? $call_body? +$namespaced_task_or_wf = $identifier ('.' $identifier)* $call_body = '{' $inputs? $outputs? '}' $inputs = 'input' ':' $variable_mappings $variable_mappings = $identifier '=' $expression (',' $identifier '=' $expression)* ``` -A workflow may call other tasks via the `call` keyword. The `$namespaced_task` is the reference to which task to run, usually this is an identifier or it may use the dot notation if the task was included via an [import statement](#import-statements). All `calls` must be uniquely identifiable, which is why one would use the `as alias` syntax. +A workflow may call other tasks/workflows via the `call` keyword. The `$namespaced_task_or_wf` is the reference to which task to run, usually this is an identifier or it may use the dot notation if the task was included via an [import statement](#import-statements). All `calls` must be uniquely identifiable, which is why one would use the `as alias` syntax. + +A `call` statement may reference a workflow too (e.g. `call other_workflow`). ``` import "lib" @@ -866,11 +890,11 @@ As an example, here is a workflow in which the second task references an output ``` task task1 { command {python do_stuff.py} - output {file results = stdout()} + output {File results = stdout()} } task task2 { command {python do_stuff2.py ${File foobar}} - output {file results = stdout()} + output {File results = stdout()} } workflow wf { call task1 @@ -1059,9 +1083,28 @@ Returns either a `File` or `Uri` of the stdout that this task generated. Returns either a `File` or `Uri` of the stderr that this task generated. -## mixed tsv(String|File|Uri) +## Array[String] read_lines(String|File|Uri) + +Given a file-like object (`String, `File`, or `Uri`) as a parameter, this will read each line as a string and return an `Array[String]` representation of the lines in the file. + +The order of the lines in the returned `Array[String]` must be the order in which the lines appear in the file-like object. -the `tsv()` function takes one parameter, which is a file-like object (`String`, `File`, or `URI`) and returns either an `Array`, `Object`, or `Map` depending on the contents of that file. +This task would `grep` through a file and return all strings that matched the pattern: + +``` +task do_stuff { + command { + grep '${pattern}' ${File input} + } + output { + Array[String] matches = read_lines(stdout()) + } +} +``` + +## Array[Array[String]] read_tsv(String|File|Uri) + +the `read_tsv()` function takes one parameter, which is a file-like object (`String`, `File`, or `Uri`) and returns an `Array[Array[String]]` representing the table from the TSV file. If the parameter is a `String`, this is assumed to be a local file path relative to the current working directory of the task. @@ -1073,16 +1116,61 @@ task do_stuff { python do_stuff.py ${File input} } output { - Array[File] outputs = tsv("./results/file_list.tsv") + Array[Array[String]] output_table = read_tsv("./results/file_list.tsv") } } ``` -Then when the task finishes, to fulfull the `outputs` variable, `./results/file_list.tsv` must contain a single-column TSV with file paths that are valid. +Then when the task finishes, to fulfull the `outputs_table` variable, `./results/file_list.tsv` must be a valid TSV file or an error will be reported. + +## Map[String, String] read_map(String|File|Uri) -## mixed json(String|File|Uri) +Given a file-like object (`String, `File`, or `Uri`) as a parameter, this will read each line from a file and expect the line to have the format `col1\tcol2`. In other words, the file-like object must be a two-column TSV file. -This function works exactly like `tsv()` except that the parameter is expected to be a path to a JSON file containing the data structure +This task would `grep` through a file and return all strings that matched the pattern: + +The following task would write a two-column TSV to standard out and that would be interpreted as a `Map[String, String]`: + +``` +task do_stuff { + command { + ./script --flags=${flags} ${File input} + } + output { + Map[String, String] mapping = read_map(stdout()) + } +} +``` + +## mixed read_json(String|File|Uri) + +the `read_json()` function takes one parameter, which is a file-like object (`String`, `File`, or `Uri`) and returns a data type which matches the data structure in the JSON file. The mapping of JSON type to WDL type is: + +|JSON Type|WDL Type| +|---------|--------| +|object|`Map[String, ?]`| +|array|`Array[?]`| +|number|`Int` or fallback `Float`| +|string|`String`| +|boolean|`Boolean`| +|null|???| + +If the parameter is a `String`, this is assumed to be a local file path relative to the current working directory of the task. + +For example, if I write a task that outputs a file to `./results/file_list.json`, and my task is defined as: + +``` +task do_stuff { + command { + python do_stuff.py ${File input} + } + output { + Map[String, String] output_table = read_json("./results/file_list.json") + } +} +``` + +Then when the task finishes, to fulfull the `output_table` variable, `./results/file_list.json` must be a valid TSV file or an error will be reported. ## Int read_int(String|File|Uri) @@ -1092,6 +1180,8 @@ The `read_int()` function takes a file path which is expected to contain 1 line The `read_string()` function takes a file path which is expected to contain 1 line with 1 string on it. This function returns that string. +No trailing newline characters should be included + ## Float read_float(String|File|Uri) The `read_float()` function takes a file path which is expected to contain 1 line with 1 floating point number on it. This function returns that float. @@ -1476,813 +1566,3 @@ Where `/jobs/564759/sample.json` would contain: In the case of TSV serialization, the only difference between `Array[object]` and `object` is that an array would contain more rows. In the case of JSON serialization, it would be a list of objects. - -# Workflow Execution Algorithm - -This section outlines an example algorithm to execute tasks and workflows. It's presented here as a suggestion algorithm and as a learning tool. The algorithm here is the one used by the reference implementation. - -In this algorithm, workflows and tasks are executed the same way. Tasks should be considered one-step workflows. The rest of this document will refer only to executing workflows. - -Execution relies on two data structures: a symbol table and an execution table. - -The symbol table holds variables and their values and some meta-data about them (e.g. their type and whether they're an input or output). - -The execution table has an entry for each task that needs to be executed and its current status of execution. - -The details of these tables and the algorithm is shown in the following sections. - -## Symbol Table Construction - -Executing a workflow first depends on determining the variables for the workflow and constructing a **Symbol Table** consisting of all of these variables with null values initially. - -Determining the **Symbol Table** entries is done by examining all the tasks in the workflow and extracting every input (everything declared in `${`...`}`) and every output variable. The name of the variable is prepended with the tasks name, and task aliases may be used at the workflow level. Here is an example: - -``` -task t1 { - command {python script.py ${File path} -n ${sample_name}} - output {Array[String] result = "${sample_name}.data.txt"} - runtime {docker: "broadinstitute/job-runner:${docker_version}"} -} -task t2 { - command {wc -l ${infile1} ${infile2}} - output {int lines = stdout()} -} -workflow t3 { - call t1 - call t2 as renamed { - input: infile2=t1.result - } -} -``` - -The initial symbol table construction would be: - -|Variable |Value | -|-------------------------|----------------| -|t1.path |null | -|t1.sample_name |null | -|t1.docker_version |null | -|t1.result |null | -|renamed.infile1 |null | -|renamed.infile2 |null | - -## Symbol Table Initial Population - -Next is to populate the initial values. First, populate internal references. Any value that starts with a '%ref:' is a reference to a different value in the symbol table. In this case, we populate a reference from `renamed.infile2` to `t1.result`: - -|Variable |Value | -|-------------------------|----------------| -|t1.path |null | -|t1.sample_name |null | -|t1.docker_version |null | -|t1.result |null | -|renamed.infile1 |null | -|renamed.infile2 |%ref:t1.result | - -The client that is running the workflow MUST provide a valid value (matching or compatible type) for every *input* variable, which is anything inside `${`...`}`. - -In this case, let's say the client submits these values: - -|Variable |Value | -|-------------------------|---------------------------| -|t1.path |/path/to/input.txt | -|t1.sample_name |my_sample | -|t1.docker_version |latest | -|renamed.infile1 |/path/to/another_input.txt | - -The resulting symbol table would look like: - -|**Variable** |**Value** | -|-------------------------|---------------------------| -|t1.path |/path/to/input.txt | -|t1.sample_name |my_sample | -|t1.docker_version |latest | -|**t1.result** |**null** | -|renamed.infile1 |/path/to/another_input.txt | -|renamed.infile2 |%ref:t1.result | - -Notice how the only variable in the symbol table left unassigned is `t1.result` (and by reference, `renamed.infile2` is also unassigned). - -## Execution Table Construction - -The execution table has one entry for each task with its potentially aliased name. Initially the **Status** column is set to `not_started`. This table should include any additional columns to link the execution of this task to the client system. For example, if jobs are run in Sun GridEngine, there might be an additional column linking to a table which has the SGE Job ID and the current state of that job in SGE. Alternatively there might simply be a foreign key to a another table which contains implementation-specific information about the particular job. - -Valid states for the **Status** field are: - -* `not_started` - This task was not yet started because not all of its input parameters are defined -* `started` - All inputs to this task are defined and the task was launched in an implementation specific way -* `successful`- Execution of this task has completed with a zero return code. -* `failed` - Execution of this task has completed with a non-zero return code. -* `skipped` - This task cannot be run because an upstream task that was supposed to produce one of its inputs is in either `error` or `failed` states. -* `error` - The task has finished with a zero return code, but one of the outputs for the task doesn't match its declared type. Either the format of the file was wrong or the file does not exist. - -Terminal states are: `skipped`, `successful`, `failed`, or `error` - -A workflow execution table would include at least this: - -|**Task** |**Index** |**Status** |... | -|-------------------------|------------|------------|---------| -|task1 | |not_started | | -|task2 | |not_started | | - -The **Index** field is left blank unless this task is part of a series of scatter tasks, in which case this is an integer >= 0. Having an index value here means that some of the inputs to the task in the symbol table can be arrays and this index is used to index those arrays. - -## Workflow Evaluation - -After both the symbol table and execution table are constructed, the algorithm must examine all tasks in the execution table with a `not_started` state (i.e. all of them). It finds all parameters to the task in the symbol table and if all of them have a value specified, the task is started and the status field gets set to `started`. - -The workflow engine is responsible for monitoring the progress of the jobs it runs in an implementation specific way. Once a job completes, The workflow engine must set the job to either `successful` or `failed`. The workflow engine must then evaluate the tasks's output mappings and write them to the symbol table. If evaluation of the output mappings fails, set the status to `error`. Otherwise, set the status to `successful` if the return code is zero and `failed` otherwise. - -> **TODO**: Write a section about evaluating output mappings - -When evaluating if a `not_started` job should be started, the workflow engine must set the status to `skipped` and leave the return code undefined if any of is inputs references a job that is in either of these states: `error`, `failed`, or `skipped`. - -> **Job Avoidance**: When examining a `not_started` task, the engine might have a way of noticing that this particular task was already run in the past and we have results for it already. If that's the case, the workflow engine may set the state immediately to `finished` and fulfill its output mappings contract. - -## Workflow Termination - -A workflow is defined as finished if the following conditions are met: - -* All entries in the **Execution Table** are in a terminal state. - -## Example 1: Two-step parallel workflow - -``` -task grep_words { - command { - grep '^${start}' ${File infile} - } - output { - Array[String] words = tsv(stdout()) - } -} -workflow wf { - File dictionary - call grep_words as grep_pythonic_words { - input: start="pythonic", infile=dictionary - } - call grep_words as grep_workf_words { - input: start="workf", infile=dictionary - } -} -``` - -Given these parameters values: - -|Variable |Value | -|-------------------------|---------------------------| -|wf.dictionary |/usr/share/dict/words | - -At start of execution, **Symbol Table** looks like this: - -|Name |Value |Type |I/O | -|-----------------------------|---------------------------------------|-------------|------| -|wf.dictionary |/usr/share/dict/words |file |input | -|wf.grep_pythonic_words.start |pythonic |String |input | -|wf.grep_pythonic_words.infile|%ref:wf.dictionary |file |input | -|wf.grep_pythonic_words.words |['pythonic', 'pythonical'] |Array[String]|output| -|wf.grep_workf_words.start |workf |String |input | -|wf.grep_workf_words.infile |%ref:wf.dictionary |file |input | -|wf.grep_workf_words.words |['workfellow', 'workfolk', 'workfolks']|Array[String]|output| - -And **Execution Table** looks like this: - -|Name |Status |Index|PID |rc | -|----------------------|-----------|-----|----|----| -|wf.grep_pythonic_words|not_started| | | | -|wf.grep_workf_words |not_started| | | | - -Any field beyond **Task**, **Status**, and **Index** are optional and implementation specific. In this example, **PID** and **rc** capture the fact that these jobs are running as subprocesses. - -The engine will then examine all tasks with a the `not_started` status and look up all their input variables in the **symbol table**. If there is a value for each of the inputs, the task is started and the status is changed to `started`. The status of `started` means that the workflow engine has started the job in an implementation-specific way. Or it might mean simply launching a sub-process in which case the *System Status* field might not be necessary (and perhaps there would be a column for PID). - -|Name |Status |Index|PID |rc | -|----------------------|-----------|-----|----|----| -|wf.grep_pythonic_words|started | |1643| | -|wf.grep_workf_words |started | |1644| | - -The workflow engine is responsible for updating all fields besides the first three in an implementation specific way. Once the workflow engine determines that a job is done executing, it updates the **Status** field to either `successful` or `failed`. - -|Name |Status |Index|PID |rc | -|----------------------|-----------|-----|----|----| -|wf.grep_pythonic_words|successful | |1643|0 | -|wf.grep_workf_words |started | |1644| | - -Upon a job finishing and the return code being set, the `outputs` section is processed for that task. In the case of the above example, when `wf.grep_pythonic_words` finishes the output mapping `Array[String] words = tsv(stdout())` is processed. The workflow engine looks for the file called `stdout` in the current working directory where the process was launched. If it is not found, the task's status gets set to `error` and optionally an error message field gets set describing the nature of why the field was set. - -A workflow is defined as finished when all the tasks in the workflow are in a terminal state (i.e. `skipped`, `successful`, `failed`, or `error`). For this example, let's say the first task succeeds and the second task fails. The terminal state of the workflow is: - -|Name |Status |Index|PID |rc | -|----------------------|-----------|-----|----|----| -|wf.grep_pythonic_words|successful | |1643|0 | -|wf.grep_workf_words |failed | |1644|1 | - -And now since all entries in the execution table are in a terminal state, the workflow is finished. - -## Example 2: Loops - -This is a less common case, and likely will only be used in the iterative scatter-gather tasks. - -``` -task scatter_task { - command <<< - egrep ^.{${Int count}}$ ${File in} || exit 0 - >>> - output { - Array[String] words = tsv(stdout()) - } -} - -task gather_task { - command { - python3 < 3) { - scatter(filename in files) { - call scatter_task { - input: in=filename, count=count - } - } - call gather_task { - input: count=count, word_lists=scatter_task.words - output: count=count - } - } -} -``` - -If we run the task specifying the value for the two required parameters as: - -|Parameter|Value | -|---------|--------------------------------| -|wf.files |["test/languages", "test/words"]| -|wf.count |6 | - -In this example, the file `test/languages` contains: - -``` -c -python -java -c++ -js -scala -d -c# -objective-c -php -vb -sql -R -shell -lisp -``` - -and `test/words` contains: - -``` -ace -act -bad -bag -cage -cake -call -doggy -daily -dairy -``` - -**Symbol Table Construction** - -The symbol table starts out with the following values defined as a combination of initialized values and user-provided values - -|Name |Value |Type |I/O | -|-----------------------------|--------------------------------|--------------------|------| -|wf.files |['test/languages', 'test/words']|Array[File] |input | -|wf.count |6 |Int |input | -|wf._w5._s6.filename |%ref:wf.files |Array[File] |input | -|wf._w5._s6.scatter_task.count|%ref:wf.count |String |input | -|wf._w5._s6.scatter_task.in |%ref:wf._w5._s6.filename |File |input | -|wf._w5._s6.scatter_task.words| |Array[Array[String]]|output| -|wf._w5.gather_task.count |%ref:wf.count |Int |input | -|wf._w5.gather_task.word_lists|%expr:scatter_task.words |Array[Array[String]]|input | -|wf._w5.gather_task.count |%ref:wf.count |Int |output| - -The fully qualified names in this symbol table also include the scopes that a `call` lives inside. `_s6` refers to a scatter block with an arbitrary unique ID of 6. `_w5` refers to a while loop with arbitrary unique ID of 5. - -If `scatter_task` were not being executed inside of a scatter block, the output (`wf._w5._s6.scatter_task.words`) would only be of type `Array[String]`, but any task executed in a scatter block will have all of its outputs in an array (one entry for each scatter). This is why `wf._w5._s6.scatter_task.words` has type `Array[Array[String]]`. Since the scatter and gather steps are inside of a loop, the entries that exist initially in the symbol table are only place holders that will be copied for each iteration. - -**Execution Table Construction** - -With this particular task, all of the calls happen inside of the loop. Since we don't know initially if the loop will be executed even once (we might not run `scatter_task` or `gather_task` even once) based on what the loop condition evaluates to. Because of this, only one entry exists in the execution table initially, and that entry is for the loop itself. - -|Name |Status |Index|Iter|PID |rc | -|------|-----------|-----|----|----|----| -|wf._w5|not_started| |0 | | | - -Every time the execution table is evaluated (for example, by polling or on an event), the loop table is also evaluated. The loop's condition is evaluated under the following circumstance: - -1. Execution Table status for the loop is `not_started` AND iteration == 0 -2. Execution Table status for the loop is `running` AND iteration > 0 AND all entries in the execution table for the current iteration are `successful` - -If either of these conditions is met, the loop is looked up and the condition evaluated. The following actions take place based on the evaluation of the loop condition: - -* Condition evalutes to `undefined`: this means one of the variables referenced in the conditional is undefined. - * Do nothing in this case. It should never be the case that a condition evaluates to `true` or `false` on one pass and then `undefined` on subsequent pass. -* Condition evaluates to `true`: - * Increment the iteration field on the execution table. - * Set status to `running` on the execution table for this loop, if it's not already set. - * Add all child nodes of the loop to the execution table and symbol table with a suffix of `._i1` where the number at the end is the current iteration number. They should be in state `not_started`. If any child nodes of the loop are themselves loops, add those to the control flow table. -* Condition evaluates to `false`: - * Mark the loop in the execution table as `successful` - -If the loop is `running` in the execution table, but any of the tasks in the current iteration is NOT `successful`, then mark the loop as `failed` which will prevent any further iterations. Let any tasks that are not in a terminal state finish. - -After the tables are constructed, since the loop is `not_started` and the iteration count is 0, the condition is evaluated. The condition is `count > 3`, which is looked up in the symbol table and has a value of `true`. Therefore: - -* Increment iteration value to 1 -* Set status to `running` -* Add nodes to the symbol table and execution table for this iteration - -**Symbol Table** - -|Name |Value |Type |I/O | -|---------------------------------|--------------------------------|--------------------|------| -|wf.files |['test/languages', 'test/words']|Array[File] |input | -|wf.count |6 |Int |input | -|wf._w5._s6.filename |%ref:wf.files |Array[File] |input | -|wf._w5._s6.scatter_task.count |%ref:wf.count |String |input | -|wf._w5._s6.scatter_task.in |%ref:wf._w5._s6.filename |file |input | -|wf._w5._s6.scatter_task.words | |Array[Array[String]]|output| -|wf._w5.gather_task.count |%ref:wf.count |Int |input | -|wf._w5.gather_task.word_lists |%expr:scatter_task.words |Array[Array[String]]|input | -|wf._w5.gather_task.count |%ref:wf.count |Int |output| -|wf._w5._s6.scatter_task._i1.count|%ref:wf.count |String |input | -|wf._w5._s6.scatter_task._i1.in |%ref:wf._w5._s6.filename |file |input | -|wf._w5._s6.scatter_task._i1.words| |Array[Array[String]]|output| -|wf._w5.gather_task._i1.count |%ref:wf.count |Int |input | -|wf._w5.gather_task._i1.word_lists|%expr:scatter_task.words |Array[Array[String]]|input | -|wf._w5.gather_task._i1.count |%ref:wf.count |Int |output| - -**Execution Table** - -|Name |Status |Index|Iter|PID |rc | -|---------------------------|-----------|-----|----|----|----| -|wf._w5 |started | |1 | | | -|wf._w5._s6.scatter_task._i1|not_started|0 | | | | -|wf._w5._s6.scatter_task._i1|not_started|1 | | | | -|wf._w5.gather_task._i1 |not_started| | | | | - -When the workflow engine then evaluates the execution table again, it sees that both entries for `wf._w5._s6.scatter_task._i1` are ready to run. - -The workflow engine then runs the commands for the two `scatter_task` calls in the current iteration: - -``` -egrep ^.{6}$ test/languages || exit 0 -``` - -``` -egrep ^.{6}$ python/test/words || exit 0 -``` - -**Symbol Table** - -|Name |Value |Type |I/O | -|---------------------------------|--------------------------------|--------------------|------| -|wf.files |['test/languages', 'test/words']|Array[File] |input | -|wf.count |6 |Int |input | -|wf._w5._s6.filename |%ref:wf.files |Array[File] |input | -|wf._w5._s6.scatter_task.count |%ref:wf.count |String |input | -|wf._w5._s6.scatter_task.in |%ref:wf._w5._s6.filename |file |input | -|wf._w5._s6.scatter_task.words | |Array[Array[String]]|output| -|wf._w5.gather_task.count |%ref:wf.count |Int |input | -|wf._w5.gather_task.word_lists |%expr:scatter_task.words |Array[Array[String]]|input | -|wf._w5.gather_task.count |%ref:wf.count |Int |output| -|wf._w5._s6.scatter_task._i1.count|%ref:wf.count |String |input | -|wf._w5._s6.scatter_task._i1.in |%ref:wf._w5._s6.filename |file |input | -|wf._w5._s6.scatter_task._i1.words|[['python'], []] |Array[Array[String]]|output| -|wf._w5.gather_task._i1.count |%ref:wf.count |Int |input | -|wf._w5.gather_task._i1.word_lists|%expr:scatter_task.words |Array[Array[String]]|input | -|wf._w5.gather_task._i1.count |%ref:wf.count |Int |output| - -The only thing that changed was a new value for `wf._w5._s6.scatter_task._i1.words` - -**Execution Table** - -|Name |Status |Index|Iter|PID |rc | -|---------------------------|-----------|-----|----|-----|----| -|wf._w5 |started | |1 | | | -|wf._w5._s6.scatter_task._i1|successful |0 | |47844|0 | -|wf._w5._s6.scatter_task._i1|successful |1 | |47846|0 | -|wf._w5.gather_task._i1 |not_started| | | | | - -Then, the `gather_task` step is ready because the value for `wf._w5._s6.scatter_task._i1.words` is now populated, so the engine runs the command: - -``` -python3 < grep-output.tsv - } - output { - File grepped = "grep-output.tsv" - } -} - -workflow cut_grep { - call cut_sh - call grep {input:file1=cut_sh.out2} - output { - grep.*, cut_sh.out1 - } -} -``` - -## Example 7: Output a Map - -Task that operates on a file that's a map of string->file (sample_id -> clean_bam_file) - -``` -task map-test { - command { sh foobar.sh ${Map[String, file] in} > something.tsv } - output { Map[String, Int] out = "something.tsv" } -} -``` - -# Notes & Things to Clarify - -## Workflow output syntax - -See [corresponding section](#outputs) for details - -## public/private -or- export statements - -We need a way to declare which parts of a WDL file are exported and which parts are private - -Perhaps a NodeJS-like model of explicit exports: - -``` -export (task1, task2, wf) -``` - -Perhaps assume that everything that isn't marked public is automatically private? - -``` -task t1 {...} -task t2 {...} -public wf my_wf {...} -``` - -## Seven Bridges Considerations - -1) Dependencies on ports in addition to dependencies on tasks (e.g. ps.procs instead of just ps) -2) Defining engine behavior around setting current working directory for tasks. Dir must be empty, task writes outputs to directory, support relative paths. Absolute paths may work, engine specific. -3) Enumerating success return codes? Wrapper script? extra definition on the task? Special output mapping (Int rc = ...)? -4) Defining objects (class Sample...) -5) Heterogeneous arrays -6) Supplying values from an object onto the command line - - Sample s; ${s.id} ${s.bam}; ??? - - Array[Sample] a; ${a.join(', ')}; ??? -7) What about files that travel together, like .bam and .bai files? -8) "docker" runtime format -9) Expressions in runtime section -10) Specifying environment variables and assembling / localizing a configuration file (e.g. in /etc) - -## Various loop issues - -* What if another downstream task sets a variable used in the loop expression? How do we know when we're *really* done executing the loop? - * Solution: variables in the loop condition may only be set from within the loop otherwise it's a syntax error -* What if variables in the condition is initialized but nothing inside the loop changes the var? - * Solution: Check that **exactly one** task inside the loop sets at least one variable in the loop condition -or- don't check and let it either infinitely loop or set a max iteration amount -* What if there's a completely stand-alone task inside a loop? - * Solution: This task would be pointless because it'd always run with the same inputs. This should be an error -* How are loop conditions evaluated? - * Should return one of three values: `undefined` when one of the variables is undefined, or `true` or `false` if everything is defined. - - -## Explicit vs. Implicit output mappings - -Problem: It's not obvious from this output mapping if it should parse it as TSV or JSON: - -``` -Map[String, String] my_var = stdout() -``` - -We could just say "all outputs for a task must use the exact same serialization method". - -Though, the more I play around with this syntax the more I don't like the implicit nature of it. - -* The syntax doesn't guide the user well. It's a convention, and not an obvious one. -* It *looks* like a type mismatch -* Trying to do something like `Map[String, String] my_var = subdir + "my_output"` looks more like a type mismatch. - -Since function calls are *very* easily supported in WDL, I propose going back to two built-in functions: `tsv()` and `json()` - -* `Map[String, String] blah = tsv(subdir + "somefile.tsv")` -* `Array[File] out = json(stdout())` - -What about reading primitives from files? - -* `int c = stdout()` -* `int c = read_int(stdout())` - -## Additional Syntax Checks - -* One `command` section -* `sep` is specified if postfix quantifier is a `+` or `*` -* Scatter block must contain at least one task that uses the iteration variable - * If it doesn't, it must depend on something that does. -* A loop expression can not be statically 'true' (e.g. `true`, `1==1`, etc) or else this is an infinite loop by definition. -* A loop must be able to modify the condition. One of the outputs of one of the tasks must alter one of the identifiers in the loop condition -* No circular dependencies? -* Tasks outside of the loop cannot set a variable defined in the loop. - -# Implementations - -Current implementations of the workflow description language description: - -* [Reference implementation (Python)](python/) diff --git a/python/wdl/toc.py b/python/wdl/toc.py index 438db75..8df2342 100644 --- a/python/wdl/toc.py +++ b/python/wdl/toc.py @@ -1,8 +1,13 @@ import re +import sys +if len(sys.argv) < 2: + println("Usage: toc.py [markdown file]") + sys.exit(-1) +filename = sys.argv[1] as_link = lambda x: re.sub(r'[^a-zA-Z0-9-_]', '', x.lower().replace(' ', '-')) escape = lambda x: x.replace('[', '\\[').replace(']', '\\]') toc = [] -with open('README.md') as fp: +with open(filename) as fp: contents = fp.read() for line in contents.split('\n'): header = re.match(r'^(#+)(.*)', line) @@ -16,5 +21,5 @@ )) toc_re = re.compile(r'<\!---toc start-->(.*?)<\!---toc end-->', flags=re.DOTALL) (contents, replacements) = toc_re.subn('\n\n{}\n\n'.format('\n'.join(toc)), contents) -with open('README.md', 'w') as fp: +with open(filename, 'w') as fp: fp.write(contents)