-
Notifications
You must be signed in to change notification settings - Fork 357
/
reference.conf
691 lines (600 loc) · 25.2 KB
/
reference.conf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
##################################
# Cromwell Reference Config File #
##################################
# This is the reference config file that contains all the default settings.
# The application config contains the default _override_ settings for cromwell.
# Make your edits/overrides in your cromwell.conf and be sure to `include required(classpath("application"))`.
# Add/See documented examples in cromwell.examples.conf.
webservice {
port = 8000
interface = 0.0.0.0
binding-timeout = 5s
instance.name = "reference"
}
akka {
http {
client {
parsing {
illegal-header-warnings = off
}
}
}
actor.default-dispatcher.fork-join-executor {
# Number of threads = min(parallelism-factor * cpus, parallelism-max)
# Below are the default values set by Akka, uncomment to tune these
#parallelism-factor = 3.0
#parallelism-max = 64
}
priority-mailbox {
mailbox-type = "akka.dispatch.UnboundedControlAwareMailbox"
}
bard-actor-mailbox {
mailbox-type = "akka.dispatch.UnboundedControlAwareMailbox"
}
dispatchers {
# A dispatcher for actors performing blocking io operations
# Prevents the whole system from being slowed down when waiting for responses from external resources for instance
io-dispatcher {
type = Dispatcher
executor = "fork-join-executor"
# Using the forkjoin defaults, this can be tuned if we wish
}
# A dispatcher for actors handling API operations
# Keeps the API responsive regardless of the load of workflows being run
api-dispatcher {
type = Dispatcher
executor = "fork-join-executor"
}
# A dispatcher for engine actors
# Because backends behavior is unpredictable (potentially blocking, slow) the engine runs
# on its own dispatcher to prevent backends from affecting its performance.
engine-dispatcher {
type = Dispatcher
executor = "fork-join-executor"
}
# A dispatcher used by supported backend actors
backend-dispatcher {
type = Dispatcher
executor = "fork-join-executor"
}
# A dispatcher used for the service registry
service-dispatcher {
type = Dispatcher
executor = "fork-join-executor"
}
# A dispatcher to bulkhead the health monitor from the rest of the system. Sets throughput low in order to
# ensure the monitor is fairly low priority
health-monitor-dispatcher {
type = Dispatcher
executor = "thread-pool-executor"
thread-pool-executor {
fixed-pool-size = 4
}
throughput = 1
}
# Note that without further configuration, all other actors run on the default dispatcher
}
coordinated-shutdown.phases {
abort-all-workflows {
# This phase is used to give time to Cromwell to abort all workflows upon shutdown.
# It's only used if system.abort-jobs-on-terminate = true
# This timeout can be adjusted to give more or less time to Cromwell to abort workflows
timeout = 1 hour
depends-on = [service-unbind]
}
stop-io-activity{
# Adjust this timeout according to the maximum amount of time Cromwell
# should be allowed to spend flushing its database queues
timeout = 30 minutes
depends-on = [service-stop]
}
}
}
system {
# If 'true', a SIGINT will trigger Cromwell to attempt to abort all currently running jobs before exiting
# Defaults to false in server mode, and true in run mode.
# abort-jobs-on-terminate = false
# If 'true', a SIGTERM or SIGINT will trigger Cromwell to attempt to gracefully shutdown in server mode,
# in particular clearing up all queued database writes before letting the JVM shut down.
# The shutdown is a multi-phase process, each phase having its own configurable timeout. See the Dev Wiki for more details.
graceful-server-shutdown = true
# Cromwell will cap the number of running workflows at N
max-concurrent-workflows = 5000
# Cromwell will launch up to N submitted workflows at a time, regardless of how many open workflow slots exist
# Deviating from 1 is not recommended for multi-runner setups due to possible deadlocks. [BW-962]
max-workflow-launch-count = 1
# Workflows will be grouped by the value of the specified field in their workflow options.
#
# Currently, hog-safety will limit concurrent jobs within a hog group:
# For each hog-group and backend, only (concurrent-job-limit / hog-factor) jobs will be run at any given time.
#
# In the future, hog-groups may also apply to other finite resources
#
# You can re-use an existing workflow option to allow it to also indicate hog-group,
# or add a new field created specifically for hog-safety.
#
hog-safety {
# Set this field in the workflow-options file to assign a hog group:
workflow-option = "hogGroup"
# Setting to '1' means that a hog group can use the full resources of the Cromwell instance if it needs to:
hog-factor = 1
# Time to wait before repeating token log messages - including "queue state" and "group X is a hog" style messages.
# Leave at 0 to never log these types of message.
token-log-interval-seconds = 0
}
# Number of seconds between workflow launches
new-workflow-poll-rate = 20
# Since the WorkflowLogCopyRouter is initialized in code, this is the number of workers
number-of-workflow-log-copy-workers = 10
# Default number of cache read workers
number-of-cache-read-workers = 25
# The maximum number of file hashes a single job will ask for at a time. The job will not ask for any more hashes
# until it receives a response to its outstanding request.
file-hash-batch-size = 50
# Maximum scatter width per scatter node. Cromwell will fail the workflow if the scatter width goes beyond N
max-scatter-width-per-scatter = 1000000
# Total max. jobs that can be created per root workflow. If it goes beyond N, Cromwell will fail the workflow by:
# - no longer creating new jobs
# - let the jobs that have already been started finish, and then fail the workflow
total-max-jobs-per-root-workflow = 1000000
# Option to allow Cromwell to delete intermediate output files once the workflow succeeds
delete-workflow-files = false
io {
# Global Throttling - This is mostly useful for GCS and can be adjusted to match
# the quota available on the GCS API
throttle {
number-of-requests = 100000
per = 100 seconds
}
# Number of times an I/O operation should be attempted before giving up and failing it.
number-of-attempts = 5
# I/O Commands that are more than `command-backpressure-staleness` old at the time they begin to be processed will
# trigger I/O backpressure. Keep this above gcs.max-batch-duration to avoid unwanted competition with that deadline
# and consequent excessive backpressure.
command-backpressure-staleness = 6 seconds
# Extensions to an ongoing I/O backpressure will only be logged if they extend the backpressuring
# by more than this duration.
backpressure-extension-log-threshold = 1 second
# configures exponential backoff of io requests
backpressure-backoff {
# starting point
min = 10 seconds
# maximum waiting time between attempts
max = 5 minutes
# how much longer to wait between each attempt
multiplier = 2
# randomizes wait times to avoid large spikes. Must be between 0 and 1.
randomization-factor = 0.9
}
# Amount of time after which an I/O operation will timeout if no response has been received.
# Note that a timeout may result in a workflow failure so be careful not to set a timeout too low.
# Unless you start experiencing timeouts under very heavy load there should be no reason to change the default values.
timeout {
default = 3 minutes
# Copy can be a time consuming operation and its timeout can be set separately.
copy = 1 hour
}
gcs {
# `gcs.parallelism` governs GCS *batch operation* parallelism. Non-batch GCS operations like 'write' or
# 'read lines' are processed as nio commands whose parallelism is governed by `nio.parallelism` below.
parallelism = 10
# Batches of non-zero size will be sent to Google with at most `max-batch-size` commands and not having
# more than `max-batch-duration` between when the first and last command was batched.
max-batch-size = 100
max-batch-duration = 5 seconds
}
nio {
parallelism = 10
}
}
# Maximum number of input file bytes allowed in order to read each type.
# If exceeded a FileSizeTooBig exception will be thrown.
input-read-limits {
lines = 512000000
bool = 7
int = 19
float = 50
string = 512000000
json = 512000000
tsv = 512000000
map = 512000000
object = 512000000
}
# Rate at which Cromwell updates its instrumentation gauge metrics (e.g: Number of workflows running, queued, etc..)
instrumentation-rate = 5 seconds
job-rate-control {
jobs = 20
per = 10 seconds
}
# Restart checks do not involve the I/O actor at all so they may be able to run faster than the rate specified in
# `job-rate-control`, enabling faster workflow restarts.
job-restart-check-rate-control {
jobs = 50
per = 1 seconds
# The maximum number of restart checks running against the database shouldn't be driven by backend job limits,
# although the per-backend hog factor will be used for fair apportionment of restart tokens within a backend.
max-jobs = 50
# If logging of restart tokens is desired set the following to a positive value.
# token-log-interval-seconds = 300
}
# If enabled, Cromwell will not allocate new execution tokens to jobs whose hog groups are actively
# experiencing quota exhaustion.
quota-exhaustion-job-start-control {
enabled = false
# threshold (in minutes) after which a group in GROUP_METRICS_ENTRY table is no longer considered to be
# actively experiencing quota exhaustion
threshold-minutes = 15
# logging interval for which groups are in active quota exhaustion state
logging-interval = 5 minutes
}
workflow-heartbeats {
heartbeat-interval: 2 minutes
ttl: 10 minutes
write-failure-shutdown-duration: 5 minutes
write-batch-size: 10000
write-threshold: 10000
}
job-shell: "/bin/bash"
# Cromwell reads this value into the JVM's `networkaddress.cache.ttl` setting to control DNS cache expiration
dns-cache-ttl: 3 minutes
memory-retry-error-keys = ["OutOfMemory", "Killed"]
}
workflow-options {
# These workflow options will be encrypted when stored in the database
encrypted-fields: []
# AES-256 key to use to encrypt the values in `encrypted-fields`
base64-encryption-key: "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA="
# Directory where to write per workflow logs
workflow-log-dir: "cromwell-workflow-logs"
# When true, per workflow logs will be deleted after copying
workflow-log-temporary: true
# Workflow-failure-mode determines what happens to other calls when a call fails. Can be either ContinueWhilePossible or NoNewCalls.
# Can also be overridden in workflow options. Defaults to NoNewCalls. Uncomment to change:
#workflow-failure-mode: "ContinueWhilePossible"
}
# Optional call-caching configuration.
call-caching {
# Allows re-use of existing results for jobs you've already run
# (default: false)
enabled = false
# Whether to invalidate a cache result forever if we cannot reuse them. Disable this if you expect some cache copies
# to fail for external reasons which should not invalidate the cache (e.g. auth differences between users):
# (default: true)
invalidate-bad-cache-results = true
# The maximum number of times Cromwell will attempt to copy cache hits before giving up and running the job.
max-failed-copy-attempts = 1000000
}
google {
application-name = "cromwell"
# See other auths examples in cromwell.examples.conf
auths = [
{
name = "application-default"
scheme = "application_default"
}
]
}
# Filesystems available in this Crowmell instance
# They can be enabled individually in the engine.filesystems stanza and in the config.filesystems stanza of backends
# There is a default built-in local filesystem that can also be referenced as "local" as well.
filesystems {
drs {
class = "cromwell.filesystems.drs.DrsPathBuilderFactory"
# Use to share a unique global object across all instances of the factory
global {
# Class to instantiate and propagate to all factories. Takes a single typesafe config argument
class = "cromwell.filesystems.drs.DrsFileSystemConfig"
config {
resolver {
url = "https://martha-url-here or https://drshub-url-here"
# When true (and when possible) any DrsPath will be internally converted to another compatible Path such as a
# GcsPath. This enables other optimizations such as using batch requests for GCS for hashes, using the
# existing GCS downloader in Papi, etc.
preresolve = false
}
access-token-acceptable-ttl = 1 minute
}
}
}
gcs {
class = "cromwell.filesystems.gcs.GcsPathBuilderFactory"
}
s3 {
class = "cromwell.filesystems.s3.S3PathBuilderFactory"
}
http {
class = "cromwell.filesystems.http.HttpPathBuilderFactory"
}
ftp {
# When the global configuration is used, the factory constructor needs to have a third argument of the type of the global configuration
class = "cromwell.filesystems.ftp.FtpPathBuilderFactory"
# Use to share a unique global object across all instances of the factory
global {
# Class to instantiate and propagate to all factories. Takes a single typesafe config argument
class = "cromwell.filesystems.ftp.CromwellFtpFileSystems"
# Configuration to be passed
config {
# All instances of FTP filesystems are being cached and re-used across a single Cromwell instance.
# This allows control over how many connections are being established to a host for a specific user at any given time.
# Cached filesystems are removed from the cache (and their associated connections closed) when they haven't been used for the
# time configured below. This value should be sufficiently high to cover for the duration of the longest expected I/O operation.
cache-ttl = 1 day
# How long to wait trying to obtain a connection from the pool before giving up. Don't specify for no timeout
# obtain-connection-timeout = 1 hour
# Maximum number of connections that will be established per user per host. This is across the entire Cromwell instance
max-connection-per-server-per-user = 30
# Time after which a connection will be closed if idle
idle-connection-timeout = 1 hour
# FTP connection port to use
connection-port: 21
# FTP connection mode
connection-mode = "passive"
}
}
}
}
docker {
hash-lookup {
// /!\ Attention /!\
// If you disable this call caching will be disabled for jobs with floating docker tags !
enabled = true
// Time in minutes before an entry expires from the docker hashes cache and needs to be fetched again
cache-entry-ttl = "20 minutes"
// Maximum number of elements to be kept in the cache. If the limit is reached, old elements will be removed from the cache
cache-size = 200
// How should docker hashes be looked up. Possible values are "local" and "remote"
// "local": Lookup hashes on the local docker daemon using the cli
// "remote": Lookup hashes on docker hub, gcr, gar, quay
method = "remote"
// For docker repositories that support the version 2 of the manifest schema, Cromwell can retrieve the compressed size
// of the images. This compressed size is smaller than the actual size needed on disk when the docker image is pulled.
// This factor is used to multiply the compressed size and get an approximation of the size needed on disk so that the
// disk can be allocated appropriately.
// See https://github.com/docker/hub-feedback/issues/331
size-compression-factor = 3.0
// Retry configuration for http requests - across all registries
max-time-between-retries = 1 minute
max-retries = 3
// Supported registries (Docker Hub, Google, Quay) can have additional configuration set separately
azure {
// Worst case `ReadOps per minute` value from official docs
// https://github.com/MicrosoftDocs/azure-docs/blob/main/includes/container-registry-limits.md
throttle {
number-of-requests = 1000
per = 60 seconds
}
num-threads = 10
}
google {
// Example of how to configure throttling, available for all supported registries
throttle {
number-of-requests = 1000
per = 100 seconds
}
// How many threads to allocate for GCR / GAR related work
num-threads = 10
}
dockerhub.num-threads = 10
quay.num-threads = 10
}
}
engine {
# This instructs the engine which filesystems are at its disposal to perform any IO operation that it might need.
# For instance, WDL variables declared at the Workflow level will be evaluated using the filesystems declared here.
# If you intend to be able to run workflows with this kind of declarations:
# workflow {
# String str = read_string("gs://bucket/my-file.txt")
# }
# You will need to provide the engine with a gcs filesystem
# Note that the default filesystem (local) is always available.
filesystems {
local {
enabled: true
}
http {
enabled: true
}
}
}
languages {
default: WDL
WDL {
http-allow-list {
enabled: false
allowed-http-hosts: []
}
versions {
default: "draft-2"
"draft-2" {
language-factory = "languages.wdl.draft2.WdlDraft2LanguageFactory"
config {
strict-validation: false
enabled: true
}
}
"1.0" {
# WDL draft-3 was our in-progress name for what became WDL 1.0
language-factory = "languages.wdl.draft3.WdlDraft3LanguageFactory"
config {
strict-validation: true
enabled: true
}
}
"biscayne" {
# WDL biscayne is our in-progress name for what will become WDL 1.1
language-factory = "languages.wdl.biscayne.WdlBiscayneLanguageFactory"
config {
strict-validation: true
enabled: true
}
}
"cascades" {
# WDL cascades is our in-progress name for what will (probably) become WDL 2.0
language-factory = "languages.wdl.cascades.WdlCascadesLanguageFactory"
config {
strict-validation: true
enabled: true
}
}
}
}
}
ontology {
# Uncomment to enable caching of ontologies. Improves performance when loading ontologies from remote IRIs.
#cache {
# max-size = 20
#}
retries = 3
pool-size = 3
backoff-time = 2 seconds
}
# Other backend examples are in cromwell.example.backends
backend {
default = "Local"
providers {
Local {
actor-factory = "cromwell.backend.impl.sfs.config.ConfigBackendLifecycleActorFactory"
config {
include required(classpath("reference_local_provider_config.inc.conf"))
}
}
}
}
# Note: When adding a new actor that uses service registry pattern make sure that the new actor handles the graceful
# shutdown command ('ShutdownCommand'). See https://github.com/broadinstitute/cromwell/issues/2575
services {
KeyValue {
class = "cromwell.services.keyvalue.impl.SqlKeyValueServiceActor"
config {
# Similar to metadata service config, see cromwell.examples.conf
# db-batch-size = 200
# db-flush-rate = 5 seconds
}
}
MetadataService {
class = "cromwell.services.metadata.impl.MetadataServiceActor"
config {
# See cromwell.examples.conf for details on settings one can use here as they depend on the implementation
# being used.
}
}
Instrumentation {
# Default noop service - instrumentation metrics are ignored
class = "cromwell.services.instrumentation.impl.noop.NoopInstrumentationServiceActor"
# StatsD instrumentation service actor
# class = "cromwell.services.instrumentation.impl.statsd.StatsDInstrumentationServiceActor"
# Stackdriver instrumentation service actor
# class = "cromwell.services.instrumentation.impl.stackdriver.StackdriverInstrumentationServiceActor"
}
HealthMonitor {
class = "cromwell.services.healthmonitor.impl.HealthMonitorServiceActor"
# Override the standard dispatcher. In particular this one has a low throughput so as to be lower in priority
dispatcher = "akka.dispatchers.health-monitor-dispatcher"
config {
check-engine-database: false
check-papi-backends: []
}
}
LoadController {
class = "cromwell.services.loadcontroller.impl.LoadControllerServiceActor"
config {
# See cromwell.examples.conf for details on settings one can use here
}
}
Womtool {
# Default - run within the Cromwell JVM
class = "cromwell.services.womtool.impl.WomtoolServiceInCromwellActor"
}
GithubAuthVending {
class = "cromwell.services.auth.impl.GithubAuthVendingActor"
config {
enabled = false
auth.azure = false
# Set this to the service that Cromwell should retrieve Github access token associated with user's token.
# ecm.base-url = ""
}
}
// Bard is used for metrics collection in the Terra SaaS offering and is not applicable outside of it.
Bard {
class = "cromwell.services.metrics.bard.impl.BardEventingActor"
config {
enabled = false
bard {
base-url = ""
connection-pool-size = 0
}
}
}
CostCatalogService {
class = "cromwell.services.cost.GcpCostCatalogService"
config {
catalogExpirySeconds = 86400
}
}
}
include required(classpath("reference_database.inc.conf"))
# Configuration for load-control related values
load-control {
## Queue Size Thresholds ##
# Cromwell centralizes some operations through singleton actors (possibly acting as routers).
# This allows for a more efficient control, throttling, and potentially batching of those operations which overall improves throughput.
# In order to do that, those operations are queued until they can be performed.
# Those queues are for the most part unbounded in size, which can become a problem under heavy load.
# Each actor can however let the load controller service know when it considers its work load to be abnormally high.
# In the case of those queuing actors, this means that their queue size is over a certain threshold.
# This section allows to configure those threshold values.
# They should be kept at a reasonable number where reasonable will depend on your system and how much load Cromwell is submitted to.
# If they're too high they could end up using a lot of memory, if they're too small any small spike will be considered a high load and the system will automatically slow itself down.
# Benchmarking is recommended to find the values that suit your use case best.
# If you use the statsD instrumentation service, the queue size and throughput of these actors are instrumented and looking at their value over time can also help you find the right configuration.
job-store-read = 10000
job-store-write = 10000
# call cache read actors are routed (several actors are performing cache read operations
# this threshold applies to each routee individually, so set it to a lower value to account for that
# to change the number of routees, see the services.number-of-cache-read-workers config value
call-cache-read = 1000
call-cache-write = 10000
key-value-read = 10000
key-value-write = 10000
# The I/O queue has the specificity to be bounded. This sets its size
io-queue-size = 10000
# If the I/O queue is full, subsequent requests are rejected and to be retried later.
# This time window specifies how much time without request rejection constitutes a "back to normal load" event.
# The amount of time to be waited without request rejection can vary between `io-normal-window-minimum` and
# `io-normal-window-maximum` depending on severity (how long I/O commands have been waiting to run).
io-normal-window-minimum = 20 seconds
io-normal-window-maximum = 60 seconds
# metadata is an order of magnitude higher because its normal load is much higher than other actors
metadata-write = 100000
## Backend specific ##
# Google requests to the Pipelines API are also queued and batched
papi-requests = 10000
## Backend specific ##
# Google requests to the Batch API are also queued and batched
batch-requests = 10000
## Misc. ##
# How often each actor should update its perceived load
monitoring-frequency = 5 seconds
}
ga4gh {
wes {
# URLs for site-specific auth process as well as contact info for security issues.
# These aren't really compelling defaults, but there aren't really any compelling defaults
auth-instructions-url = "https://cromwell.readthedocs.io/en/stable/"
contact-info-url = "https://cromwell.readthedocs.io/en/stable/"
}
}
workflow-state-callback {
enabled: false
## The number of threads to allocate for performing callbacks
# num-threads: 5
# endpoint: "http://example.com/foo" # Can be overridden in workflow options
# auth.azure: true
## Users can override default retry behavior if desired
# request-backoff {
# min: "3 seconds"
# max: "5 minutes"
# multiplier: 1.1
# }
# max-retries = 10
}