Skip to content

Commit

Permalink
demux changes to support novaseq (#868)
Browse files Browse the repository at this point in the history
* reduce demux memory footprint to 80% of system max

* remove force_gc=false default from viral-ngs

remove force_gc=false default from viral-ngs, as force_gc=true is now the default in Picard: https://broadinstitute.github.io/picard/command-line-overview.html#IlluminaBasecallsToSam

* maxReadsInRamPerTile, maxRecordsInRam, and forceGC in demux WDL workflow

* memory to 85%

* autoscale some novoseq demux params, allowing override by passed-in param values

* tune -Xmx memory limit for Novaseq demux

* max_mismatches 0 -> 1; minimum_base_quality 25 -> 10

max_mismatches 0 -> 1; minimum_base_quality 25 -> 10
  • Loading branch information
tomkinsc committed Jul 27, 2018
1 parent ebe4d98 commit dcc14e1
Show file tree
Hide file tree
Showing 4 changed files with 87 additions and 9 deletions.
38 changes: 38 additions & 0 deletions docker/run_tile_count.sh
@@ -0,0 +1,38 @@
#!/bin/bash
set -e -o pipefail

if [ $# -eq 0 ]; then
echo "Usage: $(basename $0) path/to/RunInfo.xml"
exit 0
fi

if [ ! -e "$1" ]; then
echo "The specified file does not exist: $1"
exit 1
fi

# Parse the lane count & run ID from RunInfo.xml file
lane_count=$(xmllint --xpath "string(//Run/FlowcellLayout/@LaneCount)" $1)
if [ -z "$lane_count" ]; then
echo "Could not parse LaneCount from RunInfo.xml. Please check RunInfo.xml is properly formatted"
fi

surface_count=$(xmllint --xpath "string(//Run/FlowcellLayout/@SurfaceCount)" $1)
if [ -z "$surface_count" ]; then
echo "Could not parse SurfaceCount from RunInfo.xml. Please check RunInfo.xml is properly formatted"
fi

swath_count=$(xmllint --xpath "string(//Run/FlowcellLayout/@SwathCount)" $1)
if [ -z "$swath_count" ]; then
echo "Could not parse SwathCount from RunInfo.xml. Please check RunInfo.xml is properly formatted"
fi

tile_count=$(xmllint --xpath "string(//Run/FlowcellLayout/@TileCount)" $1)
if [ -z "$tile_count" ]; then
echo "Could not parse TileCount from RunInfo.xml. Please check RunInfo.xml is properly formatted"
fi

# total data size more roughly tracks total tile count
total_tile_count=$((lane_count*surface_count*swath_count*tile_count))

echo $total_tile_count
2 changes: 1 addition & 1 deletion pipes/WDL/dx-launcher/demux_launcher.yml
Expand Up @@ -121,7 +121,7 @@ runSpec:
min_base_quality=20
demux_threads=20 # with NovaSeq-size output, OOM errors can sporadically occur with higher thread counts
echo "Detected $total_tile_count tiles, interpreting as NovaSeq run, executing on a $demux_instance_type machine."
echo " **Note: Q23 threshold used since NovaSeq with RTA3 writes only four Q-score values: 2, 12, 23, and 37.**"
echo " **Note: Q20 threshold used since NovaSeq with RTA3 writes only four Q-score values: 2, 12, 23, and 37.**"
echo " See: https://www.illumina.com/content/dam/illumina-marketing/documents/products/appnotes/novaseq-hiseq-q30-app-note-770-2017-010.pdf"
elif [ "$total_tile_count" -gt 1408 ]; then
tar_consolidation_instance_size="mem1_ssd2_x36"
Expand Down
53 changes: 47 additions & 6 deletions pipes/WDL/workflows/tasks/tasks_demux.wdl
Expand Up @@ -36,14 +36,18 @@ task illumina_demux {
String? sequencingCenter

String? flowcell
Int? minimumBaseQuality
Int? minimumBaseQuality = 10
Int? maxMismatches = 1
Int? minMismatchDelta
Int? maxNoCalls
String? readStructure
Int? minimumQuality = 10
Int? threads = 36
Int? threads = 30
String? runStartDate
Int? maxReadsInRamPerTile
Int? maxRecordsInRam
Boolean? forceGC=true


parameter_meta {
flowcell_tgz : "stream" # for DNAnexus, until WDL implements the File| type
Expand All @@ -52,6 +56,9 @@ task illumina_demux {
command {
set -ex -o pipefail

# find N% memory
mem_in_mb=`/opt/viral-ngs/source/docker/mem_in_mb_85.sh`

if [ -d /mnt/tmp ]; then
TMPDIR=/mnt/tmp
fi
Expand All @@ -61,8 +68,39 @@ task illumina_demux {
${flowcell_tgz} $FLOWCELL_DIR \
--loglevel=DEBUG

# find N% memory
mem_in_mb=`/opt/viral-ngs/source/docker/mem_in_mb_90.sh`
total_tile_count=$("/opt/viral-ngs/source/docker/run_tile_count.sh $FLOWCELL_DIR/RunInfo.xml")

if [ "$total_tile_count" -le 50 ]; then
echo "Detected $total_tile_count tiles, interpreting as MiSeq run."
elif [ "$total_tile_count" -le 150 ]; then
echo "Detected $total_tile_count tiles, interpreting as HiSeq2k run."
elif [ "$total_tile_count" -le 896 ]; then
echo "Detected $total_tile_count tiles, interpreting as HiSeq4k run."
elif [ "$total_tile_count" -le 1408 ]; then
mem_in_mb=$(/opt/viral-ngs/source/docker/mem_in_mb_80.sh)
demux_threads=20 # with NovaSeq-size output, OOM errors can sporadically occur with higher thread counts
echo "Detected $total_tile_count tiles, interpreting as NovaSeq run."
echo " **Note: Q20 threshold used since NovaSeq with RTA3 writes only four Q-score values: 2, 12, 23, and 37.**"
echo " See: https://www.illumina.com/content/dam/illumina-marketing/documents/products/appnotes/novaseq-hiseq-q30-app-note-770-2017-010.pdf"
elif [ "$total_tile_count" -gt 1408 ]; then
demux_threads=$(echo "$demux_instance_type" | cut -dx -f2)
echo "Tile count: $total_tile_count tiles (unknown instrument type)."
fi

# use the passed-in (or default) WDL value first, then fall back to the auto-scaled value
# if the result of this is null (nothing is passed in, no autoscaled value, no param is passed to the command)
if [ -n "${minimumBaseQuality}" ]; then demux_min_base_quality="${minimumBaseQuality}"; else demux_min_base_quality="$demux_min_base_quality"; fi
if [ -n "$demux_min_base_quality" ]; then demux_min_base_quality="--minimum_base_quality=$demux_min_base_quality";fi

if [ -n "${threads}" ]; then demux_threads="${threads}"; else demux_threads="$demux_threads"; fi
if [ -n "$demux_threads" ]; then demux_threads="--threads=$demux_threads"; fi


if [ -n "${maxReadsInRamPerTile}" ]; then max_reads_in_ram_per_tile="${maxReadsInRamPerTile}"; else max_reads_in_ram_per_tile="$max_reads_in_ram_per_tile"; fi
if [ -n "$max_reads_in_ram_per_tile" ]; then max_reads_in_ram_per_tile="--max_reads_in_ram_per_tile=$max_reads_in_ram_per_tile"; fi

if [ -n "${maxRecordsInRam}" ]; then max_records_in_ram="${maxRecordsInRam}"; else max_records_in_ram="$max_records_in_ram"; fi
if [ -n "$max_records_in_ram" ]; then max_records_in_ram="--max_records_in_ram=$max_records_in_ram"; fi

# note that we are intentionally setting --threads to about 2x the core
# count. seems to still provide speed benefit (over 1x) when doing so.
Expand All @@ -76,15 +114,18 @@ task illumina_demux {
--outMetrics=metrics.txt \
--commonBarcodes=barcodes.txt \
${'--flowcell=' + flowcell} \
${'--minimum_base_quality=' + minimumBaseQuality} \
$demux_min_base_quality \
${'--max_mismatches=' + maxMismatches} \
${'--min_mismatch_delta=' + minMismatchDelta} \
${'--max_no_calls=' + maxNoCalls} \
${'--read_structure=' + readStructure} \
${'--minimum_quality=' + minimumQuality} \
${'--run_start_date=' + runStartDate} \
$max_reads_in_ram_per_tile \
$max_records_in_ram \
--JVMmemory="$mem_in_mb"m \
${'--threads=' + threads} \
$demux_threads \
${true='--force_gc=true' false="--force_gc=false" forceGC} \
--compression_level=5 \
--loglevel=DEBUG

Expand Down
3 changes: 1 addition & 2 deletions tools/picard.py
Expand Up @@ -458,7 +458,7 @@ def execute(
class ExtractIlluminaBarcodesTool(PicardTools):
subtoolName = 'ExtractIlluminaBarcodes'
jvmMemDefault = '8g'
defaults = {'read_structure': '101T8B8B101T', 'max_mismatches': 0, 'minimum_base_quality': 25, 'num_processors': 0}
defaults = {'read_structure': '101T8B8B101T', 'max_mismatches': 1, 'minimum_base_quality': 10, 'num_processors': 0}
option_list = (
'read_structure', 'max_mismatches', 'minimum_base_quality', 'min_mismatch_delta', 'max_no_calls',
'minimum_quality', 'compress_outputs', 'num_processors'
Expand Down Expand Up @@ -502,7 +502,6 @@ class IlluminaBasecallsToSamTool(PicardTools):
'max_reads_in_ram_per_tile': 200000,
'max_records_in_ram': 1000000,
'num_processors': 0,
'force_gc': False,
'include_non_pf_reads': False,
'compression_level': 7,
}
Expand Down

0 comments on commit dcc14e1

Please sign in to comment.