diff --git a/assembly.py b/assembly.py index ad03bffb1..d482c11a2 100755 --- a/assembly.py +++ b/assembly.py @@ -356,7 +356,7 @@ def assemble_spades( kmer_sizes=(55,65), n_reads=10000000, outReads=None, - mask_errors=False, + always_succeed=False, max_kmer_sizes=1, mem_limit_gb=8, threads=None, @@ -379,7 +379,7 @@ def assemble_spades( contigs_untrusted=contigs_untrusted, contigs_trusted=contigs_trusted, contigs_out=out_fasta, filter_contigs=filter_contigs, min_contig_len=min_contig_len, - kmer_sizes=kmer_sizes, mask_errors=mask_errors, max_kmer_sizes=max_kmer_sizes, + kmer_sizes=kmer_sizes, always_succeed=always_succeed, max_kmer_sizes=max_kmer_sizes, spades_opts=spades_opts, mem_limit_gb=mem_limit_gb, threads=threads) except subprocess.CalledProcessError as e: @@ -403,6 +403,9 @@ def parser_assemble_spades(parser=argparse.ArgumentParser()): parser.add_argument('--outReads', default=None, help='Save the trimmomatic/prinseq/subsamp reads to a BAM file') parser.add_argument('--filterContigs', dest='filter_contigs', default=False, action='store_true', help='only output contigs SPAdes is sure of (drop lesser-quality contigs from output)') + parser.add_argument('--alwaysSucceed', dest='always_succeed', default=False, action='store_true', + help='if assembly fails for any reason, output an empty contigs file, rather than failing with ' + 'an error code') parser.add_argument('--minContigLen', dest='min_contig_len', type=int, default=0, help='only output contigs longer than this many bp') parser.add_argument('--spadesOpts', dest='spades_opts', default='', help='(advanced) Extra flags to pass to the SPAdes assembler') @@ -415,7 +418,7 @@ def parser_assemble_spades(parser=argparse.ArgumentParser()): __commands__.append(('assemble_spades', parser_assemble_spades)) def gapfill_gap2seq(in_scaffold, in_bam, out_scaffold, threads=None, mem_limit_gb=4, time_soft_limit_minutes=60.0, - random_seed=0, gap2seq_opts='', mask_errors=False): + random_seed=0, gap2seq_opts='', always_succeed=False): ''' This step runs the Gap2Seq tool to close gaps between contigs in a scaffold. ''' try: @@ -423,7 +426,7 @@ def gapfill_gap2seq(in_scaffold, in_bam, out_scaffold, threads=None, mem_limit_g mem_limit_gb=mem_limit_gb, time_soft_limit_minutes=time_soft_limit_minutes, random_seed=random_seed) except Exception as e: - if not mask_errors: + if not always_succeed: raise log.warning('Gap-filling failed (%s); ignoring error, emulating successful run where simply no gaps were filled.') shutil.copyfile(in_scaffold, out_scaffold) @@ -436,7 +439,7 @@ def parser_gapfill_gap2seq(parser=argparse.ArgumentParser(description='Close gap parser.add_argument('--memLimitGb', dest='mem_limit_gb', default=4.0, help='Max memory to use, in gigabytes %(default)s') parser.add_argument('--timeSoftLimitMinutes', dest='time_soft_limit_minutes', default=60.0, help='Stop trying to close more gaps after this many minutes (default: %(default)s); this is a soft/advisory limit') - parser.add_argument('--maskErrors', dest='mask_errors', default=False, action='store_true', + parser.add_argument('--maskErrors', dest='always_succeed', default=False, action='store_true', help='In case of any error, just copy in_scaffold to out_scaffold, emulating a successful run that simply could not ' 'fill any gaps.') parser.add_argument('--gap2seqOpts', dest='gap2seq_opts', default='', help='(advanced) Extra command-line options to pass to Gap2Seq') diff --git a/pipes/Broad_UGER/cluster-submitter.py b/pipes/Broad_UGER/cluster-submitter.py index 82d4e1812..70b56303b 100755 --- a/pipes/Broad_UGER/cluster-submitter.py +++ b/pipes/Broad_UGER/cluster-submitter.py @@ -43,6 +43,8 @@ def hard_blacklist_node(node): if mem: # on UGER, memory requests are per-core (according to BITS as of Sept. 6, 2016) mem_per_core = round((int(mem)*1024)/int(threads), 2) + # increase memory requested to reflect new JVM-UGER changes requiring greater memory headroom + mem_per_core = round(mem_per_core*1.1,2) if blacklisted_nodes: # Pass h= as the hostname parameter; it accepts a regex, so # invert the match to blacklist hostnames diff --git a/pipes/WDL/workflows/demux_plus.wdl b/pipes/WDL/workflows/demux_plus.wdl index 86f3a5aac..b1f388c39 100644 --- a/pipes/WDL/workflows/demux_plus.wdl +++ b/pipes/WDL/workflows/demux_plus.wdl @@ -30,7 +30,8 @@ workflow demux_plus { input: assembler = "spades", reads_unmapped_bam = deplete.cleaned_bam, - trim_clip_db = trim_clip_db + trim_clip_db = trim_clip_db, + always_succeed = true } } diff --git a/pipes/WDL/workflows/tasks/tasks_assembly.wdl b/pipes/WDL/workflows/tasks/tasks_assembly.wdl index 17080a21f..accf5dcb1 100644 --- a/pipes/WDL/workflows/tasks/tasks_assembly.wdl +++ b/pipes/WDL/workflows/tasks/tasks_assembly.wdl @@ -7,6 +7,7 @@ task assemble { Int? spades_min_contig_len=0 String? assembler="trinity" # trinity, spades, or trinity-spades + Boolean? always_succeed=false String cleaned_assembler = select_first([assembler, ""]) # workaround for https://gatkforums.broadinstitute.org/wdl/discussion/10462/string-type-in-output-section # do this in two steps in case the input doesn't actually have "taxfilt" in the name @@ -26,6 +27,7 @@ task assemble { ${trim_clip_db} \ ${sample_name}.assembly1-trinity.fasta \ ${'--n_reads=' + trinity_n_reads} \ + ${true='--alwaysSucceed' false="" always_succeed} \ --JVMmemory "$mem_in_mb"m \ --outReads=${sample_name}.subsamp.bam \ --loglevel=DEBUG @@ -36,6 +38,7 @@ task assemble { ${trim_clip_db} \ ${sample_name}.assembly1-spades.fasta \ ${'--nReads=' + spades_n_reads} \ + ${true="--alwaysSucceed" false="" always_succeed} \ ${'--minContigLen=' + spades_min_contig_len} \ --memLimitGb $mem_in_gb \ --outReads=${sample_name}.subsamp.bam \ @@ -49,6 +52,7 @@ task assemble { ${'--n_reads=' + trinity_n_reads} \ --JVMmemory "$mem_in_mb"m \ --outReads=${sample_name}.subsamp.bam \ + ${true='--always_succeed' false='' always_succeed} \ --loglevel=DEBUG assembly.py assemble_spades \ ${reads_unmapped_bam} \ @@ -56,6 +60,7 @@ task assemble { ${sample_name}.assembly1-spades.fasta \ --contigsUntrusted=${sample_name}.assembly1-trinity.fasta \ ${'--nReads=' + spades_n_reads} \ + ${true='--alwaysSucceed' false='' always_succeed} \ ${'--minContigLen=' + spades_min_contig_len} \ --memLimitGb $mem_in_gb \ --loglevel=DEBUG diff --git a/test/unit/test_assembly.py b/test/unit/test_assembly.py index 5fddb1e38..a32e0af5e 100644 --- a/test/unit/test_assembly.py +++ b/test/unit/test_assembly.py @@ -183,6 +183,15 @@ def test_empty_input_succeed(self): assembly.assemble_spades(in_bam=inBam, clip_db=clipDb, out_fasta=outFasta) self.assertEqual(os.path.getsize(outFasta), 0) + def test_always_succeed(self): + inDir = util.file.get_test_input_path(self) + inBam = os.path.join(inDir, '..', 'almost-empty.bam') + clipDb = os.path.join(inDir, 'clipDb.fasta') + with util.file.tempfname('.fasta') as outFasta: + assembly.assemble_spades(in_bam=inBam, clip_db=clipDb, out_fasta=outFasta, spades_opts='--bad-option', + always_succeed=True) + self.assertEqual(os.path.getsize(outFasta), 0) + class TestTrimRmdupSubsamp(TestCaseWithTmp): ''' Test the trim_rmdup_subsamp command ''' diff --git a/tools/spades.py b/tools/spades.py index f7981e527..dcc9b2329 100644 --- a/tools/spades.py +++ b/tools/spades.py @@ -42,7 +42,7 @@ def execute(self, args): # pylint: disable=W0221 subprocess.check_call(tool_cmd) def assemble(self, reads_fwd, reads_bwd, contigs_out, reads_unpaired=None, contigs_trusted=None, - contigs_untrusted=None, kmer_sizes=(55,65), mask_errors=False, max_kmer_sizes=1, + contigs_untrusted=None, kmer_sizes=(55,65), always_succeed=False, max_kmer_sizes=1, filter_contigs=False, min_contig_len=0, mem_limit_gb=8, threads=None, spades_opts=''): '''Assemble contigs from RNA-seq reads and (optionally) pre-existing contigs. @@ -56,7 +56,7 @@ def assemble(self, reads_fwd, reads_bwd, contigs_out, reads_unpaired=None, conti Params: kmer_sizes: if given, use these kmer sizes and combine the resulting contigs. kmer size of 0 or None means use size auto-selected by SPAdes based on read length. - mask_errors: if True, if spades fails with an error for a kmer size, pretend it just produced no contigs for that kmer size + always_succeed: if True, if spades fails with an error for a kmer size, pretend it just produced no contigs for that kmer size max_kmer_sizes: if this many kmer sizes succeed, do not try further ones filter_contigs: if True, outputs only "long and reliable transcripts with rather high expression" (per SPAdes docs) min_contig_len: drop contigs shorter than this many bp @@ -101,12 +101,21 @@ def assemble(self, reads_fwd, reads_bwd, contigs_out, reads_unpaired=None, conti try: self.execute(args=args) except Exception as e: - if mask_errors: + if always_succeed: log.warning('SPAdes failed for k={}: {}'.format(kmer_size, e)) util.file.make_empty(transcripts_fname) else: raise + # work around the bug that spades may succeed yet not create the transcripts.fasta file + if not os.path.isfile(transcripts_fname): + msg = 'SPAdes failed to make transcripts.fasta for k={}'.format(kmer_size) + if always_succeed: + log.warning(msg) + util.file.make_empty(transcripts_fname) + else: + raise RuntimeError(msg) + if min_contig_len: transcripts = Bio.SeqIO.parse(transcripts_fname, 'fasta') transcripts_sans_short = [r for r in transcripts if len(r.seq) >= min_contig_len]