diff --git a/assembly.py b/assembly.py index 37ca7ea47..1a031c4b5 100755 --- a/assembly.py +++ b/assembly.py @@ -57,7 +57,7 @@ def __init__(self, reason): super(DenovoAssemblyError, self).__init__(reason) -def trim_rmdup_subsamp_reads(inBam, clipDb, outBam, n_reads=100000): +def trim_rmdup_subsamp_reads(inBam, clipDb, outBam, n_reads=100000, trim_opts=None): ''' Take reads through Trimmomatic, Prinseq, and subsampling. This should probably move over to read_utils. ''' @@ -87,7 +87,8 @@ def trim_rmdup_subsamp_reads(inBam, clipDb, outBam, n_reads=100000): trimfq[1], clipDb, unpairedOutFastq1=trimfq_unpaired[0], - unpairedOutFastq2=trimfq_unpaired[1] + unpairedOutFastq2=trimfq_unpaired[1], + **(trim_opts or {}) ) n_trim = max(map(util.file.count_fastq_reads, trimfq)) # count is pairs @@ -198,7 +199,7 @@ def trim_rmdup_subsamp_reads(inBam, clipDb, outBam, n_reads=100000): n_final_individual_reads = samtools.count(outBam) - log.info("Pre-Trinity read filters: ") + log.info("Pre-DeNovoAssembly read filters: ") log.info(" {} read pairs at start ".format(n_input)) log.info( " {} read pairs after Trimmomatic {}".format( @@ -220,7 +221,7 @@ def trim_rmdup_subsamp_reads(inBam, clipDb, outBam, n_reads=100000): else: log.info(" Paired read count sufficient to reach threshold ({})".format(n_reads)) log.info( - " {} individual reads for trinity ({}{})".format( + " {} individual reads for de novo assembly ({}{})".format( n_output, "paired subsampled {} -> {}".format(n_rmdup_paired, n_paired_subsamp) if not did_include_subsampled_unpaired_reads else "{} read pairs".format(n_rmdup_paired), " + unpaired subsampled {} -> {}".format(n_rmdup_unpaired, n_unpaired_subsamp) @@ -232,7 +233,7 @@ def trim_rmdup_subsamp_reads(inBam, clipDb, outBam, n_reads=100000): if did_include_subsampled_unpaired_reads: if n_final_individual_reads < n_reads: log.warning( - "NOTE: Even with unpaired reads included, there are fewer unique trimmed reads than requested for Trinity input." + "NOTE: Even with unpaired reads included, there are fewer unique trimmed reads than requested for de novo assembly input." ) # clean up temp files @@ -340,72 +341,70 @@ def parser_assemble_trinity(parser=argparse.ArgumentParser()): __commands__.append(('assemble_trinity', parser_assemble_trinity)) -def gapfill_gap2seq(in_scaffold, in_bam, out_scaffold, gap2seq_opts='', threads=1, mem_limit_gb=4): - ''' This step runs the Gap2Seq tool to close gaps between contigs in a scaffold. - ''' - tools.gap2seq.Gap2SeqTool().gapfill(in_scaffold, in_bam, out_scaffold, gap2seq_opts=gap2seq_opts, threads=threads, - mem_limit_gb=mem_limit_gb) - -def parser_gapfill_gap2seq(parser=argparse.ArgumentParser(description='Close gaps between contigs in a scaffold')): - parser.add_argument('inScaffold', help='FASTA file containing the scaffold. Each FASTA record corresponds to one ' - 'segment (for multi-segment genomes). Contigs within each segment are separated by Ns.') - parser.add_argument('inBam', help='Input unaligned reads, BAM format.') - parser.add_argument('outScaffold', help='Output assembly.') - parser.add_argument('--threads', default=0, type=int, help='Number of threads (default: %(default)s); 0 means use all available cores') - parser.add_argument('--memLimitGb', dest='mem_limit_gb', default=4.0, help='Max memory to use, in gigabytes %(default)s') - parser.add_argument('--timeSoftLimitMinutes', dest='time_soft_limit_minutes', default=60.0, - help='Stop trying to close more gaps after this many minutes (default: %(default)s); this is a soft/advisory limit') - parser.add_argument('--gap2seqOpts', dest='gap2seq_opts', default='', help='(advanced) Extra command-line options to pass to Gap2Seq') - - util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None))) - util.cmd.attach_main(parser, gapfill_gap2seq, split_args=True) - return parser - - -__commands__.append(('gapfill_gap2seq', parser_gapfill_gap2seq)) - - def assemble_spades( - inBam, - outFasta, + in_bam, + clip_db, + out_fasta, spades_opts='', - previously_assembled_contigs=None, - mem_limit_gb=4, + contigs_trusted=None, contigs_untrusted=None, + filter_contigs=False, + kmer_sizes=(55,65), + n_reads=10000000, + mask_errors=False, + max_kmer_sizes=1, + mem_limit_gb=8, threads=0, ): - ''' De novo RNA-seq assembly with the SPAdes assembler. + '''De novo RNA-seq assembly with the SPAdes assembler. Inputs: - inBam - reads to assemble. May include both paired and unpaired reads. - previously_assembled_contigs - (optional) already-assembled contigs from the same sample. + inBam: reads to assemble. May include both paired and unpaired reads. + clip_db: Trimmomatic clip DB + trusted_contigs, untrusted_contigs: (optional) already-assembled contigs from the same sample. trusted contigs must be + high-quality, untrusted_contigs may have some errors. Outputs: - outFasta - the assembled contigs. Note that, since this is RNA-seq assembly, for each assembled genomic region there may be + outFasta: the assembled contigs. Note that, since this is RNA-seq assembly, for each assembled genomic region there may be several contigs representing different variants of that region. Params: - mem_limit_gb - max memory to use - threads - number of threads to use (0 means use all available CPUs) - spades_opts - (advanced) custom command-line options to pass to the SPAdes assembler - + n_reads: before assembly, subsample the reads to at most this many + filter_contigs: drop lesser-quality contigs from output + mem_limit_gb: max memory to use + threads: number of threads to use (0 means use all available CPUs) + spades_opts: (advanced) custom command-line options to pass to the SPAdes assembler ''' - with tools.picard.SamToFastqTool().execute_tmp(inBam, includeUnpaired=True, - JVMmemory=str(mem_limit_gb)+'g') as (reads_fwd, reads_bwd, reads_unpaired): - try: - tools.spades.SpadesTool().assemble(reads_fwd=reads_fwd, reads_bwd=reads_bwd, reads_unpaired=reads_unpaired, - contigs_untrusted=previously_assembled_contigs, - contigs_out=outFasta, spades_opts=spades_opts, mem_limit_gb=mem_limit_gb, - threads=threads) - except subprocess.CalledProcessError as e: - raise DenovoAssemblyError('SPAdes assembler failed: ' + str(e)) + with util.file.tempfname(suffix='.bam', prefix='trim_rmdup_for_spades') as trim_rmdup_bam: + trim_rmdup_subsamp_reads(in_bam, clip_db, trim_rmdup_bam, n_reads=n_reads, + trim_opts=dict(maxinfo_target_length=35, maxinfo_strictness=.2)) + + with tools.picard.SamToFastqTool().execute_tmp(trim_rmdup_bam, includeUnpaired=True, illuminaClipping=True, + JVMmemory=str(mem_limit_gb)+'g') as (reads_fwd, reads_bwd, reads_unpaired): + try: + tools.spades.SpadesTool().assemble(reads_fwd=reads_fwd, reads_bwd=reads_bwd, reads_unpaired=reads_unpaired, + contigs_untrusted=contigs_untrusted, contigs_trusted=contigs_trusted, + contigs_out=out_fasta, filter_contigs=filter_contigs, + kmer_sizes=kmer_sizes, mask_errors=mask_errors, max_kmer_sizes=max_kmer_sizes, + spades_opts=spades_opts, mem_limit_gb=mem_limit_gb, + threads=threads) + except subprocess.CalledProcessError as e: + raise DenovoAssemblyError('SPAdes assembler failed: ' + str(e)) def parser_assemble_spades(parser=argparse.ArgumentParser()): - parser.add_argument('inBam', help='Input unaligned reads, BAM format.') - parser.add_argument('outFasta', help='Output assembly.') - parser.add_argument('--previously_assembled_contigs', help='Contigs previously assembled from the same sample') - parser.add_argument('--spades_opts', default='', help='(advanced) Extra flags to pass to the SPAdes assembler') - parser.add_argument('--mem_limit_gb', default=4, type=int, help='Max memory to use, in GB (default: %(default)s)') + parser.add_argument('in_bam', help='Input unaligned reads, BAM format.') + parser.add_argument('clip_db', help='Trimmomatic clip db') + parser.add_argument('out_fasta', help='Output assembly.') + parser.add_argument('--contigsTrusted', dest='contigs_trusted', + help='Contigs of high quality previously assembled from the same sample') + parser.add_argument('--contigsUntrusted', dest='contigs_untrusted', + help='Contigs of high medium quality previously assembled from the same sample') + parser.add_argument('--nReads', dest='n_reads', type=int, default=10000000, + help='Before assembly, subsample the reads to at most this many') + parser.add_argument('--filterContigs', dest='filter_contigs', default=False, action='store_true', + help='only output contigs SPAdes is sure of') + parser.add_argument('--spadesOpts', dest='spades_opts', default='', help='(advanced) Extra flags to pass to the SPAdes assembler') + parser.add_argument('--memLimitGb', dest='mem_limit_gb', default=4, type=int, help='Max memory to use, in GB (default: %(default)s)') parser.add_argument('--threads', default=0, type=int, help='Number of threads, or 0 to use all CPUs (default: %(default)s)') util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None))) util.cmd.attach_main(parser, assemble_spades, split_args=True) @@ -415,6 +414,43 @@ def parser_assemble_spades(parser=argparse.ArgumentParser()): __commands__.append(('assemble_spades', parser_assemble_spades)) +def gapfill_gap2seq(in_scaffold, in_bam, out_scaffold, threads=1, mem_limit_gb=4, time_soft_limit_minutes=60.0, + random_seed=0, gap2seq_opts='', mask_errors=False): + ''' This step runs the Gap2Seq tool to close gaps between contigs in a scaffold. + ''' + try: + tools.gap2seq.Gap2SeqTool().gapfill(in_scaffold, in_bam, out_scaffold, gap2seq_opts=gap2seq_opts, threads=threads, + mem_limit_gb=mem_limit_gb, time_soft_limit_minutes=time_soft_limit_minutes, + random_seed=random_seed) + except Exception as e: + if not mask_errors: + raise + log.warning('Gap-filling failed (%s); ignoring error, emulating successful run where simply no gaps were filled.') + shutil.copyfile(in_scaffold, out_scaffold) + +def parser_gapfill_gap2seq(parser=argparse.ArgumentParser(description='Close gaps between contigs in a scaffold')): + parser.add_argument('in_scaffold', help='FASTA file containing the scaffold. Each FASTA record corresponds to one ' + 'segment (for multi-segment genomes). Contigs within each segment are separated by Ns.') + parser.add_argument('in_bam', help='Input unaligned reads, BAM format.') + parser.add_argument('out_scaffold', help='Output assembly.') + parser.add_argument('--threads', default=0, type=int, help='Number of threads (default: %(default)s); 0 means use all available cores') + parser.add_argument('--memLimitGb', dest='mem_limit_gb', default=4.0, help='Max memory to use, in gigabytes %(default)s') + parser.add_argument('--timeSoftLimitMinutes', dest='time_soft_limit_minutes', default=60.0, + help='Stop trying to close more gaps after this many minutes (default: %(default)s); this is a soft/advisory limit') + parser.add_argument('--maskErrors', dest='mask_errors', default=False, action='store_true', + help='In case of any error, just copy in_scaffold to out_scaffold, emulating a successful run that simply could not ' + 'fill any gaps.') + parser.add_argument('--gap2seqOpts', dest='gap2seq_opts', default='', help='(advanced) Extra command-line options to pass to Gap2Seq') + parser.add_argument('--randomSeed', dest='random_seed', default=0, help='Random seed; 0 means use current time') + + util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None))) + util.cmd.attach_main(parser, gapfill_gap2seq, split_args=True) + return parser + + +__commands__.append(('gapfill_gap2seq', parser_gapfill_gap2seq)) + + def order_and_orient(inFasta, inReference, outFasta, outAlternateContigs=None, breaklen=None, # aligner='nucmer', circular=False, trimmed_contigs=None, diff --git a/pipes/config.yaml b/pipes/config.yaml index 3b6bf692c..61fe82718 100644 --- a/pipes/config.yaml +++ b/pipes/config.yaml @@ -140,6 +140,10 @@ samples_per_run: "samples-runs.txt" # to insufficient input data, consider expanding the list of accessions given to lastal. accessions_file_for_lastal_db_build: "" +# De novo assembler to use. Current options are "trinity", "spades" and "trinity-spades" +# (the "trinity-spades" option gives contigs produced by Trinity as auxiliary input to SPAdes). +assembly_assembler: "trinity" + # The minimum length an assembled sequence must have # to be considered acceptible quality, specified as # a fraction of the length of the reference sequene. @@ -192,6 +196,10 @@ mafft_maxiters: 1000 # See: https://github.com/trinityrnaseq/trinityrnaseq/wiki/Trinity-FAQ#ques_why_so_many_transcripts trinity_n_reads: 250000 +# The number of reads to be subsampled from input bam files +# and used as input to assembly via SPAdes +spades_n_reads: 10000000 + # Minimum number of reads on each strand vphaser_min_reads_each: 5 @@ -205,6 +213,9 @@ vphaser_max_bins: 10 # allele freq > 0.005. If false, no filtering is performed. vcf_merge_naive_filter: false +# Random seed, for non-deterministic steps. 0 means use current time. +random_seed: 0 + # |----------------------- Data storage locations --------------------------- # The parent directory containing data sub-directories. diff --git a/pipes/rules/assembly.rules b/pipes/rules/assembly.rules index 159045317..5dbf30b02 100644 --- a/pipes/rules/assembly.rules +++ b/pipes/rules/assembly.rules @@ -40,7 +40,8 @@ rule assemble_trinity: bam = config["data_dir"]+'/'+config["subdirs"]["per_sample"]+'/{sample}.taxfilt.bam', clipDb = objectify_remote(config["trim_clip_db"]) output: - fasta = config["tmp_dir"] +'/'+config["subdirs"]["assembly"]+'/{sample}.assembly1-trinity.fasta' + fasta = config["tmp_dir"] +'/'+config["subdirs"]["assembly"]+'/{sample}.assembly1-trinity.fasta', + subsamp_bam = config["tmp_dir"]+'/'+config["subdirs"]["assembly"]+'/{sample}.subsamp.bam' resources: mem = 7, threads = int(config.get("number_of_threads", 1)) @@ -49,13 +50,43 @@ rule assemble_trinity: UGER = config.get('UGER_queues', {}).get('short', '-l h_rt 04:00:00'), n_reads = str(config["trinity_n_reads"]), logid = "{sample}", - subsamp_bam = config["tmp_dir"]+'/'+config["subdirs"]["assembly"]+'/{sample}.subsamp.bam' run: makedirs(expand("{dir}/{subdir}", dir=[config["data_dir"],config["tmp_dir"]], subdir=config["subdirs"]["assembly"])) - shell("{config[bin_dir]}/assembly.py assemble_trinity {input.bam} {input.clipDb} {output.fasta} --n_reads={params.n_reads} --outReads {params.subsamp_bam} --threads {resources.threads} --JVMmemory 5g") + shell("{config[bin_dir]}/assembly.py assemble_trinity {input.bam} {input.clipDb} {output.fasta} --n_reads={params.n_reads} --outReads {output.subsamp_bam} --threads {resources.threads} --JVMmemory 5g") + +rule assemble_spades: + ''' This step runs the SPAdes assembler. + ''' + input: taxfilt_reads=config["data_dir"]+'/'+config["subdirs"]["per_sample"]+'/{sample}.taxfilt.bam', + clipDb = objectify_remote(config["trim_clip_db"]) + output: contigs_spades=config["tmp_dir"] +'/'+config["subdirs"]["assembly"]+'/{sample}.assembly1-spades.fasta' + resources: + mem=12, + threads=int(config.get("number_of_threads", 1)) + params: n_reads=str(config["spades_n_reads"]), + logid="{sample}" + run: + shell("{config[bin_dir]}/assembly.py assemble_spades {input.taxfilt_reads} {input.clipDb} {output.contigs_spades} --nReads {params.n_reads} --threads {resources.threads} --memLimitGb {resources.mem}") + + +rule assemble_trinity_spades: + ''' This step runs the Trinity assembler then the SPAdes assembler. + ''' + input: taxfilt_reads=config["data_dir"]+'/'+config["subdirs"]["per_sample"]+'/{sample}.taxfilt.bam', + clipDb = objectify_remote(config["trim_clip_db"]), + contigs_trinity=config["tmp_dir"] +'/'+config["subdirs"]["assembly"]+'/{sample}.assembly1-trinity.fasta' + output: contigs_trinity_spades=config["tmp_dir"] +'/'+config["subdirs"]["assembly"]+'/{sample}.assembly1-trinity-spades.fasta' + resources: + mem=12, + threads=int(config.get("number_of_threads", 1)) + params: n_reads=str(config["spades_n_reads"]), + logid="{sample}" + run: + shell("{config[bin_dir]}/assembly.py assemble_spades {input.taxfilt_reads} {input.clipDb} {output.contigs_trinity_spades} --contigsUntrusted {input.contigs_trinity} --nReads {params.n_reads} --threads {resources.threads} --memLimitGb {resources.mem}") + rule orient_and_impute: ''' This step cleans up the Trinity assembly with a known reference genome. order_and_orient (MUMmer): take the de novo assembly, @@ -77,8 +108,9 @@ rule orient_and_impute: which will be captured in this output. ''' input: - fasta = config["tmp_dir"]+'/'+config["subdirs"]["assembly"]+'/{sample}.assembly1-trinity.fasta', - ref_genome = objectify_remote(expand( '{refDir}/'+'{ref_name}.fasta', refDir=config["ref_genome_dir"], ref_name="reference" )) + fasta = config["tmp_dir"]+'/'+config["subdirs"]["assembly"]+'/{sample}.assembly1-'+config["assembly_assembler"]+'.fasta', + ref_genome = objectify_remote(expand( '{refDir}/'+'{ref_name}.fasta', refDir=config["ref_genome_dir"], ref_name="reference" )), + cleaned_reads = config["data_dir"]+'/'+config["subdirs"]["per_sample"]+'/{sample}.cleaned.bam' output: fasta = config["tmp_dir"]+'/'+config["subdirs"]["assembly"]+'/{sample}.assembly3-modify.fasta' resources: @@ -94,6 +126,7 @@ rule orient_and_impute: replace_length = str(config.get("assembly_replace_length", 55)), logid = "{sample}", scaffolded_fasta = config["tmp_dir"]+'/'+config["subdirs"]["assembly"]+'/{sample}.assembly2-scaffolded.fasta', + gapfilled_fasta=config["tmp_dir"]+'/'+config["subdirs"]["assembly"]+'/{sample}.assembly2-gapfilled.fasta', alternate_fasta = config["tmp_dir"]+'/'+config["subdirs"]["assembly"]+'/{sample}.assembly2-alternate_sequences.fasta' run: ono_extra = [] @@ -107,7 +140,8 @@ rule orient_and_impute: ono_extra.extend(["--min_pct_contig_aligned", str(config["scaffold_min_pct_contig_aligned"])]) ono_extra = " ".join(ono_extra) shell("{config[bin_dir]}/assembly.py order_and_orient {input.fasta} {input.ref_genome} {params.scaffolded_fasta} {ono_extra} --outAlternateContigs {params.alternate_fasta}") - shell("{config[bin_dir]}/assembly.py impute_from_reference {params.scaffolded_fasta} {input.ref_genome} {output.fasta} --newName {params.renamed_prefix}{wildcards.sample} --replaceLength {params.replace_length} --minLengthFraction {params.length} --minUnambig {params.min_unambig} --index") + shell("{config[bin_dir]}/assembly.py gapfill_gap2seq {params.scaffolded_fasta} {input.cleaned_reads} {params.gapfilled_fasta} --memLimitGb {resources[mem]} --maskErrors --randomSeed {config[random_seed]}") + shell("{config[bin_dir]}/assembly.py impute_from_reference {params.gapfilled_fasta} {input.ref_genome} {output.fasta} --newName {params.renamed_prefix}{wildcards.sample} --replaceLength {params.replace_length} --minLengthFraction {params.length} --minUnambig {params.min_unambig} --index") rule refine_assembly_1: ''' This a first pass refinement step where we take the scaffolded assembly, diff --git a/test/input/TestAssembleSpades/G3952.1.subsamp.bam b/test/input/TestAssembleSpades/G3952.1.subsamp.bam deleted file mode 100644 index 788c0f2d2..000000000 Binary files a/test/input/TestAssembleSpades/G3952.1.subsamp.bam and /dev/null differ diff --git a/test/input/TestAssembleSpades/clipDb.fasta b/test/input/TestAssembleSpades/clipDb.fasta deleted file mode 100644 index aeaabf18f..000000000 --- a/test/input/TestAssembleSpades/clipDb.fasta +++ /dev/null @@ -1,460 +0,0 @@ ->NuGEN -TCTATAGTTTAGGTAACTTTGTGTTTGA ->TruSeq3_IndexedAdapter -AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC ->TruSeq3_UniversalAdapter -AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTA ->PrefixPE/1 -TACACTCTTTCCCTACACGACGCTCTTCCGATCT ->PrefixPE/2 -GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT ->PE1_rc -AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTA ->PrefixNX/1 -AGATGTGTATAAGAGACAG ->PrefixNX/2 -AGATGTGTATAAGAGACAG ->Trans1 -TCGTCGGCAGCGTCAGATGTGTATAAGAGACAG ->Trans1_rc -CTGTCTCTTATACACATCTGACGCTGCCGACGA ->Trans2 -GTCTCGTGGGCTCGGAGATGTGTATAAGAGACAG ->Trans2_rc -CTGTCTCTTATACACATCTCCGAGCCCACGAGAC ->TruSeq2_SE -AGATCGGAAGAGCTCGTATGCCGTCTTCTGCTTG ->TruSeq2_PE_f -AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT ->TruSeq2_PE_r -AGATCGGAAGAGCGGTTCAGCAGGAATGCCGAG ->PCR_Primer1_rc -AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT ->PCR_Primer2 -CAAGCAGAAGACGGCATACGAGATCGGTCTCGGCATTCCTGCTGAACCGCTCTTCCGATCT ->PCR_Primer2_rc -AGATCGGAAGAGCGGTTCAGCAGGAATGCCGAGACCGATCTCGTATGCCGTCTTCTGCTTG ->FlowCell1 -TTTTTTTTTTAATGATACGGCGACCACCGAGATCTACAC ->FlowCell2 -TTTTTTTTTTCAAGCAGAAGACGGCATACGA ->TruSeq Universal Adapter -AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT ->TruSeq Adapter, Index 1 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACATCACGATCTCGTATGCCGTCTTCTGCTTG ->TruSeq Adapter, Index 2 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACCGATGTATCTCGTATGCCGTCTTCTGCTTG ->TruSeq Adapter, Index 3 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACTTAGGCATCTCGTATGCCGTCTTCTGCTTG ->TruSeq Adapter, Index 4 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACTGACCAATCTCGTATGCCGTCTTCTGCTTG ->TruSeq Adapter, Index 5 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACACAGTGATCTCGTATGCCGTCTTCTGCTTG ->TruSeq Adapter, Index 6 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACGCCAATATCTCGTATGCCGTCTTCTGCTTG ->TruSeq Adapter, Index 7 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACCAGATCATCTCGTATGCCGTCTTCTGCTTG ->TruSeq Adapter, Index 8 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACACTTGAATCTCGTATGCCGTCTTCTGCTTG ->TruSeq Adapter, Index 9 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACGATCAGATCTCGTATGCCGTCTTCTGCTTG ->TruSeq Adapter, Index 10 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACTAGCTTATCTCGTATGCCGTCTTCTGCTTG ->TruSeq Adapter, Index 11 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACGGCTACATCTCGTATGCCGTCTTCTGCTTG ->TruSeq Adapter, Index 12 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACCTTGTAATCTCGTATGCCGTCTTCTGCTTG ->Illumina Single End Adapter 1 -ACACTCTTTCCCTACACGACGCTGTTCCATCT ->Illumina Single End Adapter 2 -CAAGCAGAAGACGGCATACGAGCTCTTCCGATCT ->Illumina Paired End Adapter 2 -CTCGGCATTCCTGCTGAACCGCTCTTCCGATCT ->Illumina Paried End Sequencing Primer 1 -ACACTCTTTCCCTACACGACGCTCTTCCGATCT ->Illumina Paired End Sequencing Primer 2 -CGGTCTCGGCATTCCTACTGAACCGCTCTTCCGATCT ->Illumina Multiplexing Adapter 1 -GATCGGAAGAGCACACGTCT ->Illumina Multiplexing Index Sequencing Primer -GATCGGAAGAGCACACGTCTGAACTCCAGTCAC ->Illumina Multiplexing Read2 Sequencing Primer -GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT ->Illumina PCR Primer Index 1 -CAAGCAGAAGACGGCATACGAGATCGTGATGTGACTGGAGTTC ->Illumina PCR Primer Index 2 -CAAGCAGAAGACGGCATACGAGATACATCGGTGACTGGAGTTC ->Illumina PCR Primer Index 3 -CAAGCAGAAGACGGCATACGAGATGCCTAAGTGACTGGAGTTC ->Illumina PCR Primer Index 4 -CAAGCAGAAGACGGCATACGAGATTGGTCAGTGACTGGAGTTC ->Illumina PCR Primer Index 5 -CAAGCAGAAGACGGCATACGAGATCACTGTGTGACTGGAGTTC ->Illumina PCR Primer Index 6 -CAAGCAGAAGACGGCATACGAGATATTGGCGTGACTGGAGTTC ->Illumina PCR Primer Index 7 -CAAGCAGAAGACGGCATACGAGATGATCTGGTGACTGGAGTTC ->Illumina PCR Primer Index 8 -CAAGCAGAAGACGGCATACGAGATTCAAGTGTGACTGGAGTTC ->Illumina PCR Primer Index 9 -CAAGCAGAAGACGGCATACGAGATCTGATCGTGACTGGAGTTC ->Illumina PCR Primer Index 10 -CAAGCAGAAGACGGCATACGAGATAAGCTAGTGACTGGAGTTC ->Illumina PCR Primer Index 11 -CAAGCAGAAGACGGCATACGAGATGTAGCCGTGACTGGAGTTC ->Illumina PCR Primer Index 12 -CAAGCAGAAGACGGCATACGAGATTACAAGGTGACTGGAGTTC ->RNA PCR Primer, Index 1 -CAAGCAGAAGACGGCATACGAGATCGTGATGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA ->RNA PCR Primer, Index 2 -CAAGCAGAAGACGGCATACGAGATACATCGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA ->RNA PCR Primer, Index 3 -CAAGCAGAAGACGGCATACGAGATGCCTAAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA ->RNA PCR Primer, Index 4 -CAAGCAGAAGACGGCATACGAGATTGGTCAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA ->RNA PCR Primer, Index 5 -CAAGCAGAAGACGGCATACGAGATCACTGTGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA ->RNA PCR Primer, Index 6 -CAAGCAGAAGACGGCATACGAGATATTGGCGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA ->RNA PCR Primer, Index 7 -CAAGCAGAAGACGGCATACGAGATGATCTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA ->RNA PCR Primer, Index 8 -CAAGCAGAAGACGGCATACGAGATTCAAGTGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA ->RNA PCR Primer, Index 9 -CAAGCAGAAGACGGCATACGAGATCTGATCGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA ->RNA PCR Primer, Index 10 -CAAGCAGAAGACGGCATACGAGATAAGCTAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA ->RNA PCR Primer, Index 11 -CAAGCAGAAGACGGCATACGAGATGTAGCCGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA ->RNA PCR Primer, Index 12 -CAAGCAGAAGACGGCATACGAGATTACAAGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA ->RNA PCR Primer, Index 13 -CAAGCAGAAGACGGCATACGAGATTTGACTGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA ->RNA PCR Primer, Index 14 -CAAGCAGAAGACGGCATACGAGATGGAACTGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA ->RNA PCR Primer, Index 15 -CAAGCAGAAGACGGCATACGAGATTGACATGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA ->RNA PCR Primer, Index 16 -CAAGCAGAAGACGGCATACGAGATGGACGGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA ->RNA PCR Primer, Index 17 -CAAGCAGAAGACGGCATACGAGATCTCTACGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA ->RNA PCR Primer, Index 18 -CAAGCAGAAGACGGCATACGAGATGCGGACGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA ->RNA PCR Primer, Index 19 -CAAGCAGAAGACGGCATACGAGATTTTCACGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA ->RNA PCR Primer, Index 20 -CAAGCAGAAGACGGCATACGAGATGGCCACGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA ->RNA PCR Primer, Index 21 -CAAGCAGAAGACGGCATACGAGATCGAAACGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA ->RNA PCR Primer, Index 22 -CAAGCAGAAGACGGCATACGAGATCGTACGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA ->RNA PCR Primer, Index 23 -CAAGCAGAAGACGGCATACGAGATCCACTCGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA ->RNA PCR Primer, Index 24 -CAAGCAGAAGACGGCATACGAGATGCTACCGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA ->RNA PCR Primer, Index 25 -CAAGCAGAAGACGGCATACGAGATATCAGTGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA ->RNA PCR Primer, Index 26 -CAAGCAGAAGACGGCATACGAGATGCTCATGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA ->RNA PCR Primer, Index 27 -CAAGCAGAAGACGGCATACGAGATAGGAATGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA ->RNA PCR Primer, Index 28 -CAAGCAGAAGACGGCATACGAGATCTTTTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA ->RNA PCR Primer, Index 29 -CAAGCAGAAGACGGCATACGAGATTAGTTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA ->RNA PCR Primer, Index 30 -CAAGCAGAAGACGGCATACGAGATCCGGTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA ->RNA PCR Primer, Index 31 -CAAGCAGAAGACGGCATACGAGATATCGTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA ->RNA PCR Primer, Index 32 -CAAGCAGAAGACGGCATACGAGATTGAGTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA ->RNA PCR Primer, Index 33 -CAAGCAGAAGACGGCATACGAGATCGCCTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA ->RNA PCR Primer, Index 34 -CAAGCAGAAGACGGCATACGAGATGCCATGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA ->RNA PCR Primer, Index 35 -CAAGCAGAAGACGGCATACGAGATAAAATGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA ->RNA PCR Primer, Index 36 -CAAGCAGAAGACGGCATACGAGATTGTTGGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA ->RNA PCR Primer, Index 37 -CAAGCAGAAGACGGCATACGAGATATTCCGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA ->RNA PCR Primer, Index 38 -CAAGCAGAAGACGGCATACGAGATAGCTAGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA ->RNA PCR Primer, Index 39 -CAAGCAGAAGACGGCATACGAGATGTATAGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA ->RNA PCR Primer, Index 40 -CAAGCAGAAGACGGCATACGAGATTCTGAGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA ->RNA PCR Primer, Index 41 -CAAGCAGAAGACGGCATACGAGATGTCGTCGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA ->RNA PCR Primer, Index 42 -CAAGCAGAAGACGGCATACGAGATCGATTAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA ->RNA PCR Primer, Index 43 -CAAGCAGAAGACGGCATACGAGATGCTGTAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA ->RNA PCR Primer, Index 44 -CAAGCAGAAGACGGCATACGAGATATTATAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA ->RNA PCR Primer, Index 45 -CAAGCAGAAGACGGCATACGAGATGAATGAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA ->RNA PCR Primer, Index 46 -CAAGCAGAAGACGGCATACGAGATTCGGGAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA ->RNA PCR Primer, Index 47 -CAAGCAGAAGACGGCATACGAGATCTTCGAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA ->RNA PCR Primer, Index 48 -CAAGCAGAAGACGGCATACGAGATTGCCGAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA ->Custom add, Illumina Paired End Adapter 2 -GATCGGAAGAGCGGTTCAGCAGGAATGC ->Nextera S501 -AATGATACGGCGACCACCGAGATCTACACTAGATCGCTCGTCGGCAGCGTC ->Nextera S502 -AATGATACGGCGACCACCGAGATCTACACCTCTCTATTCGTCGGCAGCGTC ->Nextera S503 -AATGATACGGCGACCACCGAGATCTACACTATCCTCTTCGTCGGCAGCGTC ->Nextera S504 -AATGATACGGCGACCACCGAGATCTACACAGAGTAGATCGTCGGCAGCGTC ->Nextera S505 -AATGATACGGCGACCACCGAGATCTACACGTAAGGAGTCGTCGGCAGCGTC ->Nextera S506 -AATGATACGGCGACCACCGAGATCTACACACTGCATATCGTCGGCAGCGTC ->Nextera S507 -AATGATACGGCGACCACCGAGATCTACACAAGGAGTATCGTCGGCAGCGTC ->Nextera S508 -AATGATACGGCGACCACCGAGATCTACACCTAAGCCTTCGTCGGCAGCGTC ->Nextera N711fw -CAAGCAGAAGACGGCATACGAGATAAGAGGCAGTCTCGTGGGCTCGG ->Nextera N703fw -CAAGCAGAAGACGGCATACGAGATAGGCAGAAGTCTCGTGGGCTCGG ->Nextera N708fw -CAAGCAGAAGACGGCATACGAGATCAGAGAGGGTCTCGTGGGCTCGG ->Nextera N710fw -CAAGCAGAAGACGGCATACGAGATCGAGGCTGGTCTCGTGGGCTCGG ->Nextera N702fw -CAAGCAGAAGACGGCATACGAGATCGTACTAGGTCTCGTGGGCTCGG ->Nextera N707fw -CAAGCAGAAGACGGCATACGAGATCTCTCTACGTCTCGTGGGCTCGG ->Nextera N709fw -CAAGCAGAAGACGGCATACGAGATGCTACGCTGTCTCGTGGGCTCGG ->Nextera N705fw -CAAGCAGAAGACGGCATACGAGATGGACTCCTGTCTCGTGGGCTCGG ->Nextera N712fw -CAAGCAGAAGACGGCATACGAGATGTAGAGGAGTCTCGTGGGCTCGG ->Nextera N701fw -CAAGCAGAAGACGGCATACGAGATTAAGGCGAGTCTCGTGGGCTCGG ->Nextera N706fw -CAAGCAGAAGACGGCATACGAGATTAGGCATGGTCTCGTGGGCTCGG ->Nextera N704fw -CAAGCAGAAGACGGCATACGAGATTCCTGAGCGTCTCGTGGGCTCGG ->Nextera N701 -CAAGCAGAAGACGGCATACGAGATTCGCCTTAGTCTCGTGGGCTCGG ->Nextera N702 -CAAGCAGAAGACGGCATACGAGATCTAGTACGGTCTCGTGGGCTCGG ->Nextera N703 -CAAGCAGAAGACGGCATACGAGATTTCTGCCTGTCTCGTGGGCTCGG ->Nextera N704 -CAAGCAGAAGACGGCATACGAGATGCTCAGGAGTCTCGTGGGCTCGG ->Nextera N705 -CAAGCAGAAGACGGCATACGAGATAGGAGTCCGTCTCGTGGGCTCGG ->Nextera N706 -CAAGCAGAAGACGGCATACGAGATCATGCCTAGTCTCGTGGGCTCGG ->Nextera N707 -CAAGCAGAAGACGGCATACGAGATGTAGAGAGGTCTCGTGGGCTCGG ->Nextera N708 -CAAGCAGAAGACGGCATACGAGATCCTCTCTGGTCTCGTGGGCTCGG ->Nextera N709 -CAAGCAGAAGACGGCATACGAGATAGCGTAGCGTCTCGTGGGCTCGG ->Nextera N710 -CAAGCAGAAGACGGCATACGAGATCAGCCTCGGTCTCGTGGGCTCGG ->Nextera N711 -CAAGCAGAAGACGGCATACGAGATTGCCTCTTGTCTCGTGGGCTCGG ->Nextera N712 -CAAGCAGAAGACGGCATACGAGATTCCTCTACGTCTCGTGGGCTCGG ->Broad Barcode tagged_57 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACAAGTAGAGATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_100 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACACACGATCATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_473 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACCGCGCGGTATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_373 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACCATGATCGATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_498 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACCGTTACCAATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_861 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACTCCTTGGTATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_23 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACAACGCATTATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_109 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACACAGGTATATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_218 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACAGGTAAGGATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_3 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACAACAATGGATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_163 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACACTGTATCATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_220 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACAGGTCGCAATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_375 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACCATGCTTAATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_726 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACGGTCCAGAATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_880 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACTCTGGCGAATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_214 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACAGGATCTAATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_754 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACGTCTGATGATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_223 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACAGGTTATCATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_309 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACCAACTCTCATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_379 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACCCAACATTATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_500 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACCTAACTCGATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_291 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACATTCCTCTATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_504 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACCTACCAGGATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_534 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACCTGCGGATATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_630 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACGCACATCTATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_741 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACGTATAACAATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_367 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACCATAGCGAATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_579 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACGACAGTAAATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_938 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACTTACGCACATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_745 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACGTCATCTAATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_542 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACCTGTAATCATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_655 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACGCCGTCGAATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_732 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACGTAACATCATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_567 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACGAAGGAAGATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_584 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACGACCTAACATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_117 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACACCAACTGATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_908 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACTGCTCGACATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_954 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACTTCGCTGAATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_357 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACCAGGAGCCATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_426 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACCCTATGCCATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_959 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACTTGAATAGATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_438 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACCCTTCGCAATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_778 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACTAAGCACAATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_821 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACTATCTGCCATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_924 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACTGTAATCAATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_868 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACTCGCTAGAATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_899 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACTGCAAGTAATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_934 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACTTAATCAGATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_190 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACAGCAATTCATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_34 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACAACTTGACATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_927 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACTGTCGGATATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_866 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACTCGCCTTGATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_38 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACAAGACACTATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_875 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACTCTCGGTCATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_78 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACAATGTTCTATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_151 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACACTAAGACATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_288 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACATTATCAAATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_110 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACACAGTTGAATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_195 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACAGCATGGAATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_222 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACAGGTGCGAATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_236 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACAGTTGCTTATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_332 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACCACATCCTATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_289 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACATTATGTTATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_250 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACATAGCGTCATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_352 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACCAGCAAGGATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_298 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACATTGTCTGATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_355 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACCAGCGGTAATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_469 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACCGCCTTCCATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_509 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACCTATGCGTATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_320 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACCAATAGTCATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_474 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACCGCTATGTATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_544 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACCTGTGGCGATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_393 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACCCAGTTAGATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_869 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACTCGGAATGATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_422 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACCCTACCATATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_564 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACGAAGAAGTATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_851 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACTCCAGCAAATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_559 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACGAACCTAGATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_581 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACGACCAGGAATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_657 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACGCCTAGCCATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_747 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACGTCCACAGATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_583 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACGACCGTTGATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_616 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACGATATCCAATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_652 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACGCCGCAACATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_960 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACTTGAGCCTATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_52 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACAAGGATGTATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_800 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACTACTTAGCATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_293 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACATTCTAGGATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_388 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACCCAGAGCTATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_786 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACTAATGAACATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_818 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACTATCCAGGATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_910 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACTGCTGCTGATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_968 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACTTGTCTATATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_878 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACTCTGCAAGATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_923 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACTGTAACTCATCTCGTATGCCGTCTTCTGCTTG ->Broad Barcode tagged_944 -GATCGGAAGAGCACACGTCTGAACTCCAGTCACTTATATCTATCTCGTATGCCGTCTTCTGCTTG \ No newline at end of file diff --git a/test/input/TestAssembleSpades/clipDb.fasta b/test/input/TestAssembleSpades/clipDb.fasta new file mode 120000 index 000000000..75f1e7ce2 --- /dev/null +++ b/test/input/TestAssembleSpades/clipDb.fasta @@ -0,0 +1 @@ +../TestAssembleTrinity/clipDb.fasta \ No newline at end of file diff --git a/test/input/TestTrimmomatic/expected1.maxinfo.fastq b/test/input/TestTrimmomatic/expected1.maxinfo.fastq new file mode 100644 index 000000000..5d3d22a2a --- /dev/null +++ b/test/input/TestTrimmomatic/expected1.maxinfo.fastq @@ -0,0 +1,8 @@ +@lowQuality-trim +AGTACATGCAGAGCAAGGACTGATACAATATCCAACAGCTTGGCAATCAGTAGGACACATGATGGTGA ++ +CCCFFFFFHHHHHJJJJJJJJJJJJJJJHIIIIJJJJHIJIIJJIJJFHGIIJJGHHHBDFDDDDDD9 +@barcodeAtEnd-trim +CGGGCTTTCACCATGTTGGCCAGGCTGGTCTCGAACTCCTGACTTCGTGATCTGCCCGCCTCGGCCTC ++ +CCCFFFFFHHHHHJJJJJJJJJJJJJJJHIIIIJJJJHIJIIJJIJJFHGIIJJGHHHBDFDDDDDDD diff --git a/test/input/TestTrimmomatic/expected2.maxinfo.fastq b/test/input/TestTrimmomatic/expected2.maxinfo.fastq new file mode 100644 index 000000000..a583c6822 --- /dev/null +++ b/test/input/TestTrimmomatic/expected2.maxinfo.fastq @@ -0,0 +1,8 @@ +@test1 +AGTACATGCAGAGCAAGGACTGATACAATATCCAACAGCTTGGCAATCAGTAGGACACATGATGGTGA ++ +CCCFFFFFHHHHHJJJJJJJJJJJJJJJHIIIIJJJJHIJIIJJIJJFHGIIJJGHHHBDFDDDDDD9 +@test3 +CGGGCTTTCACCATGTTGGCCAGGCTGGTCTCGAACTCCTGACTTCGTGATCTGCCCGCCTCGGCCTC ++ +CCCFFFFFHHHHHJJJJJJJJJJJJJJJHIIIIJJJJHIJIIJJIJJFHGIIJJGHHHBDFDDDDDDD diff --git a/test/unit/test_assembly.py b/test/unit/test_assembly.py index 23f4c82cd..f358a78d6 100644 --- a/test/unit/test_assembly.py +++ b/test/unit/test_assembly.py @@ -152,29 +152,33 @@ class TestAssembleSpades(TestCaseWithTmp): def test_assembly(self): inDir = util.file.get_test_input_path(self) inBam = os.path.join(inDir, '..', 'G5012.3.subset.bam') + clipDb = os.path.join(inDir, 'clipDb.fasta') with util.file.tempfname('.fasta') as outFasta: - assembly.assemble_spades(inBam=inBam, outFasta=outFasta, threads=4) + assembly.assemble_spades(in_bam=inBam, clip_db=clipDb, out_fasta=outFasta, threads=1) self.assertGreater(os.path.getsize(outFasta), 0) contig_lens = list(sorted(len(seq.seq) for seq in Bio.SeqIO.parse(outFasta, 'fasta'))) - self.assertEqual(contig_lens, [168, 177, 180, 183, 184, 187, 190, 191, 195, 197, 201, 211, 243, 244, 247, 294, 319, 328, 348]) + print('test_assembly_contigs_lens:', contig_lens) + self.assertEqual(contig_lens, [168, 170, 177, 180, 184, 187, 190, 191, 195, 197, 211, 243, 244, 247, 328, 348, 430]) - @pytest.mark.skip(reason="takes too long") def test_assembly_with_previously_assembled_contigs(self): inDir = util.file.get_test_input_path(self) inBam = os.path.join(inDir, '..', 'G5012.3.subset.bam') + clipDb = os.path.join(inDir, 'clipDb.fasta') previously_assembled_contigs = os.path.join(inDir, 'trinity_contigs.fasta') with util.file.tempfname('.fasta') as outFasta: - assembly.assemble_spades(inBam=inBam, previously_assembled_contigs=previously_assembled_contigs, - outFasta=outFasta, threads=1, mem_limit_gb=1) + assembly.assemble_spades(in_bam=inBam, clip_db=clipDb, contigs_untrusted=previously_assembled_contigs, + out_fasta=outFasta, threads=1, mem_limit_gb=1) self.assertGreater(os.path.getsize(outFasta), 0) contig_lens = list(sorted(len(seq.seq) for seq in Bio.SeqIO.parse(outFasta, 'fasta'))) - self.assertEqual(contig_lens, [111, 140, 184, 211, 243, 244, 294, 321, 328, 348, 430]) + print('test_assembly_with_previously_assembled_contigs_contigs_lens:', contig_lens) + self.assertEqual(contig_lens, [168, 170, 177, 180, 184, 187, 190, 191, 195, 197, 211, 243, 244, 321, 328, 348, 430]) def test_empty_input_succeed(self): inDir = util.file.get_test_input_path() inBam = os.path.join(inDir, 'empty.bam') + clipDb = os.path.join(inDir, 'clipDb.fasta') with util.file.tempfname('fasta') as outFasta: - assembly.assemble_spades(inBam=inBam, outFasta=outFasta, threads=4) + assembly.assemble_spades(in_bam=inBam, clip_db=clipDb, out_fasta=outFasta) self.assertEqual(os.path.getsize(outFasta), 0) class TestTrimRmdupSubsamp(TestCaseWithTmp): @@ -353,7 +357,7 @@ def test_gapfill(self): in_scaffold = join(inDir, 'TestOrderAndOrient', 'expected.ebov.doublehit.fasta') with util.file.tempfname(suffix='.filled.fasta') as filled: assembly.gapfill_gap2seq(in_scaffold=in_scaffold, - in_bam=join(inDir, 'G5012.3.testreads.bam'), out_scaffold=filled) + in_bam=join(inDir, 'G5012.3.testreads.bam'), out_scaffold=filled, threads=1, random_seed=23923937) self.assertEqualContents(filled, join(inDir, 'TestGap2Seq', 'expected.ebov.doublehit.gapfill.fasta')) diff --git a/test/unit/test_tools_trimmomatic.py b/test/unit/test_tools_trimmomatic.py index b1d75a7a4..0eed3a98b 100644 --- a/test/unit/test_tools_trimmomatic.py +++ b/test/unit/test_tools_trimmomatic.py @@ -26,6 +26,23 @@ def test_trimmomatic_paired(self): assert_equal_contents(self, pairedOutFastq1, expected1Fastq) assert_equal_contents(self, pairedOutFastq2, expected2Fastq) + + def test_trimmomatic_paired_maxinfo(self): + myInputDir = util.file.get_test_input_path(self) + inFastq1 = os.path.join(myInputDir, 'in1.fastq') + inFastq2 = os.path.join(myInputDir, 'in2.fastq') + clipFasta = os.path.join(myInputDir, 'clip.fasta') + with util.file.tempfnames(('.out1.fastq', '.out2.fastq')) as (pairedOutFastq1, pairedOutFastq2): + tools.trimmomatic.TrimmomaticTool().execute(inFastq1, inFastq2, pairedOutFastq1, pairedOutFastq2, clipFasta, + maxinfo_target_length=30, maxinfo_strictness=.3) + + # Check that results match expected + expected1Fastq = os.path.join(myInputDir, 'expected1.maxinfo.fastq') + expected2Fastq = os.path.join(myInputDir, 'expected2.maxinfo.fastq') + assert_equal_contents(self, pairedOutFastq1, expected1Fastq) + assert_equal_contents(self, pairedOutFastq2, expected2Fastq) + + def test_trimmomatic_single(self): myInputDir = util.file.get_test_input_path(self) inFastq1 = os.path.join(myInputDir, 'in1.fastq') diff --git a/tools/gap2seq.py b/tools/gap2seq.py index 745a2d9dd..ac3e393b4 100644 --- a/tools/gap2seq.py +++ b/tools/gap2seq.py @@ -56,8 +56,8 @@ def _run_gap2seq(self, reads, scaffolds, filled, *args, **kwargs): with util.file.pushd_popd(gap2seq_run_dir): self.execute(file_args+args+more_args) - def gapfill(self, in_scaffold, in_bam, out_scaffold, solid_kmer_thresholds=(3,2), kmer_sizes=(90, 80, 70, 60, 50, 40, 31), - min_gap_to_close=4, gap2seq_opts='', mem_limit_gb=4.0, threads=0, time_soft_limit_minutes=60.0): + def gapfill(self, in_scaffold, in_bam, out_scaffold, solid_kmer_thresholds=(3,), kmer_sizes=(90, 80, 70, 60, 50, 40, 31), + min_gap_to_close=4, gap2seq_opts='', mem_limit_gb=4.0, threads=0, time_soft_limit_minutes=60.0, random_seed=0): """Try to fill the gaps in the given scaffold, using the reads. Inputs: @@ -80,6 +80,7 @@ def gapfill(self, in_scaffold, in_bam, out_scaffold, solid_kmer_thresholds=(3,2) mem_limit_gb: max memory to use, in gigabytes threads: number of threads to use; 0 or None means use all available cores. time_soft_limit_minutes: stop trying to close more gaps after this many minutes (currently this is a soft/advisory limit) + random_seed: random seed for choosing random paths (0 to use current time) """ solid_kmer_thresholds = sorted(util.misc.make_seq(solid_kmer_thresholds), reverse=True) @@ -93,18 +94,18 @@ def gapfill(self, in_scaffold, in_bam, out_scaffold, solid_kmer_thresholds=(3,2) prev_scaffold = in_scaffold for solid_kmer_threshold, kmer_size in itertools.product(solid_kmer_thresholds, kmer_sizes): - filled_scaffold = os.path.join(gap2seq_dir, 'gap2seq-filled.s{}.k{}.fasta'.format(solid_kmer_threshold, kmer_size)) - self._run_gap2seq(reads, prev_scaffold, filled_scaffold, - *(['-all-upper', '-verbose']+shlex.split(gap2seq_opts)), - solid=solid_kmer_threshold, k=kmer_size, nb_cores=threads, max_mem=mem_limit_gb) - - prev_scaffold = filled_scaffold - - if not any('N'*min_gap_to_close in str(rec.seq) for rec in Bio.SeqIO.parse(filled_scaffold, 'fasta')): + if not any('N'*min_gap_to_close in str(rec.seq) for rec in Bio.SeqIO.parse(prev_scaffold, 'fasta')): log.info('no gaps left, quittting gap2seq early') break if time.time() > stop_time: log.info('Time limit for gap closing reached') break + filled_scaffold = os.path.join(gap2seq_dir, 'gap2seq-filled.s{}.k{}.fasta'.format(solid_kmer_threshold, kmer_size)) + self._run_gap2seq(reads, prev_scaffold, filled_scaffold, + *(['-all-upper', '-verbose']+shlex.split(gap2seq_opts)), + solid=solid_kmer_threshold, k=kmer_size, nb_cores=threads, max_mem=mem_limit_gb, randseed=random_seed) + + prev_scaffold = filled_scaffold + shutil.copyfile(src=prev_scaffold, dst=out_scaffold) diff --git a/tools/spades.py b/tools/spades.py index f9b2e3211..09251fbb5 100644 --- a/tools/spades.py +++ b/tools/spades.py @@ -1,6 +1,5 @@ ''' - SPAdes - St. Petersburg Assembler - + Tool wrapper for SPAdes, St. Petersburg Assembler ( http://cab.spbu.ru/software/spades/ ) ''' import logging @@ -36,56 +35,76 @@ def version(self): return TOOL_VERSION def execute(self, args): # pylint: disable=W0221 - tool_cmd = [self.install_and_get_path()] + args + tool_cmd = [self.install_and_get_path()] + list(map(str, args)) log.debug(' '.join(tool_cmd)) subprocess.check_call(tool_cmd) def assemble(self, reads_fwd, reads_bwd, contigs_out, reads_unpaired=None, contigs_trusted=None, - contigs_untrusted=None, mem_limit_gb=4, threads=0, spades_opts=''): - '''Assemble contigs from reads and (optionally) pre-existing contigs. + contigs_untrusted=None, kmer_sizes=(55,65), mask_errors=False, max_kmer_sizes=1, + filter_contigs=False, mem_limit_gb=8, threads=0, spades_opts=''): + '''Assemble contigs from RNA-seq reads and (optionally) pre-existing contigs. Inputs: - reads_fwd, reads_bwd - paired reads in fasta format - reads_unpaired - optionally, additional unpaired reads in fasta format - contigs_trusted - optionally, already-assembled contigs of high quality - contigs_untrusted - optionally, already-assembled contigs of average quality + Required: + reads_fwd, reads_bwd (fasta/q): paired reads + Optional: + reads_unpaired (fasta/q): optionally, additional unpaired reads + contigs_trusted (fasta/q): optionally, already-assembled contigs of high quality + contigs_untrusted (fasta/q): optionally, already-assembled contigs of average quality Params: - mem_limit_gb - max memory to use, in gigabytes - threads - number of threads to use (0 means use all available CPUs) - spades_opts - additional options to pass to spades + kmer_sizes: if given, use these kmer sizes and combine the resulting contigs. kmer size of 0 or None means use size auto-selected + by SPAdes based on read length. + mask_errors: if True, if spades fails with an error for a kmer size, pretend it just produced no contigs for that kmer size + max_kmer_sizes: if this many kmer sizes succeed, do not try further ones + filter_contigs: if True, outputs only "long and reliable transcripts with rather high expression" (per SPAdes docs) + mem_limit_gb: max memory to use, in gigabytes + threads: number of threads to use (0 means use all available CPUs) + spades_opts: additional options to pass to spades Outputs: - contigs_out - assembled contigs in fasta format. Note that, since we use the + contigs_out: assembled contigs in fasta format. Note that, since we use the RNA-seq assembly mode, for some genome regions we may get several contigs representing alternative transcripts. Fasta record name of each contig indicates its length, coverage, and the group of alternative transcripts to which it belongs. See details at - http://cab.spbu.ru/files/release3.10.1/rnaspades_manual.html#sec2.2 . + http://cab.spbu.ru/files/release3.11.1/rnaspades_manual.html#sec2.4 . ''' if not threads: threads = util.misc.available_cpu_count() - if (reads_fwd and reads_bwd - and os.path.getsize(reads_fwd) > 0 and os.path.getsize(reads_bwd) > 0 - or reads_unpaired and os.path.getsize(reads_unpaired) > 0): - - with util.file.tmp_dir('_spades') as spades_dir: - log.debug('spades_dir=' + spades_dir) - args = [] - if reads_fwd and reads_bwd and os.path.getsize(reads_fwd) > 0 and os.path.getsize(reads_bwd) > 0: - args += ['-1', reads_fwd, '-2', reads_bwd ] - if reads_unpaired and os.path.getsize(reads_unpaired) > 0: - args += [ '--s1', reads_unpaired ] - if contigs_trusted: args += [ '--trusted-contigs', contigs_trusted ] - if contigs_untrusted: args += [ '--untrusted-contigs', contigs_untrusted ] - if spades_opts: args += shlex.split(spades_opts) - args += [ '--rna', '-m' + str(mem_limit_gb), '-t', str(threads), '-o', spades_dir ] - - self.execute(args=args) - - shutil.copyfile(src=os.path.join(spades_dir, 'transcripts.fasta'), dst=contigs_out) - - else: - # spades crashes on empty input, so just return empty output - util.file.make_empty(contigs_out) - return - + util.file.make_empty(contigs_out) + + if ((reads_fwd and reads_bwd and os.path.getsize(reads_fwd) > 0 and os.path.getsize(reads_bwd) > 0) or + (reads_unpaired and os.path.getsize(reads_unpaired) > 0)): + + kmer_sizes_succeeded = 0 + for kmer_size in util.misc.make_seq(kmer_sizes): + + with util.file.tmp_dir('_spades') as spades_dir: + log.debug('spades_dir=' + spades_dir) + args = [] + if reads_fwd and reads_bwd and os.path.getsize(reads_fwd) > 0 and os.path.getsize(reads_bwd) > 0: + args += ['-1', reads_fwd, '-2', reads_bwd ] + if reads_unpaired and os.path.getsize(reads_unpaired) > 0: + args += [ '--s1', reads_unpaired ] + if contigs_trusted: args += [ '--trusted-contigs', contigs_trusted ] + if contigs_untrusted: args += [ '--untrusted-contigs', contigs_untrusted ] + if kmer_size: args += [ '-k', kmer_size ] + if spades_opts: args += shlex.split(spades_opts) + args += [ '--rna', '-m' + str(mem_limit_gb), '-t', str(threads), '-o', spades_dir ] + + transcripts_fname = os.path.join(spades_dir, ('hard_filtered_' if filter_contigs else '') + 'transcripts.fasta') + + try: + self.execute(args=args) + except Exception as e: + if mask_errors: + log.warning('SPAdes failed for k={}: {}'.format(kmer_size, e)) + util.file.make_empty(transcripts_fname) + else: + raise + + util.file.concat(transcripts_fname, contigs_out, append=True) + if os.path.getsize(transcripts_fname): + kmer_sizes_succeeded += 1 + if kmer_sizes_succeeded >= max_kmer_sizes: + break diff --git a/tools/trimmomatic.py b/tools/trimmomatic.py index 41414ec9f..d33adc84e 100644 --- a/tools/trimmomatic.py +++ b/tools/trimmomatic.py @@ -32,7 +32,9 @@ def execute(self, trailing_q_cutoff=15, minlength_to_keep=30, sliding_window_size=4, - sliding_window_q_cutoff=25 + sliding_window_q_cutoff=25, + maxinfo_target_length=None, + maxinfo_strictness=None ): '''Trim read sequences with Trimmomatic.''' trimmomaticPath = self.install_and_get_path() @@ -62,6 +64,10 @@ def execute(self, 'SLIDINGWINDOW:{sliding_window_size}:{sliding_window_q_cutoff}'.format( sliding_window_size=sliding_window_size, sliding_window_q_cutoff=sliding_window_q_cutoff, + ) if not maxinfo_target_length else \ + 'MAXINFO:{maxinfo_target_length}:{maxinfo_strictness}'.format( + maxinfo_target_length=maxinfo_target_length, + maxinfo_strictness=maxinfo_strictness, ), 'MINLEN:{minlength_to_keep}'.format(minlength_to_keep=minlength_to_keep), 'ILLUMINACLIP:{clipFasta}:2:30:12'.format(clipFasta=clipFasta) diff --git a/util/file.py b/util/file.py index bff148bc4..8d96942f6 100644 --- a/util/file.py +++ b/util/file.py @@ -417,14 +417,15 @@ def bam_is_sorted(bam_file_path): raise KeyError("Could not locate the SO field in the SAM/BAM file header.") -def concat(inputFilePaths, outputFilePath): +def concat(inputFilePaths, outputFilePath, append=False): ''' This function creates an output file containing the - lines present in the input files, in the order specified - by the inputFilePaths list. + lines present in the input file(s), in the order specified + by the inputFilePaths list. If `append` is True, + appends to the output file instead of overwriting it. ''' - with open(outputFilePath, 'w') as outfile: - for filePath in inputFilePaths: + with open(outputFilePath, 'a' if append else 'w') as outfile: + for filePath in util.misc.make_seq(inputFilePaths): with open(filePath) as infile: for line in infile: outfile.write(line)