Skip to content

Commit

Permalink
Merge branch 'master' into hm-cleanup-krona-tmp-files
Browse files Browse the repository at this point in the history
  • Loading branch information
dpark01 committed Sep 1, 2017
2 parents 864f1cb + afdeb16 commit 663ed79
Show file tree
Hide file tree
Showing 91 changed files with 1,356 additions and 15,700 deletions.
1 change: 1 addition & 0 deletions .agignore
@@ -0,0 +1 @@
tools/binaries
94 changes: 85 additions & 9 deletions assembly.py
Expand Up @@ -32,11 +32,13 @@
import tools.samtools
import tools.gatk
import tools.novoalign
import tools.spades
import tools.trimmomatic
import tools.trinity
import tools.mafft
import tools.mummer
import tools.muscle
import tools.gap2seq

# third-party
import Bio.AlignIO
Expand All @@ -46,14 +48,13 @@
log = logging.getLogger(__name__)


class DenovoAssemblyError(Exception):
class DenovoAssemblyError(RuntimeError):

def __init__(self, n_start, n_trimmed, n_rmdup, n_output, n_subsamp, n_unpaired_subsamp):
super(DenovoAssemblyError, self).__init__(
'denovo assembly (Trinity) failed. {} reads at start. {} read pairs after Trimmomatic. {} read pairs after Prinseq rmdup. {} reads for trinity ({} pairs + {} unpaired).'.format(
n_start, n_trimmed, n_rmdup, n_output, n_subsamp, n_unpaired_subsamp
)
)
'''Indicates a failure of the de novo assembly step. Can indicate an internal error in the assembler, but also
insufficient input data (for assemblers which cannot report that condition separately).'''

def __init__(self, reason):
super(DenovoAssemblyError, self).__init__(reason)


def trim_rmdup_subsamp_reads(inBam, clipDb, outBam, n_reads=100000):
Expand Down Expand Up @@ -296,9 +297,10 @@ def assemble_trinity(
except subprocess.CalledProcessError as e:
if always_succeed:
log.warn("denovo assembly (Trinity) failed to assemble input, emitting empty output instead.")
util.file.touch(outFasta)
util.file.make_empty(outFasta)
else:
raise DenovoAssemblyError(*read_stats)
raise DenovoAssemblyError('denovo assembly (Trinity) failed. {} reads at start. {} read pairs after Trimmomatic. '
'{} read pairs after Prinseq rmdup. {} reads for trinity ({} pairs + {} unpaired).'.format(*read_stats))
os.unlink(subsampfq[0])
os.unlink(subsampfq[1])

Expand Down Expand Up @@ -338,6 +340,80 @@ def parser_assemble_trinity(parser=argparse.ArgumentParser()):

__commands__.append(('assemble_trinity', parser_assemble_trinity))

def gapfill_gap2seq(in_scaffold, in_bam, out_scaffold, gap2seq_opts='', threads=1, mem_limit_gb=4):
''' This step runs the Gap2Seq tool to close gaps between contigs in a scaffold.
'''
tools.gap2seq.Gap2SeqTool().gapfill(in_scaffold, in_bam, out_scaffold, gap2seq_opts=gap2seq_opts, threads=threads,
mem_limit_gb=mem_limit_gb)

def parser_gapfill_gap2seq(parser=argparse.ArgumentParser(description='Close gaps between contigs in a scaffold')):
parser.add_argument('inScaffold', help='FASTA file containing the scaffold. Each FASTA record corresponds to one '
'segment (for multi-segment genomes). Contigs within each segment are separated by Ns.')
parser.add_argument('inBam', help='Input unaligned reads, BAM format.')
parser.add_argument('outScaffold', help='Output assembly.')
parser.add_argument('--threads', default=0, type=int, help='Number of threads (default: %(default)s); 0 means use all available cores')
parser.add_argument('--memLimitGb', dest='mem_limit_gb', default=4.0, help='Max memory to use, in gigabytes %(default)s')
parser.add_argument('--timeSoftLimitMinutes', dest='time_soft_limit_minutes', default=60.0,
help='Stop trying to close more gaps after this many minutes (default: %(default)s); this is a soft/advisory limit')
parser.add_argument('--gap2seqOpts', dest='gap2seq_opts', default='', help='(advanced) Extra command-line options to pass to Gap2Seq')

util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None)))
util.cmd.attach_main(parser, gapfill_gap2seq, split_args=True)
return parser


__commands__.append(('gapfill_gap2seq', parser_gapfill_gap2seq))


def assemble_spades(
inBam,
outFasta,
spades_opts='',
previously_assembled_contigs=None,
mem_limit_gb=4,
threads=0,
):
''' De novo RNA-seq assembly with the SPAdes assembler.
Inputs:
inBam - reads to assemble. May include both paired and unpaired reads.
previously_assembled_contigs - (optional) already-assembled contigs from the same sample.
Outputs:
outFasta - the assembled contigs. Note that, since this is RNA-seq assembly, for each assembled genomic region there may be
several contigs representing different variants of that region.
Params:
mem_limit_gb - max memory to use
threads - number of threads to use (0 means use all available CPUs)
spades_opts - (advanced) custom command-line options to pass to the SPAdes assembler
'''

with tools.picard.SamToFastqTool().execute_tmp(inBam, includeUnpaired=True,
JVMmemory=str(mem_limit_gb)+'g') as (reads_fwd, reads_bwd, reads_unpaired):
try:
tools.spades.SpadesTool().assemble(reads_fwd=reads_fwd, reads_bwd=reads_bwd, reads_unpaired=reads_unpaired,
contigs_untrusted=previously_assembled_contigs,
contigs_out=outFasta, spades_opts=spades_opts, mem_limit_gb=mem_limit_gb,
threads=threads)
except subprocess.CalledProcessError as e:
raise DenovoAssemblyError('SPAdes assembler failed: ' + str(e))

def parser_assemble_spades(parser=argparse.ArgumentParser()):
parser.add_argument('inBam', help='Input unaligned reads, BAM format.')
parser.add_argument('outFasta', help='Output assembly.')
parser.add_argument('--previously_assembled_contigs', help='Contigs previously assembled from the same sample')
parser.add_argument('--spades_opts', default='', help='(advanced) Extra flags to pass to the SPAdes assembler')
parser.add_argument('--mem_limit_gb', default=4, type=int, help='Max memory to use, in GB (default: %(default)s)')
parser.add_argument('--threads', default=0, type=int, help='Number of threads, or 0 to use all CPUs (default: %(default)s)')
util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None)))
util.cmd.attach_main(parser, assemble_spades, split_args=True)
return parser


__commands__.append(('assemble_spades', parser_assemble_spades))


def order_and_orient(inFasta, inReference, outFasta,
outAlternateContigs=None,
Expand Down
76 changes: 69 additions & 7 deletions read_utils.py
Expand Up @@ -22,6 +22,7 @@
import util.misc
from util.file import mkstempfname
import tools.bwa
import tools.cdhit
import tools.picard
import tools.samtools
import tools.mvicuna
Expand Down Expand Up @@ -658,6 +659,67 @@ def mvicuna_fastqs_to_readlist(inFastq1, inFastq2, readList):
os.unlink(outFastq2)


def rmdup_cdhit_bam(inBam, outBam, max_mismatches=None, jvm_memory=None):
''' Remove duplicate reads from BAM file using cd-hit-dup.
'''
max_mismatches = max_mismatches or 4
tmp_dir = tempfile.mkdtemp()

tools.picard.SplitSamByLibraryTool().execute(inBam, tmp_dir)

s2fq_tool = tools.picard.SamToFastqTool()
cdhit = tools.cdhit.CdHit()
out_bams = []
for f in os.listdir(tmp_dir):
out_bam = mkstempfname('.bam')
out_bams.append(out_bam)
library_sam = os.path.join(tmp_dir, f)

in_fastqs = mkstempfname('.1.fastq'), mkstempfname('.2.fastq')

s2fq_tool.execute(library_sam, in_fastqs[0], in_fastqs[1])
if not os.path.getsize(in_fastqs[0]) > 0 and not os.path.getsize(in_fastqs[1]) > 0:
continue

out_fastqs = mkstempfname('.1.fastq'), mkstempfname('.2.fastq')
options = {
'-e': max_mismatches,
}
if in_fastqs[1] is not None and os.path.getsize(in_fastqs[1]) > 10:
options['-i2'] = in_fastqs[1]
options['-o2'] = out_fastqs[1]

log.info("executing cd-hit-est on library " + library_sam)
# cd-hit-dup cannot operate on piped fastq input because it reads twice
cdhit.execute('cd-hit-dup', in_fastqs[0], out_fastqs[0], options=options, background=True)

tools.picard.FastqToSamTool().execute(out_fastqs[0], out_fastqs[1], f, out_bam, JVMmemory=jvm_memory)
for fn in in_fastqs:
os.unlink(fn)

with util.file.fifo(name='merged.sam') as merged_bam:
merge_opts = ['SORT_ORDER=queryname']
tools.picard.MergeSamFilesTool().execute(out_bams, merged_bam, picardOptions=merge_opts, JVMmemory=jvm_memory, background=True)
tools.picard.ReplaceSamHeaderTool().execute(merged_bam, inBam, outBam, JVMmemory=jvm_memory)


def parser_rmdup_cdhit_bam(parser=argparse.ArgumentParser()):
parser.add_argument('inBam', help='Input reads, BAM format.')
parser.add_argument('outBam', help='Output reads, BAM format.')
parser.add_argument(
'--JVMmemory',
default=tools.picard.FilterSamReadsTool.jvmMemDefault,
help='JVM virtual memory size (default: %(default)s)',
dest='jvm_memory'
)
util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None)))
util.cmd.attach_main(parser, rmdup_cdhit_bam, split_args=True)
return parser


__commands__.append(('rmdup_cdhit_bam', parser_rmdup_cdhit_bam))


def rmdup_mvicuna_bam(inBam, outBam, JVMmemory=None):
''' Remove duplicate reads from BAM file using M-Vicuna. The
primary advantage to this approach over Picard's MarkDuplicates tool
Expand Down Expand Up @@ -909,7 +971,7 @@ def main_gatk_realign(args):
tools.gatk.GATKTool(path=args.GATK_PATH).local_realign(
args.inBam, args.refFasta,
args.outBam, JVMmemory=args.JVMmemory,
threads=args.threads,
threads=args.threads,
)
return 0

Expand Down Expand Up @@ -945,7 +1007,7 @@ def align_and_fix(
shutil.copyfile(refFasta, refFastaCopy)

tools.picard.CreateSequenceDictionaryTool().execute(refFastaCopy, overwrite=True)
tools.samtools.SamtoolsTool().faidx(refFastaCopy, overwrite=True)
tools.samtools.SamtoolsTool().faidx(refFastaCopy, overwrite=True)

if aligner_options is None:
if aligner=="novoalign":
Expand All @@ -955,7 +1017,7 @@ def align_and_fix(

bam_aligned = mkstempfname('.aligned.bam')
if aligner=="novoalign":

tools.novoalign.NovoalignTool(license_path=novoalign_license_path).index_fasta(refFastaCopy)

tools.novoalign.NovoalignTool(license_path=novoalign_license_path).execute(
Expand Down Expand Up @@ -1003,21 +1065,21 @@ def parser_align_and_fix(parser=argparse.ArgumentParser()):
'--outBamAll',
default=None,
help='''Aligned, sorted, and indexed reads. Unmapped and duplicate reads are
retained. By default, duplicate reads are marked. If "--skipMarkDupes"
retained. By default, duplicate reads are marked. If "--skipMarkDupes"
is specified duplicate reads are included in outout without being marked.'''
)
parser.add_argument(
'--outBamFiltered',
default=None,
help='''Aligned, sorted, and indexed reads. Unmapped reads are removed from this file,
as well as any marked duplicate reads. Note that if "--skipMarkDupes" is provided,
help='''Aligned, sorted, and indexed reads. Unmapped reads are removed from this file,
as well as any marked duplicate reads. Note that if "--skipMarkDupes" is provided,
duplicates will be not be marked and will be included in the output.'''
)
parser.add_argument('--aligner_options', default=None, help='aligner options (default for novoalign: "-r Random", bwa: "-T 30"')
parser.add_argument('--aligner', choices=['novoalign', 'bwa'], default='novoalign', help='aligner (default: %(default)s)')
parser.add_argument('--JVMmemory', default='4g', help='JVM virtual memory size (default: %(default)s)')
parser.add_argument('--threads', default=1, help='Number of threads (default: %(default)s)')
parser.add_argument('--skipMarkDupes',
parser.add_argument('--skipMarkDupes',
help='If specified, duplicate reads will not be marked in the resulting output file.',
dest="skip_mark_dupes",
action='store_true')
Expand Down
4 changes: 4 additions & 0 deletions requirements-conda.txt
@@ -1,12 +1,15 @@
blast=2.6.0
bmtagger=3.101
bwa=0.7.15
cd-hit=4.6.8
cd-hit-auxtools=4.6.8
diamond=0.8.36
gatk=3.6
kraken-all=0.10.6_fork2
krona=2.7
last=719
fastqc=0.11.5
gap2seq=2.1
mafft=7.221
mummer=3.23
muscle=3.8.1551
Expand All @@ -16,6 +19,7 @@ picard=2.9.0
prinseq=0.20.4
samtools=1.3.1
snpeff=4.1l
spades=3.10.1
tbl2asn=25.3
trimmomatic=0.36
trinity=date.2011_11_26
Expand Down
Binary file not shown.

0 comments on commit 663ed79

Please sign in to comment.