Skip to content

Commit

Permalink
Merge branch 'master' into ct-common-barcodes-and-other-fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
dpark01 committed Jul 8, 2016
2 parents 9c23a12 + f15cfa9 commit f9545c2
Show file tree
Hide file tree
Showing 11 changed files with 165 additions and 58 deletions.
2 changes: 2 additions & 0 deletions metagenomics.py
Expand Up @@ -507,6 +507,8 @@ def diamond(inBam, db, taxDb, outReport, outM8=None, outLca=None, numThreads=1):
'''
tmp_fastq = util.file.mkstempfname('.fastq')
tmp_fastq2 = util.file.mkstempfname('.fastq')
# do not convert this to samtools bam2fq unless we can figure out how to replicate
# the clipping functionality of Picard SamToFastq
picard = tools.picard.SamToFastqTool()
picard_opts = {
'CLIPPING_ATTRIBUTE': tools.picard.SamToFastqTool.illumina_clipping_attribute,
Expand Down
2 changes: 1 addition & 1 deletion pipes/config.yaml
Expand Up @@ -37,7 +37,7 @@ blast_db_remove: "metag_v3.ncRNA.mRNA.mitRNA.consensus"
trim_clip_db: "/idi/sabeti-scratch/kandersen/references/depletion_databases/contaminants.fasta"

# a fasta file containing sequences to report downstream as spike-ins
spikeins_db: "/idi/sabeti-scratch/kandersen/references/other/ercc_spike-ins.fasta"
spikeins_db: "/idi/sabeti-scratch/genomes/other/ercc_spike-ins.fasta"

# Directory of kraken database for metagenomics identification
# For pre-built hosted databases, you may download:
Expand Down
2 changes: 1 addition & 1 deletion pipes/rules/reports.rules
Expand Up @@ -56,7 +56,7 @@ if config.get("spikeins_db"):
run:
makedirs(os.path.join(config["reports_dir"], 'spike_count'))
makedirs(os.path.join(config["tmp_dir"], config["subdirs"]["depletion"]))
shell("{config[bin_dir]}/read_utils.py align_and_count_hits {input} {params.spikeins_db} {output}")
shell("{config[bin_dir]}/read_utils.py bwamem_and_idxstats {input} {params.spikeins_db} {output}")

rule consolidate_spike_count:
input: expand("{{dir}}/spike_count/{sample}.spike_count.txt", \
Expand Down
50 changes: 19 additions & 31 deletions read_utils.py
Expand Up @@ -21,6 +21,7 @@
import util.file
import util.misc
from util.file import mkstempfname
import tools.bwa
import tools.picard
import tools.samtools
import tools.mvicuna
Expand Down Expand Up @@ -1019,48 +1020,35 @@ def parser_align_and_fix(parser=argparse.ArgumentParser()):

# =========================

def align_and_count_hits(inBam, refFasta, outCounts, includeZeros=False,
JVMmemory=None):
''' Take reads, align to reference with Novoalign and return aligned
read counts for each reference sequence.
def bwamem_idxstats(inBam, refFasta, outBam=None, outStats=None):
''' Take reads, align to reference with BWA-MEM and perform samtools idxstats.
'''

bam_aligned = mkstempfname('.aligned.bam')
tools.novoalign.NovoalignTool().execute(
inBam,
refFasta,
bam_aligned,
options=['-r', 'Random'],
JVMmemory=JVMmemory)
if outBam is None:
bam_aligned = mkstempfname('.aligned.bam')
else:
bam_aligned = outBam

samtools = tools.samtools.SamtoolsTool()
seqs = list(dict(x.split(':', 1) for x in row[1:])['SN']
for row in samtools.getHeader(bam_aligned)
if row[0]=='@SQ')
bwa = tools.bwa.Bwa()

with util.file.open_or_gzopen(outCounts, 'w') as outf:
for seq in seqs:
n = samtools.count(bam_aligned, regions=[seq])
if n>0 or includeZeros:
outf.write("{}\t{}\n".format(seq, n))
bwa.mem(inBam, refFasta, bam_aligned)

os.unlink(bam_aligned)
if outStats is not None:
samtools.idxstats(bam_aligned, outStats)

if outBam is None:
os.unlink(bam_aligned)

def parser_align_and_count_hits(parser=argparse.ArgumentParser()):
def parser_bwamem_idxstats(parser=argparse.ArgumentParser()):
parser.add_argument('inBam', help='Input unaligned reads, BAM format.')
parser.add_argument('refFasta', help='Reference genome, FASTA format, pre-indexed by Picard and Novoalign.')
parser.add_argument('outCounts', help='Output counts file')
parser.add_argument("--includeZeros",
help="Output lines with no hits (default: %(default)s)",
default=False,
action="store_true",
dest="includeZeros")
parser.add_argument('--JVMmemory', default='4g', help='JVM virtual memory size (default: %(default)s)')
parser.add_argument('outBam', help='Output aligned, indexed BAM file', default=None)
parser.add_argument('outStats', help='Output idxstats file', default=None)
util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None)))
util.cmd.attach_main(parser, align_and_count_hits, split_args=True)
util.cmd.attach_main(parser, bwamem_idxstats, split_args=True)
return parser

__commands__.append(('align_and_count_hits', parser_align_and_count_hits))
__commands__.append(('bwamem_idxstats', parser_bwamem_idxstats))

# =========================

Expand Down
4 changes: 2 additions & 2 deletions reports.py
Expand Up @@ -359,8 +359,8 @@ def consolidate_spike_count(inDir, outFile):
s = s[:-len('.spike_count.txt')]
with open(fn, 'rt') as inf:
for line in inf:
if not line.startswith('Input bam'):
spike, count = line.strip().split('\t')
if not line.startswith('Input bam') and not line.startswith('*'):
spike, count = line.strip().split('\t')[0,2]
outf.write('\t'.join([s, spike, count]) + '\n')


Expand Down
3 changes: 2 additions & 1 deletion requirements-conda.txt
@@ -1,5 +1,6 @@
blast=2.2.31
bmtagger=3.101
bwa=0.7.15
diamond=0.7.10=boost1.60_1
kraken-all=0.10.6_eaf8fb68
last=719
Expand All @@ -10,7 +11,7 @@ mvicuna=1.0
novoalign=3.03.02
picard=1.126
prinseq=0.20.4
samtools=1.2
samtools=1.3.1
snpeff=4.1l
trimmomatic=0.36
trinity=date.2011_11_26
Expand Down
8 changes: 4 additions & 4 deletions taxon_filter.py
Expand Up @@ -379,7 +379,7 @@ def filter_lastal_bam(
# convert BAM to paired FASTQ
inReads1 = mkstempfname('.1.fastq')
inReads2 = mkstempfname('.2.fastq')
tools.picard.SamToFastqTool().execute(inBam, inReads1, inReads2)
tools.samtools.SamtoolsTool().bam2fq(inBam, inReads1, inReads2)

# look for hits in inReads1 and inReads2
hitList1 = mkstempfname('.1.hits')
Expand Down Expand Up @@ -509,7 +509,7 @@ def deplete_bmtagger_bam(inBam, db, outBam, threads=None, JVMmemory=None):

inReads1 = mkstempfname('.1.fastq')
inReads2 = mkstempfname('.2.fastq')
tools.picard.SamToFastqTool().execute(inBam, inReads1, inReads2)
tools.samtools.SamtoolsTool().bam2fq(inBam, inReads1, inReads2)

bmtaggerConf = mkstempfname('.bmtagger.conf')
with open(bmtaggerConf, 'w') as f:
Expand Down Expand Up @@ -974,7 +974,7 @@ def deplete_blastn_bam(inBam, db, outBam, threads, chunkSize=1000000, JVMmemory=
blastOutFile = mkstempfname('.hits.txt')

# Initial BAM -> FASTQ pair
tools.picard.SamToFastqTool().execute(inBam, fastq1, fastq2)
tools.samtools.SamtoolsTool().bam2fq(inBam, fastq1, fastq2)

# Find BLAST hits against FASTQ1
read_utils.fastq_to_fasta(fastq1, fasta)
Expand All @@ -996,7 +996,7 @@ def deplete_blastn_bam(inBam, db, outBam, threads, chunkSize=1000000, JVMmemory=
tools.picard.FilterSamReadsTool().execute(inBam, True, blast_hits, halfBam, JVMmemory=JVMmemory)

# Depleted BAM -> FASTQ pair
tools.picard.SamToFastqTool().execute(halfBam, fastq1, fastq2)
tools.samtools.SamtoolsTool().bam2fq(halfBam, fastq1, fastq2)

# Find BLAST hits against FASTQ2 (which is already smaller than before)
read_utils.fastq_to_fasta(fastq2, fasta)
Expand Down
48 changes: 43 additions & 5 deletions test/unit/test_read_utils.py
Expand Up @@ -3,15 +3,17 @@
__author__ = "irwin@broadinstitute.org"

import unittest
import os
import tempfile
import argparse
import filecmp
import util
import util.file
import os
import read_utils
import shutil
import tempfile
import tools
import tools.bwa
import tools.samtools
import util
import util.file
from test import TestCaseWithTmp


Expand Down Expand Up @@ -59,6 +61,27 @@ def test_purge_unmated_sra(self):
self.assertEqualContents(outFastq2, expected2Fastq)


class TestBwamemIdxstats(TestCaseWithTmp):

def setUp(self):
TestCaseWithTmp.setUp(self)
self.tempDir = tempfile.mkdtemp()
self.ebolaRef = util.file.mkstempfname('.fasta', directory=self.tempDir)
shutil.copyfile(os.path.join(util.file.get_test_input_path(), 'G5012.3.fasta'), self.ebolaRef)
self.bwa = tools.bwa.Bwa()
self.samtools = tools.samtools.SamtoolsTool()
self.bwa.index(self.ebolaRef)

def test_bwamem_idxstats(self):
inBam = os.path.join(util.file.get_test_input_path(), 'G5012.3.testreads.bam')
outBam = util.file.mkstempfname('.bam', directory=self.tempDir)
outStats = util.file.mkstempfname('.stats.txt', directory=self.tempDir)
read_utils.bwamem_idxstats(inBam, self.ebolaRef, outBam, outStats)
with open(outStats, 'rt') as inf:
actual_count = int(inf.readline().strip().split('\t')[2])
self.assertEqual(actual_count, self.samtools.count(outBam, opts=['-F', '4']))
self.assertGreater(actual_count, 18000)

class TestFastqToFasta(TestCaseWithTmp):

def test_fastq_to_fasta(self):
Expand Down Expand Up @@ -94,6 +117,7 @@ def test_fastq_bam(self):
outFastq1 = util.file.mkstempfname('.fastq')
outFastq2 = util.file.mkstempfname('.fastq')
outHeader = util.file.mkstempfname('.txt')
outHeaderFix = util.file.mkstempfname('.fix.txt')

# in1.fastq, in2.fastq -> out.bam; header params from command-line
parser = read_utils.parser_fastq_to_bam(argparse.ArgumentParser())
Expand All @@ -110,6 +134,12 @@ def test_fastq_bam(self):
'SEQUENCING_CENTER=KareemAbdul-Jabbar',])
args.func_main(args)

# Note for developers: if you're fixing the tests to handle non-bugs
# (ie our testing here is too brittle), let's just replace a lot of this
# in the future with code that just reads the header, sorts it, and
# tests for equality of sorted values in the RG line (and stricter
# equality in the non-header lines). This is kind of hacky.

# samtools view for out.sam and compare to expected
samtools = tools.samtools.SamtoolsTool()
samtools.view(['-h'], outBamCmd, outSam)
Expand Down Expand Up @@ -142,10 +172,18 @@ def test_fastq_bam(self):
'READ1_TRIM=1',])
args.func_main(args)

# filter out any "PG" lines from header for testing purposes
# I don't like this... let's replace later.
with open(outHeader, 'rt') as inf:
with open(outHeaderFix, 'wt') as outf:
for line in inf:
if not line.startswith('@PG'):
outf.write(line)

# compare to out1.fastq, out2.fastq, outHeader.txt to in and expected
self.assertEqualContents(outFastq1, expectedFastq1) # 1 base trimmed
self.assertEqualContents(outFastq2, inFastq2)
self.assertEqualContents(outHeader, inHeader)
self.assertEqualContents(outHeaderFix, inHeader)


class TestSplitReads(TestCaseWithTmp):
Expand Down
63 changes: 63 additions & 0 deletions tools/bwa.py
@@ -0,0 +1,63 @@
'''
The BWA aligner.
'''

import logging
import os
import os.path
import subprocess
import tools
import tools.samtools
import util.file
import util.misc

TOOL_NAME = 'bwa'
TOOL_VERSION = '0.7.15'

log = logging.getLogger(__name__)


class Bwa(tools.Tool):

def __init__(self, install_methods=None):
if install_methods is None:
install_methods = [tools.CondaPackage(TOOL_NAME, version=TOOL_VERSION)]
tools.Tool.__init__(self, install_methods=install_methods)

def version(self):
return TOOL_VERSION

def execute(self, command, args, stdout=None): # pylint: disable=W0221
tool_cmd = [self.install_and_get_path(), command] + args
log.debug(' '.join(tool_cmd))
if stdout:
stdout = open(stdout, 'w')
subprocess.check_call(tool_cmd, stdout=stdout)
if stdout:
stdout.close()

def index(self, inFasta, algorithm=None):
cmd = []
if algorithm is not None:
if algorithm not in ('is', 'bwtsw'):
raise NameError(algorithm + " is not a recognized algorithm")
cmd.extend(('-a', algorithm))
cmd.append(inFasta)
self.execute('index', cmd)

def mem(self, inReads, refDb, outAlign, threads=None):
threads = threads or util.misc.available_cpu_count()
samtools = tools.samtools.SamtoolsTool()
fq1 = util.file.mkstempfname('.1.fastq')
fq2 = util.file.mkstempfname('.2.fastq')
aln_sam = util.file.mkstempfname('.sam')
samtools.bam2fq(inReads, fq1, fq2)
self.execute('mem', ['-t', str(threads), refDb, fq1, fq2], stdout=aln_sam)
os.unlink(fq1)
os.unlink(fq2)
samtools.sort(aln_sam, outAlign)
os.unlink(aln_sam)
samtools.index(outAlign)


2 changes: 2 additions & 0 deletions tools/kraken.py
Expand Up @@ -69,6 +69,8 @@ def classify(self, inBam, db, outReads, numThreads=None):
return
tmp_fastq1 = util.file.mkstempfname('.1.fastq')
tmp_fastq2 = util.file.mkstempfname('.2.fastq')
# do not convert this to samtools bam2fq unless we can figure out how to replicate
# the clipping functionality of Picard SamToFastq
picard = tools.picard.SamToFastqTool()
picard_opts = {
'CLIPPING_ATTRIBUTE': tools.picard.SamToFastqTool.illumina_clipping_attribute,
Expand Down

0 comments on commit f9545c2

Please sign in to comment.