Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/dp-lastal-simple' into ct-build-…
Browse files Browse the repository at this point in the history
…fixes
  • Loading branch information
tomkinsc committed Oct 12, 2017
2 parents 51793dc + e9d8869 commit c713e15
Show file tree
Hide file tree
Showing 10 changed files with 100 additions and 297 deletions.
12 changes: 2 additions & 10 deletions read_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,8 @@ def parser_purge_unmated(parser=argparse.ArgumentParser()):
def fastq_to_fasta(inFastq, outFasta):
''' Convert from fastq format to fasta format.
Warning: output reads might be split onto multiple lines.
Note: this is only used by read_utils.deplete_blastn_bam currently.
'''

# Do this with biopython rather than prinseq, because if the latter fails
Expand All @@ -92,16 +94,6 @@ def fastq_to_fasta(inFastq, outFasta):
return 0


def parser_fastq_to_fasta(parser=argparse.ArgumentParser()):
parser.add_argument('inFastq', help='Input fastq file.')
parser.add_argument('outFasta', help='Output fasta file.')
util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None)))
util.cmd.attach_main(parser, fastq_to_fasta, split_args=True)
return parser


__commands__.append(('fastq_to_fasta', parser_fastq_to_fasta))

# ===============================
# *** index_fasta_samtools ***
# ===============================
Expand Down
6 changes: 3 additions & 3 deletions requirements-conda.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@ cd-hit=4.6.8
cd-hit-auxtools=4.6.8
diamond=0.8.36
gatk=3.6
kraken-all=0.10.6_fork2
kraken=0.10.6_fork3
krona=2.7
last=719
last=876
fastqc=0.11.5
gap2seq=2.1
mafft=7.221
Expand All @@ -34,4 +34,4 @@ pysam=0.9.1
pybedtools=0.7.8
Jinja2==2.8
PyYAML=3.11
six=1.10.0
six=1.10.0
170 changes: 29 additions & 141 deletions taxon_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,115 +156,47 @@ def bmtagger_wrapper(inBam, db, outBam, threads, JVMmemory=None):
# =======================


def lastal_chunked_fastq(
inFastq,
def filter_lastal_bam(
inBam,
db,
outFastq,
outBam,
max_gapless_alignments_per_position=1,
min_length_for_initial_matches=5,
max_length_for_initial_matches=50,
max_initial_matches_per_position=100,
chunk_size=100000
JVMmemory=None
):
''' Restrict input reads to those that align to the given
reference database using LASTAL.
'''

lastal_path = tools.last.Lastal().install_and_get_path()
mafsort_path = tools.last.MafSort().install_and_get_path()
mafconvert_path = tools.last.MafConvert().install_and_get_path()
no_blast_like_hits_path = os.path.join(util.file.get_scripts_path(), 'noBlastLikeHits.py')
with util.file.tmp_dir('-lastal_db') as tmp_db_dir:
# auto build db if needed
if not all(os.path.exists(db + x)
for x in ('.bck', '.des', '.prj', '.sds', '.ssp', '.suf', '.tis')):
db = tools.last.Lastdb().build_database(db, os.path.join(os.path.abspath(tmp_db_dir), 'lastal_db'))

filtered_fastq_files = []
with open(inFastq, "rt") as fastqFile:
record_iter = SeqIO.parse(fastqFile, "fastq")
for batch in util.misc.batch_iterator(record_iter, chunk_size):
with util.file.tempfname('.read_ids.txt') as hitList:

chunk_fastq = mkstempfname('.fastq')
with open(chunk_fastq, "wt") as handle:
SeqIO.write(batch, handle, "fastq")
batch = None
# look for lastal hits in BAM and write to temp file
with open(hitList, 'wt') as outf:
for read_id in tools.last.Lastal().get_hits(
inBam, db,
max_gapless_alignments_per_position,
min_length_for_initial_matches,
max_length_for_initial_matches,
max_initial_matches_per_position
):
outf.write(read_id + '\n')

lastal_out = mkstempfname('.lastal')
with open(lastal_out, 'wt') as outf:
cmd = [lastal_path, '-Q1', '-P0']
cmd.extend(
[
'-n', max_gapless_alignments_per_position, '-l', min_length_for_initial_matches, '-L',
max_length_for_initial_matches, '-m', max_initial_matches_per_position
]
)
cmd = [str(x) for x in cmd]
cmd.extend([db, chunk_fastq])
log.debug(' '.join(cmd) + ' > ' + lastal_out)
util.misc.run_and_save(cmd, outf=outf)
# everything below this point in this method should be replaced with
# our own code that just reads lastal output and makes a list of read names

mafsort_out = mkstempfname('.mafsort')
with open(mafsort_out, 'wt') as outf:
with open(lastal_out, 'rt') as inf:
cmd = [mafsort_path, '-n2']
log.debug('cat ' + lastal_out + ' | ' + ' '.join(cmd) + ' > ' + mafsort_out)
subprocess.check_call(cmd, stdin=inf, stdout=outf)
os.unlink(lastal_out)

mafconvert_out = mkstempfname('.mafconvert')
with open(mafconvert_out, 'wt') as outf:
cmd = ["python", mafconvert_path, 'tab', mafsort_out]
log.debug(' '.join(cmd) + ' > ' + mafconvert_out)
subprocess.check_call(cmd, stdout=outf)
os.unlink(mafsort_out)

filtered_fastq_chunk = mkstempfname('.filtered.fastq')
with open(filtered_fastq_chunk, 'wt') as outf:
cmd = [no_blast_like_hits_path, '-b', mafconvert_out, '-r', chunk_fastq, '-m', 'hit']
log.debug(' '.join(cmd) + ' > ' + filtered_fastq_chunk)
subprocess.check_call(cmd, stdout=outf)
filtered_fastq_files.append(filtered_fastq_chunk)
os.unlink(mafconvert_out)

# concatenate filtered fastq files to outFastq
util.file.concat(filtered_fastq_files, outFastq)

# remove temp fastq files
for tempfastq in filtered_fastq_files:
os.unlink(tempfastq)


def lastal_get_hits(
inFastq,
db,
outList,
max_gapless_alignments_per_position=1,
min_length_for_initial_matches=5,
max_length_for_initial_matches=50,
max_initial_matches_per_position=100
):
filteredFastq = mkstempfname('.filtered.fastq')
lastal_chunked_fastq(
inFastq,
db,
filteredFastq,
max_gapless_alignments_per_position=max_gapless_alignments_per_position,
min_length_for_initial_matches=min_length_for_initial_matches,
max_length_for_initial_matches=max_length_for_initial_matches,
max_initial_matches_per_position=max_initial_matches_per_position
)

with open(outList, 'wt') as outf:
with open(filteredFastq, 'rt') as inf:
line_num = 0
for line in inf:
if (line_num % 4) == 0:
seq_id = line.rstrip('\n\r')[1:]
if seq_id.endswith('/1') or seq_id.endswith('/2'):
seq_id = seq_id[:-2]
outf.write(seq_id + '\n')
line_num += 1
# filter original BAM file against keep list
tools.picard.FilterSamReadsTool().execute(inBam, False, hitList, outBam, JVMmemory=JVMmemory)

os.unlink(filteredFastq)


def parser_lastal_generic(parser=argparse.ArgumentParser()):
# max_gapless_alignments_per_position, min_length_for_initial_matches, max_length_for_initial_matches, max_initial_matches_per_position
def parser_filter_lastal_bam(parser=argparse.ArgumentParser()):
parser.add_argument("inBam", help="Input reads")
parser.add_argument("db", help="Database of taxa we keep")
parser.add_argument("outBam", help="Output reads, filtered to refDb")
parser.add_argument(
'-n',
dest="max_gapless_alignments_per_position",
Expand Down Expand Up @@ -293,50 +225,6 @@ def parser_lastal_generic(parser=argparse.ArgumentParser()):
type=int,
default=100
)
return parser


def filter_lastal_bam(
inBam,
db,
outBam,
max_gapless_alignments_per_position=1,
min_length_for_initial_matches=5,
max_length_for_initial_matches=50,
max_initial_matches_per_position=100,
JVMmemory=None
):
''' Restrict input reads to those that align to the given
reference database using LASTAL.
'''
# convert BAM to paired FASTQ
inReads = mkstempfname('.all.fastq')
tools.samtools.SamtoolsTool().bam2fq(inBam, inReads)

# look for hits in FASTQ
hitList1 = mkstempfname('.hits')
lastal_get_hits(
inReads, db, hitList1, max_gapless_alignments_per_position, min_length_for_initial_matches,
max_length_for_initial_matches, max_initial_matches_per_position
)
os.unlink(inReads)

# merge & uniqify hits
hitList = mkstempfname('.hits')
with open(hitList, 'wt') as outf:
subprocess.check_call(['sort', '-u', hitList1], stdout=outf)
os.unlink(hitList1)

# filter original BAM file against keep list
tools.picard.FilterSamReadsTool().execute(inBam, False, hitList, outBam, JVMmemory=JVMmemory)
os.unlink(hitList)


def parser_filter_lastal_bam(parser=argparse.ArgumentParser()):
parser = parser_lastal_generic(parser)
parser.add_argument("inBam", help="Input reads")
parser.add_argument("db", help="Database of taxa we keep")
parser.add_argument("outBam", help="Output reads, filtered to refDb")
parser.add_argument(
'--JVMmemory',
default=tools.picard.FilterSamReadsTool.jvmMemDefault,
Expand Down
3 changes: 2 additions & 1 deletion test/input/TestLastalDbBuild/expected/TestLastalDbBuild.prj
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
version=719
version=876
alphabet=ACGT
numofsequences=20
numofletters=378200
Expand All @@ -7,6 +7,7 @@ maxunsortedinterval=0
keeplowercase=1
masklowercase=0
numofindexes=1
integersize=32
totallength=378222
specialcharacters=22
prefixlength=13
Expand Down
4 changes: 1 addition & 3 deletions test/unit/test_read_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,9 +96,7 @@ def test_fastq_to_fasta(self):
myInputDir = util.file.get_test_input_path(self)
inFastq = os.path.join(myInputDir, 'in.fastq')
outFasta = util.file.mkstempfname('.fasta')
parser = read_utils.parser_fastq_to_fasta(argparse.ArgumentParser())
args = parser.parse_args([inFastq, outFasta])
args.func_main(args)
read_utils.fastq_to_fasta(inFastq, outFasta)

# Check that results match expected
expectedFasta = os.path.join(myInputDir, 'expected.fasta')
Expand Down
15 changes: 13 additions & 2 deletions test/unit/test_taxon_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,12 +35,12 @@ class TestFilterLastal(TestCaseWithTmp):

def setUp(self):
TestCaseWithTmp.setUp(self)
polio_fasta = os.path.join(
self.polio_fasta = os.path.join(
util.file.get_test_input_path(),
'TestMetagenomicsViralMix', 'db', 'library', 'Viruses', 'Poliovirus_uid15288', 'NC_002058.ffn'
)
dbDir = tempfile.mkdtemp()
self.lastdb_path = tools.last.Lastdb().build_database(polio_fasta, os.path.join(dbDir, 'NC_002058'))
self.lastdb_path = tools.last.Lastdb().build_database(self.polio_fasta, os.path.join(dbDir, 'NC_002058'))

def test_filter_lastal_bam_polio(self):
inBam = os.path.join(util.file.get_test_input_path(), 'TestDepleteHuman', 'expected', 'test-reads.blastn.bam')
Expand Down Expand Up @@ -72,6 +72,17 @@ def test_lastal_empty_output(self):
)
assert_equal_bam_reads(self, outBam, empty_bam)

def test_lastal_unbuilt_db(self):
empty_bam = os.path.join(util.file.get_test_input_path(), 'empty.bam')
in_bam = os.path.join(util.file.get_test_input_path(), 'TestDepleteHuman', 'test-reads-human.bam')
outBam = util.file.mkstempfname('-out-taxfilt.bam')
taxon_filter.filter_lastal_bam(
in_bam,
self.polio_fasta,
outBam
)
assert_equal_bam_reads(self, outBam, empty_bam)


class TestBmtagger(TestCaseWithTmp):
'''
Expand Down
4 changes: 2 additions & 2 deletions tools/kraken.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@
import util.misc
from builtins import super

TOOL_NAME = 'kraken-all'
TOOL_VERSION = '0.10.6_fork2'
TOOL_NAME = 'kraken'
TOOL_VERSION = '0.10.6_fork3'

log = logging.getLogger(__name__)

Expand Down
Loading

0 comments on commit c713e15

Please sign in to comment.