diff --git a/read_utils.py b/read_utils.py index 161a4a631..d4b9bbb5c 100755 --- a/read_utils.py +++ b/read_utils.py @@ -78,6 +78,8 @@ def parser_purge_unmated(parser=argparse.ArgumentParser()): def fastq_to_fasta(inFastq, outFasta): ''' Convert from fastq format to fasta format. Warning: output reads might be split onto multiple lines. + + Note: this is only used by read_utils.deplete_blastn_bam currently. ''' # Do this with biopython rather than prinseq, because if the latter fails @@ -92,16 +94,6 @@ def fastq_to_fasta(inFastq, outFasta): return 0 -def parser_fastq_to_fasta(parser=argparse.ArgumentParser()): - parser.add_argument('inFastq', help='Input fastq file.') - parser.add_argument('outFasta', help='Output fasta file.') - util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None))) - util.cmd.attach_main(parser, fastq_to_fasta, split_args=True) - return parser - - -__commands__.append(('fastq_to_fasta', parser_fastq_to_fasta)) - # =============================== # *** index_fasta_samtools *** # =============================== diff --git a/requirements-conda.txt b/requirements-conda.txt index 38497f6a1..a6c51d5af 100644 --- a/requirements-conda.txt +++ b/requirements-conda.txt @@ -5,9 +5,9 @@ cd-hit=4.6.8 cd-hit-auxtools=4.6.8 diamond=0.8.36 gatk=3.6 -kraken-all=0.10.6_fork2 +kraken=0.10.6_fork3 krona=2.7 -last=719 +last=876 fastqc=0.11.5 gap2seq=2.1 mafft=7.221 @@ -34,4 +34,4 @@ pysam=0.9.1 pybedtools=0.7.8 Jinja2==2.8 PyYAML=3.11 -six=1.10.0 \ No newline at end of file +six=1.10.0 diff --git a/taxon_filter.py b/taxon_filter.py index 0731e263c..e0997e45b 100755 --- a/taxon_filter.py +++ b/taxon_filter.py @@ -156,115 +156,47 @@ def bmtagger_wrapper(inBam, db, outBam, threads, JVMmemory=None): # ======================= -def lastal_chunked_fastq( - inFastq, +def filter_lastal_bam( + inBam, db, - outFastq, + outBam, max_gapless_alignments_per_position=1, min_length_for_initial_matches=5, max_length_for_initial_matches=50, max_initial_matches_per_position=100, - chunk_size=100000 + JVMmemory=None ): + ''' Restrict input reads to those that align to the given + reference database using LASTAL. + ''' - lastal_path = tools.last.Lastal().install_and_get_path() - mafsort_path = tools.last.MafSort().install_and_get_path() - mafconvert_path = tools.last.MafConvert().install_and_get_path() - no_blast_like_hits_path = os.path.join(util.file.get_scripts_path(), 'noBlastLikeHits.py') + with util.file.tmp_dir('-lastal_db') as tmp_db_dir: + # auto build db if needed + if not all(os.path.exists(db + x) + for x in ('.bck', '.des', '.prj', '.sds', '.ssp', '.suf', '.tis')): + db = tools.last.Lastdb().build_database(db, os.path.join(os.path.abspath(tmp_db_dir), 'lastal_db')) - filtered_fastq_files = [] - with open(inFastq, "rt") as fastqFile: - record_iter = SeqIO.parse(fastqFile, "fastq") - for batch in util.misc.batch_iterator(record_iter, chunk_size): + with util.file.tempfname('.read_ids.txt') as hitList: - chunk_fastq = mkstempfname('.fastq') - with open(chunk_fastq, "wt") as handle: - SeqIO.write(batch, handle, "fastq") - batch = None + # look for lastal hits in BAM and write to temp file + with open(hitList, 'wt') as outf: + for read_id in tools.last.Lastal().get_hits( + inBam, db, + max_gapless_alignments_per_position, + min_length_for_initial_matches, + max_length_for_initial_matches, + max_initial_matches_per_position + ): + outf.write(read_id + '\n') - lastal_out = mkstempfname('.lastal') - with open(lastal_out, 'wt') as outf: - cmd = [lastal_path, '-Q1', '-P0'] - cmd.extend( - [ - '-n', max_gapless_alignments_per_position, '-l', min_length_for_initial_matches, '-L', - max_length_for_initial_matches, '-m', max_initial_matches_per_position - ] - ) - cmd = [str(x) for x in cmd] - cmd.extend([db, chunk_fastq]) - log.debug(' '.join(cmd) + ' > ' + lastal_out) - util.misc.run_and_save(cmd, outf=outf) - # everything below this point in this method should be replaced with - # our own code that just reads lastal output and makes a list of read names - - mafsort_out = mkstempfname('.mafsort') - with open(mafsort_out, 'wt') as outf: - with open(lastal_out, 'rt') as inf: - cmd = [mafsort_path, '-n2'] - log.debug('cat ' + lastal_out + ' | ' + ' '.join(cmd) + ' > ' + mafsort_out) - subprocess.check_call(cmd, stdin=inf, stdout=outf) - os.unlink(lastal_out) - - mafconvert_out = mkstempfname('.mafconvert') - with open(mafconvert_out, 'wt') as outf: - cmd = ["python", mafconvert_path, 'tab', mafsort_out] - log.debug(' '.join(cmd) + ' > ' + mafconvert_out) - subprocess.check_call(cmd, stdout=outf) - os.unlink(mafsort_out) - - filtered_fastq_chunk = mkstempfname('.filtered.fastq') - with open(filtered_fastq_chunk, 'wt') as outf: - cmd = [no_blast_like_hits_path, '-b', mafconvert_out, '-r', chunk_fastq, '-m', 'hit'] - log.debug(' '.join(cmd) + ' > ' + filtered_fastq_chunk) - subprocess.check_call(cmd, stdout=outf) - filtered_fastq_files.append(filtered_fastq_chunk) - os.unlink(mafconvert_out) - - # concatenate filtered fastq files to outFastq - util.file.concat(filtered_fastq_files, outFastq) - - # remove temp fastq files - for tempfastq in filtered_fastq_files: - os.unlink(tempfastq) - - -def lastal_get_hits( - inFastq, - db, - outList, - max_gapless_alignments_per_position=1, - min_length_for_initial_matches=5, - max_length_for_initial_matches=50, - max_initial_matches_per_position=100 -): - filteredFastq = mkstempfname('.filtered.fastq') - lastal_chunked_fastq( - inFastq, - db, - filteredFastq, - max_gapless_alignments_per_position=max_gapless_alignments_per_position, - min_length_for_initial_matches=min_length_for_initial_matches, - max_length_for_initial_matches=max_length_for_initial_matches, - max_initial_matches_per_position=max_initial_matches_per_position - ) - - with open(outList, 'wt') as outf: - with open(filteredFastq, 'rt') as inf: - line_num = 0 - for line in inf: - if (line_num % 4) == 0: - seq_id = line.rstrip('\n\r')[1:] - if seq_id.endswith('/1') or seq_id.endswith('/2'): - seq_id = seq_id[:-2] - outf.write(seq_id + '\n') - line_num += 1 + # filter original BAM file against keep list + tools.picard.FilterSamReadsTool().execute(inBam, False, hitList, outBam, JVMmemory=JVMmemory) - os.unlink(filteredFastq) - -def parser_lastal_generic(parser=argparse.ArgumentParser()): - # max_gapless_alignments_per_position, min_length_for_initial_matches, max_length_for_initial_matches, max_initial_matches_per_position +def parser_filter_lastal_bam(parser=argparse.ArgumentParser()): + parser.add_argument("inBam", help="Input reads") + parser.add_argument("db", help="Database of taxa we keep") + parser.add_argument("outBam", help="Output reads, filtered to refDb") parser.add_argument( '-n', dest="max_gapless_alignments_per_position", @@ -293,50 +225,6 @@ def parser_lastal_generic(parser=argparse.ArgumentParser()): type=int, default=100 ) - return parser - - -def filter_lastal_bam( - inBam, - db, - outBam, - max_gapless_alignments_per_position=1, - min_length_for_initial_matches=5, - max_length_for_initial_matches=50, - max_initial_matches_per_position=100, - JVMmemory=None -): - ''' Restrict input reads to those that align to the given - reference database using LASTAL. - ''' - # convert BAM to paired FASTQ - inReads = mkstempfname('.all.fastq') - tools.samtools.SamtoolsTool().bam2fq(inBam, inReads) - - # look for hits in FASTQ - hitList1 = mkstempfname('.hits') - lastal_get_hits( - inReads, db, hitList1, max_gapless_alignments_per_position, min_length_for_initial_matches, - max_length_for_initial_matches, max_initial_matches_per_position - ) - os.unlink(inReads) - - # merge & uniqify hits - hitList = mkstempfname('.hits') - with open(hitList, 'wt') as outf: - subprocess.check_call(['sort', '-u', hitList1], stdout=outf) - os.unlink(hitList1) - - # filter original BAM file against keep list - tools.picard.FilterSamReadsTool().execute(inBam, False, hitList, outBam, JVMmemory=JVMmemory) - os.unlink(hitList) - - -def parser_filter_lastal_bam(parser=argparse.ArgumentParser()): - parser = parser_lastal_generic(parser) - parser.add_argument("inBam", help="Input reads") - parser.add_argument("db", help="Database of taxa we keep") - parser.add_argument("outBam", help="Output reads, filtered to refDb") parser.add_argument( '--JVMmemory', default=tools.picard.FilterSamReadsTool.jvmMemDefault, diff --git a/test/input/TestLastalDbBuild/expected/TestLastalDbBuild.prj b/test/input/TestLastalDbBuild/expected/TestLastalDbBuild.prj index 4adc40b77..b3a33eb30 100644 --- a/test/input/TestLastalDbBuild/expected/TestLastalDbBuild.prj +++ b/test/input/TestLastalDbBuild/expected/TestLastalDbBuild.prj @@ -1,4 +1,4 @@ -version=719 +version=876 alphabet=ACGT numofsequences=20 numofletters=378200 @@ -7,6 +7,7 @@ maxunsortedinterval=0 keeplowercase=1 masklowercase=0 numofindexes=1 +integersize=32 totallength=378222 specialcharacters=22 prefixlength=13 diff --git a/test/unit/test_read_utils.py b/test/unit/test_read_utils.py index a85169496..1f4fd1537 100644 --- a/test/unit/test_read_utils.py +++ b/test/unit/test_read_utils.py @@ -96,9 +96,7 @@ def test_fastq_to_fasta(self): myInputDir = util.file.get_test_input_path(self) inFastq = os.path.join(myInputDir, 'in.fastq') outFasta = util.file.mkstempfname('.fasta') - parser = read_utils.parser_fastq_to_fasta(argparse.ArgumentParser()) - args = parser.parse_args([inFastq, outFasta]) - args.func_main(args) + read_utils.fastq_to_fasta(inFastq, outFasta) # Check that results match expected expectedFasta = os.path.join(myInputDir, 'expected.fasta') diff --git a/test/unit/test_taxon_filter.py b/test/unit/test_taxon_filter.py index 1a8daa9fc..364b44caf 100644 --- a/test/unit/test_taxon_filter.py +++ b/test/unit/test_taxon_filter.py @@ -35,12 +35,12 @@ class TestFilterLastal(TestCaseWithTmp): def setUp(self): TestCaseWithTmp.setUp(self) - polio_fasta = os.path.join( + self.polio_fasta = os.path.join( util.file.get_test_input_path(), 'TestMetagenomicsViralMix', 'db', 'library', 'Viruses', 'Poliovirus_uid15288', 'NC_002058.ffn' ) dbDir = tempfile.mkdtemp() - self.lastdb_path = tools.last.Lastdb().build_database(polio_fasta, os.path.join(dbDir, 'NC_002058')) + self.lastdb_path = tools.last.Lastdb().build_database(self.polio_fasta, os.path.join(dbDir, 'NC_002058')) def test_filter_lastal_bam_polio(self): inBam = os.path.join(util.file.get_test_input_path(), 'TestDepleteHuman', 'expected', 'test-reads.blastn.bam') @@ -72,6 +72,17 @@ def test_lastal_empty_output(self): ) assert_equal_bam_reads(self, outBam, empty_bam) + def test_lastal_unbuilt_db(self): + empty_bam = os.path.join(util.file.get_test_input_path(), 'empty.bam') + in_bam = os.path.join(util.file.get_test_input_path(), 'TestDepleteHuman', 'test-reads-human.bam') + outBam = util.file.mkstempfname('-out-taxfilt.bam') + taxon_filter.filter_lastal_bam( + in_bam, + self.polio_fasta, + outBam + ) + assert_equal_bam_reads(self, outBam, empty_bam) + class TestBmtagger(TestCaseWithTmp): ''' diff --git a/tools/kraken.py b/tools/kraken.py index 53cc9a791..eff09cf1c 100644 --- a/tools/kraken.py +++ b/tools/kraken.py @@ -18,8 +18,8 @@ import util.misc from builtins import super -TOOL_NAME = 'kraken-all' -TOOL_VERSION = '0.10.6_fork2' +TOOL_NAME = 'kraken' +TOOL_VERSION = '0.10.6_fork3' log = logging.getLogger(__name__) diff --git a/tools/last.py b/tools/last.py index 22f13d140..5d5ff399b 100644 --- a/tools/last.py +++ b/tools/last.py @@ -7,12 +7,14 @@ # within this module import util.file +import util.misc import tools +import tools.samtools _log = logging.getLogger(__name__) TOOL_NAME = "last" -TOOL_VERSION = "719" +TOOL_VERSION = "876" class LastTools(tools.Tool): @@ -35,11 +37,41 @@ class Lastal(LastTools): subtool_name = 'lastal' subtool_name_on_broad = 'lastal' - -class MafSort(LastTools): - """ wrapper for maf-sort subtool """ - subtool_name = 'maf-sort' - subtool_name_on_broad = 'scripts/maf-sort.sh' + def get_hits(self, inBam, db, + max_gapless_alignments_per_position=1, + min_length_for_initial_matches=5, + max_length_for_initial_matches=50, + max_initial_matches_per_position=100 + ): + + # convert BAM to interleaved FASTQ + fastq_pipe = tools.samtools.SamtoolsTool().bam2fq_pipe(inBam) + + # run lastal and emit list of read IDs + # -P 0 = use threads = core count + # -N 1 = report at most one alignment per query sequence + # -i 1G = perform work in batches of at most 1GB of query sequence at a time + # -f tab = write output in tab format instead of maf format + cmd = [self.install_and_get_path(), + '-n', max_gapless_alignments_per_position, + '-l', min_length_for_initial_matches, + '-L', max_length_for_initial_matches, + '-m', max_initial_matches_per_position, + '-Q', '1', '-P', '0', '-N', '1', '-i', '1G', '-f', 'tab', + db, + ] + cmd = [str(x) for x in cmd] + _log.debug('| ' + ' '.join(cmd) + ' |') + lastal_pipe = subprocess.Popen(cmd, stdin=fastq_pipe, stdout=subprocess.PIPE) + + # strip tab output to just query read ID names and emit + for line in lastal_pipe.stdout: + line = line.decode('UTF-8').rstrip('\n\r') + if not line.startswith('#'): + read_id = line.split('\t')[6] + if read_id.endswith('/1') or read_id.endswith('/2'): + read_id = read_id[:-2] + yield read_id class Lastdb(LastTools): @@ -88,28 +120,17 @@ def execute(self, inputFasta, outputDirectory, outputFilePrefix): # pylint: d if not os.path.exists(outputDirectory): os.makedirs(outputDirectory) - # store the cwd because we will be changing it to the file destination - cwd_before_lastdb = os.getcwd() - # append the prefix given to files created by lastdb tool_cmd.append(outputFilePrefix) # append the input filepath tool_cmd.append(os.path.realpath(inputFasta)) + # execute the lastdb command # lastdb writes files to the current working directory, so we need to set # it to the desired output location - os.chdir(os.path.realpath(outputDirectory)) - - # execute the lastdb command - _log.debug(" ".join(tool_cmd)) - subprocess.check_call(tool_cmd) - - # restore cwd - os.chdir(cwd_before_lastdb) + with util.file.pushd_popd(os.path.realpath(outputDirectory)): + _log.debug(" ".join(tool_cmd)) + subprocess.check_call(tool_cmd) -class MafConvert(LastTools): - """ wrapper for maf-convert subtool """ - subtool_name = 'maf-convert' - subtool_name_on_broad = 'scripts/maf-convert.py' diff --git a/tools/samtools.py b/tools/samtools.py index 0ef310edc..307ce0be3 100644 --- a/tools/samtools.py +++ b/tools/samtools.py @@ -82,6 +82,12 @@ def bam2fq(self, inBam, outFq1, outFq2=None): else: self.execute('bam2fq', ['-1', outFq1, '-2', outFq2, inBam]) + def bam2fq_pipe(self, inBam): + tool_cmd = [self.install_and_get_path(), 'bam2fq', '-n', inBam] + log.debug(' '.join(tool_cmd) + ' |') + p = subprocess.Popen(tool_cmd, stdout=subprocess.PIPE) + return p.stdout + def bam2fa(self, inBam, outFa1, outFa2=None, outFa0=None): args=['-1', outFa1] if outFa2: args += ['-2', outFa2] diff --git a/tools/scripts/noBlastLikeHits.py b/tools/scripts/noBlastLikeHits.py deleted file mode 100755 index d974756d9..000000000 --- a/tools/scripts/noBlastLikeHits.py +++ /dev/null @@ -1,114 +0,0 @@ -#!/usr/bin/env python -'''Written by Kristian Andersen.''' - -import argparse -import sys - -parser = argparse.ArgumentParser(description='This program outputs to stdout reads that have no blast hits') -parser.add_argument('-b', action="store", dest="blastPath", required=True, help="path to the blast-like hits file") -parser.add_argument('-r', action="store", dest="readsPath", required=True, help="path to the reads file") -parser.add_argument('-m', - action="store", - dest="hit", - required=True, - help="hit => output reads with hits, nohit => output reads with no hits", - choices=['hit', 'nohit']) -args = parser.parse_args() - -# finds the nth and n+1th occurrence of a substring - - -def find_nth(str, substr, n): - pos = list() - i = 0 - for j in range(n + 1): - i = str.find(substr, i + len(substr)) - if (j == n - 1): - pos.append(i) - elif (j == n): - pos.append(i) - break - return pos - - -blastReads = {} -blastFile = open(args.blastPath, 'r') -for line in blastFile: - if line.find('#') != 0: - pos = find_nth(line, '\t', 6) - blastReads[line[pos.pop(0) + 1:pos.pop(0)]] = True -blastFile.close() - -FILE = open(args.readsPath, 'r') -count = 0 -char = FILE.readline()[0:1] -FILE.seek(0) -offsets = list() -# fasta format -if char == '>': - while True: - line = FILE.readline() - if not line: - break - if line[0:1] == char: - count += 1 - offsets.append(FILE.tell() - len(line)) -# fastq format -elif char == '@': - while True: - offsets.append(FILE.tell()) - line = FILE.readline() - if not line: - offsets.pop(count) - break - count += 1 - linesToPlus = 0 - while FILE.readline()[0:1] != '+': - linesToPlus += 1 - for i in range(0, linesToPlus): - FILE.readline() -else: - print("Your file does not appear to be fasta or fastq") - sys.exit() - -if args.hit == "nohit": - for i in range(0, len(offsets)): - FILE.seek(offsets[i]) - line = FILE.readline() - if not (line.split()[0][1:] in blastReads): - if i == count - 1: - FILE.seek(offsets[i]) - while True: - line = FILE.readline() - if not line: - break - sys.stdout.write(line) - else: - curOffset = 0 - targetOffset = offsets[i + 1] - offsets[i] - FILE.seek(offsets[i]) - while curOffset != targetOffset: - line = FILE.readline() - curOffset += len(line) - sys.stdout.write(line) -else: - for i in range(0, len(offsets)): - FILE.seek(offsets[i]) - line = FILE.readline() - if line.split()[0][1:] in blastReads: - if i == count - 1: - FILE.seek(offsets[i]) - while True: - line = FILE.readline() - if not line: - break - sys.stdout.write(line) - else: - curOffset = 0 - targetOffset = offsets[i + 1] - offsets[i] - FILE.seek(offsets[i]) - while curOffset != targetOffset: - line = FILE.readline() - curOffset += len(line) - sys.stdout.write(line) -FILE.close()