From d40d51170c2cce442c18372270b8ce8a20070461 Mon Sep 17 00:00:00 2001 From: Christopher Tomkins-Tinch Date: Thu, 16 Jun 2016 13:01:16 -0400 Subject: [PATCH] moved count_fastq_reads() to util/file.py; refactored to use grep counting on non-gzip files moved count_fastq_reads() to util/file.py; refactored to use grep counting on non-gzip files which seems to be faster on Pythons <3.5. changed fasta_length() to use grep_count() which is also used by count_fastq_reads() --- assembly.py | 17 +++++----------- util/file.py | 57 +++++++++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 57 insertions(+), 17 deletions(-) diff --git a/assembly.py b/assembly.py index 44f7dbe17..7cf48cfbd 100755 --- a/assembly.py +++ b/assembly.py @@ -54,13 +54,6 @@ def __init__(self, n_start, n_trimmed, n_rmdup, n_purge, n_subsamp): n_start, n_trimmed, n_rmdup, n_purge, n_subsamp)) -def count_fastq_reads(inFastq): - ''' Maybe move this to util.file one day ''' - with util.file.open_or_gzopen(inFastq, 'rt') as inf: - n = sum(1 for line in inf if line.startswith('@')) - return n - - def trim_rmdup_subsamp_reads(inBam, clipDb, outBam, n_reads=100000): ''' Take reads through Trimmomatic, Prinseq, and subsampling. This should probably move over to read_utils or taxon_filter. @@ -69,7 +62,7 @@ def trim_rmdup_subsamp_reads(inBam, clipDb, outBam, n_reads=100000): # BAM -> fastq infq = list(map(util.file.mkstempfname, ['.in.1.fastq', '.in.2.fastq'])) tools.picard.SamToFastqTool().execute(inBam, infq[0], infq[1]) - n_input = count_fastq_reads(infq[0]) + n_input = util.file.count_fastq_reads(infq[0]) # Trimmomatic trimfq = list(map(util.file.mkstempfname, ['.trim.1.fastq', '.trim.2.fastq'])) @@ -80,7 +73,7 @@ def trim_rmdup_subsamp_reads(inBam, clipDb, outBam, n_reads=100000): taxon_filter.trimmomatic(infq[0], infq[1], trimfq[0], trimfq[1], clipDb) os.unlink(infq[0]) os.unlink(infq[1]) - n_trim = count_fastq_reads(trimfq[0]) + n_trim = util.file.count_fastq_reads(trimfq[0]) # Prinseq rmdupfq = list(map(util.file.mkstempfname, ['.rmdup.1.fastq', '.rmdup.2.fastq'])) @@ -91,7 +84,7 @@ def trim_rmdup_subsamp_reads(inBam, clipDb, outBam, n_reads=100000): read_utils.rmdup_prinseq_fastq(trimfq[i], rmdupfq[i]) os.unlink(trimfq[0]) os.unlink(trimfq[1]) - n_rmdup = count_fastq_reads(rmdupfq[0]) + n_rmdup = util.file.count_fastq_reads(rmdupfq[0]) # Purge unmated purgefq = list(map(util.file.mkstempfname, ['.fix.1.fastq', '.fix.2.fastq'])) @@ -102,7 +95,7 @@ def trim_rmdup_subsamp_reads(inBam, clipDb, outBam, n_reads=100000): read_utils.purge_unmated(rmdupfq[0], rmdupfq[1], purgefq[0], purgefq[1]) os.unlink(rmdupfq[0]) os.unlink(rmdupfq[1]) - n_purge = count_fastq_reads(purgefq[0]) + n_purge = util.file.count_fastq_reads(purgefq[0]) # Log count log.info("PRE-SUBSAMPLE COUNT: %s read pairs", n_purge) @@ -127,7 +120,7 @@ def trim_rmdup_subsamp_reads(inBam, clipDb, outBam, n_reads=100000): util.misc.run_and_print(cmd, check=True) os.unlink(purgefq[0]) os.unlink(purgefq[1]) - n_subsamp = count_fastq_reads(subsampfq[0]) + n_subsamp = util.file.count_fastq_reads(subsampfq[0]) # Fastq -> BAM # Note: this destroys RG IDs! We should instead frun the BAM->fastq step in a way diff --git a/util/file.py b/util/file.py index 6392b5b38..ee098dafe 100644 --- a/util/file.py +++ b/util/file.py @@ -390,13 +390,60 @@ def string_to_file_name(string_value): return string_value -def fasta_length(fasta_path): - if not os.path.isfile(fasta_path) or os.path.getsize(fasta_path)==0: +def grep_count(file_path, to_match, additional_flags=None, fixed_mode=True, starts_with=False): + ''' + This uses grep for fast counting of strings in a file + ''' + if not os.path.isfile(file_path) or os.path.getsize(file_path)==0: return 0 env = os.environ.copy() env['LC_ALL'] = 'C' #use C locale rather than UTF8 for faster grep - - # grep for record start characters, in fixed-string mode (no regex) - number_of_seqs = util.misc.run_and_print(["grep", "-cF", ">", fasta_path], silent=False, check=True, env=env) + + cmd = ["grep"] + # '-c' returns the match count + cmd.append("-c") + if additional_flags: + cmd.extend(additional_flags) + + # fixed mode cannot be used with starts_with, since it does not match regular expressions + # only add the fixed_mode flag if we're not using starts_with + if not starts_with: + if fixed_mode: + cmd.append("-F") + cmd.append(to_match) + else: + cmd.append("^"+to_match) + + cmd.append(file_path) + + number_of_seqs = util.misc.run_and_print(cmd, silent=False, check=True, env=env) return int(number_of_seqs.stdout.decode("utf-8").rstrip(os.linesep)) + +def count_str_in_file(in_file, query_str, starts_with=False): + if not os.path.isfile(in_file) or os.path.getsize(in_file)==0: + return 0 + + if in_file.endswith('.gz'): + n = 0 + with gzip.open(in_file, 'rt') as inf: + if starts_with: + n = sum(1 for line in inf if line[0]==query_str) + else: + n = sum(1 for line in inf if query_str in line) + return n + # use grep count for non-gzip files since it seems to be faster than native on Pythons <3.5 + else: + return grep_count(in_file, query_str, starts_with=starts_with) + +def fasta_length(fasta_path): + ''' + Count number of records in fasta file + ''' + return count_str_in_file(fasta_path, '>', starts_with=True) + +def count_fastq_reads(inFastq): + ''' + Count number of reads in fastq file + ''' + return count_str_in_file(inFastq, '@', starts_with=True)