moved count_fastq_reads() to util/file.py; refactored to use grep cou…

…nting on non-gzip files moved count_fastq_reads() to util/file.py; refactored to use grep counting on non-gzip files which seems to be faster on Pythons <3.5. changed fasta_length() to use grep_count() which is also used by count_fastq_reads()
broadinstitute · Jun 16, 2016 · d40d511 · d40d511
1 parent c98f1e9
commit d40d511
Show file tree

Hide file tree

Showing 2 changed files with 57 additions and 17 deletions.
diff --git a/assembly.py b/assembly.py
@@ -54,13 +54,6 @@ def __init__(self, n_start, n_trimmed, n_rmdup, n_purge, n_subsamp):
                 n_start, n_trimmed, n_rmdup, n_purge, n_subsamp))
 
 
-def count_fastq_reads(inFastq):
-    ''' Maybe move this to util.file one day '''
-    with util.file.open_or_gzopen(inFastq, 'rt') as inf:
-        n = sum(1 for line in inf if line.startswith('@'))
-    return n
-
-
 def trim_rmdup_subsamp_reads(inBam, clipDb, outBam, n_reads=100000):
     ''' Take reads through Trimmomatic, Prinseq, and subsampling.
         This should probably move over to read_utils or taxon_filter.
@@ -69,7 +62,7 @@ def trim_rmdup_subsamp_reads(inBam, clipDb, outBam, n_reads=100000):
     # BAM -> fastq
     infq = list(map(util.file.mkstempfname, ['.in.1.fastq', '.in.2.fastq']))
     tools.picard.SamToFastqTool().execute(inBam, infq[0], infq[1])
-    n_input = count_fastq_reads(infq[0])
+    n_input = util.file.count_fastq_reads(infq[0])
 
     # Trimmomatic
     trimfq = list(map(util.file.mkstempfname, ['.trim.1.fastq', '.trim.2.fastq']))
@@ -80,7 +73,7 @@ def trim_rmdup_subsamp_reads(inBam, clipDb, outBam, n_reads=100000):
         taxon_filter.trimmomatic(infq[0], infq[1], trimfq[0], trimfq[1], clipDb)
     os.unlink(infq[0])
     os.unlink(infq[1])
-    n_trim = count_fastq_reads(trimfq[0])
+    n_trim = util.file.count_fastq_reads(trimfq[0])
 
     # Prinseq
     rmdupfq = list(map(util.file.mkstempfname, ['.rmdup.1.fastq', '.rmdup.2.fastq']))
@@ -91,7 +84,7 @@ def trim_rmdup_subsamp_reads(inBam, clipDb, outBam, n_reads=100000):
             read_utils.rmdup_prinseq_fastq(trimfq[i], rmdupfq[i])
     os.unlink(trimfq[0])
     os.unlink(trimfq[1])
-    n_rmdup = count_fastq_reads(rmdupfq[0])
+    n_rmdup = util.file.count_fastq_reads(rmdupfq[0])
 
     # Purge unmated
     purgefq = list(map(util.file.mkstempfname, ['.fix.1.fastq', '.fix.2.fastq']))
@@ -102,7 +95,7 @@ def trim_rmdup_subsamp_reads(inBam, clipDb, outBam, n_reads=100000):
         read_utils.purge_unmated(rmdupfq[0], rmdupfq[1], purgefq[0], purgefq[1])
     os.unlink(rmdupfq[0])
     os.unlink(rmdupfq[1])
-    n_purge = count_fastq_reads(purgefq[0])
+    n_purge = util.file.count_fastq_reads(purgefq[0])
 
     # Log count
     log.info("PRE-SUBSAMPLE COUNT: %s read pairs", n_purge)
@@ -127,7 +120,7 @@ def trim_rmdup_subsamp_reads(inBam, clipDb, outBam, n_reads=100000):
         util.misc.run_and_print(cmd, check=True)
     os.unlink(purgefq[0])
     os.unlink(purgefq[1])
-    n_subsamp = count_fastq_reads(subsampfq[0])
+    n_subsamp = util.file.count_fastq_reads(subsampfq[0])
 
     # Fastq -> BAM
     # Note: this destroys RG IDs! We should instead frun the BAM->fastq step in a way

diff --git a/util/file.py b/util/file.py
@@ -390,13 +390,60 @@ def string_to_file_name(string_value):
 
     return string_value
 
-def fasta_length(fasta_path):
-    if not os.path.isfile(fasta_path) or os.path.getsize(fasta_path)==0:
+def grep_count(file_path, to_match, additional_flags=None, fixed_mode=True, starts_with=False):
+    '''
+        This uses grep for fast counting of strings in a file
+    '''
+    if not os.path.isfile(file_path) or os.path.getsize(file_path)==0:
         return 0
 
     env = os.environ.copy()
     env['LC_ALL'] = 'C' #use C locale rather than UTF8 for faster grep
-
-    # grep for record start characters, in fixed-string mode (no regex)
-    number_of_seqs = util.misc.run_and_print(["grep", "-cF", ">", fasta_path], silent=False, check=True, env=env)
+
+    cmd = ["grep"]
+    # '-c' returns the match count
+    cmd.append("-c")
+    if additional_flags:
+        cmd.extend(additional_flags)
+
+    # fixed mode cannot be used with starts_with, since it does not match regular expressions
+    # only add the fixed_mode flag if we're not using starts_with
+    if not starts_with:
+        if fixed_mode:
+            cmd.append("-F")
+        cmd.append(to_match)
+    else:
+        cmd.append("^"+to_match)
+
+    cmd.append(file_path)
+
+    number_of_seqs = util.misc.run_and_print(cmd, silent=False, check=True, env=env)
     return int(number_of_seqs.stdout.decode("utf-8").rstrip(os.linesep))
+
+def count_str_in_file(in_file, query_str, starts_with=False):
+    if not os.path.isfile(in_file) or os.path.getsize(in_file)==0:
+        return 0
+
+    if in_file.endswith('.gz'):
+        n = 0
+        with gzip.open(in_file, 'rt') as inf:
+            if starts_with:
+                n = sum(1 for line in inf if line[0]==query_str)
+            else:
+                n = sum(1 for line in inf if query_str in line)
+        return n
+    # use grep count for non-gzip files since it seems to be faster than native on Pythons <3.5
+    else:
+        return grep_count(in_file, query_str, starts_with=starts_with)
+
+def fasta_length(fasta_path):
+    '''
+        Count number of records in fasta file
+    '''
+    return count_str_in_file(fasta_path, '>', starts_with=True)
+
+def count_fastq_reads(inFastq):
+    '''
+        Count number of reads in fastq file
+    '''
+    return count_str_in_file(inFastq, '@', starts_with=True)