Made a number of minor improvements:

- Permit input from a file provided as an argument, maintaining the ability to input via STDIN - Transparently permit input of a gzipped FASTQ, when provided with a FASTQ filename ending in '.gz' - Add useful '__future__' imports - Refactor print statements to use the new syntax and use '.format' - Enforce PEP8 compliance - Change exit code upon error to a non-zero value (1) - Minor refactoring, removing redundant traces - Add an appropriate shebang - Revise internal documentation, adding common use-cases
brentp · Jul 12, 2017 · 95adaa7 · 95adaa7 · david-wb · Aug 3, 2017
1 parent 845c3e1
commit 95adaa7
Showing 1 changed file with 58 additions and 15 deletions.
diff --git a/reads-utils/guess-encoding.py b/reads-utils/guess-encoding.py
@@ -1,10 +1,21 @@
+#!/usr/bin/env python
+
 """
-   awk 'NR % 4 == 0' your.fastq | python %prog [options]
+Guess the encoding of a stream of qual lines.
+
+Accepts only quality scores as input, either on STDIN or
+from a file provided as an argument.
 
-guess the encoding of a stream of qual lines.
+Use cases: `awk 'NR % 4 == 0' <FASTQ> | %prog [options]`,
+           `%prog [options] <quality scores file>`,
+           `samtools view <BAM file> | cut -f 5 | %prog [options]`
 """
-import sys
+
+from __future__ import with_statement, division, print_function
+
+import fileinput
 import optparse
+import sys
 
 #  Note that the theoretical maximum for all encodings is 126.
 #  The upper limits below are for "typical" data only.
@@ -26,13 +37,15 @@ def get_qual_range(qual_str):
     vals = [ord(c) for c in qual_str]
     return min(vals), max(vals)
 
+
 def get_encodings_in_range(rmin, rmax, ranges=RANGES):
     valid_encodings = []
     for encoding, (emin, emax) in ranges.items():
         if rmin >= emin and rmax <= emax:
             valid_encodings.append(encoding)
     return valid_encodings
 
+
 def main():
     p = optparse.OptionParser(__doc__)
     p.add_option("-n", dest="n", help="number of qual lines to test default:-1"
@@ -41,30 +54,60 @@ def main():
                  type='int', default=-1)
 
     opts, args = p.parse_args()
-    print >>sys.stderr, "# reading qualities from stdin"
-    gmin, gmax  = 99, 0
+
+    if len(args) > 1:
+        print("Only a single input file is supported.", file=sys.stderr)
+        sys.exit(1)
+
+    gmin = 99
+    gmax = 0
     valid = []
-    for i, line in enumerate(sys.stdin):
+
+    err_exit = False
+
+    input_file = fileinput.input(args, openhook=fileinput.hook_compressed)
+
+    for i, line in enumerate(input_file):
+        if i == 0:
+            input_filename_for_disp = fileinput.filename()
+
+            if fileinput.isstdin():
+                input_filename_for_disp = 'STDIN'
+
+            print("# reading qualities from "
+                  "{}".format(input_filename_for_disp), file=sys.stderr)
+
         lmin, lmax = get_qual_range(line.rstrip())
+
         if lmin < gmin or lmax > gmax:
             gmin, gmax = min(lmin, gmin), max(lmax, gmax)
             valid = get_encodings_in_range(gmin, gmax)
+
             if len(valid) == 0:
-                print >>sys.stderr, "no encodings for range: %s" % str((gmin, gmax))
-                sys.exit()
+                print("no encodings for range: "
+                      "{}".format((gmin, gmax)), file=sys.stderr)
+                err_exit = True
+                break
+
             if len(valid) == 1 and opts.n == -1:
-                print "\t".join(valid) + "\t" + str((gmin, gmax))
-                sys.exit()
+                # parsed entire file and found unique guess
+                break
 
         if opts.n > 0 and i > opts.n:
-            print "\t".join(valid) + "\t" + str((gmin, gmax))
-            sys.exit()
+            # parsed up to specified portion; return current guess(es)
+            break
+
+    input_file.close()
 
-    print "\t".join(valid) + "\t" + str((gmin, gmax))
+    if err_exit:
+        sys.exit(1)
+    else:
+        print("{}\t{}\t{}".format(valid, gmin, gmax),
+              file=sys.stderr)
 
 
 if __name__ == "__main__":
     import doctest
-    if doctest.testmod(optionflags=doctest.ELLIPSIS |\
-                                   doctest.NORMALIZE_WHITESPACE).failed == 0:
+    if doctest.testmod(optionflags=doctest.ELLIPSIS |
+                       doctest.NORMALIZE_WHITESPACE).failed == 0:
         main()