Permalink
Browse files

add skip_while

  • Loading branch information...
1 parent a0ff1f4 commit 5fae59063ad40ba8adfd2a60a562e47d95f7203b @brentp committed Nov 1, 2013
Showing with 36 additions and 1 deletion.
  1. +5 −0 NEWS.txt
  2. +6 −0 README.rst
  3. +12 −1 toolshed/files.py
  4. +13 −0 toolshed/tests/files_tests.py
View
5 NEWS.txt
@@ -1,5 +1,10 @@
News
====
+0.3.2
+-----
+add skip_while kwarg to reader(). takes a function that accepts a list
+of tokens (from a line) and returns true to skip the first lines of a file.
+
0.3.0
-----
utilities for working with multiprocessing pools.
View
6 README.rst
@@ -64,6 +64,12 @@ you may need to send stdin to a proc:
{'number': '1'}
{'number': '3'}
+In addition, you can skip the first lines of a file with a function like::
+
+ skipper = lambda toks: toks[0].startswith('#')
+ for d in reader('file-with-extra-header.txt', skip_while=skipper):
+ do_stuff(d)
+
Pools
-----
View
13 toolshed/files.py
@@ -138,12 +138,16 @@ def xls_reader(f, sheet=0):
for irow in range(ws.nrows):
yield map(str, ws.row_values(irow))
-def reader(fname, header=True, sep="\t"):
+def reader(fname, header=True, sep="\t", skip_while=None):
r"""
for each row in the file `fname` generate dicts if `header` is True
or lists if `header` is False. The dict keys are drawn from the first
line. If `header` is a list of names, those will be used as the dict
keys.
+ skip_while is a function that returns False when it is ready to start
+ consuming. this could be something like:
+
+ skip_while = lambda toks: toks[0].startswith('#')
>>> from StringIO import StringIO
>>> get_str = lambda : StringIO("a\tb\tname\n1\t2\tfred\n11\t22\tjane")
@@ -180,6 +184,13 @@ def _re_line_gen(f, sep):
yield sep.split(line.rstrip("\r\n"))
line_gen = _re_line_gen(fname, sep)
+ if skip_while:
+ from itertools import chain
+ l = line_gen.next()
+ while skip_while(l):
+ l = line_gen.next()
+ line_gen = chain.from_iterable(([l], line_gen))
+
# they sent in a class or function that accepts the toks.
if callable(header):
for toks in line_gen:
View
13 toolshed/tests/files_tests.py
@@ -22,6 +22,19 @@ def test_split_None():
assert toks == ['asdf', '123', 'abc'], toks
os.unlink('tt.tmp')
+def test_skip_until():
+
+ with open('tta.tmp', 'w') as fh:
+ print >>fh, """#a\n#b\n#c\na\tb"""
+
+ fiter = reader('tta.tmp', header=False, sep="\t",
+ skip_while=lambda toks: toks[0].startswith("#"))
+ f = list(fiter)
+ assert len(f) == 1, f
+ assert f[0] == ["a", "b"], f
+ #os.unlink('tta.tmp')
+
+
def test_split_regex():
fh = open('tt.tmp', 'w')
print >>fh, "asdf123abc"

0 comments on commit 5fae590

Please sign in to comment.