Skip to content

Commit

Permalink
Merge pull request #527 from rainwoodman/bigfile-ds-attr
Browse files Browse the repository at this point in the history
Aggregate attrs of header and the main datasets.
  • Loading branch information
rainwoodman committed Oct 3, 2018
2 parents 1ce197c + e940631 commit 35609f8
Show file tree
Hide file tree
Showing 2 changed files with 60 additions and 26 deletions.
65 changes: 44 additions & 21 deletions nbodykit/io/bigfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
from six import string_types
import json
from nbodykit.utils import JSONDecoder
from fnmatch import fnmatch


class Automatic: pass

Expand All @@ -31,10 +33,14 @@ class BigFile(FileType):
exclude : list of str, optional
the data sets to exlude from loading within bigfile; default
is the header. If any list is given, the name of the header column
must be given too if it is not part of the data set.
header : str, optional
must be given too if it is not part of the data set. The names
are shell glob patterns.
header : str, or list, optional
the path to the header; default is to use a column 'Header'.
It is relative to the file, not the dataset.
If a list is provided, the attributes is updated from the first entry to the last.
dataset : str
finding columns from a specific dataset in the bigfile;
the default is start looking for columns from the root.
Expand All @@ -53,47 +59,64 @@ def __init__(self, path, exclude=None, header=Automatic, dataset='./'):

# the file path
with bigfile.BigFile(filename=path) as ff:
columns = ff[self.dataset].blocks
header = self._find_header(header, ff)
columns = [block for block in ff[self.dataset].blocks]
headers = self._find_headers(header, dataset, ff)

if exclude is None:
# by default exclude header only.
exclude = [header]
exclude = headers

if not isinstance(exclude, (list, tuple)):
exclude = [exclude]

columns = list(set(columns) - set(exclude))
columns = [
column
for column in set(columns) if not any(fnmatch(column, e) for e in exclude)
]

ds = bigfile.BigData(ff[self.dataset], columns)

# set the data type and size
self.dtype = ds.dtype
self.size = ds.size

header = ff[header]
attrs = header.attrs
headers = [ff[header] for header in headers]
all_attrs = [ header.attrs for header in headers ]
for attrs in all_attrs:
# copy over the attrs
for k in attrs.keys():

# copy over the attrs
for k in attrs.keys():
# load a JSON representation if str starts with json:://
if isinstance(attrs[k], string_types) and attrs[k].startswith('json://'):
self.attrs[k] = json.loads(attrs[k][7:], cls=JSONDecoder)
# copy over an array
else:
self.attrs[k] = numpy.array(attrs[k], copy=True)

# load a JSON representation if str starts with json:://
if isinstance(attrs[k], string_types) and attrs[k].startswith('json://'):
self.attrs[k] = json.loads(attrs[k][7:], cls=JSONDecoder)
# copy over an array
else:
self.attrs[k] = numpy.array(attrs[k], copy=True)

def _find_header(self, header, ff):
def _find_headers(self, header, dataset, ff):
""" Find header from the file block by default. """
if header is Automatic:
for header in ['Header', 'header', '.']:
if header in ff.blocks: break
header = ['Header', 'header', '.']

if not isinstance(header, (tuple, list)):
header = [header]

r = []
for h in header:
if h in ff.blocks:
if h not in r:
r.append(h)

# append the dataset itself
r.append(dataset.strip('/') + '/.')

# shall not make the assertion here because header can be nested deep.
# then not shown in ff.blocks. try catch may work better.
#if not header in ff.blocks:
# raise KeyError("header block `%s` is not defined in the bigfile. Candidates can be `%s`"
# % (header, str(ff.blocks))

return header
return r

def read(self, columns, start, stop, step=1):
"""
Expand Down
21 changes: 16 additions & 5 deletions nbodykit/io/tests/test_bigfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,18 @@ def temporary_data():
with tmpff.create("Velocity", dtype=('f4', 3), size=1024) as bb:
bb.write(0, data['Velocity'])
with tmpff.create("Header") as bb:
bb.attrs['Size'] = 1024.

bb.attrs['Size'] = 1024
bb.attrs['Over'] = 0

with tmpff.create('1/.') as bb:
bb.attrs['Size'] = 1024
bb.attrs['Over'] = 1024

with tmpff.create("1/Position", dtype=('f4', 3), size=1024) as bb:
bb.write(0, data['Position'])
with tmpff.create("1/Velocity", dtype=('f4', 3), size=1024) as bb:
bb.write(0, data['Velocity'])

yield (data, tmpdir)
except:
raise
Expand All @@ -35,7 +45,7 @@ def test_data(comm):

with temporary_data() as (data, tmpfile):
# read
ff = BigFile(tmpfile, header='Header')
ff = BigFile(tmpfile, header='Header', dataset='1')

# check size
assert ff.attrs['Size'] == 1024
Expand All @@ -47,18 +57,19 @@ def test_data(comm):
@MPITest([1])
def test_data_auto_header(comm):
with temporary_data() as (data, tmpfile):
ff = BigFile(tmpfile)
ff = BigFile(tmpfile, dataset='1')

# check size
assert ff.attrs['Size'] == 1024
assert ff.attrs['Over'] == 1024

@MPITest([1])
def test_pickle(comm):

with temporary_data() as (data, tmpfile):

# read
ff = BigFile(tmpfile, header='Header')
ff = BigFile(tmpfile, header='Header', exclude=['1/*', 'Header'])

# pickle
s = pickle.dumps(ff)
Expand Down

0 comments on commit 35609f8

Please sign in to comment.