Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Speed up writing of FITS tables with string columns #6920

Merged
merged 12 commits into from Dec 18, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
9 changes: 9 additions & 0 deletions CHANGES.rst
Expand Up @@ -47,6 +47,9 @@ astropy.io.fits
in particular for cases where the tables contain one or more string columns
and when done through ``Table.read``. [#6821]

- The performance for writing tables from ``Table.write`` has now been
significantly improved for tables containing one or more string columns. [#6920]

- The ``Table.read`` now supports a ``memmap=`` keyword argument to control
whether or not to use memory mapping when reading the table. [#6821]

Expand All @@ -56,6 +59,12 @@ astropy.io.fits
the same columns are decoded to Unicode strings (Numpy type U) which uses more
memory. [#6821]

- The ``table_to_hdu`` function and the ``BinTableHDU.from_columns`` and
``FITS_rec.from_columns`` methods now include a ``character_as_bytes``
keyword argument - if set to `True`, then when string columns are accessed,
byte columns will be returned, which can provide significantly improved
performance. [#6920]

astropy.io.misc
^^^^^^^^^^^^^^^

Expand Down
2 changes: 1 addition & 1 deletion astropy/io/fits/connect.py
Expand Up @@ -236,7 +236,7 @@ def write_table_fits(input, output, overwrite=False):
Whether to overwrite any existing file without warning.
"""

table_hdu = table_to_hdu(input)
table_hdu = table_to_hdu(input, character_as_bytes=True)
Copy link
Member Author

@astrofrog astrofrog Dec 2, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just FYI, this is deliberate - there are no situations I can think of under which we should expose character_as_bytes as a keyword argument in write_table_fits (unlike for reading)


# Check if output file already exists
if isinstance(output, str) and os.path.exists(output):
Expand Down
10 changes: 7 additions & 3 deletions astropy/io/fits/convenience.py
Expand Up @@ -423,7 +423,7 @@ def writeto(filename, data, header=None, output_verify='exception',
checksum=checksum)


def table_to_hdu(table):
def table_to_hdu(table, character_as_bytes=False):
"""
Convert an `~astropy.table.Table` object to a FITS
`~astropy.io.fits.BinTableHDU`.
Expand All @@ -432,6 +432,10 @@ def table_to_hdu(table):
----------
table : astropy.table.Table
The table to convert.
character_as_bytes : bool
Whether to return bytes for string columns when accessed from the HDU.
By default this is `False` and (unicode) strings are returned, but for
large tables this may use up a lot of memory.

Returns
-------
Expand Down Expand Up @@ -474,7 +478,7 @@ def table_to_hdu(table):

# TODO: it might be better to construct the FITS table directly from
# the Table columns, rather than go via a structured array.
table_hdu = BinTableHDU.from_columns(np.array(table.filled()), header=hdr)
table_hdu = BinTableHDU.from_columns(np.array(table.filled()), header=hdr, character_as_bytes=True)
for col in table_hdu.columns:
# Binary FITS tables support TNULL *only* for integer data columns
# TODO: Determine a schema for handling non-integer masked columns
Expand All @@ -491,7 +495,7 @@ def table_to_hdu(table):

col.null = fill_value.astype(table[col.name].dtype)
else:
table_hdu = BinTableHDU.from_columns(np.array(table.filled()), header=hdr)
table_hdu = BinTableHDU.from_columns(np.array(table.filled()), header=hdr, character_as_bytes=character_as_bytes)

# Set units for output HDU
for col in table_hdu.columns:
Expand Down
20 changes: 3 additions & 17 deletions astropy/io/fits/fitsrec.py
Expand Up @@ -15,7 +15,7 @@
from .column import (ASCIITNULL, FITS2NUMPY, ASCII2NUMPY, ASCII2STR, ColDefs,
_AsciiColDefs, _FormatX, _FormatP, _VLF, _get_index,
_wrapx, _unwrapx, _makep, Delayed)
from .util import decode_ascii, encode_ascii
from .util import decode_ascii, encode_ascii, _rstrip_inplace
from ...utils import lazyproperty


Expand Down Expand Up @@ -267,7 +267,7 @@ def _init(self):
self._uint = False

@classmethod
def from_columns(cls, columns, nrows=0, fill=False):
def from_columns(cls, columns, nrows=0, fill=False, character_as_bytes=False):
"""
Given a `ColDefs` object of unknown origin, initialize a new `FITS_rec`
object.
Expand Down Expand Up @@ -329,6 +329,7 @@ def from_columns(cls, columns, nrows=0, fill=False):
raw_data = np.empty(columns.dtype.itemsize * nrows, dtype=np.uint8)
raw_data.fill(ord(columns._padding_byte))
data = np.recarray(nrows, dtype=columns.dtype, buf=raw_data).view(cls)
data._character_as_bytes = character_as_bytes

# Make sure the data is a listener for changes to the columns
columns._add_listener(data)
Expand Down Expand Up @@ -1283,21 +1284,6 @@ def _get_recarray_field(array, key):
return field


def _rstrip_inplace(array, chars=None):
"""
Performs an in-place rstrip operation on string arrays.
This is necessary since the built-in `np.char.rstrip` in Numpy does not
perform an in-place calculation. This can be removed if ever
https://github.com/numpy/numpy/issues/6303 is implemented (however, for
the purposes of this module the only in-place vectorized string functions
we need are rstrip and encode).
"""

for item in np.nditer(array, flags=['zerosize_ok'],
op_flags=['readwrite']):
item[...] = item.item().rstrip(chars)


class _UnicodeArrayEncodeError(UnicodeEncodeError):
def __init__(self, encoding, object_, start, end, reason, index):
super().__init__(encoding, object_, start, end, reason)
Expand Down
12 changes: 9 additions & 3 deletions astropy/io/fits/hdu/table.py
Expand Up @@ -71,7 +71,7 @@ def match_header(cls, header):

@classmethod
def from_columns(cls, columns, header=None, nrows=0, fill=False,
**kwargs):
character_as_bytes=False, **kwargs):
"""
Given either a `ColDefs` object, a sequence of `Column` objects,
or another table HDU or table data (a `FITS_rec` or multi-field
Expand Down Expand Up @@ -111,6 +111,11 @@ def from_columns(cls, columns, header=None, nrows=0, fill=False,
copy the data from input, undefined cells will still be filled with
zeros/blanks.

character_as_bytes : bool
Whether to return bytes for string columns when accessed from the
HDU. By default this is `False` and (unicode) strings are returned,
but for large tables this may use up a lot of memory.

Notes
-----

Expand All @@ -119,8 +124,9 @@ def from_columns(cls, columns, header=None, nrows=0, fill=False,
"""

coldefs = cls._columns_type(columns)
data = FITS_rec.from_columns(coldefs, nrows=nrows, fill=fill)
hdu = cls(data=data, header=header, **kwargs)
data = FITS_rec.from_columns(coldefs, nrows=nrows, fill=fill,
character_as_bytes=character_as_bytes)
hdu = cls(data=data, header=header, character_as_bytes=character_as_bytes, **kwargs)
coldefs._add_listener(hdu)
return hdu

Expand Down
37 changes: 36 additions & 1 deletion astropy/io/fits/tests/test_util.py
Expand Up @@ -7,6 +7,7 @@

import pytest
import numpy as np
from numpy.testing import assert_equal

try:
from PIL import Image
Expand All @@ -16,7 +17,7 @@

from ....tests.helper import catch_warnings
from .. import util
from ..util import ignore_sigint
from ..util import ignore_sigint, _rstrip_inplace
from .._numpy_hacks import realign_dtype

from . import FitsTestCase
Expand Down Expand Up @@ -148,3 +149,37 @@ def test_mode_normalization(self):
filename = self.temp('test3{0}.dat'.format(num))
with open(filename, mode) as fileobj:
assert util.fileobj_mode(fileobj) == res


def test_rstrip_inplace():

# Incorrect type
s = np.array([1, 2, 3])
with pytest.raises(TypeError) as exc:
_rstrip_inplace(s)
assert exc.value.args[0] == 'This function can only be used on string arrays'

# Bytes array
s = np.array(['a ', ' b', ' c c '], dtype='S6')
_rstrip_inplace(s)
assert_equal(s, np.array(['a', ' b', ' c c'], dtype='S6'))

# Unicode array
s = np.array(['a ', ' b', ' c c '], dtype='U6')
_rstrip_inplace(s)
assert_equal(s, np.array(['a', ' b', ' c c'], dtype='U6'))

# 2-dimensional array
s = np.array([['a ', ' b'], [' c c ', ' a ']], dtype='S6')
_rstrip_inplace(s)
assert_equal(s, np.array([['a', ' b'], [' c c', ' a']], dtype='S6'))

# 3-dimensional array
s = np.repeat(' a a ', 24).reshape((2, 3, 4))
_rstrip_inplace(s)
assert_equal(s, ' a a')

# 3-dimensional non-contiguous array
s = np.repeat(' a a ', 1000).reshape((10, 10, 10))[:2, :3, :4]
_rstrip_inplace(s)
assert_equal(s, ' a a')
50 changes: 50 additions & 0 deletions astropy/io/fits/util.py
Expand Up @@ -873,3 +873,53 @@ def get_testdata_filepath(filename):
"""
return data.get_pkg_data_filename(
'io/fits/tests/data/{}'.format(filename), 'astropy')


def _rstrip_inplace(array):
"""
Performs an in-place rstrip operation on string arrays. This is necessary
since the built-in `np.char.rstrip` in Numpy does not perform an in-place
calculation.
"""

# The following implementation convert the string to unsigned integers of
# the right length. Trailing spaces (which are represented as 32) are then
# converted to null characters (represented as zeros). To avoid creating
# large temporary mask arrays, we loop over chunks (attempting to do that
# on a 1-D version of the array; large memory may still be needed in the
# unlikely case that a string array has small first dimension and cannot
# be represented as a contiguous 1-D array in memory).

dt = array.dtype

if dt.kind not in 'SU':
raise TypeError("This function can only be used on string arrays")
# View the array as appropriate integers. The last dimension will
# equal the number of characters in each string.
bpc = 1 if dt.kind == 'S' else 4
dt_int = "{0}{1}u{2}".format(dt.itemsize // bpc, dt.byteorder, bpc)
b = array.view(dt_int, np.ndarray)
# For optimal speed, work in chunks of the internal ufunc buffer size.
bufsize = np.getbufsize()
# Attempt to have the strings as a 1-D array to give the chunk known size.
# Note: the code will work if this fails; the chunks will just be larger.
if b.ndim > 2:
try:
b.shape = -1, b.shape[-1]
except AttributeError: # can occur for non-contiguous arrays
pass
for j in range(0, b.shape[0], bufsize):
c = b[j:j + bufsize]
# Mask which will tell whether we're in a sequence of trailing spaces.
mask = np.ones(c.shape[:-1], dtype=bool)
# Loop over the characters in the strings, in reverse order. We process
# the i-th character of all strings in the chunk at the same time. If
# the character is 32, this corresponds to a space, and we then change
# this to 0. We then construct a new mask to find rows where the
# i-th character is 0 (null) and the i-1-th is 32 (space) and repeat.
for i in range(-1, -c.shape[-1], -1):
mask &= c[..., i] == 32
c[..., i][mask] = 0
mask = c[..., i] == 0

return array