From c7061a81002a73ad43b226d262e96ad69c4a0e31 Mon Sep 17 00:00:00 2001 From: Sam Lee Date: Mon, 12 Feb 2024 10:42:49 -0800 Subject: [PATCH] Consistently use 64-bit integers in `io.ascii` (including Windows) (#16005) * ensure 64 bit int columns remain 64 bit int type on Windows (Python path) * ensure 64 bit in columns read correctly, C code path * Fixed tests to reflect changes from reading 64bit integers as ints * C path int parsing: use int64_t instead of long long * update the test to reflect the policy of defaulting to 64bit ints in all cases. * update changelog to stress the 64-bit by default behavior. * forward port ipac test from #15992 * small optimization: inline int64 string parsing C function * Tweak changelog wording and add what's new entry --- astropy/io/ascii/core.py | 9 ++++- astropy/io/ascii/cparser.pyx | 18 ++++++---- astropy/io/ascii/src/tokenizer.c | 26 ++++++++++++-- astropy/io/ascii/src/tokenizer.h | 3 +- astropy/io/ascii/tests/test_c_reader.py | 6 ++-- astropy/io/ascii/tests/test_read.py | 6 ++-- astropy/io/ascii/tests/test_types.py | 46 +++++++++++++++++++++++++ docs/changes/io.ascii/16005.bugfix.rst | 8 +++++ docs/whatsnew/6.1.rst | 12 +++++++ 9 files changed, 116 insertions(+), 18 deletions(-) create mode 100644 docs/changes/io.ascii/16005.bugfix.rst diff --git a/astropy/io/ascii/core.py b/astropy/io/ascii/core.py index c8d0acfa92e..e67a911aa68 100644 --- a/astropy/io/ascii/core.py +++ b/astropy/io/ascii/core.py @@ -1175,7 +1175,14 @@ class TableOutputter(BaseOutputter): Output the table as an astropy.table.Table object. """ - default_converters = [convert_numpy(int), convert_numpy(float), convert_numpy(str)] + default_converters = [ + # Use `np.int64` to ensure large integers can be read as ints + # on platforms such as Windows + # https://github.com/astropy/astropy/issues/5744 + convert_numpy(np.int64), + convert_numpy(float), + convert_numpy(str), + ] def __call__(self, cols, meta): # Sets col.data to numpy array and col.type to io.ascii Type class (e.g. diff --git a/astropy/io/ascii/cparser.pyx b/astropy/io/ascii/cparser.pyx index 85eebd1e470..85aa19b9d66 100644 --- a/astropy/io/ascii/cparser.pyx +++ b/astropy/io/ascii/cparser.pyx @@ -22,6 +22,7 @@ from cpython.buffer cimport ( PyObject_GetBuffer, ) from libc cimport stdio +from libc.stdint cimport int64_t from astropy.utils.data import get_readable_fileobj from astropy.utils.exceptions import AstropyWarning @@ -92,7 +93,7 @@ cdef extern from "src/tokenizer.h": void delete_tokenizer(tokenizer_t *tokenizer) int skip_lines(tokenizer_t *self, int offset, int header) int tokenize(tokenizer_t *self, int end, int header, int num_cols) - long str_to_long(tokenizer_t *self, char *str) + int64_t str_to_int64_t(tokenizer_t *self, char *str) double fast_str_to_double(tokenizer_t *self, char *str) double str_to_double(tokenizer_t *self, char *str) void start_iteration(tokenizer_t *self, int col) @@ -673,10 +674,13 @@ cdef class CParser: if nrows != -1: num_rows = nrows # initialize ndarray - cdef np.ndarray col = np.empty(num_rows, dtype=np.int_) - cdef long converted + # use `int64_t` for integers to ensure large integers are converted correctly + # on some platforms, e.g. Windows (where `long` is 32 bits) + # https://github.com/astropy/astropy/issues/5744 + cdef np.ndarray col = np.empty(num_rows, dtype=np.int64) + cdef int64_t converted cdef int row = 0 - cdef long *data = col.data # pointer to raw data + cdef int64_t *data = col.data # pointer to raw data cdef char *field cdef char *empty_field = t.buf # memory address of designated empty buffer cdef bytes new_value @@ -706,12 +710,12 @@ cdef class CParser: mask.add(row) new_value = str(replace_info[0]).encode('ascii') # try converting the new value - converted = str_to_long(t, new_value) + converted = str_to_int64_t(t, new_value) else: - converted = str_to_long(t, field) + converted = str_to_int64_t(t, field) else: # convert the field to long (widest integer type) - converted = str_to_long(t, field) + converted = str_to_int64_t(t, field) if t.code in (CONVERSION_ERROR, OVERFLOW_ERROR): # no dice diff --git a/astropy/io/ascii/src/tokenizer.c b/astropy/io/ascii/src/tokenizer.c index 834ea692424..dadedd0152a 100644 --- a/astropy/io/ascii/src/tokenizer.c +++ b/astropy/io/ascii/src/tokenizer.c @@ -590,12 +590,32 @@ static int ascii_strncasecmp(const char *str1, const char *str2, size_t n) } -long str_to_long(tokenizer_t *self, char *str) +static inline int64_t strtoi64(const char *nptr, char **endptr, int base) +{ + // Adapted from: https://stackoverflow.com/a/66046867 + errno = 0; + long long v = strtoll(nptr, endptr, base); + + #if LLONG_MIN < INT64_MIN || LLONG_MAX > INT64_MAX + if (v < INT64_MIN) { + v = INT64_MIN; + errno = ERANGE; + } else if (v > INT64_MAX) { + v = INT64_MAX; + errno = ERANGE; + } + #endif + + return (int64_t) v; +} + + +int64_t str_to_int64_t(tokenizer_t *self, char *str) { char *tmp; - long ret; + int64_t ret; errno = 0; - ret = strtol(str, &tmp, 10); + ret = strtoi64(str, &tmp, 10); if (tmp == str || *tmp != '\0') self->code = CONVERSION_ERROR; diff --git a/astropy/io/ascii/src/tokenizer.h b/astropy/io/ascii/src/tokenizer.h index 0b30a409c0c..3b0cb6e231c 100644 --- a/astropy/io/ascii/src/tokenizer.h +++ b/astropy/io/ascii/src/tokenizer.h @@ -10,6 +10,7 @@ #include #include #include +#include #include #ifdef _MSC_VER @@ -101,7 +102,7 @@ void resize_col(tokenizer_t *self, int index); void resize_comments(tokenizer_t *self); int skip_lines(tokenizer_t *self, int offset, int header); int tokenize(tokenizer_t *self, int end, int header, int num_cols); -long str_to_long(tokenizer_t *self, char *str); +int64_t str_to_int64_t(tokenizer_t *self, char *str); double str_to_double(tokenizer_t *self, char *str); double xstrtod(const char *str, char **endptr, char decimal, char expchar, char tsep, int skip_trailing); diff --git a/astropy/io/ascii/tests/test_c_reader.py b/astropy/io/ascii/tests/test_c_reader.py index 677df8c78d6..0d5437b2095 100644 --- a/astropy/io/ascii/tests/test_c_reader.py +++ b/astropy/io/ascii/tests/test_c_reader.py @@ -1442,8 +1442,8 @@ def test_int_out_of_range(parallel, guess): Integer numbers outside int range shall be returned as string columns consistent with the standard (Python) parser (no 'upcasting' to float). """ - imin = np.iinfo(int).min + 1 - imax = np.iinfo(int).max - 1 + imin = np.iinfo(np.int64).min + 1 + imax = np.iinfo(np.int64).max - 1 huge = f"{imax+2:d}" text = f"P M S\n {imax:d} {imin:d} {huge:s}" @@ -1484,7 +1484,7 @@ def test_int_out_of_order(guess): shows up first, it will produce a string column - with both readers. Broken with the parallel fast_reader. """ - imax = np.iinfo(int).max - 1 + imax = np.iinfo(np.int64).max - 1 text = f"A B\n 12.3 {imax:d}0\n {imax:d}0 45.6e7" expected = Table([[12.3, 10.0 * imax], [f"{imax:d}0", "45.6e7"]], names=("A", "B")) diff --git a/astropy/io/ascii/tests/test_read.py b/astropy/io/ascii/tests/test_read.py index 794f46864c3..cf10e7a4ae3 100644 --- a/astropy/io/ascii/tests/test_read.py +++ b/astropy/io/ascii/tests/test_read.py @@ -320,7 +320,7 @@ def test_daophot_types(): assert ( table["PIER"].dtype.char in "US" ) # string (data values are consistent with int) - assert table["ID"].dtype.char in "il" # int or long + assert table["ID"].dtype.kind == "i" # int types: int, long, int64 def test_daophot_header_keywords(): @@ -1924,8 +1924,8 @@ def test_include_names_rdb_fast(): lines[0] = "a\ta_2\ta_1\ta_3\ta_4" dat = ascii.read(lines, fast_reader="force", include_names=["a", "a_2", "a_3"]) assert len(dat) == 2 - assert dat["a"].dtype == int - assert dat["a_2"].dtype == int + assert dat["a"].dtype.kind == "i" + assert dat["a_2"].dtype.kind == "i" @pytest.mark.parametrize("fast_reader", [False, "force"]) diff --git a/astropy/io/ascii/tests/test_types.py b/astropy/io/ascii/tests/test_types.py index 6e53d671f6f..b24757554f1 100644 --- a/astropy/io/ascii/tests/test_types.py +++ b/astropy/io/ascii/tests/test_types.py @@ -4,6 +4,7 @@ from io import StringIO import numpy as np +import pytest from astropy.io import ascii @@ -53,6 +54,51 @@ def test_ipac_read_types(): assert_equal(col.type, expected_type) +def test_ipac_read_long_columns(): + """Test for https://github.com/astropy/astropy/issues/15989""" + test_data = """\ +| oid|expid|cadence| +| long| l| int| + 90000000000000001 123 1 +""" + dat = ascii.read(test_data, format="ipac") + + # assert oid, as a long column, is int64 + oid = dat["oid"] + assert oid[0] == 90000000000000001 + assert oid.dtype.kind == "i" + assert oid.dtype.itemsize == 8 + + # expid is declared as a long column, + # the type needs to be int64, even though all + # the values are within int32 range + expid = dat["expid"] + assert expid[0] == 123 + assert expid.dtype.kind == "i" + assert expid.dtype.itemsize == 8 + + +@pytest.mark.parametrize("fast", [False, "force"]) +def test_large_integers(fast): + # case https://github.com/astropy/astropy/issues/5744 + dat = ascii.read( + """\ +num,cadence +110000000000000001,1 +12345,2 +""", + fast_reader=fast, + ) + # assert `num`` is exactly 64bit int + assert dat["num"].dtype.kind == "i" + assert dat["num"].dtype.itemsize == 8 + assert dat["num"][0] == 110000000000000001 + + # assert `cadence` is exactly 64bit int, even though the values are small numbers. + assert dat["cadence"].dtype.kind == "i" + assert dat["cadence"].dtype.itemsize == 8 + + def test_col_dtype_in_custom_class(): """Test code in BaseOutputter._convert_vals to handle Column.dtype attribute. See discussion in #11895.""" diff --git a/docs/changes/io.ascii/16005.bugfix.rst b/docs/changes/io.ascii/16005.bugfix.rst new file mode 100644 index 00000000000..e7db72b6b76 --- /dev/null +++ b/docs/changes/io.ascii/16005.bugfix.rst @@ -0,0 +1,8 @@ +The ``io.ascii`` Python and C table readers were updated to use a 64-bit integer field by +default when reading a column of integer numeric data. This changes the default behavior +on Windows and potentially 32-bit architectures. Previously on those platforms, table +columns with any long integers which overflowed the 32-bit integer would be returned +as string columns. The new default behavior is consistent with ``numpy`` v2 and ``pandas``. + +The new behavior also fixed the error in reading IPAC tables with ``long`` column type +on some platforms, e.g., Windows. diff --git a/docs/whatsnew/6.1.rst b/docs/whatsnew/6.1.rst index a7e1ef4d72a..a692fcbc3fd 100644 --- a/docs/whatsnew/6.1.rst +++ b/docs/whatsnew/6.1.rst @@ -23,6 +23,18 @@ By the numbers: * X distinct people have contributed code +.. _whatsnew-6.1-ascii-default-int-columns-as-int64: + +``io.ascii`` uses 64-integers by default for integer columns +============================================================ + +:mod:`~astropy.io.ascii` now uses a 64-bit integer field by +default when reading a column of integer numeric data. This changes the default behavior +on Windows and potentially 32-bit architectures. Previously on those platforms, table +columns with any long integers which overflowed the 32-bit integer would be returned +as string columns. The new default behavior is consistent with ``numpy`` v2 and ``pandas``. + + Full change log ===============