Consistently use 64-bit integers in io.ascii (including Windows) (#…

…16005) * ensure 64 bit int columns remain 64 bit int type on Windows (Python path) * ensure 64 bit in columns read correctly, C code path * Fixed tests to reflect changes from reading 64bit integers as ints * C path int parsing: use int64_t instead of long long * update the test to reflect the policy of defaulting to 64bit ints in all cases. * update changelog to stress the 64-bit by default behavior. * forward port ipac test from #15992 * small optimization: inline int64 string parsing C function * Tweak changelog wording and add what's new entry
astropy · Feb 12, 2024 · c7061a8 · c7061a8
1 parent ebbfc48
commit c7061a8
Show file tree

Hide file tree

Showing 9 changed files with 116 additions and 18 deletions.
diff --git a/astropy/io/ascii/core.py b/astropy/io/ascii/core.py
@@ -1175,7 +1175,14 @@ class TableOutputter(BaseOutputter):
     Output the table as an astropy.table.Table object.
     """
 
-    default_converters = [convert_numpy(int), convert_numpy(float), convert_numpy(str)]
+    default_converters = [
+        # Use `np.int64` to ensure large integers can be read as ints
+        # on platforms such as Windows
+        # https://github.com/astropy/astropy/issues/5744
+        convert_numpy(np.int64),
+        convert_numpy(float),
+        convert_numpy(str),
+    ]
 
     def __call__(self, cols, meta):
         # Sets col.data to numpy array and col.type to io.ascii Type class (e.g.

diff --git a/astropy/io/ascii/cparser.pyx b/astropy/io/ascii/cparser.pyx
@@ -22,6 +22,7 @@ from cpython.buffer cimport (
     PyObject_GetBuffer,
 )
 from libc cimport stdio
+from libc.stdint cimport int64_t
 
 from astropy.utils.data import get_readable_fileobj
 from astropy.utils.exceptions import AstropyWarning
@@ -92,7 +93,7 @@ cdef extern from "src/tokenizer.h":
     void delete_tokenizer(tokenizer_t *tokenizer)
     int skip_lines(tokenizer_t *self, int offset, int header)
     int tokenize(tokenizer_t *self, int end, int header, int num_cols)
-    long str_to_long(tokenizer_t *self, char *str)
+    int64_t str_to_int64_t(tokenizer_t *self, char *str)
     double fast_str_to_double(tokenizer_t *self, char *str)
     double str_to_double(tokenizer_t *self, char *str)
     void start_iteration(tokenizer_t *self, int col)
@@ -673,10 +674,13 @@ cdef class CParser:
         if nrows != -1:
             num_rows = nrows
         # initialize ndarray
-        cdef np.ndarray col = np.empty(num_rows, dtype=np.int_)
-        cdef long converted
+        # use `int64_t` for integers to ensure large integers are converted correctly
+        # on some platforms, e.g. Windows (where `long` is 32 bits)
+        # https://github.com/astropy/astropy/issues/5744
+        cdef np.ndarray col = np.empty(num_rows, dtype=np.int64)
+        cdef int64_t converted
         cdef int row = 0
-        cdef long *data = <long *> col.data # pointer to raw data
+        cdef int64_t *data = <int64_t*> col.data # pointer to raw data
         cdef char *field
         cdef char *empty_field = t.buf # memory address of designated empty buffer
         cdef bytes new_value
@@ -706,12 +710,12 @@ cdef class CParser:
                     mask.add(row)
                     new_value = str(replace_info[0]).encode('ascii')
                     # try converting the new value
-                    converted = str_to_long(t, new_value)
+                    converted = str_to_int64_t(t, new_value)
                 else:
-                    converted = str_to_long(t, field)
+                    converted = str_to_int64_t(t, field)
             else:
                 # convert the field to long (widest integer type)
-                converted = str_to_long(t, field)
+                converted = str_to_int64_t(t, field)
 
             if t.code in (CONVERSION_ERROR, OVERFLOW_ERROR):
                 # no dice

diff --git a/astropy/io/ascii/src/tokenizer.c b/astropy/io/ascii/src/tokenizer.c
@@ -590,12 +590,32 @@ static int ascii_strncasecmp(const char *str1, const char *str2, size_t n)
 }
 
 
-long str_to_long(tokenizer_t *self, char *str)
+static inline int64_t strtoi64(const char *nptr, char **endptr, int base)
+{
+    // Adapted from: https://stackoverflow.com/a/66046867
+    errno = 0;
+    long long v = strtoll(nptr, endptr, base);
+
+    #if LLONG_MIN < INT64_MIN || LLONG_MAX > INT64_MAX
+    if (v < INT64_MIN) {
+        v = INT64_MIN;
+        errno = ERANGE;
+    } else if (v > INT64_MAX) {
+        v = INT64_MAX;
+        errno = ERANGE;
+    }
+    #endif
+
+    return (int64_t) v;
+}
+
+
+int64_t str_to_int64_t(tokenizer_t *self, char *str)
 {
     char *tmp;
-    long ret;
+    int64_t ret;
     errno = 0;
-    ret = strtol(str, &tmp, 10);
+    ret = strtoi64(str, &tmp, 10);
 
     if (tmp == str || *tmp != '\0')
         self->code = CONVERSION_ERROR;

diff --git a/astropy/io/ascii/src/tokenizer.h b/astropy/io/ascii/src/tokenizer.h
@@ -10,6 +10,7 @@
 #include <math.h>
 #include <float.h>
 #include <ctype.h>
+#include <stdint.h>
 #include <sys/types.h>
 
 #ifdef _MSC_VER
@@ -101,7 +102,7 @@ void resize_col(tokenizer_t *self, int index);
 void resize_comments(tokenizer_t *self);
 int skip_lines(tokenizer_t *self, int offset, int header);
 int tokenize(tokenizer_t *self, int end, int header, int num_cols);
-long str_to_long(tokenizer_t *self, char *str);
+int64_t str_to_int64_t(tokenizer_t *self, char *str);
 double str_to_double(tokenizer_t *self, char *str);
 double xstrtod(const char *str, char **endptr, char decimal,
                char expchar, char tsep, int skip_trailing);

diff --git a/astropy/io/ascii/tests/test_c_reader.py b/astropy/io/ascii/tests/test_c_reader.py
@@ -1442,8 +1442,8 @@ def test_int_out_of_range(parallel, guess):
     Integer numbers outside int range shall be returned as string columns
     consistent with the standard (Python) parser (no 'upcasting' to float).
     """
-    imin = np.iinfo(int).min + 1
-    imax = np.iinfo(int).max - 1
+    imin = np.iinfo(np.int64).min + 1
+    imax = np.iinfo(np.int64).max - 1
     huge = f"{imax+2:d}"
 
     text = f"P M S\n {imax:d} {imin:d} {huge:s}"
@@ -1484,7 +1484,7 @@ def test_int_out_of_order(guess):
     shows up first, it will produce a string column - with both readers.
     Broken with the parallel fast_reader.
     """
-    imax = np.iinfo(int).max - 1
+    imax = np.iinfo(np.int64).max - 1
     text = f"A B\n 12.3 {imax:d}0\n {imax:d}0 45.6e7"
     expected = Table([[12.3, 10.0 * imax], [f"{imax:d}0", "45.6e7"]], names=("A", "B"))
 

diff --git a/astropy/io/ascii/tests/test_read.py b/astropy/io/ascii/tests/test_read.py
@@ -320,7 +320,7 @@ def test_daophot_types():
     assert (
         table["PIER"].dtype.char in "US"
     )  # string (data values are consistent with int)
-    assert table["ID"].dtype.char in "il"  # int or long
+    assert table["ID"].dtype.kind == "i"  # int types: int, long, int64
 
 
 def test_daophot_header_keywords():
@@ -1924,8 +1924,8 @@ def test_include_names_rdb_fast():
     lines[0] = "a\ta_2\ta_1\ta_3\ta_4"
     dat = ascii.read(lines, fast_reader="force", include_names=["a", "a_2", "a_3"])
     assert len(dat) == 2
-    assert dat["a"].dtype == int
-    assert dat["a_2"].dtype == int
+    assert dat["a"].dtype.kind == "i"
+    assert dat["a_2"].dtype.kind == "i"
 
 
 @pytest.mark.parametrize("fast_reader", [False, "force"])

diff --git a/astropy/io/ascii/tests/test_types.py b/astropy/io/ascii/tests/test_types.py
@@ -4,6 +4,7 @@
 from io import StringIO
 
 import numpy as np
+import pytest
 
 from astropy.io import ascii
 
@@ -53,6 +54,51 @@ def test_ipac_read_types():
         assert_equal(col.type, expected_type)
 
 
+def test_ipac_read_long_columns():
+    """Test for https://github.com/astropy/astropy/issues/15989"""
+    test_data = """\
+|              oid|expid|cadence|
+|             long|    l|    int|
+ 90000000000000001   123       1
+"""
+    dat = ascii.read(test_data, format="ipac")
+
+    # assert oid, as a long column, is int64
+    oid = dat["oid"]
+    assert oid[0] == 90000000000000001
+    assert oid.dtype.kind == "i"
+    assert oid.dtype.itemsize == 8
+
+    # expid is declared as a long column,
+    # the type needs to be int64, even though all
+    # the values are within int32 range
+    expid = dat["expid"]
+    assert expid[0] == 123
+    assert expid.dtype.kind == "i"
+    assert expid.dtype.itemsize == 8
+
+
+@pytest.mark.parametrize("fast", [False, "force"])
+def test_large_integers(fast):
+    # case https://github.com/astropy/astropy/issues/5744
+    dat = ascii.read(
+        """\
+num,cadence
+110000000000000001,1
+12345,2
+""",
+        fast_reader=fast,
+    )
+    # assert `num`` is exactly 64bit int
+    assert dat["num"].dtype.kind == "i"
+    assert dat["num"].dtype.itemsize == 8
+    assert dat["num"][0] == 110000000000000001
+
+    # assert `cadence` is exactly 64bit int, even though the values are small numbers.
+    assert dat["cadence"].dtype.kind == "i"
+    assert dat["cadence"].dtype.itemsize == 8
+
+
 def test_col_dtype_in_custom_class():
     """Test code in BaseOutputter._convert_vals to handle Column.dtype
     attribute. See discussion in #11895."""

diff --git a/docs/changes/io.ascii/16005.bugfix.rst b/docs/changes/io.ascii/16005.bugfix.rst
@@ -0,0 +1,8 @@
+The ``io.ascii`` Python and C table readers were updated to use a 64-bit integer field by
+default when reading a column of integer numeric data. This changes the default behavior
+on Windows and potentially 32-bit architectures. Previously on those platforms, table
+columns with any long integers which overflowed the 32-bit integer would be returned
+as string columns. The new default behavior is consistent with ``numpy`` v2 and ``pandas``.
+
+The new behavior also fixed the error in reading IPAC tables with ``long`` column type
+on some platforms, e.g., Windows.
diff --git a/docs/whatsnew/6.1.rst b/docs/whatsnew/6.1.rst
@@ -23,6 +23,18 @@ By the numbers:
 * X distinct people have contributed code
 
 
+.. _whatsnew-6.1-ascii-default-int-columns-as-int64:
+
+``io.ascii`` uses 64-integers by default for integer columns
+============================================================
+
+:mod:`~astropy.io.ascii` now uses a 64-bit integer field by
+default when reading a column of integer numeric data. This changes the default behavior
+on Windows and potentially 32-bit architectures. Previously on those platforms, table
+columns with any long integers which overflowed the 32-bit integer would be returned
+as string columns. The new default behavior is consistent with ``numpy`` v2 and ``pandas``.
+
+
 Full change log
 ===============