Skip to content

Commit

Permalink
Consistently use 64-bit integers in io.ascii (including Windows) (#…
Browse files Browse the repository at this point in the history
…16005)

* ensure 64 bit int columns remain 64 bit int type on Windows (Python path)

* ensure 64 bit in columns read correctly, C code path

* Fixed tests to reflect changes from reading 64bit integers as ints

* C path int parsing: use int64_t instead of long long

* update the test to reflect the policy of defaulting to 64bit ints in all cases.

* update changelog to stress  the 64-bit by default behavior.

* forward port ipac test from #15992

* small optimization: inline int64 string parsing C function

* Tweak changelog wording and add what's new entry
  • Loading branch information
orionlee committed Feb 12, 2024
1 parent ebbfc48 commit c7061a8
Show file tree
Hide file tree
Showing 9 changed files with 116 additions and 18 deletions.
9 changes: 8 additions & 1 deletion astropy/io/ascii/core.py
Expand Up @@ -1175,7 +1175,14 @@ class TableOutputter(BaseOutputter):
Output the table as an astropy.table.Table object.
"""

default_converters = [convert_numpy(int), convert_numpy(float), convert_numpy(str)]
default_converters = [
# Use `np.int64` to ensure large integers can be read as ints
# on platforms such as Windows
# https://github.com/astropy/astropy/issues/5744
convert_numpy(np.int64),
convert_numpy(float),
convert_numpy(str),
]

def __call__(self, cols, meta):
# Sets col.data to numpy array and col.type to io.ascii Type class (e.g.
Expand Down
18 changes: 11 additions & 7 deletions astropy/io/ascii/cparser.pyx
Expand Up @@ -22,6 +22,7 @@ from cpython.buffer cimport (
PyObject_GetBuffer,
)
from libc cimport stdio
from libc.stdint cimport int64_t

from astropy.utils.data import get_readable_fileobj
from astropy.utils.exceptions import AstropyWarning
Expand Down Expand Up @@ -92,7 +93,7 @@ cdef extern from "src/tokenizer.h":
void delete_tokenizer(tokenizer_t *tokenizer)
int skip_lines(tokenizer_t *self, int offset, int header)
int tokenize(tokenizer_t *self, int end, int header, int num_cols)
long str_to_long(tokenizer_t *self, char *str)
int64_t str_to_int64_t(tokenizer_t *self, char *str)
double fast_str_to_double(tokenizer_t *self, char *str)
double str_to_double(tokenizer_t *self, char *str)
void start_iteration(tokenizer_t *self, int col)
Expand Down Expand Up @@ -673,10 +674,13 @@ cdef class CParser:
if nrows != -1:
num_rows = nrows
# initialize ndarray
cdef np.ndarray col = np.empty(num_rows, dtype=np.int_)
cdef long converted
# use `int64_t` for integers to ensure large integers are converted correctly
# on some platforms, e.g. Windows (where `long` is 32 bits)
# https://github.com/astropy/astropy/issues/5744
cdef np.ndarray col = np.empty(num_rows, dtype=np.int64)
cdef int64_t converted
cdef int row = 0
cdef long *data = <long *> col.data # pointer to raw data
cdef int64_t *data = <int64_t*> col.data # pointer to raw data
cdef char *field
cdef char *empty_field = t.buf # memory address of designated empty buffer
cdef bytes new_value
Expand Down Expand Up @@ -706,12 +710,12 @@ cdef class CParser:
mask.add(row)
new_value = str(replace_info[0]).encode('ascii')
# try converting the new value
converted = str_to_long(t, new_value)
converted = str_to_int64_t(t, new_value)
else:
converted = str_to_long(t, field)
converted = str_to_int64_t(t, field)
else:
# convert the field to long (widest integer type)
converted = str_to_long(t, field)
converted = str_to_int64_t(t, field)

if t.code in (CONVERSION_ERROR, OVERFLOW_ERROR):
# no dice
Expand Down
26 changes: 23 additions & 3 deletions astropy/io/ascii/src/tokenizer.c
Expand Up @@ -590,12 +590,32 @@ static int ascii_strncasecmp(const char *str1, const char *str2, size_t n)
}


long str_to_long(tokenizer_t *self, char *str)
static inline int64_t strtoi64(const char *nptr, char **endptr, int base)
{
// Adapted from: https://stackoverflow.com/a/66046867
errno = 0;
long long v = strtoll(nptr, endptr, base);

#if LLONG_MIN < INT64_MIN || LLONG_MAX > INT64_MAX
if (v < INT64_MIN) {
v = INT64_MIN;
errno = ERANGE;
} else if (v > INT64_MAX) {
v = INT64_MAX;
errno = ERANGE;
}
#endif

return (int64_t) v;
}


int64_t str_to_int64_t(tokenizer_t *self, char *str)
{
char *tmp;
long ret;
int64_t ret;
errno = 0;
ret = strtol(str, &tmp, 10);
ret = strtoi64(str, &tmp, 10);

if (tmp == str || *tmp != '\0')
self->code = CONVERSION_ERROR;
Expand Down
3 changes: 2 additions & 1 deletion astropy/io/ascii/src/tokenizer.h
Expand Up @@ -10,6 +10,7 @@
#include <math.h>
#include <float.h>
#include <ctype.h>
#include <stdint.h>
#include <sys/types.h>

#ifdef _MSC_VER
Expand Down Expand Up @@ -101,7 +102,7 @@ void resize_col(tokenizer_t *self, int index);
void resize_comments(tokenizer_t *self);
int skip_lines(tokenizer_t *self, int offset, int header);
int tokenize(tokenizer_t *self, int end, int header, int num_cols);
long str_to_long(tokenizer_t *self, char *str);
int64_t str_to_int64_t(tokenizer_t *self, char *str);
double str_to_double(tokenizer_t *self, char *str);
double xstrtod(const char *str, char **endptr, char decimal,
char expchar, char tsep, int skip_trailing);
Expand Down
6 changes: 3 additions & 3 deletions astropy/io/ascii/tests/test_c_reader.py
Expand Up @@ -1442,8 +1442,8 @@ def test_int_out_of_range(parallel, guess):
Integer numbers outside int range shall be returned as string columns
consistent with the standard (Python) parser (no 'upcasting' to float).
"""
imin = np.iinfo(int).min + 1
imax = np.iinfo(int).max - 1
imin = np.iinfo(np.int64).min + 1
imax = np.iinfo(np.int64).max - 1
huge = f"{imax+2:d}"

text = f"P M S\n {imax:d} {imin:d} {huge:s}"
Expand Down Expand Up @@ -1484,7 +1484,7 @@ def test_int_out_of_order(guess):
shows up first, it will produce a string column - with both readers.
Broken with the parallel fast_reader.
"""
imax = np.iinfo(int).max - 1
imax = np.iinfo(np.int64).max - 1
text = f"A B\n 12.3 {imax:d}0\n {imax:d}0 45.6e7"
expected = Table([[12.3, 10.0 * imax], [f"{imax:d}0", "45.6e7"]], names=("A", "B"))

Expand Down
6 changes: 3 additions & 3 deletions astropy/io/ascii/tests/test_read.py
Expand Up @@ -320,7 +320,7 @@ def test_daophot_types():
assert (
table["PIER"].dtype.char in "US"
) # string (data values are consistent with int)
assert table["ID"].dtype.char in "il" # int or long
assert table["ID"].dtype.kind == "i" # int types: int, long, int64


def test_daophot_header_keywords():
Expand Down Expand Up @@ -1924,8 +1924,8 @@ def test_include_names_rdb_fast():
lines[0] = "a\ta_2\ta_1\ta_3\ta_4"
dat = ascii.read(lines, fast_reader="force", include_names=["a", "a_2", "a_3"])
assert len(dat) == 2
assert dat["a"].dtype == int
assert dat["a_2"].dtype == int
assert dat["a"].dtype.kind == "i"
assert dat["a_2"].dtype.kind == "i"


@pytest.mark.parametrize("fast_reader", [False, "force"])
Expand Down
46 changes: 46 additions & 0 deletions astropy/io/ascii/tests/test_types.py
Expand Up @@ -4,6 +4,7 @@
from io import StringIO

import numpy as np
import pytest

from astropy.io import ascii

Expand Down Expand Up @@ -53,6 +54,51 @@ def test_ipac_read_types():
assert_equal(col.type, expected_type)


def test_ipac_read_long_columns():
"""Test for https://github.com/astropy/astropy/issues/15989"""
test_data = """\
| oid|expid|cadence|
| long| l| int|
90000000000000001 123 1
"""
dat = ascii.read(test_data, format="ipac")

# assert oid, as a long column, is int64
oid = dat["oid"]
assert oid[0] == 90000000000000001
assert oid.dtype.kind == "i"
assert oid.dtype.itemsize == 8

# expid is declared as a long column,
# the type needs to be int64, even though all
# the values are within int32 range
expid = dat["expid"]
assert expid[0] == 123
assert expid.dtype.kind == "i"
assert expid.dtype.itemsize == 8


@pytest.mark.parametrize("fast", [False, "force"])
def test_large_integers(fast):
# case https://github.com/astropy/astropy/issues/5744
dat = ascii.read(
"""\
num,cadence
110000000000000001,1
12345,2
""",
fast_reader=fast,
)
# assert `num`` is exactly 64bit int
assert dat["num"].dtype.kind == "i"
assert dat["num"].dtype.itemsize == 8
assert dat["num"][0] == 110000000000000001

# assert `cadence` is exactly 64bit int, even though the values are small numbers.
assert dat["cadence"].dtype.kind == "i"
assert dat["cadence"].dtype.itemsize == 8


def test_col_dtype_in_custom_class():
"""Test code in BaseOutputter._convert_vals to handle Column.dtype
attribute. See discussion in #11895."""
Expand Down
8 changes: 8 additions & 0 deletions docs/changes/io.ascii/16005.bugfix.rst
@@ -0,0 +1,8 @@
The ``io.ascii`` Python and C table readers were updated to use a 64-bit integer field by
default when reading a column of integer numeric data. This changes the default behavior
on Windows and potentially 32-bit architectures. Previously on those platforms, table
columns with any long integers which overflowed the 32-bit integer would be returned
as string columns. The new default behavior is consistent with ``numpy`` v2 and ``pandas``.

The new behavior also fixed the error in reading IPAC tables with ``long`` column type
on some platforms, e.g., Windows.
12 changes: 12 additions & 0 deletions docs/whatsnew/6.1.rst
Expand Up @@ -23,6 +23,18 @@ By the numbers:
* X distinct people have contributed code


.. _whatsnew-6.1-ascii-default-int-columns-as-int64:

``io.ascii`` uses 64-integers by default for integer columns
============================================================

:mod:`~astropy.io.ascii` now uses a 64-bit integer field by
default when reading a column of integer numeric data. This changes the default behavior
on Windows and potentially 32-bit architectures. Previously on those platforms, table
columns with any long integers which overflowed the 32-bit integer would be returned
as string columns. The new default behavior is consistent with ``numpy`` v2 and ``pandas``.


Full change log
===============

Expand Down

0 comments on commit c7061a8

Please sign in to comment.