Skip to content

Commit

Permalink
Adds support for flexible symbol buffer threshold. (#238)
Browse files Browse the repository at this point in the history
  • Loading branch information
cheqianh committed Feb 10, 2023
1 parent 91bbbdd commit 3bf6a73
Show file tree
Hide file tree
Showing 3 changed files with 84 additions and 29 deletions.
13 changes: 10 additions & 3 deletions amazon/ion/ioncmodule.c
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ static char _err_msg[ERR_MSG_MAX_LEN];
#define _FAILWITHMSG(x, msg) { err = x; snprintf(_err_msg, ERR_MSG_MAX_LEN, msg); goto fail; }

#define IONC_BYTES_FORMAT "y#"
#define IONC_READ_ARGS_FORMAT "OO"
#define IONC_READ_ARGS_FORMAT "OOO"

static PyObject* _math_module;

Expand Down Expand Up @@ -1468,9 +1468,11 @@ PyObject* ionc_read(PyObject* self, PyObject *args, PyObject *kwds) {
iENTER;
PyObject *py_file = NULL; // TextIOWrapper
PyObject *emit_bare_values;
PyObject *text_buffer_size_limit;
ionc_read_Iterator *iterator = NULL;
static char *kwlist[] = {"file", "emit_bare_values", NULL};
if (!PyArg_ParseTupleAndKeywords(args, kwds, IONC_READ_ARGS_FORMAT, kwlist, &py_file, &emit_bare_values)) {
static char *kwlist[] = {"file", "emit_bare_values", "text_buffer_size_limit", NULL};
if (!PyArg_ParseTupleAndKeywords(args, kwds, IONC_READ_ARGS_FORMAT, kwlist, &py_file,
&emit_bare_values, &text_buffer_size_limit)) {
FAILWITH(IERR_INVALID_ARG);
}

Expand All @@ -1490,6 +1492,11 @@ PyObject* ionc_read(PyObject* self, PyObject *args, PyObject *kwds) {
memset(&iterator->reader, 0, sizeof(iterator->reader));
memset(&iterator->_reader_options, 0, sizeof(iterator->_reader_options));
iterator->_reader_options.decimal_context = &dec_context;
if (text_buffer_size_limit != Py_None) {
int symbol_threshold = PyLong_AsLong(text_buffer_size_limit);
iterator->_reader_options.symbol_threshold = symbol_threshold;
Py_XDECREF(text_buffer_size_limit);
}

IONCHECK(ion_reader_open_stream(
&iterator->reader,
Expand Down
77 changes: 51 additions & 26 deletions amazon/ion/simpleion.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,17 +39,19 @@
except ModuleNotFoundError:
c_ext = False


_ION_CONTAINER_END_EVENT = IonEvent(IonEventType.CONTAINER_END)
_IVM = b'\xe0\x01\x00\xea'
_TEXT_TYPES = (TextIOBase, io.StringIO)


def dump_python(obj, fp, imports=None, binary=True, sequence_as_stream=False, skipkeys=False, ensure_ascii=True,
check_circular=True, allow_nan=True, cls=None, indent=None, separators=None, encoding='utf-8', default=None,
use_decimal=True, namedtuple_as_object=True, tuple_as_array=True, bigint_as_string=False, sort_keys=False,
item_sort_key=None, for_json=None, ignore_nan=False, int_as_string_bitcount=None, iterable_as_array=False,
tuple_as_sexp=False, omit_version_marker=False, **kw):
check_circular=True, allow_nan=True, cls=None, indent=None, separators=None, encoding='utf-8',
default=None,
use_decimal=True, namedtuple_as_object=True, tuple_as_array=True, bigint_as_string=False,
sort_keys=False,
item_sort_key=None, for_json=None, ignore_nan=False, int_as_string_bitcount=None,
iterable_as_array=False,
tuple_as_sexp=False, omit_version_marker=False, **kw):
"""Serialize ``obj`` as an Ion-formatted stream to ``fp`` (a file-like object), using the following conversion
table::
+-------------------+-------------------+
Expand Down Expand Up @@ -214,10 +216,10 @@ def _dump(obj, writer, from_type, field=None, in_struct=False, depth=0):
writer.send(event)
if ion_type is IonType.STRUCT:
for field, val in iter(obj.items()):
_dump(val, writer, from_type, field, in_struct=True, depth=depth+1)
_dump(val, writer, from_type, field, in_struct=True, depth=depth + 1)
else:
for elem in obj:
_dump(elem, writer, from_type, depth=depth+1)
_dump(elem, writer, from_type, depth=depth + 1)
event = _ION_CONTAINER_END_EVENT
else:
# obj is a scalar value
Expand All @@ -228,7 +230,8 @@ def _dump(obj, writer, from_type, field=None, in_struct=False, depth=0):
writer.send(event)


def dumps(obj, imports=None, binary=True, sequence_as_stream=False, skipkeys=False, ensure_ascii=True, check_circular=True,
def dumps(obj, imports=None, binary=True, sequence_as_stream=False, skipkeys=False, ensure_ascii=True,
check_circular=True,
allow_nan=True, cls=None, indent=None, separators=None, encoding='utf-8', default=None, use_decimal=True,
namedtuple_as_object=True, tuple_as_array=True, bigint_as_string=False, sort_keys=False, item_sort_key=None,
for_json=None, ignore_nan=False, int_as_string_bitcount=None, iterable_as_array=False, tuple_as_sexp=False,
Expand Down Expand Up @@ -293,7 +296,8 @@ def dumps(obj, imports=None, binary=True, sequence_as_stream=False, skipkeys=Fal


def load_python(fp, catalog=None, single_value=True, encoding='utf-8', cls=None, object_hook=None, parse_float=None,
parse_int=None, parse_constant=None, object_pairs_hook=None, use_decimal=None, parse_eagerly=True, **kw):
parse_int=None, parse_constant=None, object_pairs_hook=None, use_decimal=None, parse_eagerly=True,
**kw):
"""Deserialize ``fp`` (a file-like object), which contains a text or binary Ion stream, to a Python object using the
following conversion table::
+-------------------+-------------------+
Expand Down Expand Up @@ -397,6 +401,7 @@ def load_python(fp, catalog=None, single_value=True, encoding='utf-8', cls=None,
IonPyDict
]


def _load_iteratively(reader, end_type=IonEventType.STREAM_END):
event = reader.send(NEXT_EVENT)
while event.event_type is not end_type:
Expand All @@ -413,8 +418,8 @@ def _load_iteratively(reader, end_type=IonEventType.STREAM_END):
yield scalar
event = reader.send(NEXT_EVENT)

def _load(out, reader, end_type=IonEventType.STREAM_END, in_struct=False):

def _load(out, reader, end_type=IonEventType.STREAM_END, in_struct=False):
def add(obj):
if in_struct:
out.add_item(event.field_name.text, obj)
Expand All @@ -438,7 +443,8 @@ def add(obj):


def loads(ion_str, catalog=None, single_value=True, encoding='utf-8', cls=None, object_hook=None, parse_float=None,
parse_int=None, parse_constant=None, object_pairs_hook=None, use_decimal=None, parse_eagerly=True, **kw):
parse_int=None, parse_constant=None, object_pairs_hook=None, use_decimal=None, parse_eagerly=True,
text_buffer_size_limit=None, **kw):
"""Deserialize ``ion_str``, which is a string representation of an Ion object, to a Python object using the
conversion table used by load (above).
Expand All @@ -451,6 +457,9 @@ def loads(ion_str, catalog=None, single_value=True, encoding='utf-8', cls=None,
``sequence_as_stream=True``, it must be loaded using ``single_value=False``. Default: True.
parse_eagerly: (Optional[True|False]) Used in conjunction with ``single_value=False`` to return the result as list
or an iterator
text_buffer_size_limit (int): The maximum byte size allowed for text values when the C extension is enabled
(default: 512 bytes). This option only has an effect when the C extension is enabled (and it is enabled by
default). When the C extension is disabled, there is no limit on the size of text values.
encoding: NOT IMPLEMENTED
cls: NOT IMPLEMENTED
object_hook: NOT IMPLEMENTED
Expand All @@ -477,7 +486,8 @@ def loads(ion_str, catalog=None, single_value=True, encoding='utf-8', cls=None,

return load(ion_buffer, catalog=catalog, single_value=single_value, encoding=encoding, cls=cls,
object_hook=object_hook, parse_float=parse_float, parse_int=parse_int, parse_constant=parse_constant,
object_pairs_hook=object_pairs_hook, use_decimal=use_decimal, parse_eagerly=parse_eagerly)
object_pairs_hook=object_pairs_hook, use_decimal=use_decimal, parse_eagerly=parse_eagerly,
text_buffer_size_limit=text_buffer_size_limit)


def dump_extension(obj, fp, binary=True, sequence_as_stream=False, tuple_as_sexp=False, omit_version_marker=False):
Expand All @@ -489,8 +499,21 @@ def dump_extension(obj, fp, binary=True, sequence_as_stream=False, tuple_as_sexp
fp.write(res)


def load_extension(fp, single_value=True, parse_eagerly=True):
iterator = ionc.ionc_read(fp, emit_bare_values=False)
def load_extension(fp, single_value=True, parse_eagerly=True, text_buffer_size_limit=None):
"""
Args:
fp (str): A string representation of Ion data.
single_value (Optional[True|False]): When True, the data in ``ion_str`` is interpreted as a single Ion value,
and will be returned without an enclosing container. If True and there are multiple top-level values in
the Ion stream, IonException will be raised. NOTE: this means that when data is dumped using
``sequence_as_stream=True``, it must be loaded using ``single_value=False``. Default: True.
parse_eagerly: (Optional[True|False]) Used in conjunction with ``single_value=False`` to return the result as list
or an iterator
text_buffer_size_limit (int): The maximum byte size allowed for text values when the C extension is enabled
(default: 512 bytes). This option only has an effect when the C extension is enabled (and it is enabled by
default). When the C extension is disabled, there is no limit on the size of text values.
"""
iterator = ionc.ionc_read(fp, emit_bare_values=False, text_buffer_size_limit=text_buffer_size_limit)
if single_value:
try:
value = next(iterator)
Expand All @@ -517,21 +540,23 @@ def dump(obj, fp, imports=None, binary=True, sequence_as_stream=False, skipkeys=
tuple_as_sexp=tuple_as_sexp, omit_version_marker=omit_version_marker)
else:
return dump_python(obj, fp, imports=imports, binary=binary, sequence_as_stream=sequence_as_stream,
skipkeys=skipkeys, ensure_ascii=ensure_ascii,check_circular=check_circular,
allow_nan=allow_nan, cls=cls, indent=indent, separators=separators, encoding=encoding,
default=default, use_decimal=use_decimal, namedtuple_as_object=namedtuple_as_object,
tuple_as_array=tuple_as_array, bigint_as_string=bigint_as_string, sort_keys=sort_keys,
item_sort_key=item_sort_key, for_json=for_json, ignore_nan=ignore_nan,
int_as_string_bitcount=int_as_string_bitcount, iterable_as_array=iterable_as_array,
tuple_as_sexp=tuple_as_sexp, omit_version_marker=omit_version_marker, **kw)
skipkeys=skipkeys, ensure_ascii=ensure_ascii, check_circular=check_circular,
allow_nan=allow_nan, cls=cls, indent=indent, separators=separators, encoding=encoding,
default=default, use_decimal=use_decimal, namedtuple_as_object=namedtuple_as_object,
tuple_as_array=tuple_as_array, bigint_as_string=bigint_as_string, sort_keys=sort_keys,
item_sort_key=item_sort_key, for_json=for_json, ignore_nan=ignore_nan,
int_as_string_bitcount=int_as_string_bitcount, iterable_as_array=iterable_as_array,
tuple_as_sexp=tuple_as_sexp, omit_version_marker=omit_version_marker, **kw)


def load(fp, catalog=None, single_value=True, encoding='utf-8', cls=None, object_hook=None, parse_float=None,
parse_int=None, parse_constant=None, object_pairs_hook=None, use_decimal=None, parse_eagerly=True, **kw):
parse_int=None, parse_constant=None, object_pairs_hook=None, use_decimal=None, parse_eagerly=True,
text_buffer_size_limit=None, **kw):
if c_ext and catalog is None:
return load_extension(fp, parse_eagerly=parse_eagerly, single_value=single_value)
return load_extension(fp, parse_eagerly=parse_eagerly, single_value=single_value,
text_buffer_size_limit=text_buffer_size_limit)
else:
return load_python(fp, catalog=catalog, single_value=single_value, encoding=encoding, cls=cls,
object_hook=object_hook, parse_float=parse_float, parse_int=parse_int,
parse_constant=parse_constant, object_pairs_hook=object_pairs_hook,
use_decimal=use_decimal, parse_eagerly=parse_eagerly, **kw)
object_hook=object_hook, parse_float=parse_float, parse_int=parse_int,
parse_constant=parse_constant, object_pairs_hook=object_pairs_hook,
use_decimal=use_decimal, parse_eagerly=parse_eagerly, **kw)
23 changes: 23 additions & 0 deletions tests/test_simpleion.py
Original file line number Diff line number Diff line change
Expand Up @@ -713,3 +713,26 @@ def test_loads_unicode_utf8_conversion():
# Loads API should convert it to UTF-8 without illegal bytes number read exception.
loads(data, parse_eagerly=True)


# See issue https://github.com/amazon-ion/ion-python/issues/232
def test_loads_large_string():
# This function only tests c extension
if not c_ext:
return

data = "a"*100000

# Without symbol_buffer_threshold setup, it should fail due to BUFFER_TOO_SMALL
try:
loads(data)
except Exception:
pass
else:
assert False

# With symbol_buffer_threshold setup, it should have enough buffer size to handle "a"*100000
try:
loads(data, text_buffer_size_limit=200000)
except Exception:
assert False

0 comments on commit 3bf6a73

Please sign in to comment.