Changes bytes read size to avoid unicode/UTF-8 conversion issue. (#216)

amazon-ion · Aug 19, 2022 · ee43553 · ee43553
1 parent 36e3728
commit ee43553
Show file tree

Hide file tree

Showing 2 changed files with 13 additions and 5 deletions.
diff --git a/amazon/ion/ioncmodule.c b/amazon/ion/ioncmodule.c
@@ -21,7 +21,8 @@
 #define FIELD_NAME_MAX_LEN 1000
 #define ANNOTATION_MAX_LEN 50
 
-#define IONC_STREAM_READ_BUFFER_SIZE 1024
+#define IONC_STREAM_READ_BUFFER_SIZE 1024*32
+#define IONC_STREAM_BYTES_READ_SIZE PyLong_FromLong(IONC_STREAM_READ_BUFFER_SIZE/4)
 
 static char _err_msg[ERR_MSG_MAX_LEN];
 
@@ -87,8 +88,6 @@ static PyObject* _py_symboltoken_constructor;
 static PyObject* _exception_module;
 static PyObject* _ion_exception_cls;
 static decContext dec_context;
-static PyObject *_arg_read_size;
-
 
 typedef struct {
     PyObject *py_file; // a TextIOWrapper-like object
@@ -1400,7 +1399,7 @@ iERR ion_read_file_stream_handler(struct _ion_user_stream *pstream) {
     Py_ssize_t size;
     _ION_READ_STREAM_HANDLE *stream_handle = (_ION_READ_STREAM_HANDLE *) pstream->handler_state;
     PyObject *py_buffer_as_bytes = NULL;
-    PyObject *py_buffer = PyObject_CallMethod(stream_handle->py_file, "read", "O", _arg_read_size);
+    PyObject *py_buffer = PyObject_CallMethod(stream_handle->py_file, "read", "O", IONC_STREAM_BYTES_READ_SIZE);
 
     if (py_buffer == NULL) {
         pstream->limit = NULL;
@@ -1669,7 +1668,6 @@ PyObject* ionc_init_module(void) {
 
     decContextDefault(&dec_context, DEC_INIT_DECQUAD);  //The writer already had one of these, but it's private.
 
-    _arg_read_size = PyLong_FromLong(IONC_STREAM_READ_BUFFER_SIZE);
     return m;
 }
 

diff --git a/tests/test_simpleion.py b/tests/test_simpleion.py
@@ -704,3 +704,13 @@ def test_dumps_omit_version_marker():
     assert dumps(v) == b'\xe0\x01\x00\xea\x21\x05'
     assert dumps(v, omit_version_marker=True) == b'\xe0\x01\x00\xea\x21\x05'
 
+
+# See issue https://github.com/amzn/ion-python/issues/213
+def test_loads_unicode_utf8_conversion():
+    # Generates test data that more than 1024*32 bytes
+    data = "[ '''\u2013''',"
+    data += 'test,' * 100000
+    data += "]"
+    # Loads API should convert it to UTF-8 without illegal bytes number read exception.
+    loads(data, parse_eagerly=True)
+