From fa677cc287d3ae0ba935d91495f2d5aeda151742 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 16 Oct 2017 09:31:36 -0400 Subject: [PATCH 1/3] Only write necessary bytes from null bitmap in Feather writer Change-Id: I7f9ebc0ed95c26bda6ae6c072d6d791df071476c --- cpp/src/arrow/ipc/feather.cc | 6 +++--- python/pyarrow/tests/test_feather.py | 17 +++++++++++++++++ 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/ipc/feather.cc b/cpp/src/arrow/ipc/feather.cc index 9d244f11544..b392a0e00e6 100644 --- a/cpp/src/arrow/ipc/feather.cc +++ b/cpp/src/arrow/ipc/feather.cc @@ -554,12 +554,12 @@ class TableWriter::TableWriterImpl : public ArrayVisitor { if (values.null_count() > 0) { // We assume there is one bit for each value in values.nulls, aligned on a // byte boundary, and we write this much data into the stream + int64_t null_bitmap_size = BitUtil::BytesForBits(values.length()); if (values.null_bitmap()) { RETURN_NOT_OK(WritePadded(stream_.get(), values.null_bitmap()->data(), - values.null_bitmap()->size(), &bytes_written)); + null_bitmap_size, &bytes_written)); } else { - RETURN_NOT_OK(WritePaddedBlank( - stream_.get(), BitUtil::BytesForBits(values.length()), &bytes_written)); + RETURN_NOT_OK(WritePaddedBlank(stream_.get(), null_bitmap_size, &bytes_written)); } meta->total_bytes += bytes_written; } diff --git a/python/pyarrow/tests/test_feather.py b/python/pyarrow/tests/test_feather.py index a7013ba5998..22d428a5cb5 100644 --- a/python/pyarrow/tests/test_feather.py +++ b/python/pyarrow/tests/test_feather.py @@ -249,6 +249,23 @@ def test_boolean_nulls(self): result = read_feather(path) assert_frame_equal(result, ex_frame) + def test_buffer_bounds_error(self): + # ARROW-1676 + values = pa.array([None] + list(range(128)), type=pa.float64()) + + path = random_path() + self.test_files.append(path) + + writer = FeatherWriter() + writer.open(path) + + writer.write_array('arr', values) + writer.close() + + result = read_feather(path) + expected = pd.DataFrame({'arr': values.to_pandas()}) + assert_frame_equal(result, expected) + def test_boolean_object_nulls(self): repeats = 100 arr = np.array([False, None, True] * repeats, dtype=object) From 540f79d3b5063457951f791257a93e7c3211b2c3 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 16 Oct 2017 09:39:33 -0400 Subject: [PATCH 2/3] Ensure proper padding Change-Id: I3db578a41c68790393f7ebf63621b7f5f6e6b5df --- cpp/src/arrow/ipc/feather.cc | 2 +- python/pyarrow/tests/test_feather.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/ipc/feather.cc b/cpp/src/arrow/ipc/feather.cc index b392a0e00e6..97ed601adb5 100644 --- a/cpp/src/arrow/ipc/feather.cc +++ b/cpp/src/arrow/ipc/feather.cc @@ -554,7 +554,7 @@ class TableWriter::TableWriterImpl : public ArrayVisitor { if (values.null_count() > 0) { // We assume there is one bit for each value in values.nulls, aligned on a // byte boundary, and we write this much data into the stream - int64_t null_bitmap_size = BitUtil::BytesForBits(values.length()); + int64_t null_bitmap_size = GetOutputLength(BitUtil::BytesForBits(values.length())); if (values.null_bitmap()) { RETURN_NOT_OK(WritePadded(stream_.get(), values.null_bitmap()->data(), null_bitmap_size, &bytes_written)); diff --git a/python/pyarrow/tests/test_feather.py b/python/pyarrow/tests/test_feather.py index 22d428a5cb5..a826bbc9dd4 100644 --- a/python/pyarrow/tests/test_feather.py +++ b/python/pyarrow/tests/test_feather.py @@ -266,6 +266,8 @@ def test_buffer_bounds_error(self): expected = pd.DataFrame({'arr': values.to_pandas()}) assert_frame_equal(result, expected) + self._check_pandas_roundtrip(expected, null_counts=[1]) + def test_boolean_object_nulls(self): repeats = 100 arr = np.array([False, None, True] * repeats, dtype=object) From 60a2ed55993d36141b0cf9879cf57e53d2280fd2 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 16 Oct 2017 09:43:14 -0400 Subject: [PATCH 3/3] Fuzz test more array lengths Change-Id: Ibe10ff32f44be87a42b9d5d865c28be921e913a3 --- python/pyarrow/tests/test_feather.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/python/pyarrow/tests/test_feather.py b/python/pyarrow/tests/test_feather.py index a826bbc9dd4..76f0844fa48 100644 --- a/python/pyarrow/tests/test_feather.py +++ b/python/pyarrow/tests/test_feather.py @@ -251,22 +251,23 @@ def test_boolean_nulls(self): def test_buffer_bounds_error(self): # ARROW-1676 - values = pa.array([None] + list(range(128)), type=pa.float64()) - path = random_path() self.test_files.append(path) - writer = FeatherWriter() - writer.open(path) + for i in range(16, 256): + values = pa.array([None] + list(range(i)), type=pa.float64()) - writer.write_array('arr', values) - writer.close() + writer = FeatherWriter() + writer.open(path) - result = read_feather(path) - expected = pd.DataFrame({'arr': values.to_pandas()}) - assert_frame_equal(result, expected) + writer.write_array('arr', values) + writer.close() + + result = read_feather(path) + expected = pd.DataFrame({'arr': values.to_pandas()}) + assert_frame_equal(result, expected) - self._check_pandas_roundtrip(expected, null_counts=[1]) + self._check_pandas_roundtrip(expected, null_counts=[1]) def test_boolean_object_nulls(self): repeats = 100