From a5efaafcf51e810eb31d41a144d0aef671b504e0 Mon Sep 17 00:00:00 2001 From: Bryan Cutler Date: Fri, 2 Mar 2018 16:16:34 -0800 Subject: [PATCH 1/5] added support for variable length binary conversion from pandas to arrow --- cpp/src/arrow/python/numpy_to_arrow.cc | 36 +++++++++++++++++---- python/pyarrow/tests/test_convert_pandas.py | 8 ++++- 2 files changed, 36 insertions(+), 8 deletions(-) diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc b/cpp/src/arrow/python/numpy_to_arrow.cc index 71bf69fc1ed2..180faddf9829 100644 --- a/cpp/src/arrow/python/numpy_to_arrow.cc +++ b/cpp/src/arrow/python/numpy_to_arrow.cc @@ -195,16 +195,15 @@ int64_t MaskToBitmap(PyArrayObject* mask, int64_t length, uint8_t* bitmap) { } // namespace -/// Append as many string objects from NumPy arrays to a `StringBuilder` as we +/// Append as many string objects from NumPy arrays to a `BinaryBuilder` as we /// can fit /// /// \param[in] offset starting offset for appending /// \param[out] end_offset ending offset where we stopped appending. Will /// be length of arr if fully consumed -/// \param[out] have_bytes true if we encountered any PyBytes object static Status AppendObjectBinaries(PyArrayObject* arr, PyArrayObject* mask, int64_t offset, BinaryBuilder* builder, - int64_t* end_offset, bool* have_bytes) { + int64_t* end_offset) { PyObject* obj; Ndarray1DIndexer objects(arr); @@ -506,6 +505,7 @@ class NumPyConverter { Status ConvertBooleans(); Status ConvertObjectStrings(); Status ConvertObjectFloats(); + Status ConvertObjectBytes(); Status ConvertObjectFixedWidthBytes(const std::shared_ptr& type); Status ConvertObjectIntegers(); Status ConvertLists(const std::shared_ptr& type); @@ -988,6 +988,29 @@ Status NumPyConverter::ConvertObjectIntegers() { return PushBuilderResult(&builder); } +Status NumPyConverter::ConvertObjectBytes() { + PyAcquireGIL lock; + + BinaryBuilder builder(type_, pool_); + RETURN_NOT_OK(builder.Resize(length_)); + + if (length_ == 0) { + // Produce an empty chunk + std::shared_ptr chunk; + RETURN_NOT_OK(builder.Finish(&chunk)); + out_arrays_.emplace_back(std::move(chunk)); + } else { + int64_t offset = 0; + while (offset < length_) { + RETURN_NOT_OK(AppendObjectBinaries(arr_, mask_, offset, &builder, &offset)); + std::shared_ptr chunk; + RETURN_NOT_OK(builder.Finish(&chunk)); + out_arrays_.emplace_back(std::move(chunk)); + } + } + return Status::OK(); +} + Status NumPyConverter::ConvertObjectFixedWidthBytes( const std::shared_ptr& type) { PyAcquireGIL lock; @@ -1154,6 +1177,8 @@ Status NumPyConverter::ConvertObjects() { switch (type_->id()) { case Type::STRING: return ConvertObjectStrings(); + case Type::BINARY: + return ConvertObjectBytes(); case Type::FIXED_SIZE_BINARY: return ConvertObjectFixedWidthBytes(type_); case Type::BOOL: @@ -1339,8 +1364,6 @@ template <> inline Status NumPyConverter::ConvertTypedLists( const std::shared_ptr& type, ListBuilder* builder, PyObject* list) { PyAcquireGIL lock; - // TODO: If there are bytes involed, convert to Binary representation - bool have_bytes = false; Ndarray1DIndexer mask_values; @@ -1363,8 +1386,7 @@ inline Status NumPyConverter::ConvertTypedLists( RETURN_NOT_OK(CheckFlatNumpyArray(numpy_array, NPY_OBJECT)); int64_t offset = 0; - RETURN_NOT_OK(AppendObjectBinaries(numpy_array, nullptr, 0, value_builder, &offset, - &have_bytes)); + RETURN_NOT_OK(AppendObjectBinaries(numpy_array, nullptr, 0, value_builder, &offset)); if (offset < PyArray_SIZE(numpy_array)) { return Status::Invalid("Array cell value exceeded 2GB"); } diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py index 45ec66dab106..1eb8a056447e 100644 --- a/python/pyarrow/tests/test_convert_pandas.py +++ b/python/pyarrow/tests/test_convert_pandas.py @@ -1099,6 +1099,12 @@ def test_fixed_size_bytes_does_not_accept_varying_lengths(self): with pytest.raises(pa.ArrowInvalid): pa.Table.from_pandas(df, schema=schema) + def test_variable_size_bytes(self): + s = pd.Series([b'123', b'', b'a', None]) + arr = pa.Array.from_pandas(s, type=pa.binary()) + assert arr.type == pa.binary() + _check_series_roundtrip(s, type_=pa.binary()) + def test_table_empty_str(self): values = ['', '', '', '', ''] df = pd.DataFrame({'strings': values}) @@ -1693,7 +1699,7 @@ class TestConvertMisc(object): # XXX unsupported # (np.dtype([('a', 'i2')]), pa.struct([pa.field('a', pa.int16())])), (np.object, pa.string()), - # (np.object, pa.binary()), # XXX unsupported + (np.object, pa.binary()), (np.object, pa.binary(10)), (np.object, pa.list_(pa.int64())), ] From 610888c6e9579d2c3ad3bbab05be15d4e62169d1 Mon Sep 17 00:00:00 2001 From: Bryan Cutler Date: Tue, 6 Mar 2018 09:54:46 -0800 Subject: [PATCH 2/5] fix formatting issues --- cpp/src/arrow/python/numpy_to_arrow.cc | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc b/cpp/src/arrow/python/numpy_to_arrow.cc index 180faddf9829..21adb209408d 100644 --- a/cpp/src/arrow/python/numpy_to_arrow.cc +++ b/cpp/src/arrow/python/numpy_to_arrow.cc @@ -1178,7 +1178,7 @@ Status NumPyConverter::ConvertObjects() { case Type::STRING: return ConvertObjectStrings(); case Type::BINARY: - return ConvertObjectBytes(); + return ConvertObjectBytes(); case Type::FIXED_SIZE_BINARY: return ConvertObjectFixedWidthBytes(type_); case Type::BOOL: @@ -1386,7 +1386,8 @@ inline Status NumPyConverter::ConvertTypedLists( RETURN_NOT_OK(CheckFlatNumpyArray(numpy_array, NPY_OBJECT)); int64_t offset = 0; - RETURN_NOT_OK(AppendObjectBinaries(numpy_array, nullptr, 0, value_builder, &offset)); + RETURN_NOT_OK(AppendObjectBinaries(numpy_array, nullptr, 0, value_builder, + &offset)); if (offset < PyArray_SIZE(numpy_array)) { return Status::Invalid("Array cell value exceeded 2GB"); } From 86b934be878804f59f90b848ab2e0300f895aceb Mon Sep 17 00:00:00 2001 From: Bryan Cutler Date: Tue, 6 Mar 2018 13:14:21 -0800 Subject: [PATCH 3/5] added support for converting from bytearrays --- cpp/src/arrow/python/numpy_to_arrow.cc | 28 +++++++++++++-------- python/pyarrow/tests/test_convert_pandas.py | 10 ++++++++ 2 files changed, 28 insertions(+), 10 deletions(-) diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc b/cpp/src/arrow/python/numpy_to_arrow.cc index 21adb209408d..7d89a519d51d 100644 --- a/cpp/src/arrow/python/numpy_to_arrow.cc +++ b/cpp/src/arrow/python/numpy_to_arrow.cc @@ -221,18 +221,24 @@ static Status AppendObjectBinaries(PyArrayObject* arr, PyArrayObject* mask, if ((have_mask && mask_values[offset]) || internal::PandasObjectIsNull(obj)) { RETURN_NOT_OK(builder->AppendNull()); continue; - } else if (!PyBytes_Check(obj)) { + } else if (PyBytes_Check(obj)) { + const int32_t length = static_cast(PyBytes_GET_SIZE(obj)); + if (ARROW_PREDICT_FALSE(builder->value_data_length() + length > kBinaryMemoryLimit)) { + break; + } + RETURN_NOT_OK(builder->Append(PyBytes_AS_STRING(obj), length)); + } else if (PyByteArray_Check(obj)) { + const int32_t length = static_cast(PyByteArray_GET_SIZE(obj)); + if (ARROW_PREDICT_FALSE(builder->value_data_length() + length > kBinaryMemoryLimit)) { + break; + } + RETURN_NOT_OK(builder->Append(PyByteArray_AS_STRING(obj), length)); + } else { std::stringstream ss; ss << "Error converting from Python objects to bytes: "; - RETURN_NOT_OK(InvalidConversion(obj, "str, bytes", &ss)); + RETURN_NOT_OK(InvalidConversion(obj, "str, bytes, bytearray", &ss)); return Status::Invalid(ss.str()); } - - const int32_t length = static_cast(PyBytes_GET_SIZE(obj)); - if (ARROW_PREDICT_FALSE(builder->value_data_length() + length > kBinaryMemoryLimit)) { - break; - } - RETURN_NOT_OK(builder->Append(PyBytes_AS_STRING(obj), length)); } // If we consumed the whole array, this will be the length of arr @@ -991,7 +997,7 @@ Status NumPyConverter::ConvertObjectIntegers() { Status NumPyConverter::ConvertObjectBytes() { PyAcquireGIL lock; - BinaryBuilder builder(type_, pool_); + BinaryBuilder builder(binary(), pool_); RETURN_NOT_OK(builder.Resize(length_)); if (length_ == 0) { @@ -1111,6 +1117,8 @@ Status NumPyConverter::ConvertObjectsInfer() { return ConvertTimes(); } else if (PyObject_IsInstance(obj, decimal_type_.obj()) == 1) { return ConvertDecimals(); + } else if (PyByteArray_Check(obj)) { + return ConvertObjectBytes(); } else if (PyList_Check(obj)) { std::shared_ptr inferred_type; RETURN_NOT_OK(InferArrowType(obj, &inferred_type)); @@ -1128,7 +1136,7 @@ Status NumPyConverter::ConvertObjectsInfer() { return ConvertLists(inferred_type); } else { const std::string supported_types = - "string, bool, float, int, date, time, decimal, list, array"; + "string, bool, float, int, date, time, decimal, bytearray, list, array"; std::stringstream ss; ss << "Error inferring Arrow type for Python object array. "; RETURN_NOT_OK(InvalidConversion(obj, supported_types, &ss)); diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py index 1eb8a056447e..9ecc73c129d2 100644 --- a/python/pyarrow/tests/test_convert_pandas.py +++ b/python/pyarrow/tests/test_convert_pandas.py @@ -1105,6 +1105,16 @@ def test_variable_size_bytes(self): assert arr.type == pa.binary() _check_series_roundtrip(s, type_=pa.binary()) + def test_binary_from_bytearray(self): + s = pd.Series([bytearray(b'123'), bytearray(b''), bytearray(b'a')]) + # Explicitly set type + arr = pa.Array.from_pandas(s, type=pa.binary()) + assert arr.type == pa.binary() + # Infer type from bytearrays + arr = pa.Array.from_pandas(s) + assert arr.type == pa.binary() + _check_series_roundtrip(s, type_=pa.binary()) + def test_table_empty_str(self): values = ['', '', '', '', ''] df = pd.DataFrame({'strings': values}) From f2c09568590e702bab91c3439ea47ec3dd7fd312 Mon Sep 17 00:00:00 2001 From: Bryan Cutler Date: Tue, 6 Mar 2018 13:29:47 -0800 Subject: [PATCH 4/5] fix lint issue --- cpp/src/arrow/python/numpy_to_arrow.cc | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc b/cpp/src/arrow/python/numpy_to_arrow.cc index 7d89a519d51d..20a83e009b50 100644 --- a/cpp/src/arrow/python/numpy_to_arrow.cc +++ b/cpp/src/arrow/python/numpy_to_arrow.cc @@ -223,13 +223,15 @@ static Status AppendObjectBinaries(PyArrayObject* arr, PyArrayObject* mask, continue; } else if (PyBytes_Check(obj)) { const int32_t length = static_cast(PyBytes_GET_SIZE(obj)); - if (ARROW_PREDICT_FALSE(builder->value_data_length() + length > kBinaryMemoryLimit)) { + if (ARROW_PREDICT_FALSE(builder->value_data_length() + length > + kBinaryMemoryLimit)) { break; } RETURN_NOT_OK(builder->Append(PyBytes_AS_STRING(obj), length)); } else if (PyByteArray_Check(obj)) { const int32_t length = static_cast(PyByteArray_GET_SIZE(obj)); - if (ARROW_PREDICT_FALSE(builder->value_data_length() + length > kBinaryMemoryLimit)) { + if (ARROW_PREDICT_FALSE(builder->value_data_length() + length > + kBinaryMemoryLimit)) { break; } RETURN_NOT_OK(builder->Append(PyByteArray_AS_STRING(obj), length)); From 4e707a6cde38189d0c3783cb5572298baea7878c Mon Sep 17 00:00:00 2001 From: Bryan Cutler Date: Mon, 2 Apr 2018 15:01:35 -0700 Subject: [PATCH 5/5] fix format error --- cpp/src/arrow/python/numpy_to_arrow.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc b/cpp/src/arrow/python/numpy_to_arrow.cc index 20a83e009b50..eb0af8b08cf5 100644 --- a/cpp/src/arrow/python/numpy_to_arrow.cc +++ b/cpp/src/arrow/python/numpy_to_arrow.cc @@ -1396,8 +1396,8 @@ inline Status NumPyConverter::ConvertTypedLists( RETURN_NOT_OK(CheckFlatNumpyArray(numpy_array, NPY_OBJECT)); int64_t offset = 0; - RETURN_NOT_OK(AppendObjectBinaries(numpy_array, nullptr, 0, value_builder, - &offset)); + RETURN_NOT_OK( + AppendObjectBinaries(numpy_array, nullptr, 0, value_builder, &offset)); if (offset < PyArray_SIZE(numpy_array)) { return Status::Invalid("Array cell value exceeded 2GB"); }