Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 49 additions & 16 deletions cpp/src/arrow/python/numpy_to_arrow.cc
Original file line number Diff line number Diff line change
Expand Up @@ -195,16 +195,15 @@ int64_t MaskToBitmap(PyArrayObject* mask, int64_t length, uint8_t* bitmap) {

} // namespace

/// Append as many string objects from NumPy arrays to a `StringBuilder` as we
/// Append as many string objects from NumPy arrays to a `BinaryBuilder` as we
/// can fit
///
/// \param[in] offset starting offset for appending
/// \param[out] end_offset ending offset where we stopped appending. Will
/// be length of arr if fully consumed
/// \param[out] have_bytes true if we encountered any PyBytes object
static Status AppendObjectBinaries(PyArrayObject* arr, PyArrayObject* mask,
int64_t offset, BinaryBuilder* builder,
int64_t* end_offset, bool* have_bytes) {
int64_t* end_offset) {
PyObject* obj;

Ndarray1DIndexer<PyObject*> objects(arr);
Expand All @@ -222,18 +221,26 @@ static Status AppendObjectBinaries(PyArrayObject* arr, PyArrayObject* mask,
if ((have_mask && mask_values[offset]) || internal::PandasObjectIsNull(obj)) {
RETURN_NOT_OK(builder->AppendNull());
continue;
} else if (!PyBytes_Check(obj)) {
} else if (PyBytes_Check(obj)) {
const int32_t length = static_cast<int32_t>(PyBytes_GET_SIZE(obj));
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The cast is entirely wrong :-(

if (ARROW_PREDICT_FALSE(builder->value_data_length() + length >
kBinaryMemoryLimit)) {
break;
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have trouble parsing this... If we don't know how to convert this, we should fail, not simply break from the loop, no?

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

After reading the code a bit more carefully, I understand... though there is still a problem: what if length is larger than kBinaryMemoryLimit?

}
RETURN_NOT_OK(builder->Append(PyBytes_AS_STRING(obj), length));
} else if (PyByteArray_Check(obj)) {
const int32_t length = static_cast<int32_t>(PyByteArray_GET_SIZE(obj));
if (ARROW_PREDICT_FALSE(builder->value_data_length() + length >
kBinaryMemoryLimit)) {
break;
}
RETURN_NOT_OK(builder->Append(PyByteArray_AS_STRING(obj), length));
} else {
std::stringstream ss;
ss << "Error converting from Python objects to bytes: ";
RETURN_NOT_OK(InvalidConversion(obj, "str, bytes", &ss));
RETURN_NOT_OK(InvalidConversion(obj, "str, bytes, bytearray", &ss));
return Status::Invalid(ss.str());
}

const int32_t length = static_cast<int32_t>(PyBytes_GET_SIZE(obj));
if (ARROW_PREDICT_FALSE(builder->value_data_length() + length > kBinaryMemoryLimit)) {
break;
}
RETURN_NOT_OK(builder->Append(PyBytes_AS_STRING(obj), length));
}

// If we consumed the whole array, this will be the length of arr
Expand Down Expand Up @@ -506,6 +513,7 @@ class NumPyConverter {
Status ConvertBooleans();
Status ConvertObjectStrings();
Status ConvertObjectFloats();
Status ConvertObjectBytes();
Status ConvertObjectFixedWidthBytes(const std::shared_ptr<DataType>& type);
Status ConvertObjectIntegers();
Status ConvertLists(const std::shared_ptr<DataType>& type);
Expand Down Expand Up @@ -988,6 +996,29 @@ Status NumPyConverter::ConvertObjectIntegers() {
return PushBuilderResult(&builder);
}

Status NumPyConverter::ConvertObjectBytes() {
PyAcquireGIL lock;

BinaryBuilder builder(binary(), pool_);
RETURN_NOT_OK(builder.Resize(length_));

if (length_ == 0) {
// Produce an empty chunk
std::shared_ptr<Array> chunk;
RETURN_NOT_OK(builder.Finish(&chunk));
out_arrays_.emplace_back(std::move(chunk));
} else {
int64_t offset = 0;
while (offset < length_) {
RETURN_NOT_OK(AppendObjectBinaries(arr_, mask_, offset, &builder, &offset));
std::shared_ptr<Array> chunk;
RETURN_NOT_OK(builder.Finish(&chunk));
out_arrays_.emplace_back(std::move(chunk));
}
}
return Status::OK();
}

Status NumPyConverter::ConvertObjectFixedWidthBytes(
const std::shared_ptr<DataType>& type) {
PyAcquireGIL lock;
Expand Down Expand Up @@ -1088,6 +1119,8 @@ Status NumPyConverter::ConvertObjectsInfer() {
return ConvertTimes();
} else if (PyObject_IsInstance(obj, decimal_type_.obj()) == 1) {
return ConvertDecimals();
} else if (PyByteArray_Check(obj)) {
return ConvertObjectBytes();
} else if (PyList_Check(obj)) {
std::shared_ptr<DataType> inferred_type;
RETURN_NOT_OK(InferArrowType(obj, &inferred_type));
Expand All @@ -1105,7 +1138,7 @@ Status NumPyConverter::ConvertObjectsInfer() {
return ConvertLists(inferred_type);
} else {
const std::string supported_types =
"string, bool, float, int, date, time, decimal, list, array";
"string, bool, float, int, date, time, decimal, bytearray, list, array";
std::stringstream ss;
ss << "Error inferring Arrow type for Python object array. ";
RETURN_NOT_OK(InvalidConversion(obj, supported_types, &ss));
Expand Down Expand Up @@ -1154,6 +1187,8 @@ Status NumPyConverter::ConvertObjects() {
switch (type_->id()) {
case Type::STRING:
return ConvertObjectStrings();
case Type::BINARY:
return ConvertObjectBytes();
case Type::FIXED_SIZE_BINARY:
return ConvertObjectFixedWidthBytes(type_);
case Type::BOOL:
Expand Down Expand Up @@ -1339,8 +1374,6 @@ template <>
inline Status NumPyConverter::ConvertTypedLists<NPY_OBJECT, BinaryType>(
const std::shared_ptr<DataType>& type, ListBuilder* builder, PyObject* list) {
PyAcquireGIL lock;
// TODO: If there are bytes involed, convert to Binary representation
bool have_bytes = false;

Ndarray1DIndexer<uint8_t> mask_values;

Expand All @@ -1363,8 +1396,8 @@ inline Status NumPyConverter::ConvertTypedLists<NPY_OBJECT, BinaryType>(
RETURN_NOT_OK(CheckFlatNumpyArray(numpy_array, NPY_OBJECT));

int64_t offset = 0;
RETURN_NOT_OK(AppendObjectBinaries(numpy_array, nullptr, 0, value_builder, &offset,
&have_bytes));
RETURN_NOT_OK(
AppendObjectBinaries(numpy_array, nullptr, 0, value_builder, &offset));
if (offset < PyArray_SIZE(numpy_array)) {
return Status::Invalid("Array cell value exceeded 2GB");
}
Expand Down
18 changes: 17 additions & 1 deletion python/pyarrow/tests/test_convert_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -1099,6 +1099,22 @@ def test_fixed_size_bytes_does_not_accept_varying_lengths(self):
with pytest.raises(pa.ArrowInvalid):
pa.Table.from_pandas(df, schema=schema)

def test_variable_size_bytes(self):
s = pd.Series([b'123', b'', b'a', None])
arr = pa.Array.from_pandas(s, type=pa.binary())
assert arr.type == pa.binary()
_check_series_roundtrip(s, type_=pa.binary())

def test_binary_from_bytearray(self):
s = pd.Series([bytearray(b'123'), bytearray(b''), bytearray(b'a')])
# Explicitly set type
arr = pa.Array.from_pandas(s, type=pa.binary())
assert arr.type == pa.binary()
# Infer type from bytearrays
arr = pa.Array.from_pandas(s)
assert arr.type == pa.binary()
_check_series_roundtrip(s, type_=pa.binary())

def test_table_empty_str(self):
values = ['', '', '', '', '']
df = pd.DataFrame({'strings': values})
Expand Down Expand Up @@ -1693,7 +1709,7 @@ class TestConvertMisc(object):
# XXX unsupported
# (np.dtype([('a', 'i2')]), pa.struct([pa.field('a', pa.int16())])),
(np.object, pa.string()),
# (np.object, pa.binary()), # XXX unsupported
(np.object, pa.binary()),
(np.object, pa.binary(10)),
(np.object, pa.list_(pa.int64())),
]
Expand Down