Skip to content

Commit

Permalink
ARROW-1689: [Python] Implement zero-copy conversions for DictionaryArray
Browse files Browse the repository at this point in the history
This PR closes [ARROW-1689](https://issues.apache.org/jira/browse/ARROW-1689).
I want to add the zero-copy option after #1233 merged.

Author: Licht-T <licht-t@outlook.jp>
Author: Wes McKinney <wes.mckinney@twosigma.com>

Closes #1237 from Licht-T/feature-categorical-index-zerocopy and squashes the following commits:

53342e8 [Wes McKinney] Use the PyCapsule API to preserve base references to C++ objects when no PyObject* is available to set as zero-copy ndarray base
0b847d1 [Wes McKinney] Fix flakes
4270b5d [Licht-T] Fix C++ lint issues
ddc6b84 [Licht-T] TST: Add test_zero_copy_dictionaries
e0561dc [Licht-T] ENH: Add zero_copy_only option check
de4ed3e [Licht-T] ENH: Implement Categorical Block Zero-Copy
  • Loading branch information
Licht-T authored and xhochy committed Oct 28, 2017
1 parent cc03a45 commit 74a934a
Show file tree
Hide file tree
Showing 2 changed files with 110 additions and 29 deletions.
120 changes: 99 additions & 21 deletions cpp/src/arrow/python/arrow_to_pandas.cc
Expand Up @@ -109,6 +109,20 @@ static inline bool ListTypeSupported(const DataType& type) {
}
return false;
}
// ----------------------------------------------------------------------
// PyCapsule code for setting ndarray base to reference C++ object

struct ArrowCapsule {
std::shared_ptr<Array> array;
};

namespace {

void ArrowCapsule_Destructor(PyObject* capsule) {
delete reinterpret_cast<ArrowCapsule*>(PyCapsule_GetPointer(capsule, "arrow"));
}

} // namespace

// ----------------------------------------------------------------------
// pandas 0.x DataFrame conversion internals
Expand Down Expand Up @@ -957,23 +971,40 @@ class CategoricalBlock : public PandasBlock {
using TRAITS = internal::arrow_traits<ARROW_INDEX_TYPE>;
using T = typename TRAITS::T;
constexpr int npy_type = TRAITS::npy_type;
RETURN_NOT_OK(AllocateNDArray(npy_type, 1));

// No relative placement offset because a single column
T* out_values = reinterpret_cast<T*>(block_data_);

const ChunkedArray& data = *col->data().get();

for (int c = 0; c < data.num_chunks(); c++) {
const std::shared_ptr<Array> arr = data.chunk(c);
const auto& dict_arr = static_cast<const DictionaryArray&>(*arr);
// Sniff the first chunk
const std::shared_ptr<Array> arr_first = data.chunk(0);
const auto& dict_arr_first = static_cast<const DictionaryArray&>(*arr_first);
const auto& indices_first =
static_cast<const PrimitiveArray&>(*dict_arr_first.indices());

if (data.num_chunks() == 1 && indices_first.null_count() == 0) {
RETURN_NOT_OK(AllocateNDArrayFromIndices<T>(npy_type, indices_first));
} else {
if (options_.zero_copy_only) {
std::stringstream ss;
ss << "Needed to copy " << data.num_chunks() << " chunks with "
<< indices_first.null_count() << " indices nulls, but zero_copy_only was True";
return Status::Invalid(ss.str());
}
RETURN_NOT_OK(AllocateNDArray(npy_type, 1));

// No relative placement offset because a single column
T* out_values = reinterpret_cast<T*>(block_data_);

const auto& indices = static_cast<const PrimitiveArray&>(*dict_arr.indices());
auto in_values = reinterpret_cast<const T*>(indices.raw_values());
for (int c = 0; c < data.num_chunks(); c++) {
const std::shared_ptr<Array> arr = data.chunk(c);
const auto& dict_arr = static_cast<const DictionaryArray&>(*arr);

// Null is -1 in CategoricalBlock
for (int i = 0; i < arr->length(); ++i) {
*out_values++ = indices.IsNull(i) ? -1 : in_values[i];
const auto& indices = static_cast<const PrimitiveArray&>(*dict_arr.indices());
auto in_values = reinterpret_cast<const T*>(indices.raw_values());

// Null is -1 in CategoricalBlock
for (int i = 0; i < arr->length(); ++i) {
*out_values++ = indices.IsNull(i) ? -1 : in_values[i];
}
}
}

Expand Down Expand Up @@ -1043,6 +1074,43 @@ class CategoricalBlock : public PandasBlock {
PyObject* dictionary() const { return dictionary_.obj(); }

protected:
template <typename T>
Status AllocateNDArrayFromIndices(int npy_type, const PrimitiveArray& indices) {
npy_intp block_dims[1] = {num_rows_};

auto in_values = reinterpret_cast<const T*>(indices.raw_values());
void* data = const_cast<T*>(in_values);

PyAcquireGIL lock;

PyArray_Descr* descr = GetSafeNumPyDtype(npy_type);
if (descr == nullptr) {
// Error occurred, trust error state is set
return Status::OK();
}

PyObject* block_arr = PyArray_NewFromDescr(&PyArray_Type, descr, 1, block_dims,
nullptr, data, NPY_ARRAY_CARRAY, nullptr);

npy_intp placement_dims[1] = {num_columns_};
PyObject* placement_arr = PyArray_SimpleNew(1, placement_dims, NPY_INT64);
if (placement_arr == NULL) {
// TODO(wesm): propagating Python exception
return Status::OK();
}

block_arr_.reset(block_arr);
placement_arr_.reset(placement_arr);

block_data_ = reinterpret_cast<uint8_t*>(
PyArray_DATA(reinterpret_cast<PyArrayObject*>(block_arr)));

placement_data_ = reinterpret_cast<int64_t*>(
PyArray_DATA(reinterpret_cast<PyArrayObject*>(placement_arr)));

return Status::OK();
}

MemoryPool* pool_;
OwnedRef dictionary_;
bool ordered_;
Expand Down Expand Up @@ -1369,12 +1437,26 @@ class ArrowDeserializer {
return Status::OK();
}

if (PyArray_SetBaseObject(arr_, py_ref_) == -1) {
PyObject* base;
if (py_ref_ == nullptr) {
ArrowCapsule* capsule = new ArrowCapsule;
capsule->array = arr;
base = PyCapsule_New(reinterpret_cast<void*>(capsule), "arrow",
&ArrowCapsule_Destructor);
if (base == nullptr) {
delete capsule;
RETURN_IF_PYERROR();
}
} else {
base = py_ref_;
}

if (PyArray_SetBaseObject(arr_, base) == -1) {
// Error occurred, trust that SetBaseObject set the error state
return Status::OK();
} else {
// PyArray_SetBaseObject steals our reference to py_ref_
Py_INCREF(py_ref_);
// PyArray_SetBaseObject steals our reference to base
Py_INCREF(base);
}

// Arrow data is immutable.
Expand All @@ -1399,7 +1481,7 @@ class ArrowDeserializer {
typedef typename traits::T T;
int npy_type = traits::npy_type;

if (data_.num_chunks() == 1 && data_.null_count() == 0 && py_ref_ != nullptr) {
if (data_.num_chunks() == 1 && data_.null_count() == 0) {
return ConvertValuesZeroCopy<TYPE>(options_, npy_type, data_.chunk(0));
} else if (options_.zero_copy_only) {
std::stringstream ss;
Expand Down Expand Up @@ -1462,7 +1544,7 @@ class ArrowDeserializer {

typedef typename traits::T T;

if (data_.num_chunks() == 1 && data_.null_count() == 0 && py_ref_ != nullptr) {
if (data_.num_chunks() == 1 && data_.null_count() == 0) {
return ConvertValuesZeroCopy<TYPE>(options_, traits::npy_type, data_.chunk(0));
} else if (options_.zero_copy_only) {
std::stringstream ss;
Expand Down Expand Up @@ -1566,10 +1648,6 @@ class ArrowDeserializer {
}

Status Visit(const DictionaryType& type) {
if (options_.zero_copy_only) {
return Status::Invalid("DictionaryType needs copies, but zero_copy_only was True");
}

auto block = std::make_shared<CategoricalBlock>(options_, nullptr, col_->length());
RETURN_NOT_OK(block->Write(col_, 0, 0));

Expand Down
19 changes: 11 additions & 8 deletions python/pyarrow/tests/test_convert_pandas.py
Expand Up @@ -217,6 +217,17 @@ def test_zero_copy_success(self):
result = pa.array([0, 1, 2]).to_pandas(zero_copy_only=True)
npt.assert_array_equal(result, [0, 1, 2])

def test_zero_copy_dictionaries(self):
arr = pa.DictionaryArray.from_arrays(
np.array([0, 0]),
np.array([5]))

result = arr.to_pandas(zero_copy_only=True)
values = pd.Categorical([5, 5])

tm.assert_series_equal(pd.Series(result), pd.Series(values),
check_names=False)

def test_zero_copy_failure_on_object_types(self):
with pytest.raises(pa.ArrowException):
pa.array(['A', 'B', 'C']).to_pandas(zero_copy_only=True)
Expand Down Expand Up @@ -245,14 +256,6 @@ def test_zero_copy_failure_on_timestamp_types(self):
with pytest.raises(pa.ArrowException):
pa.array(arr).to_pandas(zero_copy_only=True)

def test_zero_copy_dictionaries(self):
arr = pa.DictionaryArray.from_arrays(
np.array([0, 0]),
np.array(['A']))

with pytest.raises(pa.ArrowException):
arr.to_pandas(zero_copy_only=True)

def test_float_nulls(self):
num_values = 100

Expand Down

0 comments on commit 74a934a

Please sign in to comment.