Skip to content

Commit

Permalink
GH-39599: [Python] Avoid leaking references to Numpy dtypes (#39636)
Browse files Browse the repository at this point in the history
### Rationale for this change

`PyArray_DescrFromScalar` returns a new reference, so we should be careful to decref it when we don't use it anymore.

### Are these changes tested?

No.

### Are there any user-facing changes?

No.
* Closes: #39599

Authored-by: Antoine Pitrou <antoine@python.org>
Signed-off-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
  • Loading branch information
pitrou authored and raulcd committed Feb 20, 2024
1 parent 34b54f0 commit fc586a0
Show file tree
Hide file tree
Showing 8 changed files with 48 additions and 68 deletions.
3 changes: 1 addition & 2 deletions python/pyarrow/array.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,7 @@ cdef shared_ptr[CDataType] _ndarray_to_type(object values,
dtype = values.dtype

if type is None and dtype != object:
with nogil:
check_status(NumPyDtypeToArrow(dtype, &c_type))
c_type = GetResultValue(NumPyDtypeToArrow(dtype))

if type is not None:
c_type = type.sp_type
Expand Down
2 changes: 1 addition & 1 deletion python/pyarrow/includes/libarrow_python.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil:
object obj, object mask, const PyConversionOptions& options,
CMemoryPool* pool)

CStatus NumPyDtypeToArrow(object dtype, shared_ptr[CDataType]* type)
CResult[shared_ptr[CDataType]] NumPyDtypeToArrow(object dtype)

CStatus NdarrayToArrow(CMemoryPool* pool, object ao, object mo,
c_bool from_pandas,
Expand Down
5 changes: 1 addition & 4 deletions python/pyarrow/src/arrow/python/inference.cc
Original file line number Diff line number Diff line change
Expand Up @@ -468,10 +468,7 @@ class TypeInferrer {
if (numpy_dtype_count_ > 0) {
// All NumPy scalars and Nones/nulls
if (numpy_dtype_count_ + none_count_ == total_count_) {
std::shared_ptr<DataType> type;
RETURN_NOT_OK(NumPyDtypeToArrow(numpy_unifier_.current_dtype(), &type));
*out = type;
return Status::OK();
return NumPyDtypeToArrow(numpy_unifier_.current_dtype()).Value(out);
}

// The "bad path": data contains a mix of NumPy scalars and
Expand Down
77 changes: 33 additions & 44 deletions python/pyarrow/src/arrow/python/numpy_convert.cc
Original file line number Diff line number Diff line change
Expand Up @@ -59,12 +59,11 @@ NumPyBuffer::~NumPyBuffer() {

#define TO_ARROW_TYPE_CASE(NPY_NAME, FACTORY) \
case NPY_##NPY_NAME: \
*out = FACTORY(); \
break;
return FACTORY();

namespace {

Status GetTensorType(PyObject* dtype, std::shared_ptr<DataType>* out) {
Result<std::shared_ptr<DataType>> GetTensorType(PyObject* dtype) {
if (!PyObject_TypeCheck(dtype, &PyArrayDescr_Type)) {
return Status::TypeError("Did not pass numpy.dtype object");
}
Expand All @@ -84,11 +83,8 @@ Status GetTensorType(PyObject* dtype, std::shared_ptr<DataType>* out) {
TO_ARROW_TYPE_CASE(FLOAT16, float16);
TO_ARROW_TYPE_CASE(FLOAT32, float32);
TO_ARROW_TYPE_CASE(FLOAT64, float64);
default: {
return Status::NotImplemented("Unsupported numpy type ", descr->type_num);
}
}
return Status::OK();
return Status::NotImplemented("Unsupported numpy type ", descr->type_num);
}

Status GetNumPyType(const DataType& type, int* type_num) {
Expand Down Expand Up @@ -120,15 +116,21 @@ Status GetNumPyType(const DataType& type, int* type_num) {

} // namespace

Status NumPyDtypeToArrow(PyObject* dtype, std::shared_ptr<DataType>* out) {
Result<std::shared_ptr<DataType>> NumPyScalarToArrowDataType(PyObject* scalar) {
PyArray_Descr* descr = PyArray_DescrFromScalar(scalar);
OwnedRef descr_ref(reinterpret_cast<PyObject*>(descr));
return NumPyDtypeToArrow(descr);
}

Result<std::shared_ptr<DataType>> NumPyDtypeToArrow(PyObject* dtype) {
if (!PyObject_TypeCheck(dtype, &PyArrayDescr_Type)) {
return Status::TypeError("Did not pass numpy.dtype object");
}
PyArray_Descr* descr = reinterpret_cast<PyArray_Descr*>(dtype);
return NumPyDtypeToArrow(descr, out);
return NumPyDtypeToArrow(descr);
}

Status NumPyDtypeToArrow(PyArray_Descr* descr, std::shared_ptr<DataType>* out) {
Result<std::shared_ptr<DataType>> NumPyDtypeToArrow(PyArray_Descr* descr) {
int type_num = fix_numpy_type_num(descr->type_num);

switch (type_num) {
Expand All @@ -151,20 +153,15 @@ Status NumPyDtypeToArrow(PyArray_Descr* descr, std::shared_ptr<DataType>* out) {
reinterpret_cast<PyArray_DatetimeDTypeMetaData*>(descr->c_metadata);
switch (date_dtype->meta.base) {
case NPY_FR_s:
*out = timestamp(TimeUnit::SECOND);
break;
return timestamp(TimeUnit::SECOND);
case NPY_FR_ms:
*out = timestamp(TimeUnit::MILLI);
break;
return timestamp(TimeUnit::MILLI);
case NPY_FR_us:
*out = timestamp(TimeUnit::MICRO);
break;
return timestamp(TimeUnit::MICRO);
case NPY_FR_ns:
*out = timestamp(TimeUnit::NANO);
break;
return timestamp(TimeUnit::NANO);
case NPY_FR_D:
*out = date32();
break;
return date32();
case NPY_FR_GENERIC:
return Status::NotImplemented("Unbound or generic datetime64 time unit");
default:
Expand All @@ -176,29 +173,22 @@ Status NumPyDtypeToArrow(PyArray_Descr* descr, std::shared_ptr<DataType>* out) {
reinterpret_cast<PyArray_DatetimeDTypeMetaData*>(descr->c_metadata);
switch (timedelta_dtype->meta.base) {
case NPY_FR_s:
*out = duration(TimeUnit::SECOND);
break;
return duration(TimeUnit::SECOND);
case NPY_FR_ms:
*out = duration(TimeUnit::MILLI);
break;
return duration(TimeUnit::MILLI);
case NPY_FR_us:
*out = duration(TimeUnit::MICRO);
break;
return duration(TimeUnit::MICRO);
case NPY_FR_ns:
*out = duration(TimeUnit::NANO);
break;
return duration(TimeUnit::NANO);
case NPY_FR_GENERIC:
return Status::NotImplemented("Unbound or generic timedelta64 time unit");
default:
return Status::NotImplemented("Unsupported timedelta64 time unit");
}
} break;
default: {
return Status::NotImplemented("Unsupported numpy type ", descr->type_num);
}
}

return Status::OK();
return Status::NotImplemented("Unsupported numpy type ", descr->type_num);
}

#undef TO_ARROW_TYPE_CASE
Expand Down Expand Up @@ -230,9 +220,8 @@ Status NdarrayToTensor(MemoryPool* pool, PyObject* ao,
strides[i] = array_strides[i];
}

std::shared_ptr<DataType> type;
RETURN_NOT_OK(
GetTensorType(reinterpret_cast<PyObject*>(PyArray_DESCR(ndarray)), &type));
ARROW_ASSIGN_OR_RAISE(
auto type, GetTensorType(reinterpret_cast<PyObject*>(PyArray_DESCR(ndarray))));
*out = std::make_shared<Tensor>(type, data, shape, strides, dim_names);
return Status::OK();
}
Expand Down Expand Up @@ -435,9 +424,9 @@ Status NdarraysToSparseCOOTensor(MemoryPool* pool, PyObject* data_ao, PyObject*

PyArrayObject* ndarray_data = reinterpret_cast<PyArrayObject*>(data_ao);
std::shared_ptr<Buffer> data = std::make_shared<NumPyBuffer>(data_ao);
std::shared_ptr<DataType> type_data;
RETURN_NOT_OK(GetTensorType(reinterpret_cast<PyObject*>(PyArray_DESCR(ndarray_data)),
&type_data));
ARROW_ASSIGN_OR_RAISE(
auto type_data,
GetTensorType(reinterpret_cast<PyObject*>(PyArray_DESCR(ndarray_data))));

std::shared_ptr<Tensor> coords;
RETURN_NOT_OK(NdarrayToTensor(pool, coords_ao, {}, &coords));
Expand All @@ -462,9 +451,9 @@ Status NdarraysToSparseCSXMatrix(MemoryPool* pool, PyObject* data_ao, PyObject*

PyArrayObject* ndarray_data = reinterpret_cast<PyArrayObject*>(data_ao);
std::shared_ptr<Buffer> data = std::make_shared<NumPyBuffer>(data_ao);
std::shared_ptr<DataType> type_data;
RETURN_NOT_OK(GetTensorType(reinterpret_cast<PyObject*>(PyArray_DESCR(ndarray_data)),
&type_data));
ARROW_ASSIGN_OR_RAISE(
auto type_data,
GetTensorType(reinterpret_cast<PyObject*>(PyArray_DESCR(ndarray_data))));

std::shared_ptr<Tensor> indptr, indices;
RETURN_NOT_OK(NdarrayToTensor(pool, indptr_ao, {}, &indptr));
Expand All @@ -491,9 +480,9 @@ Status NdarraysToSparseCSFTensor(MemoryPool* pool, PyObject* data_ao, PyObject*
const int ndim = static_cast<const int>(shape.size());
PyArrayObject* ndarray_data = reinterpret_cast<PyArrayObject*>(data_ao);
std::shared_ptr<Buffer> data = std::make_shared<NumPyBuffer>(data_ao);
std::shared_ptr<DataType> type_data;
RETURN_NOT_OK(GetTensorType(reinterpret_cast<PyObject*>(PyArray_DESCR(ndarray_data)),
&type_data));
ARROW_ASSIGN_OR_RAISE(
auto type_data,
GetTensorType(reinterpret_cast<PyObject*>(PyArray_DESCR(ndarray_data))));

std::vector<std::shared_ptr<Tensor>> indptr(ndim - 1);
std::vector<std::shared_ptr<Tensor>> indices(ndim);
Expand Down
6 changes: 4 additions & 2 deletions python/pyarrow/src/arrow/python/numpy_convert.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,9 +49,11 @@ class ARROW_PYTHON_EXPORT NumPyBuffer : public Buffer {
};

ARROW_PYTHON_EXPORT
Status NumPyDtypeToArrow(PyObject* dtype, std::shared_ptr<DataType>* out);
Result<std::shared_ptr<DataType>> NumPyDtypeToArrow(PyObject* dtype);
ARROW_PYTHON_EXPORT
Status NumPyDtypeToArrow(PyArray_Descr* descr, std::shared_ptr<DataType>* out);
Result<std::shared_ptr<DataType>> NumPyDtypeToArrow(PyArray_Descr* descr);
ARROW_PYTHON_EXPORT
Result<std::shared_ptr<DataType>> NumPyScalarToArrowDataType(PyObject* scalar);

ARROW_PYTHON_EXPORT Status NdarrayToTensor(MemoryPool* pool, PyObject* ao,
const std::vector<std::string>& dim_names,
Expand Down
11 changes: 5 additions & 6 deletions python/pyarrow/src/arrow/python/numpy_to_arrow.cc
Original file line number Diff line number Diff line change
Expand Up @@ -462,8 +462,7 @@ template <typename ArrowType>
inline Status NumPyConverter::ConvertData(std::shared_ptr<Buffer>* data) {
RETURN_NOT_OK(PrepareInputData<ArrowType>(data));

std::shared_ptr<DataType> input_type;
RETURN_NOT_OK(NumPyDtypeToArrow(reinterpret_cast<PyObject*>(dtype_), &input_type));
ARROW_ASSIGN_OR_RAISE(auto input_type, NumPyDtypeToArrow(dtype_));

if (!input_type->Equals(*type_)) {
RETURN_NOT_OK(CastBuffer(input_type, *data, length_, null_bitmap_, null_count_, type_,
Expand All @@ -490,15 +489,15 @@ inline Status NumPyConverter::ConvertData<Date32Type>(std::shared_ptr<Buffer>* d
Status s = StaticCastBuffer<int64_t, int32_t>(**data, length_, pool_, data);
RETURN_NOT_OK(s);
} else {
RETURN_NOT_OK(NumPyDtypeToArrow(reinterpret_cast<PyObject*>(dtype_), &input_type));
ARROW_ASSIGN_OR_RAISE(input_type, NumPyDtypeToArrow(dtype_));
if (!input_type->Equals(*type_)) {
// The null bitmap was already computed in VisitNative()
RETURN_NOT_OK(CastBuffer(input_type, *data, length_, null_bitmap_, null_count_,
type_, cast_options_, pool_, data));
}
}
} else {
RETURN_NOT_OK(NumPyDtypeToArrow(reinterpret_cast<PyObject*>(dtype_), &input_type));
ARROW_ASSIGN_OR_RAISE(input_type, NumPyDtypeToArrow(dtype_));
if (!input_type->Equals(*type_)) {
RETURN_NOT_OK(CastBuffer(input_type, *data, length_, null_bitmap_, null_count_,
type_, cast_options_, pool_, data));
Expand Down Expand Up @@ -531,15 +530,15 @@ inline Status NumPyConverter::ConvertData<Date64Type>(std::shared_ptr<Buffer>* d
}
*data = std::move(result);
} else {
RETURN_NOT_OK(NumPyDtypeToArrow(reinterpret_cast<PyObject*>(dtype_), &input_type));
ARROW_ASSIGN_OR_RAISE(input_type, NumPyDtypeToArrow(dtype_));
if (!input_type->Equals(*type_)) {
// The null bitmap was already computed in VisitNative()
RETURN_NOT_OK(CastBuffer(input_type, *data, length_, null_bitmap_, null_count_,
type_, cast_options_, pool_, data));
}
}
} else {
RETURN_NOT_OK(NumPyDtypeToArrow(reinterpret_cast<PyObject*>(dtype_), &input_type));
ARROW_ASSIGN_OR_RAISE(input_type, NumPyDtypeToArrow(dtype_));
if (!input_type->Equals(*type_)) {
RETURN_NOT_OK(CastBuffer(input_type, *data, length_, null_bitmap_, null_count_,
type_, cast_options_, pool_, data));
Expand Down
6 changes: 2 additions & 4 deletions python/pyarrow/src/arrow/python/python_to_arrow.cc
Original file line number Diff line number Diff line change
Expand Up @@ -386,8 +386,7 @@ class PyValue {
}
} else if (PyArray_CheckAnyScalarExact(obj)) {
// validate that the numpy scalar has np.datetime64 dtype
std::shared_ptr<DataType> numpy_type;
RETURN_NOT_OK(NumPyDtypeToArrow(PyArray_DescrFromScalar(obj), &numpy_type));
ARROW_ASSIGN_OR_RAISE(auto numpy_type, NumPyScalarToArrowDataType(obj));
if (!numpy_type->Equals(*type)) {
return Status::NotImplemented("Expected np.datetime64 but got: ",
numpy_type->ToString());
Expand Down Expand Up @@ -466,8 +465,7 @@ class PyValue {
}
} else if (PyArray_CheckAnyScalarExact(obj)) {
// validate that the numpy scalar has np.datetime64 dtype
std::shared_ptr<DataType> numpy_type;
RETURN_NOT_OK(NumPyDtypeToArrow(PyArray_DescrFromScalar(obj), &numpy_type));
ARROW_ASSIGN_OR_RAISE(auto numpy_type, NumPyScalarToArrowDataType(obj));
if (!numpy_type->Equals(*type)) {
return Status::NotImplemented("Expected np.timedelta64 but got: ",
numpy_type->ToString());
Expand Down
6 changes: 1 addition & 5 deletions python/pyarrow/types.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -5140,12 +5140,8 @@ def from_numpy_dtype(object dtype):
>>> pa.from_numpy_dtype(np.str_)
DataType(string)
"""
cdef shared_ptr[CDataType] c_type
dtype = np.dtype(dtype)
with nogil:
check_status(NumPyDtypeToArrow(dtype, &c_type))

return pyarrow_wrap_data_type(c_type)
return pyarrow_wrap_data_type(GetResultValue(NumPyDtypeToArrow(dtype)))


def is_boolean_value(object obj):
Expand Down

0 comments on commit fc586a0

Please sign in to comment.