Skip to content

Commit

Permalink
ARROW-838: [Python] Expand pyarrow.array to handle NumPy arrays not o…
Browse files Browse the repository at this point in the history
…riginating in pandas

This unifies the ingest path for 1D data into `pyarrow.array`. I added the argument `from_pandas` to turn null sentinel checking on or off:

```
In [8]: arr = np.random.randn(10000000)

In [9]: arr[::3] = np.nan

In [10]: arr2 = pa.array(arr)

In [11]: arr2.null_count
Out[11]: 0

In [12]: %timeit arr2 = pa.array(arr)
The slowest run took 5.43 times longer than the fastest. This could mean that an intermediate result is being cached.
10000 loops, best of 3: 68.4 µs per loop

In [13]: arr2 = pa.array(arr, from_pandas=True)

In [14]: arr2.null_count
Out[14]: 3333334

In [15]: %timeit arr2 = pa.array(arr, from_pandas=True)
1 loop, best of 3: 228 ms per loop
```

When the data is contiguous, it is always zero-copy, but then `from_pandas=True` and no null mask is passed, then a null bitmap is constructed and populated.

This also permits sequence reads into integers smaller than int64:

```
In [17]: pa.array([1, 2, 3, 4], type='i1')
Out[17]:
<pyarrow.lib.Int8Array object at 0x7ffa1c1c65e8>
[
  1,
  2,
  3,
  4
]
```

Oh, I also added NumPy-like string type aliases:

```
In [18]: pa.int32() == 'i4'
Out[18]: True
```

Author: Wes McKinney <wes.mckinney@twosigma.com>

Closes #1146 from wesm/expand-py-array-method and squashes the following commits:

1570e52 [Wes McKinney] Code review comments
d3bbb3c [Wes McKinney] Handle type aliases in cast, too
797f015 [Wes McKinney] Allow null checking to be skipped with from_pandas=False in pyarrow.array
f2802fc [Wes McKinney] Cleaner codepath for numpy->arrow conversions
587c575 [Wes McKinney] Add direct types sequence converters for more data types
cf40b76 [Wes McKinney] Add type aliases, some unit tests
7b530e4 [Wes McKinney] Consolidate both sequence and ndarray/Series/Index conversion in pyarrow.Array
  • Loading branch information
wesm committed Sep 30, 2017
1 parent 7c61611 commit 796129b
Show file tree
Hide file tree
Showing 16 changed files with 698 additions and 455 deletions.
4 changes: 2 additions & 2 deletions cpp/src/arrow/python/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ set(ARROW_PYTHON_SRCS
init.cc
io.cc
numpy_convert.cc
pandas_to_arrow.cc
numpy_to_arrow.cc
python_to_arrow.cc
pyarrow.cc
)
Expand Down Expand Up @@ -100,7 +100,7 @@ install(FILES
io.h
numpy_convert.h
numpy_interop.h
pandas_to_arrow.h
numpy_to_arrow.h
python_to_arrow.h
platform.h
pyarrow.h
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/arrow/python/api.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
#include "arrow/python/helpers.h"
#include "arrow/python/io.h"
#include "arrow/python/numpy_convert.h"
#include "arrow/python/pandas_to_arrow.h"
#include "arrow/python/numpy_to_arrow.h"
#include "arrow/python/python_to_arrow.h"

#endif // ARROW_PYTHON_API_H
223 changes: 152 additions & 71 deletions cpp/src/arrow/python/builtin_convert.cc
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include <datetime.h>

#include <algorithm>
#include <limits>
#include <sstream>
#include <string>

Expand Down Expand Up @@ -359,7 +360,11 @@ class TypedConverterVisitor : public TypedConverter<BuilderType> {
if (PySequence_Check(obj)) {
for (int64_t i = 0; i < size; ++i) {
OwnedRef ref(PySequence_GetItem(obj, i));
RETURN_NOT_OK(static_cast<Derived*>(this)->AppendItem(ref));
if (ref.obj() == Py_None) {
RETURN_NOT_OK(this->typed_builder_->AppendNull());
} else {
RETURN_NOT_OK(static_cast<Derived*>(this)->AppendItem(ref));
}
}
} else if (PyObject_HasAttrString(obj, "__iter__")) {
PyObject* iter = PyObject_GetIter(obj);
Expand All @@ -370,7 +375,11 @@ class TypedConverterVisitor : public TypedConverter<BuilderType> {
// consuming at size.
while ((item = PyIter_Next(iter)) && i < size) {
OwnedRef ref(item);
RETURN_NOT_OK(static_cast<Derived*>(this)->AppendItem(ref));
if (ref.obj() == Py_None) {
RETURN_NOT_OK(this->typed_builder_->AppendNull());
} else {
RETURN_NOT_OK(static_cast<Derived*>(this)->AppendItem(ref));
}
++i;
}
if (size != i) {
Expand All @@ -388,80 +397,154 @@ class TypedConverterVisitor : public TypedConverter<BuilderType> {
class NullConverter : public TypedConverterVisitor<NullBuilder, NullConverter> {
public:
inline Status AppendItem(const OwnedRef& item) {
if (item.obj() == Py_None) {
return typed_builder_->AppendNull();
} else {
return Status::Invalid("NullConverter: passed non-None value");
}
return Status::Invalid("NullConverter: passed non-None value");
}
};

class BoolConverter : public TypedConverterVisitor<BooleanBuilder, BoolConverter> {
public:
inline Status AppendItem(const OwnedRef& item) {
if (item.obj() == Py_None) {
return typed_builder_->AppendNull();
} else {
if (item.obj() == Py_True) {
return typed_builder_->Append(true);
} else {
return typed_builder_->Append(false);
}
return typed_builder_->Append(item.obj() == Py_True);
}
};

class Int8Converter : public TypedConverterVisitor<Int8Builder, Int8Converter> {
public:
inline Status AppendItem(const OwnedRef& item) {
int64_t val = static_cast<int64_t>(PyLong_AsLongLong(item.obj()));

if (ARROW_PREDICT_FALSE(val > std::numeric_limits<int8_t>::max() ||
val < std::numeric_limits<int8_t>::min())) {
return Status::Invalid(
"Cannot coerce values to array type that would "
"lose data");
}
RETURN_IF_PYERROR();
return typed_builder_->Append(static_cast<int8_t>(val));
}
};

class Int16Converter : public TypedConverterVisitor<Int16Builder, Int16Converter> {
public:
inline Status AppendItem(const OwnedRef& item) {
int64_t val = static_cast<int64_t>(PyLong_AsLongLong(item.obj()));

if (ARROW_PREDICT_FALSE(val > std::numeric_limits<int16_t>::max() ||
val < std::numeric_limits<int16_t>::min())) {
return Status::Invalid(
"Cannot coerce values to array type that would "
"lose data");
}
RETURN_IF_PYERROR();
return typed_builder_->Append(static_cast<int16_t>(val));
}
};

class Int32Converter : public TypedConverterVisitor<Int32Builder, Int32Converter> {
public:
inline Status AppendItem(const OwnedRef& item) {
int64_t val = static_cast<int64_t>(PyLong_AsLongLong(item.obj()));

if (ARROW_PREDICT_FALSE(val > std::numeric_limits<int32_t>::max() ||
val < std::numeric_limits<int32_t>::min())) {
return Status::Invalid(
"Cannot coerce values to array type that would "
"lose data");
}
RETURN_IF_PYERROR();
return typed_builder_->Append(static_cast<int32_t>(val));
}
};

class Int64Converter : public TypedConverterVisitor<Int64Builder, Int64Converter> {
public:
inline Status AppendItem(const OwnedRef& item) {
int64_t val;
if (item.obj() == Py_None) {
return typed_builder_->AppendNull();
} else {
val = static_cast<int64_t>(PyLong_AsLongLong(item.obj()));
RETURN_IF_PYERROR();
return typed_builder_->Append(val);
int64_t val = static_cast<int64_t>(PyLong_AsLongLong(item.obj()));
RETURN_IF_PYERROR();
return typed_builder_->Append(val);
}
};

class UInt8Converter : public TypedConverterVisitor<UInt8Builder, UInt8Converter> {
public:
inline Status AppendItem(const OwnedRef& item) {
uint64_t val = static_cast<uint64_t>(PyLong_AsLongLong(item.obj()));

if (ARROW_PREDICT_FALSE(val > std::numeric_limits<uint8_t>::max() ||
val < std::numeric_limits<uint8_t>::min())) {
return Status::Invalid(
"Cannot coerce values to array type that would "
"lose data");
}
RETURN_IF_PYERROR();
return typed_builder_->Append(static_cast<uint8_t>(val));
}
};

class DateConverter : public TypedConverterVisitor<Date64Builder, DateConverter> {
class UInt16Converter : public TypedConverterVisitor<UInt16Builder, UInt16Converter> {
public:
inline Status AppendItem(const OwnedRef& item) {
if (item.obj() == Py_None) {
return typed_builder_->AppendNull();
} else {
PyDateTime_Date* pydate = reinterpret_cast<PyDateTime_Date*>(item.obj());
return typed_builder_->Append(PyDate_to_ms(pydate));
uint64_t val = static_cast<uint64_t>(PyLong_AsLongLong(item.obj()));

if (ARROW_PREDICT_FALSE(val > std::numeric_limits<uint16_t>::max() ||
val < std::numeric_limits<uint16_t>::min())) {
return Status::Invalid(
"Cannot coerce values to array type that would "
"lose data");
}
RETURN_IF_PYERROR();
return typed_builder_->Append(static_cast<uint16_t>(val));
}
};

class UInt32Converter : public TypedConverterVisitor<UInt32Builder, UInt32Converter> {
public:
inline Status AppendItem(const OwnedRef& item) {
uint64_t val = static_cast<uint64_t>(PyLong_AsLongLong(item.obj()));

if (ARROW_PREDICT_FALSE(val > std::numeric_limits<uint32_t>::max() ||
val < std::numeric_limits<uint32_t>::min())) {
return Status::Invalid(
"Cannot coerce values to array type that would "
"lose data");
}
RETURN_IF_PYERROR();
return typed_builder_->Append(static_cast<uint32_t>(val));
}
};

class UInt64Converter : public TypedConverterVisitor<UInt64Builder, UInt64Converter> {
public:
inline Status AppendItem(const OwnedRef& item) {
int64_t val = static_cast<int64_t>(PyLong_AsLongLong(item.obj()));
RETURN_IF_PYERROR();
return typed_builder_->Append(val);
}
};

class DateConverter : public TypedConverterVisitor<Date64Builder, DateConverter> {
public:
inline Status AppendItem(const OwnedRef& item) {
auto pydate = reinterpret_cast<PyDateTime_Date*>(item.obj());
return typed_builder_->Append(PyDate_to_ms(pydate));
}
};

class TimestampConverter
: public TypedConverterVisitor<Date64Builder, TimestampConverter> {
public:
inline Status AppendItem(const OwnedRef& item) {
if (item.obj() == Py_None) {
return typed_builder_->AppendNull();
} else {
PyDateTime_DateTime* pydatetime =
reinterpret_cast<PyDateTime_DateTime*>(item.obj());
return typed_builder_->Append(PyDateTime_to_us(pydatetime));
}
auto pydatetime = reinterpret_cast<PyDateTime_DateTime*>(item.obj());
return typed_builder_->Append(PyDateTime_to_us(pydatetime));
}
};

class DoubleConverter : public TypedConverterVisitor<DoubleBuilder, DoubleConverter> {
public:
inline Status AppendItem(const OwnedRef& item) {
double val;
if (item.obj() == Py_None) {
return typed_builder_->AppendNull();
} else {
val = PyFloat_AsDouble(item.obj());
RETURN_IF_PYERROR();
return typed_builder_->Append(val);
}
double val = PyFloat_AsDouble(item.obj());
RETURN_IF_PYERROR();
return typed_builder_->Append(val);
}
};

Expand All @@ -473,10 +556,7 @@ class BytesConverter : public TypedConverterVisitor<BinaryBuilder, BytesConverte
Py_ssize_t length;
OwnedRef tmp;

if (item.obj() == Py_None) {
RETURN_NOT_OK(typed_builder_->AppendNull());
return Status::OK();
} else if (PyUnicode_Check(item.obj())) {
if (PyUnicode_Check(item.obj())) {
tmp.reset(PyUnicode_AsUTF8String(item.obj()));
RETURN_IF_PYERROR();
bytes_obj = tmp.obj();
Expand Down Expand Up @@ -504,10 +584,7 @@ class FixedWidthBytesConverter
Py_ssize_t expected_length =
std::dynamic_pointer_cast<FixedSizeBinaryType>(typed_builder_->type())
->byte_width();
if (item.obj() == Py_None) {
RETURN_NOT_OK(typed_builder_->AppendNull());
return Status::OK();
} else if (PyUnicode_Check(item.obj())) {
if (PyUnicode_Check(item.obj())) {
tmp.reset(PyUnicode_AsUTF8String(item.obj()));
RETURN_IF_PYERROR();
bytes_obj = tmp.obj();
Expand Down Expand Up @@ -535,9 +612,7 @@ class UTF8Converter : public TypedConverterVisitor<StringBuilder, UTF8Converter>
Py_ssize_t length;

PyObject* obj = item.obj();
if (obj == Py_None) {
return typed_builder_->AppendNull();
} else if (PyBytes_Check(obj)) {
if (PyBytes_Check(obj)) {
tmp.reset(
PyUnicode_FromStringAndSize(PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj)));
RETURN_IF_PYERROR();
Expand Down Expand Up @@ -565,14 +640,10 @@ class ListConverter : public TypedConverterVisitor<ListBuilder, ListConverter> {
Status Init(ArrayBuilder* builder) override;

inline Status AppendItem(const OwnedRef& item) override {
if (item.obj() == Py_None) {
return typed_builder_->AppendNull();
} else {
RETURN_NOT_OK(typed_builder_->Append());
PyObject* item_obj = item.obj();
int64_t list_size = static_cast<int64_t>(PySequence_Size(item_obj));
return value_converter_->AppendData(item_obj, list_size);
}
RETURN_NOT_OK(typed_builder_->Append());
PyObject* item_obj = item.obj();
int64_t list_size = static_cast<int64_t>(PySequence_Size(item_obj));
return value_converter_->AppendData(item_obj, list_size);
}

protected:
Expand All @@ -584,16 +655,12 @@ class DecimalConverter
public:
inline Status AppendItem(const OwnedRef& item) {
/// TODO(phillipc): Check for nan?
if (item.obj() != Py_None) {
std::string string;
RETURN_NOT_OK(PythonDecimalToString(item.obj(), &string));

Decimal128 value;
RETURN_NOT_OK(Decimal128::FromString(string, &value));
return typed_builder_->Append(value);
}
std::string string;
RETURN_NOT_OK(PythonDecimalToString(item.obj(), &string));

return typed_builder_->AppendNull();
Decimal128 value;
RETURN_NOT_OK(Decimal128::FromString(string, &value));
return typed_builder_->Append(value);
}
};

Expand All @@ -604,8 +671,22 @@ std::shared_ptr<SeqConverter> GetConverter(const std::shared_ptr<DataType>& type
return std::make_shared<NullConverter>();
case Type::BOOL:
return std::make_shared<BoolConverter>();
case Type::INT8:
return std::make_shared<Int8Converter>();
case Type::INT16:
return std::make_shared<Int16Converter>();
case Type::INT32:
return std::make_shared<Int32Converter>();
case Type::INT64:
return std::make_shared<Int64Converter>();
case Type::UINT8:
return std::make_shared<UInt8Converter>();
case Type::UINT16:
return std::make_shared<UInt16Converter>();
case Type::UINT32:
return std::make_shared<UInt32Converter>();
case Type::UINT64:
return std::make_shared<UInt64Converter>();
case Type::DATE64:
return std::make_shared<DateConverter>();
case Type::TIMESTAMP:
Expand Down

0 comments on commit 796129b

Please sign in to comment.