Skip to content

Commit

Permalink
ARROW-7913: [C++][Python][R] C++ implementation of C data interface
Browse files Browse the repository at this point in the history
Closes #6483 from pitrou/ARROW-7913-c-data-interface-impl and squashes the following commits:

d18ec1d <Antoine Pitrou> Make internal Parse functions return Result<>
2a912a0 <Antoine Pitrou> Add issue number beside TODO
5e6d306 <Antoine Pitrou> Apply review comments
a65924c <Antoine Pitrou> Remove dead code
1705066 <Antoine Pitrou> Try a blind fix for the buildbot failure
ceeded8 <Antoine Pitrou> ARROW-7913:  C++ implementation of C data interface

Authored-by: Antoine Pitrou <antoine@python.org>
Signed-off-by: Benjamin Kietzman <bengilgit@gmail.com>
  • Loading branch information
pitrou authored and bkietz committed Mar 2, 2020
1 parent e667230 commit 24ce242
Show file tree
Hide file tree
Showing 54 changed files with 5,944 additions and 328 deletions.
2 changes: 1 addition & 1 deletion c_glib/test/test-array.rb
Expand Up @@ -154,7 +154,7 @@ def test_diff
def test_different_type
array = build_string_array(["Start", "Shutdown", "Reboot"])
other_array = build_int8_array([2, 3, 6, 10])
assert_equal("# Array types differed: string vs int8",
assert_equal("# Array types differed: string vs int8\n",
array.diff_unified(other_array))
end
end
Expand Down
1 change: 1 addition & 0 deletions ci/conda_env_python.yml
Expand Up @@ -16,6 +16,7 @@
# under the License.

# don't add pandas here, because it is not a mandatory test dependency
cffi
cython
cloudpickle
hypothesis
Expand Down
1 change: 1 addition & 0 deletions ci/conda_env_r.yml
Expand Up @@ -32,6 +32,7 @@ r-covr
r-hms
r-lubridate
r-rcmdcheck
r-reticulate
r-rmarkdown
r-testthat
r-tibble
2 changes: 1 addition & 1 deletion ci/cpp-msvc-build-main.bat
Expand Up @@ -103,7 +103,7 @@ popd

pushd python

pip install -r requirements.txt pickle5
pip install -r requirements.txt

set PYARROW_CXXFLAGS=%ARROW_CXXFLAGS%
set PYARROW_CMAKE_GENERATOR=%GENERATOR%
Expand Down
1 change: 1 addition & 0 deletions ci/docker/conda-r.dockerfile
Expand Up @@ -48,5 +48,6 @@ ENV ARROW_BUILD_STATIC=OFF \
ARROW_ORC=OFF \
ARROW_PARQUET=ON \
ARROW_PLASMA=OFF \
ARROW_USE_CCACHE=ON \
ARROW_USE_GLOG=OFF \
LC_ALL=en_US.UTF-8
21 changes: 20 additions & 1 deletion ci/docker/linux-apt-r.dockerfile
Expand Up @@ -47,7 +47,11 @@ RUN apt-get update -y && \
# R CMD CHECK --as-cran needs pdflatex to build the package manual
texlive-latex-base \
# Need locales so we can set UTF-8
locales && \
locales \
# Need Python to check py-to-r bridge
python3 \
python3-pip \
python3-dev && \
locale-gen en_US.UTF-8 && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
Expand All @@ -63,6 +67,19 @@ COPY ci/scripts/r_deps.sh /arrow/ci/scripts/
COPY r/DESCRIPTION /arrow/r/
RUN /arrow/ci/scripts/r_deps.sh /arrow

# Set up Python 3 and its dependencies
RUN ln -s /usr/bin/python3 /usr/local/bin/python && \
ln -s /usr/bin/pip3 /usr/local/bin/pip

COPY python/requirements.txt \
python/requirements-test.txt \
/arrow/python/

RUN pip install \
-r arrow/python/requirements.txt \
cython \
setuptools

ENV \
ARROW_BUILD_STATIC=OFF \
ARROW_BUILD_TESTS=OFF \
Expand All @@ -74,5 +91,7 @@ ENV \
ARROW_ORC=OFF \
ARROW_PARQUET=ON \
ARROW_PLASMA=OFF \
ARROW_PYTHON=ON \
ARROW_USE_CCACHE=ON \
ARROW_USE_GLOG=OFF \
LC_ALL=en_US.UTF-8
2 changes: 2 additions & 0 deletions cpp/src/arrow/CMakeLists.txt
Expand Up @@ -150,6 +150,7 @@ set(ARROW_SRCS
tensor.cc
type.cc
visitor.cc
c/bridge.cc
io/buffered.cc
io/compressed.cc
io/file.cc
Expand Down Expand Up @@ -550,6 +551,7 @@ add_arrow_benchmark(type_benchmark)
add_subdirectory(testing)

add_subdirectory(array)
add_subdirectory(c)
add_subdirectory(io)
add_subdirectory(util)
add_subdirectory(vendored)
Expand Down
6 changes: 6 additions & 0 deletions cpp/src/arrow/array.cc
Expand Up @@ -738,6 +738,12 @@ Result<std::shared_ptr<StructArray>> StructArray::Make(
if (offset > length) {
return Status::IndexError("Offset greater than length of child arrays");
}
if (null_bitmap == nullptr) {
if (null_count > 0) {
return Status::Invalid("null_count = ", null_count, " but no null bitmap given");
}
null_count = 0;
}
return std::make_shared<StructArray>(struct_(fields), length - offset, children,
null_bitmap, null_count, offset);
}
Expand Down
6 changes: 3 additions & 3 deletions cpp/src/arrow/array/builder_primitive.h
Expand Up @@ -83,15 +83,15 @@ class NumericBuilder : public ArrayBuilder {
/// uninitialized memory access
Status AppendNulls(int64_t length) final {
ARROW_RETURN_NOT_OK(Reserve(length));
data_builder_.UnsafeAppend(length, static_cast<value_type>(0));
data_builder_.UnsafeAppend(length, value_type{}); // zero
UnsafeSetNull(length);
return Status::OK();
}

/// \brief Append a single null element
Status AppendNull() final {
ARROW_RETURN_NOT_OK(Reserve(1));
data_builder_.UnsafeAppend(static_cast<value_type>(0));
data_builder_.UnsafeAppend(value_type{}); // zero
UnsafeAppendToBitmap(false);
return Status::OK();
}
Expand Down Expand Up @@ -243,7 +243,7 @@ class NumericBuilder : public ArrayBuilder {

void UnsafeAppendNull() {
ArrayBuilder::UnsafeAppendToBitmap(false);
data_builder_.UnsafeAppend(0);
data_builder_.UnsafeAppend(value_type{}); // zero
}

std::shared_ptr<DataType> type() const override { return type_; }
Expand Down
41 changes: 6 additions & 35 deletions cpp/src/arrow/array/builder_time.h
Expand Up @@ -21,52 +21,23 @@

#include <memory>

#include "arrow/array.h"
#include "arrow/array/builder_base.h"
#include "arrow/array/builder_binary.h"
#include "arrow/array/builder_primitive.h"
#include "arrow/buffer_builder.h"
#include "arrow/status.h"
#include "arrow/type_traits.h"
#include "arrow/util/macros.h"

namespace arrow {

class ARROW_EXPORT DayTimeIntervalBuilder : public ArrayBuilder {
// TODO(ARROW-7938): this class is untested

class ARROW_EXPORT DayTimeIntervalBuilder : public NumericBuilder<DayTimeIntervalType> {
public:
using TypeClass = DayTimeIntervalType;
using DayMilliseconds = DayTimeIntervalType::DayMilliseconds;

explicit DayTimeIntervalBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT)
: DayTimeIntervalBuilder(day_time_interval(), pool) {}

DayTimeIntervalBuilder(std::shared_ptr<DataType> type,
MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT)
: ArrayBuilder(pool), builder_(fixed_size_binary(sizeof(DayMilliseconds)), pool) {}

void Reset() override { builder_.Reset(); }
Status Resize(int64_t capacity) override { return builder_.Resize(capacity); }
Status Append(DayMilliseconds day_millis) {
return builder_.Append(reinterpret_cast<uint8_t*>(&day_millis));
}
void UnsafeAppend(DayMilliseconds day_millis) {
builder_.UnsafeAppend(reinterpret_cast<uint8_t*>(&day_millis));
}
using ArrayBuilder::UnsafeAppendNull;
Status AppendNull() override { return builder_.AppendNull(); }
Status AppendNulls(int64_t length) override { return builder_.AppendNulls(length); }
Status FinishInternal(std::shared_ptr<ArrayData>* out) override {
auto result = builder_.FinishInternal(out);
if (*out != NULLPTR) {
(*out)->type = type();
}
return result;
}

std::shared_ptr<DataType> type() const override { return day_time_interval(); }

private:
FixedSizeBinaryBuilder builder_;
explicit DayTimeIntervalBuilder(std::shared_ptr<DataType> type,
MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT)
: NumericBuilder<DayTimeIntervalType>(type, pool) {}
};

} // namespace arrow
2 changes: 1 addition & 1 deletion cpp/src/arrow/array/diff_test.cc
Expand Up @@ -152,7 +152,7 @@ TEST_F(DiffTest, Errors) {
ASSERT_RAISES(TypeError, Diff(*base_, *target_, default_memory_pool()));

ASSERT_FALSE(base_->Equals(*target_, EqualOptions().diff_sink(&formatted)));
ASSERT_EQ(formatted.str(), R"(# Array types differed: int32 vs string)");
ASSERT_EQ(formatted.str(), "# Array types differed: int32 vs string\n");
}

template <typename ArrowType>
Expand Down
16 changes: 10 additions & 6 deletions cpp/src/arrow/array/validate.cc
Expand Up @@ -18,6 +18,7 @@
#include "arrow/array/validate.h"

#include "arrow/array.h"
#include "arrow/util/bit_util.h"
#include "arrow/util/int_util.h"
#include "arrow/util/logging.h"
#include "arrow/visitor_inline.h"
Expand All @@ -41,11 +42,13 @@ struct ValidateArrayVisitor {
ARROW_RETURN_IF(array.data()->buffers.size() != 2,
Status::Invalid("number of buffers is != 2"));

if (array.length() > 0 && array.data()->buffers[1] == nullptr) {
return Status::Invalid("values buffer is null");
}
if (array.length() > 0 && array.values() == nullptr) {
return Status::Invalid("values is null");
if (array.length() > 0) {
if (array.data()->buffers[1] == nullptr) {
return Status::Invalid("values buffer is null");
}
if (array.values() == nullptr) {
return Status::Invalid("values is null");
}
}
return Status::OK();
}
Expand Down Expand Up @@ -265,7 +268,8 @@ struct ValidateArrayVisitor {

auto value_offsets = array.value_offsets();
if (value_offsets == nullptr) {
if (array.length() != 0) {
// For length 0, an empty offsets array seems accepted as a special case (ARROW-544)
if (array.length() > 0) {
return Status::Invalid("non-empty array but value_offsets_ is null");
}
return Status::OK();
Expand Down
22 changes: 22 additions & 0 deletions cpp/src/arrow/c/CMakeLists.txt
@@ -0,0 +1,22 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

add_arrow_test(bridge_test PREFIX "arrow-c")

add_arrow_benchmark(bridge_benchmark)

arrow_install_all_headers("arrow/c")
65 changes: 65 additions & 0 deletions cpp/src/arrow/c/abi.h
@@ -0,0 +1,65 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#pragma once

#include <stdint.h>

#ifdef __cplusplus
extern "C" {
#endif

#define ARROW_FLAG_DICTIONARY_ORDERED 1
#define ARROW_FLAG_NULLABLE 2
#define ARROW_FLAG_MAP_KEYS_SORTED 4

struct ArrowSchema {
// Array type description
const char* format;
const char* name;
const char* metadata;
int64_t flags;
int64_t n_children;
struct ArrowSchema** children;
struct ArrowSchema* dictionary;

// Release callback
void (*release)(struct ArrowSchema*);
// Opaque producer-specific data
void* private_data;
};

struct ArrowArray {
// Array data description
int64_t length;
int64_t null_count;
int64_t offset;
int64_t n_buffers;
int64_t n_children;
const void** buffers;
struct ArrowArray** children;
struct ArrowArray* dictionary;

// Release callback
void (*release)(struct ArrowArray*);
// Opaque producer-specific data
void* private_data;
};

#ifdef __cplusplus
}
#endif

0 comments on commit 24ce242

Please sign in to comment.