Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GH-37448: [MATLAB] Add arrow.array.ChunkedArray class #37525

Merged
merged 26 commits into from
Sep 3, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
6b2a785
Move getArrayProxyIDs from arrow.tabular.internal package to
sgilmore10 Aug 30, 2023
4629e7f
Add MATLAB class arrow.array.ChunkedArray
sgilmore10 Aug 30, 2023
60d5a85
Change field name from ArrowProxyIDs to ArrayProxyIDs
sgilmore10 Aug 30, 2023
73d4c54
Fix typo in get.Length
sgilmore10 Aug 30, 2023
35690a8
Update call to getArrayProxyIDs in recordBatch to reflect the changed
sgilmore10 Aug 30, 2023
a9dc4a8
Add C++ proxy ChunkedArray class
sgilmore10 Aug 30, 2023
9e09f82
Add getChunk method to proxy ChunkedArray class in C++
sgilmore10 Aug 31, 2023
50f008b
Register getChunk proxy method
sgilmore10 Aug 31, 2023
b4b2133
Fix error in chunk() method
sgilmore10 Aug 31, 2023
2b2037b
Fix error in chunk()
sgilmore10 Aug 31, 2023
7bfa195
Update getChunk() and getType() proxy methods to return struct arrays
sgilmore10 Aug 31, 2023
1e5255c
Implement isequal ChunkedArray
sgilmore10 Aug 31, 2023
a804056
Remove debug print statements from ChunkedArray::make()
sgilmore10 Sep 1, 2023
97f985a
Implement the isequal method for arrow.array.ChunkedArray
sgilmore10 Sep 1, 2023
f86fd45
Add unit tests for arrow.array.ChunkedArray
sgilmore10 Sep 1, 2023
afd7744
Add support for creating ChunkedArrays from zero arrays to fromArrays
sgilmore10 Sep 1, 2023
ca5ab89
Add tests for ChunkedArrays constructed from zero chunks
sgilmore10 Sep 1, 2023
2d63260
Add negative tests for chunk() method
sgilmore10 Sep 1, 2023
7d9deb1
Remove arrow.array.ChunkedArray as one of the classes createAllSuppor…
sgilmore10 Sep 1, 2023
99f6e09
Rename type to getType, valid to getValid, and length to getLength
sgilmore10 Sep 2, 2023
7c7eaef
Rename columnNames to getColumnNames and numColumns to getNumColumns
sgilmore10 Sep 2, 2023
a20c0d2
Rename typeID to getTypeID and numFields to getNumFields
sgilmore10 Sep 2, 2023
0b137fa
Rename bitWidth to getBitWidth
sgilmore10 Sep 2, 2023
d28a51a
Rename type to getType and name to getName
sgilmore10 Sep 2, 2023
3d68cd9
Updage all getType proxy methods to return struct arrays with two
sgilmore10 Sep 2, 2023
ae1ee41
Fix comment indentation
sgilmore10 Sep 2, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 12 additions & 10 deletions matlab/src/cpp/arrow/matlab/array/proxy/array.cc
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,9 @@ namespace arrow::matlab::array::proxy {
// Register Proxy methods.
REGISTER_METHOD(Array, toString);
REGISTER_METHOD(Array, toMATLAB);
REGISTER_METHOD(Array, length);
REGISTER_METHOD(Array, valid);
REGISTER_METHOD(Array, type);
REGISTER_METHOD(Array, getLength);
REGISTER_METHOD(Array, getValid);
REGISTER_METHOD(Array, getType);
REGISTER_METHOD(Array, isEqual);

}
Expand All @@ -51,13 +51,13 @@ namespace arrow::matlab::array::proxy {
context.outputs[0] = str_mda;
}

void Array::length(libmexclass::proxy::method::Context& context) {
void Array::getLength(libmexclass::proxy::method::Context& context) {
::matlab::data::ArrayFactory factory;
auto length_mda = factory.createScalar(array->length());
context.outputs[0] = length_mda;
}

void Array::valid(libmexclass::proxy::method::Context& context) {
void Array::getValid(libmexclass::proxy::method::Context& context) {
auto array_length = static_cast<size_t>(array->length());

// If the Arrow array has no null values, then return a MATLAB
Expand All @@ -77,7 +77,7 @@ namespace arrow::matlab::array::proxy {
context.outputs[0] = valid_elements_mda;
}

void Array::type(libmexclass::proxy::method::Context& context) {
void Array::getType(libmexclass::proxy::method::Context& context) {
namespace mda = ::matlab::data;

mda::ArrayFactory factory;
Expand All @@ -87,11 +87,13 @@ namespace arrow::matlab::array::proxy {
context,
error::ARRAY_FAILED_TO_CREATE_TYPE_PROXY);

auto type_id = type_proxy->unwrap()->id();
auto proxy_id = libmexclass::proxy::ProxyManager::manageProxy(type_proxy);
const auto type_id = static_cast<int32_t>(type_proxy->unwrap()->id());
const auto proxy_id = libmexclass::proxy::ProxyManager::manageProxy(type_proxy);

context.outputs[0] = factory.createScalar(proxy_id);
context.outputs[1] = factory.createScalar(static_cast<int64_t>(type_id));
mda::StructArray output = factory.createStructArray({1, 1}, {"ProxyID", "TypeID"});
output[0]["ProxyID"] = factory.createScalar(proxy_id);
output[0]["TypeID"] = factory.createScalar(type_id);
context.outputs[0] = output;
}

void Array::isEqual(libmexclass::proxy::method::Context& context) {
Expand Down
6 changes: 3 additions & 3 deletions matlab/src/cpp/arrow/matlab/array/proxy/array.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,11 @@ class Array : public libmexclass::proxy::Proxy {

void toString(libmexclass::proxy::method::Context& context);

void length(libmexclass::proxy::method::Context& context);
void getLength(libmexclass::proxy::method::Context& context);

void valid(libmexclass::proxy::method::Context& context);
void getValid(libmexclass::proxy::method::Context& context);

void type(libmexclass::proxy::method::Context& context);
void getType(libmexclass::proxy::method::Context& context);

virtual void toMATLAB(libmexclass::proxy::method::Context& context) = 0;

Expand Down
187 changes: 187 additions & 0 deletions matlab/src/cpp/arrow/matlab/array/proxy/chunked_array.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,187 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#include "arrow/util/utf8.h"

#include "arrow/matlab/array/proxy/chunked_array.h"
#include "arrow/matlab/array/proxy/array.h"
#include "arrow/matlab/error/error.h"
#include "arrow/matlab/type/proxy/wrap.h"
#include "arrow/matlab/array/proxy/wrap.h"

#include "libmexclass/proxy/ProxyManager.h"

namespace arrow::matlab::array::proxy {

namespace {
libmexclass::error::Error makeEmptyChunkedArrayError() {
const std::string error_msg = "Numeric indexing using the chunk method is not supported for chunked arrays with zero chunks.";
return libmexclass::error::Error{error::CHUNKED_ARRAY_NUMERIC_INDEX_WITH_EMPTY_CHUNKED_ARRAY, error_msg};
}

libmexclass::error::Error makeInvalidNumericIndexError(const int32_t matlab_index, const int32_t num_chunks) {
std::stringstream error_message_stream;
error_message_stream << "Invalid chunk index: ";
error_message_stream << matlab_index;
error_message_stream << ". Chunk index must be between 1 and the number of chunks (";
error_message_stream << num_chunks;
error_message_stream << ").";
return libmexclass::error::Error{error::CHUNKED_ARRAY_INVALID_NUMERIC_CHUNK_INDEX, error_message_stream.str()};
}
}

ChunkedArray::ChunkedArray(std::shared_ptr<arrow::ChunkedArray> chunked_array) : chunked_array{std::move(chunked_array)} {

// Register Proxy methods.
REGISTER_METHOD(ChunkedArray, getLength);
REGISTER_METHOD(ChunkedArray, getNumChunks);
REGISTER_METHOD(ChunkedArray, getChunk);
REGISTER_METHOD(ChunkedArray, getType);
REGISTER_METHOD(ChunkedArray, isEqual);
}


libmexclass::proxy::MakeResult ChunkedArray::make(const libmexclass::proxy::FunctionArguments& constructor_arguments) {
namespace mda = ::matlab::data;

mda::StructArray opts = constructor_arguments[0];
const mda::TypedArray<uint64_t> array_proxy_ids = opts[0]["ArrayProxyIDs"];
const mda::TypedArray<uint64_t> type_proxy_id = opts[0]["TypeProxyID"];

std::vector<std::shared_ptr<arrow::Array>> arrays;
// Retrieve all of the Array Proxy instances from the libmexclass ProxyManager.
for (const auto& array_proxy_id : array_proxy_ids) {
auto proxy = libmexclass::proxy::ProxyManager::getProxy(array_proxy_id);
auto array_proxy = std::static_pointer_cast<proxy::Array>(proxy);
auto array = array_proxy->unwrap();
arrays.push_back(array);
}

auto proxy = libmexclass::proxy::ProxyManager::getProxy(type_proxy_id[0]);
auto type_proxy = std::static_pointer_cast<type::proxy::Type>(proxy);
auto type = type_proxy->unwrap();

MATLAB_ASSIGN_OR_ERROR(auto chunked_array,
arrow::ChunkedArray::Make(arrays, type),
error::CHUNKED_ARRAY_MAKE_FAILED);

return std::make_unique<proxy::ChunkedArray>(std::move(chunked_array));
}

std::shared_ptr<arrow::ChunkedArray> ChunkedArray::unwrap() {
return chunked_array;
}

void ChunkedArray::getLength(libmexclass::proxy::method::Context& context) {
namespace mda = ::matlab::data;
mda::ArrayFactory factory;
auto length_mda = factory.createScalar(chunked_array->length());
context.outputs[0] = length_mda;
}

void ChunkedArray::getNumChunks(libmexclass::proxy::method::Context& context) {
namespace mda = ::matlab::data;
mda::ArrayFactory factory;
auto length_mda = factory.createScalar(chunked_array->num_chunks());
context.outputs[0] = length_mda;
}

void ChunkedArray::getChunk(libmexclass::proxy::method::Context& context) {
namespace mda = ::matlab::data;
mda::ArrayFactory factory;

mda::StructArray args = context.inputs[0];
const mda::TypedArray<int32_t> index_mda = args[0]["Index"];
const auto matlab_index = int32_t(index_mda[0]);

// Note: MATLAB uses 1-based indexing, so subtract 1.
// arrow::Schema::field does not do any bounds checking.
const int32_t index = matlab_index - 1;
const auto num_chunks = chunked_array->num_chunks();

if (num_chunks == 0) {
context.error = makeEmptyChunkedArrayError();
return;
}

if (matlab_index < 1 || matlab_index > num_chunks) {
context.error = makeInvalidNumericIndexError(matlab_index, num_chunks);
return;
}

const auto array = chunked_array->chunk(index);
MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(auto array_proxy,
arrow::matlab::array::proxy::wrap(array),
context,
error::UNKNOWN_PROXY_FOR_ARRAY_TYPE);


const auto array_proxy_id = libmexclass::proxy::ProxyManager::manageProxy(array_proxy);
const auto type_id = static_cast<int64_t>(array->type_id());
sgilmore10 marked this conversation as resolved.
Show resolved Hide resolved

mda::StructArray output = factory.createStructArray({1, 1}, {"ProxyID", "TypeID"});
output[0]["ProxyID"] = factory.createScalar(array_proxy_id);
output[0]["TypeID"] = factory.createScalar(type_id);
context.outputs[0] = output;
}


void ChunkedArray::getType(libmexclass::proxy::method::Context& context) {
sgilmore10 marked this conversation as resolved.
Show resolved Hide resolved
namespace mda = ::matlab::data;

mda::ArrayFactory factory;

MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(auto type_proxy,
type::proxy::wrap(chunked_array->type()),
context,
error::ARRAY_FAILED_TO_CREATE_TYPE_PROXY);


const auto proxy_id = libmexclass::proxy::ProxyManager::manageProxy(type_proxy);
const auto type_id = static_cast<int32_t>(type_proxy->unwrap()->id());

mda::StructArray output = factory.createStructArray({1, 1}, {"ProxyID", "TypeID"});
output[0]["ProxyID"] = factory.createScalar(proxy_id);
output[0]["TypeID"] = factory.createScalar(type_id);
context.outputs[0] = output;
sgilmore10 marked this conversation as resolved.
Show resolved Hide resolved
}

void ChunkedArray::isEqual(libmexclass::proxy::method::Context& context) {
namespace mda = ::matlab::data;

const mda::TypedArray<uint64_t> chunked_array_proxy_ids = context.inputs[0];

bool is_equal = true;
for (const auto& chunked_array_proxy_id : chunked_array_proxy_ids) {
// Retrieve the ChunkedArray proxy from the ProxyManager
auto proxy = libmexclass::proxy::ProxyManager::getProxy(chunked_array_proxy_id);
auto chunked_array_proxy = std::static_pointer_cast<proxy::ChunkedArray>(proxy);
auto chunked_array_to_compare = chunked_array_proxy->unwrap();

// Use the ChunkedArray::Equals(const ChunkedArray& other) overload instead
// of ChunkedArray::Equals(const std::shared_ptr<ChunkedArray> other&) to
// ensure we don't assume chunked arrays with the same memory address are
// equal. This ensures we treat NaNs as not equal by default.
if (!chunked_array->Equals(*chunked_array_to_compare)) {
is_equal = false;
break;
}
}
mda::ArrayFactory factory;
context.outputs[0] = factory.createScalar(is_equal);
}
}
51 changes: 51 additions & 0 deletions matlab/src/cpp/arrow/matlab/array/proxy/chunked_array.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#pragma once

#include "arrow/chunked_array.h"

#include "libmexclass/proxy/Proxy.h"

namespace arrow::matlab::array::proxy {

class ChunkedArray : public libmexclass::proxy::Proxy {
public:
ChunkedArray(std::shared_ptr<arrow::ChunkedArray> chunked_array);

~ChunkedArray() {}

std::shared_ptr<arrow::ChunkedArray> unwrap();

static libmexclass::proxy::MakeResult make(const libmexclass::proxy::FunctionArguments& constructor_arguments);

protected:

void getLength(libmexclass::proxy::method::Context& context);

void getNumChunks(libmexclass::proxy::method::Context& context);

void getChunk(libmexclass::proxy::method::Context& context);

void getType(libmexclass::proxy::method::Context& context);

void isEqual(libmexclass::proxy::method::Context& context);

std::shared_ptr<arrow::ChunkedArray> chunked_array;
};

}
4 changes: 4 additions & 0 deletions matlab/src/cpp/arrow/matlab/error/error.h
Original file line number Diff line number Diff line change
Expand Up @@ -190,5 +190,9 @@ namespace arrow::matlab::error {
static const char* FEATHER_VERSION_UNKNOWN = "arrow:io:feather:FeatherVersionUnknown";
static const char* FEATHER_FAILED_TO_READ_TABLE = "arrow:io:feather:FailedToReadTable";
static const char* FEATHER_FAILED_TO_READ_RECORD_BATCH = "arrow:io:feather:FailedToReadRecordBatch";
static const char* CHUNKED_ARRAY_MAKE_FAILED = "arrow:chunkedarray:MakeFailed";
static const char* CHUNKED_ARRAY_NUMERIC_INDEX_WITH_EMPTY_CHUNKED_ARRAY = "arrow:chunkedarray:NumericIndexWithEmptyChunkedArray";
static const char* CHUNKED_ARRAY_INVALID_NUMERIC_CHUNK_INDEX = "arrow:chunkedarray:InvalidNumericChunkIndex";


}
2 changes: 2 additions & 0 deletions matlab/src/cpp/arrow/matlab/proxy/factory.cc
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include "arrow/matlab/array/proxy/timestamp_array.h"
#include "arrow/matlab/array/proxy/time32_array.h"
#include "arrow/matlab/array/proxy/time64_array.h"
#include "arrow/matlab/array/proxy/chunked_array.h"
#include "arrow/matlab/tabular/proxy/record_batch.h"
#include "arrow/matlab/tabular/proxy/schema.h"
#include "arrow/matlab/error/error.h"
Expand Down Expand Up @@ -55,6 +56,7 @@ libmexclass::proxy::MakeResult Factory::make_proxy(const ClassName& class_name,
REGISTER_PROXY(arrow.array.proxy.Time32Array , arrow::matlab::array::proxy::NumericArray<arrow::Time32Type>);
REGISTER_PROXY(arrow.array.proxy.Time64Array , arrow::matlab::array::proxy::NumericArray<arrow::Time64Type>);
REGISTER_PROXY(arrow.array.proxy.Date32Array , arrow::matlab::array::proxy::NumericArray<arrow::Date32Type>);
REGISTER_PROXY(arrow.array.proxy.ChunkedArray , arrow::matlab::array::proxy::ChunkedArray);
REGISTER_PROXY(arrow.tabular.proxy.RecordBatch , arrow::matlab::tabular::proxy::RecordBatch);
REGISTER_PROXY(arrow.tabular.proxy.Schema , arrow::matlab::tabular::proxy::Schema);
REGISTER_PROXY(arrow.type.proxy.Field , arrow::matlab::type::proxy::Field);
Expand Down
8 changes: 4 additions & 4 deletions matlab/src/cpp/arrow/matlab/tabular/proxy/record_batch.cc
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,8 @@ namespace arrow::matlab::tabular::proxy {

RecordBatch::RecordBatch(std::shared_ptr<arrow::RecordBatch> record_batch) : record_batch{record_batch} {
REGISTER_METHOD(RecordBatch, toString);
REGISTER_METHOD(RecordBatch, numColumns);
REGISTER_METHOD(RecordBatch, columnNames);
REGISTER_METHOD(RecordBatch, getNumColumns);
REGISTER_METHOD(RecordBatch, getColumnNames);
REGISTER_METHOD(RecordBatch, getColumnByIndex);
REGISTER_METHOD(RecordBatch, getColumnByName);
REGISTER_METHOD(RecordBatch, getSchema);
Expand Down Expand Up @@ -104,15 +104,15 @@ namespace arrow::matlab::tabular::proxy {
return record_batch_proxy;
}

void RecordBatch::numColumns(libmexclass::proxy::method::Context& context) {
void RecordBatch::getNumColumns(libmexclass::proxy::method::Context& context) {
namespace mda = ::matlab::data;
mda::ArrayFactory factory;
const auto num_columns = record_batch->num_columns();
auto num_columns_mda = factory.createScalar(num_columns);
context.outputs[0] = num_columns_mda;
}

void RecordBatch::columnNames(libmexclass::proxy::method::Context& context) {
void RecordBatch::getColumnNames(libmexclass::proxy::method::Context& context) {
namespace mda = ::matlab::data;
mda::ArrayFactory factory;
const int num_columns = record_batch->num_columns();
Expand Down
4 changes: 2 additions & 2 deletions matlab/src/cpp/arrow/matlab/tabular/proxy/record_batch.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,8 @@ namespace arrow::matlab::tabular::proxy {

protected:
void toString(libmexclass::proxy::method::Context& context);
void numColumns(libmexclass::proxy::method::Context& context);
void columnNames(libmexclass::proxy::method::Context& context);
void getNumColumns(libmexclass::proxy::method::Context& context);
void getColumnNames(libmexclass::proxy::method::Context& context);
void getColumnByIndex(libmexclass::proxy::method::Context& context);
void getColumnByName(libmexclass::proxy::method::Context& context);
void getSchema(libmexclass::proxy::method::Context& context);
Expand Down
Loading