diff --git a/matlab/src/cpp/arrow/matlab/error/error.h b/matlab/src/cpp/arrow/matlab/error/error.h index 4b64849bcef4d..2b3009d51eb5a 100644 --- a/matlab/src/cpp/arrow/matlab/error/error.h +++ b/matlab/src/cpp/arrow/matlab/error/error.h @@ -181,6 +181,8 @@ namespace arrow::matlab::error { static const char* UNKNOWN_PROXY_FOR_ARRAY_TYPE = "arrow:array:UnknownProxyForArrayType"; static const char* RECORD_BATCH_NUMERIC_INDEX_WITH_EMPTY_RECORD_BATCH = "arrow:tabular:recordbatch:NumericIndexWithEmptyRecordBatch"; static const char* RECORD_BATCH_INVALID_NUMERIC_COLUMN_INDEX = "arrow:tabular:recordbatch:InvalidNumericColumnIndex"; + static const char* TABLE_NUMERIC_INDEX_WITH_EMPTY_TABLE = "arrow:tabular:table:NumericIndexWithEmptyTable"; + static const char* TABLE_INVALID_NUMERIC_COLUMN_INDEX = "arrow:tabular:table:InvalidNumericColumnIndex"; static const char* FAILED_TO_OPEN_FILE_FOR_WRITE = "arrow:io:FailedToOpenFileForWrite"; static const char* FAILED_TO_OPEN_FILE_FOR_READ = "arrow:io:FailedToOpenFileForRead"; static const char* FEATHER_FAILED_TO_WRITE_TABLE = "arrow:io:feather:FailedToWriteTable"; diff --git a/matlab/src/cpp/arrow/matlab/proxy/factory.cc b/matlab/src/cpp/arrow/matlab/proxy/factory.cc index 593e8ffbb6f01..4035725f2b382 100644 --- a/matlab/src/cpp/arrow/matlab/proxy/factory.cc +++ b/matlab/src/cpp/arrow/matlab/proxy/factory.cc @@ -23,6 +23,7 @@ #include "arrow/matlab/array/proxy/time64_array.h" #include "arrow/matlab/array/proxy/chunked_array.h" #include "arrow/matlab/tabular/proxy/record_batch.h" +#include "arrow/matlab/tabular/proxy/table.h" #include "arrow/matlab/tabular/proxy/schema.h" #include "arrow/matlab/error/error.h" #include "arrow/matlab/type/proxy/primitive_ctype.h" @@ -60,6 +61,7 @@ libmexclass::proxy::MakeResult Factory::make_proxy(const ClassName& class_name, REGISTER_PROXY(arrow.array.proxy.Date64Array , arrow::matlab::array::proxy::NumericArray); REGISTER_PROXY(arrow.array.proxy.ChunkedArray , arrow::matlab::array::proxy::ChunkedArray); REGISTER_PROXY(arrow.tabular.proxy.RecordBatch , arrow::matlab::tabular::proxy::RecordBatch); + REGISTER_PROXY(arrow.tabular.proxy.Table , arrow::matlab::tabular::proxy::Table); REGISTER_PROXY(arrow.tabular.proxy.Schema , arrow::matlab::tabular::proxy::Schema); REGISTER_PROXY(arrow.type.proxy.Field , arrow::matlab::type::proxy::Field); REGISTER_PROXY(arrow.type.proxy.Float32Type , arrow::matlab::type::proxy::PrimitiveCType); diff --git a/matlab/src/cpp/arrow/matlab/tabular/proxy/table.cc b/matlab/src/cpp/arrow/matlab/tabular/proxy/table.cc new file mode 100644 index 0000000000000..228e28dad9e9c --- /dev/null +++ b/matlab/src/cpp/arrow/matlab/tabular/proxy/table.cc @@ -0,0 +1,215 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "libmexclass/proxy/ProxyManager.h" + +#include "arrow/matlab/array/proxy/array.h" +#include "arrow/matlab/array/proxy/chunked_array.h" +#include "arrow/matlab/array/proxy/wrap.h" + +#include "arrow/matlab/error/error.h" +#include "arrow/matlab/tabular/proxy/table.h" +#include "arrow/matlab/tabular/proxy/schema.h" +#include "arrow/type.h" +#include "arrow/util/utf8.h" + +#include "libmexclass/proxy/ProxyManager.h" +#include "libmexclass/error/Error.h" + +namespace arrow::matlab::tabular::proxy { + + namespace { + libmexclass::error::Error makeEmptyTableError() { + const std::string error_msg = "Numeric indexing using the column method is not supported for tables with no columns."; + return libmexclass::error::Error{error::TABLE_NUMERIC_INDEX_WITH_EMPTY_TABLE, error_msg}; + } + + libmexclass::error::Error makeInvalidNumericIndexError(const int32_t matlab_index, const int32_t num_columns) { + std::stringstream error_message_stream; + error_message_stream << "Invalid column index: "; + error_message_stream << matlab_index; + error_message_stream << ". Column index must be between 1 and the number of columns ("; + error_message_stream << num_columns; + error_message_stream << ")."; + return libmexclass::error::Error{error::TABLE_INVALID_NUMERIC_COLUMN_INDEX, error_message_stream.str()}; + } + } + + Table::Table(std::shared_ptr table) : table{table} { + REGISTER_METHOD(Table, toString); + REGISTER_METHOD(Table, getNumRows); + REGISTER_METHOD(Table, getNumColumns); + REGISTER_METHOD(Table, getColumnNames); + REGISTER_METHOD(Table, getSchema); + REGISTER_METHOD(Table, getColumnByIndex); + REGISTER_METHOD(Table, getColumnByName); + } + + std::shared_ptr Table::unwrap() { + return table; + } + + void Table::toString(libmexclass::proxy::method::Context& context) { + namespace mda = ::matlab::data; + MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(const auto utf16_string, arrow::util::UTF8StringToUTF16(table->ToString()), context, error::UNICODE_CONVERSION_ERROR_ID); + mda::ArrayFactory factory; + auto str_mda = factory.createScalar(utf16_string); + context.outputs[0] = str_mda; + } + + libmexclass::proxy::MakeResult Table::make(const libmexclass::proxy::FunctionArguments& constructor_arguments) { + using ArrayProxy = arrow::matlab::array::proxy::Array; + using TableProxy = arrow::matlab::tabular::proxy::Table; + namespace mda = ::matlab::data; + mda::StructArray opts = constructor_arguments[0]; + const mda::TypedArray arrow_array_proxy_ids = opts[0]["ArrayProxyIDs"]; + const mda::StringArray column_names = opts[0]["ColumnNames"]; + + std::vector> arrow_arrays; + // Retrieve all of the Arrow Array Proxy instances from the libmexclass ProxyManager. + for (const auto& arrow_array_proxy_id : arrow_array_proxy_ids) { + auto proxy = libmexclass::proxy::ProxyManager::getProxy(arrow_array_proxy_id); + auto arrow_array_proxy = std::static_pointer_cast(proxy); + auto arrow_array = arrow_array_proxy->unwrap(); + arrow_arrays.push_back(arrow_array); + } + + std::vector> fields; + for (size_t i = 0; i < arrow_arrays.size(); ++i) { + const auto type = arrow_arrays[i]->type(); + const auto column_name_utf16 = std::u16string(column_names[i]); + MATLAB_ASSIGN_OR_ERROR(const auto column_name_utf8, arrow::util::UTF16StringToUTF8(column_name_utf16), error::UNICODE_CONVERSION_ERROR_ID); + fields.push_back(std::make_shared(column_name_utf8, type)); + } + + arrow::SchemaBuilder schema_builder; + MATLAB_ERROR_IF_NOT_OK(schema_builder.AddFields(fields), error::SCHEMA_BUILDER_ADD_FIELDS_ERROR_ID); + MATLAB_ASSIGN_OR_ERROR(const auto schema, schema_builder.Finish(), error::SCHEMA_BUILDER_FINISH_ERROR_ID); + const auto num_rows = arrow_arrays.size() == 0 ? 0 : arrow_arrays[0]->length(); + const auto table = arrow::Table::Make(schema, arrow_arrays, num_rows); + auto table_proxy = std::make_shared(table); + + return table_proxy; + } + + void Table::getNumRows(libmexclass::proxy::method::Context& context) { + namespace mda = ::matlab::data; + mda::ArrayFactory factory; + const auto num_rows = table->num_rows(); + auto num_rows_mda = factory.createScalar(num_rows); + context.outputs[0] = num_rows_mda; + } + + void Table::getNumColumns(libmexclass::proxy::method::Context& context) { + namespace mda = ::matlab::data; + mda::ArrayFactory factory; + const auto num_columns = table->num_columns(); + auto num_columns_mda = factory.createScalar(num_columns); + context.outputs[0] = num_columns_mda; + } + + void Table::getColumnNames(libmexclass::proxy::method::Context& context) { + namespace mda = ::matlab::data; + mda::ArrayFactory factory; + const int num_columns = table->num_columns(); + + std::vector column_names; + const auto schema = table->schema(); + const auto field_names = schema->field_names(); + for (int i = 0; i < num_columns; ++i) { + const auto column_name_utf8 = field_names[i]; + MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(auto column_name_utf16, arrow::util::UTF8StringToUTF16(column_name_utf8), context, error::UNICODE_CONVERSION_ERROR_ID); + const mda::MATLABString matlab_string = mda::MATLABString(std::move(column_name_utf16)); + column_names.push_back(matlab_string); + } + auto column_names_mda = factory.createArray({size_t{1}, static_cast(num_columns)}, column_names.begin(), column_names.end()); + context.outputs[0] = column_names_mda; + } + + void Table::getSchema(libmexclass::proxy::method::Context& context) { + namespace mda = ::matlab::data; + using namespace libmexclass::proxy; + using SchemaProxy = arrow::matlab::tabular::proxy::Schema; + mda::ArrayFactory factory; + + const auto schema = table->schema(); + const auto schema_proxy = std::make_shared(std::move(schema)); + const auto schema_proxy_id = ProxyManager::manageProxy(schema_proxy); + const auto schema_proxy_id_mda = factory.createScalar(schema_proxy_id); + + context.outputs[0] = schema_proxy_id_mda; + } + + void Table::getColumnByIndex(libmexclass::proxy::method::Context& context) { + using ChunkedArrayProxy = arrow::matlab::array::proxy::ChunkedArray; + namespace mda = ::matlab::data; + using namespace libmexclass::proxy; + mda::ArrayFactory factory; + + mda::StructArray args = context.inputs[0]; + const mda::TypedArray index_mda = args[0]["Index"]; + const auto matlab_index = int32_t(index_mda[0]); + + // Note: MATLAB uses 1-based indexing, so subtract 1. + // arrow::Schema::field does not do any bounds checking. + const int32_t index = matlab_index - 1; + const auto num_columns = table->num_columns(); + + if (num_columns == 0) { + context.error = makeEmptyTableError(); + return; + } + + if (matlab_index < 1 || matlab_index > num_columns) { + context.error = makeInvalidNumericIndexError(matlab_index, num_columns); + return; + } + + const auto chunked_array = table->column(index); + const auto chunked_array_proxy = std::make_shared(chunked_array); + + const auto chunked_array_proxy_id = ProxyManager::manageProxy(chunked_array_proxy); + const auto chunked_array_proxy_id_mda = factory.createScalar(chunked_array_proxy_id); + + context.outputs[0] = chunked_array_proxy_id_mda; + } + + void Table::getColumnByName(libmexclass::proxy::method::Context& context) { + using ChunkedArrayProxy = arrow::matlab::array::proxy::ChunkedArray; + namespace mda = ::matlab::data; + using namespace libmexclass::proxy; + mda::ArrayFactory factory; + + mda::StructArray args = context.inputs[0]; + const mda::StringArray name_mda = args[0]["Name"]; + const auto name_utf16 = std::u16string(name_mda[0]); + MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(const auto name, arrow::util::UTF16StringToUTF8(name_utf16), context, error::UNICODE_CONVERSION_ERROR_ID); + + const std::vector names = {name}; + const auto& schema = table->schema(); + MATLAB_ERROR_IF_NOT_OK_WITH_CONTEXT(schema->CanReferenceFieldsByNames(names), context, error::ARROW_TABULAR_SCHEMA_AMBIGUOUS_FIELD_NAME); + + const auto chunked_array = table->GetColumnByName(name); + const auto chunked_array_proxy = std::make_shared(chunked_array); + + const auto chunked_array_proxy_id = ProxyManager::manageProxy(chunked_array_proxy); + const auto chunked_array_proxy_id_mda = factory.createScalar(chunked_array_proxy_id); + + context.outputs[0] = chunked_array_proxy_id_mda; + } + +} diff --git a/matlab/src/cpp/arrow/matlab/tabular/proxy/table.h b/matlab/src/cpp/arrow/matlab/tabular/proxy/table.h new file mode 100644 index 0000000000000..dae86a180b7a6 --- /dev/null +++ b/matlab/src/cpp/arrow/matlab/tabular/proxy/table.h @@ -0,0 +1,48 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/table.h" + +#include "libmexclass/proxy/Proxy.h" + +namespace arrow::matlab::tabular::proxy { + + class Table : public libmexclass::proxy::Proxy { + public: + Table(std::shared_ptr table); + + virtual ~Table() {} + + std::shared_ptr unwrap(); + + static libmexclass::proxy::MakeResult make(const libmexclass::proxy::FunctionArguments& constructor_arguments); + + protected: + void toString(libmexclass::proxy::method::Context& context); + void getNumRows(libmexclass::proxy::method::Context& context); + void getNumColumns(libmexclass::proxy::method::Context& context); + void getColumnNames(libmexclass::proxy::method::Context& context); + void getSchema(libmexclass::proxy::method::Context& context); + void getColumnByIndex(libmexclass::proxy::method::Context& context); + void getColumnByName(libmexclass::proxy::method::Context& context); + + std::shared_ptr table; + }; + +} diff --git a/matlab/src/matlab/+arrow/+tabular/Table.m b/matlab/src/matlab/+arrow/+tabular/Table.m new file mode 100644 index 0000000000000..d9eb4d8409733 --- /dev/null +++ b/matlab/src/matlab/+arrow/+tabular/Table.m @@ -0,0 +1,145 @@ +%TABLE A tabular data structure representing a set of +% arrow.array.ChunkedArray objects with a fixed schema. + +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. + +classdef Table < matlab.mixin.CustomDisplay & matlab.mixin.Scalar + + properties (Dependent, SetAccess=private, GetAccess=public) + NumRows + NumColumns + ColumnNames + Schema + end + + properties (Hidden, SetAccess=private, GetAccess=public) + Proxy + end + + methods + + function obj = Table(proxy) + arguments + proxy(1, 1) libmexclass.proxy.Proxy {validate(proxy, "arrow.tabular.proxy.Table")} + end + import arrow.internal.proxy.validate + obj.Proxy = proxy; + end + + function numColumns = get.NumColumns(obj) + numColumns = obj.Proxy.getNumColumns(); + end + + function numRows = get.NumRows(obj) + numRows = obj.Proxy.getNumRows(); + end + + function columnNames = get.ColumnNames(obj) + columnNames = obj.Proxy.getColumnNames(); + end + + function schema = get.Schema(obj) + proxyID = obj.Proxy.getSchema(); + proxy = libmexclass.proxy.Proxy(Name="arrow.tabular.proxy.Schema", ID=proxyID); + schema = arrow.tabular.Schema(proxy); + end + + function chunkedArray = column(obj, idx) + import arrow.internal.validate.* + + idx = index.numericOrString(idx, "int32", AllowNonScalar=false); + + if isnumeric(idx) + args = struct(Index=idx); + proxyID = obj.Proxy.getColumnByIndex(args); + else + args = struct(Name=idx); + proxyID = obj.Proxy.getColumnByName(args); + end + + proxy = libmexclass.proxy.Proxy(Name="arrow.array.proxy.ChunkedArray", ID=proxyID); + chunkedArray = arrow.array.ChunkedArray(proxy); + end + + function T = table(obj) + import arrow.tabular.internal.* + + numColumns = obj.NumColumns; + matlabArrays = cell(1, numColumns); + + for ii = 1:numColumns + chunkedArray = obj.column(ii); + matlabArrays{ii} = toMATLAB(chunkedArray); + end + + validVariableNames = makeValidVariableNames(obj.ColumnNames); + validDimensionNames = makeValidDimensionNames(validVariableNames); + + T = table(matlabArrays{:}, ... + VariableNames=validVariableNames, ... + DimensionNames=validDimensionNames); + end + + function T = toMATLAB(obj) + T = obj.table(); + end + + end + + methods (Access = private) + + function str = toString(obj) + str = obj.Proxy.toString(); + end + + end + + methods (Access=protected) + + function displayScalarObject(obj) + disp(obj.toString()); + end + + end + + methods (Static, Access=public) + + function arrowTable = fromArrays(arrowArrays, opts) + arguments(Repeating) + arrowArrays(1, 1) arrow.array.Array + end + arguments + opts.ColumnNames(1, :) string {mustBeNonmissing} = compose("Column%d", 1:numel(arrowArrays)) + end + + import arrow.tabular.internal.validateArrayLengths + import arrow.tabular.internal.validateColumnNames + import arrow.array.internal.getArrayProxyIDs + + numColumns = numel(arrowArrays); + validateArrayLengths(arrowArrays); + validateColumnNames(opts.ColumnNames, numColumns); + + arrayProxyIDs = getArrayProxyIDs(arrowArrays); + args = struct(ArrayProxyIDs=arrayProxyIDs, ColumnNames=opts.ColumnNames); + proxyName = "arrow.tabular.proxy.Table"; + proxy = arrow.internal.proxy.create(proxyName, args); + arrowTable = arrow.tabular.Table(proxy); + end + + end + +end diff --git a/matlab/src/matlab/+arrow/table.m b/matlab/src/matlab/+arrow/table.m new file mode 100644 index 0000000000000..1f54481433b3b --- /dev/null +++ b/matlab/src/matlab/+arrow/table.m @@ -0,0 +1,33 @@ +%TABLE Creates an arrow.tabular.Table from a MATLAB table. + +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. +function arrowTable = table(matlabTable) + arguments + % Use istable instead of the table type specifier here to avoid + % ambiguous name parsing issue with MATLAB table type and arrow.table. + matlabTable {istable} = table.empty(0, 0) + end + + arrowArrays = arrow.tabular.internal.decompose(matlabTable); + arrayProxyIDs = arrow.array.internal.getArrayProxyIDs(arrowArrays); + + columnNames = string(matlabTable.Properties.VariableNames); + args = struct(ArrayProxyIDs=arrayProxyIDs, ColumnNames=columnNames); + proxyName = "arrow.tabular.proxy.Table"; + proxy = arrow.internal.proxy.create(proxyName, args); + + arrowTable = arrow.tabular.Table(proxy); +end diff --git a/matlab/test/arrow/tabular/tTable.m b/matlab/test/arrow/tabular/tTable.m new file mode 100644 index 0000000000000..8c6b9aae73752 --- /dev/null +++ b/matlab/test/arrow/tabular/tTable.m @@ -0,0 +1,622 @@ +% Tests for the arrow.tabular.Table class and the associated arrow.table +% construction function. + +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. + +classdef tTable < matlab.unittest.TestCase + + methods(Test) + + function Basic(testCase) + % Verify that an arrow.tabular.Table can be created + % from a MATLAB table using the arrow.table construction + % function. + matlabTable = table(... + [1, 2, 3]', ... + ["A", "B", "C"]', ... + [true, false, true]' ... + ); + arrowTable = arrow.table(matlabTable); + testCase.verifyInstanceOf(arrowTable, "arrow.tabular.Table"); + end + + function SupportedTypes(testCase) + % Verify that a MATLAB table containing all types + % supported for conversion to Arrow Arrays can be round-tripped + % from an arrow.tabular.Table to a MATLAB table and back. + import arrow.internal.test.tabular.createTableWithSupportedTypes + import arrow.type.traits.traits + + matlabTable = createTableWithSupportedTypes(); + arrowTable = arrow.table(matlabTable); + expectedColumnNames = string(matlabTable.Properties.VariableNames); + + % For each variable in the input MATLAB table, look up the + % corresponding Arrow Type of the corresponding ChunkedArray using type traits. + expectedChunkedArrayTypes = varfun(@(var) traits(string(class(var))).TypeClassName, ... + matlabTable, OutputFormat="uniform"); + testCase.verifyTable(arrowTable, expectedColumnNames, expectedChunkedArrayTypes, matlabTable); + end + + function ToMATLAB(testCase) + % Verify that the toMATLAB method converts + % an arrow.tabular.Table to a MATLAB table as expected. + expectedMatlabTable = table([1, 2, 3]'); + arrowTable = arrow.table(expectedMatlabTable); + actualMatlabTable = arrowTable.toMATLAB(); + testCase.verifyEqual(actualMatlabTable, expectedMatlabTable); + end + + function Table(testCase) + % Verify that the toMATLAB method converts + % an arrow.tabular.Table to a MATLAB table as expected. + TOriginal = table([1, 2, 3]'); + arrowRecordBatch = arrow.recordBatch(TOriginal); + TConverted = table(arrowRecordBatch); + testCase.verifyEqual(TOriginal, TConverted); + end + + function NumRows(testCase) + % Verify that the NumRows property of arrow.tabular.Table + % returns the expected number of rows. + numRows = int64([1, 5, 100]); + + for expectedNumRows = numRows + matlabTable = array2table(ones(expectedNumRows, 1)); + arrowTable = arrow.table(matlabTable); + testCase.verifyEqual(arrowTable.NumRows, expectedNumRows); + end + + end + + function NumColumns(testCase) + % Verify that the NumColumns property of arrow.tabular.Table + % returns the expected number of columns. + numColumns = int32([1, 5, 100]); + + for expectedNumColumns = numColumns + matlabTable = array2table(ones(1, expectedNumColumns)); + arrowTable = arrow.table(matlabTable); + testCase.verifyEqual(arrowTable.NumColumns, expectedNumColumns); + end + + end + + function ColumnNames(testCase) + % Verify that the ColumnNames property of arrow.tabular.Table + % returns the expected string array of column names. + columnNames = ["A", "B", "C"]; + matlabTable = table(1, 2, 3, VariableNames=columnNames); + arrowTable = arrow.table(matlabTable); + testCase.verifyEqual(arrowTable.ColumnNames, columnNames); + end + + function UnicodeColumnNames(testCase) + % Verify that an arrow.tabular.Table can be created from + % a MATLAB table with Unicode VariableNames. + smiley = "😀"; + tree = "🌲"; + mango = "🥭"; + columnNames = [smiley, tree, mango]; + matlabTable = table(1, 2, 3, VariableNames=columnNames); + arrowTable = arrow.table(matlabTable); + testCase.verifyEqual(arrowTable.ColumnNames, columnNames); + end + + function EmptyTable(testCase) + % Verify that an arrow.tabular.Table can be created from an + % empty MATLAB table. + matlabTable = table.empty(0, 0); + arrowTable = arrow.table(matlabTable); + testCase.verifyEqual(arrowTable.NumRows, int64(0)); + testCase.verifyEqual(arrowTable.NumColumns, int32(0)); + testCase.verifyEqual(arrowTable.ColumnNames, string.empty(1, 0)); + testCase.verifyEqual(toMATLAB(arrowTable), matlabTable); + + matlabTable = table.empty(1, 0); + arrowTable = arrow.table(matlabTable); + testCase.verifyEqual(arrowTable.NumRows, int64(0)); + testCase.verifyEqual(arrowTable.NumColumns, int32(0)); + testCase.verifyEqual(arrowTable.ColumnNames, string.empty(1, 0)); + + matlabTable = table.empty(0, 1); + arrowTable = arrow.table(matlabTable); + testCase.verifyEqual(arrowTable.NumRows, int64(0)); + testCase.verifyEqual(arrowTable.NumColumns, int32(1)); + testCase.verifyEqual(arrowTable.ColumnNames, "Var1"); + end + + function EmptyTableColumnIndexError(tc) + % Verify that an "arrow:tabular:table:NumericIndexWithEmptyTable" error + % is thrown when calling the column method on an empty Table. + matlabTable = table(); + arrowTable = arrow.table(matlabTable); + fcn = @() arrowTable.column(1); + tc.verifyError(fcn, "arrow:tabular:table:NumericIndexWithEmptyTable"); + end + + function InvalidNumericIndexError(tc) + % Verify that an "arrow:tabular:table:InvalidNumericColumnIndex" error + % is thrown when providing an index to the column + % method that is outside the range of valid column indices + % (e.g. greater than the number of columns). + matlabTable = table(1, 2, 3); + arrowTable = arrow.table(matlabTable); + fcn = @() arrowTable.column(4); + tc.verifyError(fcn, "arrow:tabular:table:InvalidNumericColumnIndex"); + end + + function UnsupportedColumnIndexType(tc) + % Verify that an "arrow:badsubscript:UnsupportedIndexType" error + % is thrown when providing an index to the column + % method that is not a positive scalar integer. + matlabTable = table(1, 2, 3); + arrowTable = arrow.table(matlabTable); + fcn = @() arrowTable.column(datetime(2022, 1, 3)); + tc.verifyError(fcn, "arrow:badsubscript:UnsupportedIndexType"); + end + + function ErrorIfIndexIsNonScalar(tc) + % Verify that an "arrow:badsubscript:NonScalar" error + % is thrown when providing a non-scalar index to the column + % method. + matlabtable = table(1, 2, 3); + arrowTable = arrow.table(matlabtable); + fcn = @() arrowTable.column([1 2]); + tc.verifyError(fcn, "arrow:badsubscript:NonScalar"); + end + + function ErrorIfIndexIsNonPositive(tc) + % Verify that an "arrow:badsubscript:NonPositive" error + % is thrown when providing a non-positive index to the column + % method. + matlabTable = table(1, 2, 3); + arrowTable = arrow.table(matlabTable); + fcn = @() arrowTable.column(-1); + tc.verifyError(fcn, "arrow:badsubscript:NonPositive"); + end + + function GetColumnByName(testCase) + % Verify that columns can be accessed by name. + matlabArray1 = [1; 2; 3]; + matlabArray2 = ["A"; "B"; "C"]; + matlabArray3 = [true; false; true]; + + arrowArray1 = arrow.array(matlabArray1); + arrowArray2 = arrow.array(matlabArray2); + arrowArray3 = arrow.array(matlabArray3); + + arrowTable = arrow.tabular.Table.fromArrays(... + arrowArray1, ... + arrowArray2, ... + arrowArray3, ... + ColumnNames=["A", "B", "C"] ... + ); + + column = arrowTable.column("A"); + expectedNumChunks = int32(1); + expectedLength = int64(3); + expectedArrowType = arrow.float64(); + testCase.verifyChunkedArray(column, ... + matlabArray1, ... + expectedNumChunks, ... + expectedLength, ... + expectedArrowType); + + column = arrowTable.column("B"); + expectedNumChunks = int32(1); + expectedLength = int64(3); + expectedArrowType = arrow.string(); + testCase.verifyChunkedArray(column, ... + matlabArray2, ... + expectedNumChunks, ... + expectedLength, ... + expectedArrowType); + + column = arrowTable.column("C"); + expectedNumChunks = int32(1); + expectedLength = int64(3); + expectedArrowType = arrow.boolean(); + testCase.verifyChunkedArray(column, ... + matlabArray3, ... + expectedNumChunks, ... + expectedLength, ... + expectedArrowType); + end + + function GetColumnByNameWithEmptyString(testCase) + % Verify that a column whose name is the empty string ("") + % can be accessed using the column() method. + matlabArray1 = [1; 2; 3]; + matlabArray2 = ["A"; "B"; "C"]; + matlabArray3 = [true; false; true]; + + arrowArray1 = arrow.array(matlabArray1); + arrowArray2 = arrow.array(matlabArray2); + arrowArray3 = arrow.array(matlabArray3); + + arrowTable = arrow.tabular.Table.fromArrays(... + arrowArray1, ... + arrowArray2, ... + arrowArray3, ... + ColumnNames=["A", "", "C"] ... + ); + + column = arrowTable.column(""); + expectedNumChunks = int32(1); + expectedLength = int64(3); + expectedArrowType = arrow.string(); + testCase.verifyChunkedArray(column, ... + matlabArray2, ... + expectedNumChunks, ... + expectedLength, ... + expectedArrowType); + end + + function GetColumnByNameWithWhitespace(testCase) + % Verify that a column whose name contains only whitespace + % characters can be accessed using the column() method. + matlabArray1 = [1; 2; 3]; + matlabArray2 = ["A"; "B"; "C"]; + matlabArray3 = [true; false; true]; + + arrowArray1 = arrow.array(matlabArray1); + arrowArray2 = arrow.array(matlabArray2); + arrowArray3 = arrow.array(matlabArray3); + + arrowTable = arrow.tabular.Table.fromArrays(... + arrowArray1, ... + arrowArray2, ... + arrowArray3, ... + ColumnNames=[" ", " ", " "] ... + ); + + column = arrowTable.column(" "); + expectedNumChunks = int32(1); + expectedLength = int64(3); + expectedArrowType = arrow.float64(); + testCase.verifyChunkedArray(column, ... + matlabArray1, ... + expectedNumChunks, ... + expectedLength, ... + expectedArrowType); + + column = arrowTable.column(" "); + expectedNumChunks = int32(1); + expectedLength = int64(3); + expectedArrowType = arrow.string(); + testCase.verifyChunkedArray(column, ... + matlabArray2, ... + expectedNumChunks, ... + expectedLength, ... + expectedArrowType); + + column = arrowTable.column(" "); + expectedNumChunks = int32(1); + expectedLength = int64(3); + expectedArrowType = arrow.boolean(); + testCase.verifyChunkedArray(column, ... + matlabArray3, ... + expectedNumChunks, ... + expectedLength, ... + expectedArrowType); + end + + function ErrorIfColumnNameDoesNotExist(testCase) + % Verify that an error is thrown when trying to access a column + % with a name that is not part of the Schema of the Table. + matlabArray1 = [1; 2; 3]; + matlabArray2 = ["A"; "B"; "C"]; + matlabArray3 = [true; false; true]; + + arrowArray1 = arrow.array(matlabArray1); + arrowArray2 = arrow.array(matlabArray2); + arrowArray3 = arrow.array(matlabArray3); + + arrowTable = arrow.tabular.Table.fromArrays(... + arrowArray1, ... + arrowArray2, ... + arrowArray3, ... + ColumnNames=["A", "B", "C"] ... + ); + + % Matching should be case sensitive. + name = "a"; + testCase.verifyError(@() arrowTable.column(name), "arrow:tabular:schema:AmbiguousFieldName"); + + name = "aA"; + testCase.verifyError(@() arrowTable.column(name), "arrow:tabular:schema:AmbiguousFieldName"); + + name = "D"; + testCase.verifyError(@() arrowTable.column(name), "arrow:tabular:schema:AmbiguousFieldName"); + + name = ""; + testCase.verifyError(@() arrowTable.column(name), "arrow:tabular:schema:AmbiguousFieldName"); + + name = " "; + testCase.verifyError(@() arrowTable.column(name), "arrow:tabular:schema:AmbiguousFieldName"); + end + + function ErrorIfAmbiguousColumnName(testCase) + % Verify that an error is thrown when trying to access a column + % with a name that is ambiguous / occurs more than once in the + % Schema of the Table. + arrowTable = arrow.tabular.Table.fromArrays(... + arrow.array([1, 2, 3]), ... + arrow.array(["A", "B", "C"]), ... + arrow.array([true, false, true]), ... + arrow.array([days(1), days(2), days(3)]), ... + ColumnNames=["A", "A", "B", "B"] ... + ); + + name = "A"; + testCase.verifyError(@() arrowTable.column(name), "arrow:tabular:schema:AmbiguousFieldName"); + + name = "B"; + testCase.verifyError(@() arrowTable.column(name), "arrow:tabular:schema:AmbiguousFieldName"); + end + + function GetColumnByNameWithChar(testCase) + % Verify that the column method works when supplied a char + % vector as input. + matlabArray1 = [1; 2; 3]; + matlabArray2 = ["A"; "B"; "C"]; + matlabArray3 = [true; false; true]; + + arrowArray1 = arrow.array(matlabArray1); + arrowArray2 = arrow.array(matlabArray2); + arrowArray3 = arrow.array(matlabArray3); + + arrowTable = arrow.tabular.Table.fromArrays(... + arrowArray1, ... + arrowArray2, ... + arrowArray3, ... + ColumnNames=["", "B", "123"] ... + ); + + % Should match the first column whose name is the + % empty string (""). + name = char.empty(0, 0); + column = arrowTable.column(name); + expectedNumChunks = int32(1); + expectedLength = int64(3); + expectedArrowType = arrow.float64(); + testCase.verifyChunkedArray(column, ... + matlabArray1, ... + expectedNumChunks, ... + expectedLength, ... + expectedArrowType); + + name = char.empty(0, 1); + column = arrowTable.column(name); + expectedNumChunks = int32(1); + expectedLength = int64(3); + expectedArrowType = arrow.float64(); + testCase.verifyChunkedArray(column, ... + matlabArray1, ... + expectedNumChunks, ... + expectedLength, ... + expectedArrowType); + + name = char.empty(1, 0); + column = arrowTable.column(name); + expectedNumChunks = int32(1); + expectedLength = int64(3); + expectedArrowType = arrow.float64(); + testCase.verifyChunkedArray(column, ... + matlabArray1, ... + expectedNumChunks, ... + expectedLength, ... + expectedArrowType); + + % Should match the second column whose name is "B". + name = 'B'; + column = arrowTable.column(name); + expectedNumChunks = int32(1); + expectedLength = int64(3); + expectedArrowType = arrow.string(); + testCase.verifyChunkedArray(column, ... + matlabArray2, ... + expectedNumChunks, ... + expectedLength, ... + expectedArrowType); + + % Should match the third column whose name is "123". + name = '123'; + column = arrowTable.column(name); + expectedNumChunks = int32(1); + expectedLength = int64(3); + expectedArrowType = arrow.boolean(); + testCase.verifyChunkedArray(column, ... + matlabArray3, ... + expectedNumChunks, ... + expectedLength, ... + expectedArrowType); + end + + function ErrorIfColumnNameIsNonScalar(testCase) + % Verify that an error is thrown if a nonscalar string array is + % specified as a column name to the column method. + arrowTable = arrow.tabular.Table.fromArrays(... + arrow.array([1, 2, 3]), ... + arrow.array(["A", "B", "C"]), ... + arrow.array([true, false, true]), ... + ColumnNames=["A", "B", "C"] ... + ); + + name = ["A", "B", "C"]; + testCase.verifyError(@() arrowTable.column(name), "arrow:badsubscript:NonScalar"); + + name = ["A"; "B"; "C"]; + testCase.verifyError(@() arrowTable.column(name), "arrow:badsubscript:NonScalar"); + end + + function FromArraysWithNoColumnNames(testCase) + % Verify arrow.tabular.Table.fromArrays creates the expected + % Table when given a comma-separated list of arrow.array.Array values. + import arrow.tabular.Table + import arrow.internal.test.tabular.createAllSupportedArrayTypes + + [arrowArrays, matlabData] = createAllSupportedArrayTypes(); + matlabTable = table(matlabData{:}); + + arrowTable = Table.fromArrays(arrowArrays{:}); + expectedColumnNames = compose("Column%d", 1:width(matlabTable)); + testCase.verifyEqual(arrowTable.ColumnNames, expectedColumnNames) + end + + function FromArraysWithColumnNames(testCase) + % Verify arrow.tabular.Table.fromArrays creates the expected + % Table when given a comma-separated list of arrow.array.Array values + % and when the ColumnNames nv-pair is provided. + import arrow.tabular.Table + import arrow.internal.test.tabular.createAllSupportedArrayTypes + + [arrowArrays, ~] = createAllSupportedArrayTypes(); + + expectedColumnNames = compose("MyVar%d", 1:numel(arrowArrays)); + arrowTable = Table.fromArrays(arrowArrays{:}, ColumnNames=expectedColumnNames); + testCase.verifyEqual(arrowTable.ColumnNames, expectedColumnNames) + end + + function FromArraysUnequalArrayLengthsError(testCase) + % Verify arrow.tabular.Table.fromArrays throws an error whose + % identifier is "arrow:tabular:UnequalArrayLengths" if the arrays + % provided don't all have the same length. + import arrow.tabular.Table + + A1 = arrow.array([1, 2]); + A2 = arrow.array(["A", "B", "C"]); + fcn = @() Table.fromArrays(A1, A2); + testCase.verifyError(fcn, "arrow:tabular:UnequalArrayLengths"); + end + + function FromArraysWrongNumberColumnNamesError(testCase) + % Verify arrow.tabular.Table.fromArrays throws an error whose + % identifier is "arrow:tabular:WrongNumberColumnNames" if the + % ColumnNames provided doesn't have one element per array. + import arrow.tabular.Table + + A1 = arrow.array([1, 2]); + A2 = arrow.array(["A", "B"]); + fcn = @() Table.fromArrays(A1, A2, ColumnNames=["A", "B", "C"]); + testCase.verifyError(fcn, "arrow:tabular:WrongNumberColumnNames"); + end + + function FromArraysColumnNamesHasMissingString(testCase) + % Verify arrow.tabular.Table.fromArrays throws an error whose + % identifier is "MATLAB:validators:mustBeNonmissing" if the + % ColumnNames provided has a missing string value. + import arrow.tabular.Table + + A1 = arrow.array([1, 2]); + A2 = arrow.array(["A", "B"]); + fcn = @() Table.fromArrays(A1, A2, ColumnNames=["A", missing]); + testCase.verifyError(fcn, "MATLAB:validators:mustBeNonmissing"); + end + + function FromArraysNoInputs(testCase) + % Verify that an empty Table is returned when calling fromArrays + % with no input arguments. + arrowTable = arrow.tabular.Table.fromArrays(); + testCase.verifyEqual(arrowTable.NumRows, int64(0)); + testCase.verifyEqual(arrowTable.NumColumns, int32(0)); + testCase.verifyEqual(arrowTable.ColumnNames, string.empty(1, 0)); + end + + function ConstructionFunctionNoInputs(testCase) + % Verify that an empty Table is returned when calling + % the arrow.table construction function with no inputs. + arrowTable = arrow.table(); + testCase.verifyEqual(arrowTable.NumRows, int64(0)); + testCase.verifyEqual(arrowTable.NumColumns, int32(0)); + testCase.verifyEqual(arrowTable.ColumnNames, string.empty(1, 0)); + end + + function Schema(testCase) + % Verify that the public Schema property returns an approprate + % instance of arrow.tabular.Table. + matlabTable = table(... + ["A"; "B"; "C"], ... + [1; 2; 3], ... + [true; false; true], ... + VariableNames=["A", "B", "C"] ... + ); + arrowTable = arrow.table(matlabTable); + schema = arrowTable.Schema; + testCase.verifyEqual(schema.NumFields, int32(3)); + testCase.verifyEqual(schema.field(1).Type.ID, arrow.type.ID.String); + testCase.verifyEqual(schema.field(1).Name, "A"); + testCase.verifyEqual(schema.field(2).Type.ID, arrow.type.ID.Float64); + testCase.verifyEqual(schema.field(2).Name, "B"); + testCase.verifyEqual(schema.field(3).Type.ID, arrow.type.ID.Boolean); + testCase.verifyEqual(schema.field(3).Name, "C"); + end + + function NoColumnsNoSetter(testCase) + % Verify that trying to set the value of the public NumColumns property + % results in an error of type "MATLAB:class:SetProhibited". + matlabTable = table([1; 2; 3]); + arrowTable = arrow.table(matlabTable); + testCase.verifyError(@() setfield(arrowTable, "NumColumns", int32(100)), ... + "MATLAB:class:SetProhibited"); + end + + function SchemaNoSetter(testCase) + % Verify that trying to set the value of the public Schema property + % results in an error of type "MATLAB:class:SetProhibited". + matlabTable = table([1; 2; 3]); + arrowTable = arrow.table(matlabTable); + testCase.verifyError(@() setfield(arrowTable, "Schema", "Value"), ... + "MATLAB:class:SetProhibited"); + end + + function ColumnNamesNoSetter(testCase) + % Verify that trying to set the value of the public ColumnNames property + % results in an error of type "MATLAB:class:SetProhibited". + matlabTable = table([1; 2; 3]); + arrowTable = arrow.table(matlabTable); + testCase.verifyError(@() setfield(arrowTable, "ColumnNames", "Value"), ... + "MATLAB:class:SetProhibited"); + end + + end + + methods + + function verifyTable(testCase, arrowTable, expectedColumnNames, expectedArrayClasses, expectedMatlabTable) + testCase.verifyEqual(arrowTable.NumColumns, int32(width(expectedMatlabTable))); + testCase.verifyEqual(arrowTable.ColumnNames, expectedColumnNames); + matlabTable = table(arrowTable); + testCase.verifyEqual(matlabTable, expectedMatlabTable); + for ii = 1:arrowTable.NumColumns + column = arrowTable.column(ii); + testCase.verifyEqual(column.toMATLAB(), expectedMatlabTable{:, ii}); + testCase.verifyInstanceOf(column.Type, expectedArrayClasses(ii)); + end + end + + function verifyChunkedArray(testCase, chunkedArray, expectedMatlabData, expectedNumChunks, expectedLength, expectedArrowType) + testCase.verifyInstanceOf(chunkedArray, "arrow.array.ChunkedArray"); + testCase.verifyEqual(toMATLAB(chunkedArray), expectedMatlabData); + testCase.verifyEqual(chunkedArray.NumChunks, expectedNumChunks) + testCase.verifyEqual(chunkedArray.Length, expectedLength); + testCase.verifyEqual(chunkedArray.Type, expectedArrowType); + end + + end + +end diff --git a/matlab/tools/cmake/BuildMatlabArrowInterface.cmake b/matlab/tools/cmake/BuildMatlabArrowInterface.cmake index 2d95682bc2081..a5c0b079b34a6 100644 --- a/matlab/tools/cmake/BuildMatlabArrowInterface.cmake +++ b/matlab/tools/cmake/BuildMatlabArrowInterface.cmake @@ -50,6 +50,7 @@ set(MATLAB_ARROW_LIBMEXCLASS_CLIENT_PROXY_SOURCES "${CMAKE_SOURCE_DIR}/src/cpp/a "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/array/proxy/chunked_array.cc" "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/array/proxy/wrap.cc" "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/tabular/proxy/record_batch.cc" + "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/tabular/proxy/table.cc" "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/tabular/proxy/schema.cc" "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/bit/pack.cc" "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/bit/unpack.cc"