Skip to content

Commit

Permalink
GH-37042: [MATLAB] Implement Feather V1 Writer using new MATLAB Inter…
Browse files Browse the repository at this point in the history
…face APIs (#37043)

### Rationale for this change

Now that we've have the basic building blocks for tabular IO in the MATLAB Interface (`Array`, `Schema`, `RecordBatch`), we can implement a Feather V1 writer in terms of the new APIs.

This is the first in a series of pull requests in which we will work on replacing the legacy feather V1 infrastructure with a new implementation that use the MATLAB Interface APIs. A side effect of doing this work is that we can eventually delete a lot of legacy build infrastructure and code.

### What changes are included in this PR?

1. Added a new class called `arrow.internal.io.feather.Writer` which can be used to write feather V1 files. It has one public property named `Filename` and one public method `write`. 

Below is an example of its usage:

```matlab
>> T = table([1; 2; 3], single([10; 11; 12]));

T =

  3×2 table

    Var1    Var2
    ____    ____

     1       10 
     2       11 
     3       12 

>> filename = "/tmp/table.feather";
>> writer = arrow.internal.io.feather.Writer(filename)

writer = 

  Writer with properties:

    Filename: "/tmp/table.feather"

>> writer.write(T);

```

2. Added an `unwrap` method to `proxy::RecordBatch` so that the `FeatherWriter::write` method can access the underlying `RecordBatch` from the proxy.
3.  Changed the `SetAccess` and `GetAccess` of the `Proxy` property on `arrow.tabular.RecordBatch` to `private` and `public`, respectively. 

### Are these changes tested?

Yes, added a new test file called `tRoundTrip.m` in the `matlab/test/arrow/io/feather` folder. 

### Are there any user-facing changes?

No. 

### Future Directions

1. Add a new class for reading feather V1 files (See #37041).
2. Integrate this class in the public `featherwrite` function. 
5. Once this class is integrated with `featherwrite`, we can delete the legacy build infrastructure and source code.
* Closes: #37042 

Authored-by: Sarah Gilmore <sgilmore@mathworks.com>
Signed-off-by: Kevin Gurney <kgurney@mathworks.com>
  • Loading branch information
sgilmore10 committed Aug 7, 2023
1 parent 3c00b08 commit 71329ce
Show file tree
Hide file tree
Showing 10 changed files with 247 additions and 2 deletions.
4 changes: 4 additions & 0 deletions matlab/src/cpp/arrow/matlab/error/error.h
Original file line number Diff line number Diff line change
Expand Up @@ -180,4 +180,8 @@ namespace arrow::matlab::error {
static const char* UNKNOWN_PROXY_FOR_ARRAY_TYPE = "arrow:array:UnknownProxyForArrayType";
static const char* RECORD_BATCH_NUMERIC_INDEX_WITH_EMPTY_RECORD_BATCH = "arrow:tabular:recordbatch:NumericIndexWithEmptyRecordBatch";
static const char* RECORD_BATCH_INVALID_NUMERIC_COLUMN_INDEX = "arrow:tabular:recordbatch:InvalidNumericColumnIndex";
static const char* FAILED_TO_OPEN_FILE_FOR_WRITE = "arrow:io:FailedToOpenFileForWrite";
static const char* FEATHER_FAILED_TO_WRITE_TABLE = "arrow:io:feather:FailedToWriteTable";
static const char* TABLE_FROM_RECORD_BATCH = "arrow:table:FromRecordBatch";

}
90 changes: 90 additions & 0 deletions matlab/src/cpp/arrow/matlab/io/feather/proxy/feather_writer.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#include "arrow/matlab/io/feather/proxy/feather_writer.h"
#include "arrow/matlab/tabular/proxy/record_batch.h"
#include "arrow/matlab/error/error.h"

#include "arrow/result.h"
#include "arrow/table.h"
#include "arrow/util/utf8.h"

#include "arrow/io/file.h"
#include "arrow/ipc/feather.h"

#include "libmexclass/proxy/ProxyManager.h"

namespace arrow::matlab::io::feather::proxy {

FeatherWriter::FeatherWriter(const std::string& filename) : filename{filename} {
REGISTER_METHOD(FeatherWriter, getFilename);
REGISTER_METHOD(FeatherWriter, write);
}

libmexclass::proxy::MakeResult FeatherWriter::make(const libmexclass::proxy::FunctionArguments& constructor_arguments) {
namespace mda = ::matlab::data;
mda::StructArray opts = constructor_arguments[0];
const mda::StringArray filename_mda = opts[0]["Filename"];

const auto filename_utf16 = std::u16string(filename_mda[0]);
MATLAB_ASSIGN_OR_ERROR(const auto filename_utf8,
arrow::util::UTF16StringToUTF8(filename_utf16),
error::UNICODE_CONVERSION_ERROR_ID);

return std::make_shared<FeatherWriter>(filename_utf8);
}

void FeatherWriter::getFilename(libmexclass::proxy::method::Context& context) {
namespace mda = ::matlab::data;
MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(const auto utf16_filename,
arrow::util::UTF8StringToUTF16(filename),
context,
error::UNICODE_CONVERSION_ERROR_ID);
mda::ArrayFactory factory;
auto str_mda = factory.createScalar(utf16_filename);
context.outputs[0] = str_mda;
}

void FeatherWriter::write(libmexclass::proxy::method::Context& context) {
namespace mda = ::matlab::data;
mda::StructArray opts = context.inputs[0];
const mda::TypedArray<uint64_t> record_batch_proxy_id_mda = opts[0]["RecordBatchProxyID"];
const uint64_t record_batch_proxy_id = record_batch_proxy_id_mda[0];

auto proxy = libmexclass::proxy::ProxyManager::getProxy(record_batch_proxy_id);
auto record_batch_proxy = std::static_pointer_cast<arrow::matlab::tabular::proxy::RecordBatch>(proxy);
auto record_batch = record_batch_proxy->unwrap();

MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(const auto table,
arrow::Table::FromRecordBatches({record_batch}),
context,
error::TABLE_FROM_RECORD_BATCH);

MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(std::shared_ptr<arrow::io::OutputStream> output_stream,
arrow::io::FileOutputStream::Open(filename),
context,
error::FAILED_TO_OPEN_FILE_FOR_WRITE);

// Specify the feather file format version as V1
arrow::ipc::feather::WriteProperties write_props;
write_props.version = arrow::ipc::feather::kFeatherV1Version;

MATLAB_ERROR_IF_NOT_OK_WITH_CONTEXT(ipc::feather::WriteTable(*table, output_stream.get(), write_props),
context,
error::FEATHER_FAILED_TO_WRITE_TABLE);
}
}
41 changes: 41 additions & 0 deletions matlab/src/cpp/arrow/matlab/io/feather/proxy/feather_writer.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#pragma once

#include "arrow/status.h"

#include "libmexclass/proxy/Proxy.h"

namespace arrow::matlab::io::feather::proxy {

class FeatherWriter : public libmexclass::proxy::Proxy {
public:
FeatherWriter(const std::string& filename);

~FeatherWriter() {}

static libmexclass::proxy::MakeResult make(const libmexclass::proxy::FunctionArguments& constructor_arguments);

protected:
void getFilename(libmexclass::proxy::method::Context& context);
void write(libmexclass::proxy::method::Context& context);

private:
const std::string filename;
};
}
2 changes: 2 additions & 0 deletions matlab/src/cpp/arrow/matlab/proxy/factory.cc
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#include "arrow/matlab/type/proxy/string_type.h"
#include "arrow/matlab/type/proxy/timestamp_type.h"
#include "arrow/matlab/type/proxy/field.h"
#include "arrow/matlab/io/feather/proxy/feather_writer.h"

#include "factory.h"

Expand Down Expand Up @@ -60,6 +61,7 @@ libmexclass::proxy::MakeResult Factory::make_proxy(const ClassName& class_name,
REGISTER_PROXY(arrow.type.proxy.BooleanType , arrow::matlab::type::proxy::PrimitiveCType<bool>);
REGISTER_PROXY(arrow.type.proxy.StringType , arrow::matlab::type::proxy::StringType);
REGISTER_PROXY(arrow.type.proxy.TimestampType , arrow::matlab::type::proxy::TimestampType);
REGISTER_PROXY(arrow.io.feather.proxy.FeatherWriter , arrow::matlab::io::feather::proxy::FeatherWriter);

return libmexclass::error::Error{error::UNKNOWN_PROXY_ERROR_ID, "Did not find matching C++ proxy for " + class_name};
};
Expand Down
4 changes: 4 additions & 0 deletions matlab/src/cpp/arrow/matlab/tabular/proxy/record_batch.cc
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,10 @@ namespace arrow::matlab::tabular::proxy {
REGISTER_METHOD(RecordBatch, getColumnByIndex);
}

std::shared_ptr<arrow::RecordBatch> RecordBatch::unwrap() {
return record_batch;
}

void RecordBatch::toString(libmexclass::proxy::method::Context& context) {
namespace mda = ::matlab::data;
MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(const auto utf16_string, arrow::util::UTF8StringToUTF16(record_batch->ToString()), context, error::UNICODE_CONVERSION_ERROR_ID);
Expand Down
2 changes: 2 additions & 0 deletions matlab/src/cpp/arrow/matlab/tabular/proxy/record_batch.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ namespace arrow::matlab::tabular::proxy {

virtual ~RecordBatch() {}

std::shared_ptr<arrow::RecordBatch> unwrap();

static libmexclass::proxy::MakeResult make(const libmexclass::proxy::FunctionArguments& constructor_arguments);

protected:
Expand Down
48 changes: 48 additions & 0 deletions matlab/src/matlab/+arrow/+internal/+io/+feather/Writer.m
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
%WRITER Class for writing feather V1 files.

% Licensed to the Apache Software Foundation (ASF) under one or more
% contributor license agreements. See the NOTICE file distributed with
% this work for additional information regarding copyright ownership.
% The ASF licenses this file to you under the Apache License, Version
% 2.0 (the "License"); you may not use this file except in compliance
% with the License. You may obtain a copy of the License at
%
% http://www.apache.org/licenses/LICENSE-2.0
%
% Unless required by applicable law or agreed to in writing, software
% distributed under the License is distributed on an "AS IS" BASIS,
% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
% implied. See the License for the specific language governing
% permissions and limitations under the License.
classdef Writer < matlab.mixin.Scalar

properties(Hidden, SetAccess=private, GetAccess=public)
Proxy
end

properties(Dependent)
Filename
end

methods
function obj = Writer(filename)
arguments
filename(1, 1) {mustBeNonmissing, mustBeNonzeroLengthText}
end

args = struct(Filename=filename);
proxyName = "arrow.io.feather.proxy.FeatherWriter";
obj.Proxy = arrow.internal.proxy.create(proxyName, args);
end

function write(obj, T)
rb = arrow.recordbatch(T);
args = struct(RecordBatchProxyID=rb.Proxy.ID);
obj.Proxy.write(args);
end

function filename = get.Filename(obj)
filename = obj.Proxy.getFilename();
end
end
end
2 changes: 1 addition & 1 deletion matlab/src/matlab/+arrow/+tabular/RecordBatch.m
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
ColumnNames
end

properties (Access=protected)
properties (Hidden, SetAccess=private, GetAccess=public)
Proxy
end

Expand Down
52 changes: 52 additions & 0 deletions matlab/test/arrow/io/feather/tRoundTrip.m
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
%TROUNDTRIP Round trip tests for feather.

% Licensed to the Apache Software Foundation (ASF) under one or more
% contributor license agreements. See the NOTICE file distributed with
% this work for additional information regarding copyright ownership.
% The ASF licenses this file to you under the Apache License, Version
% 2.0 (the "License"); you may not use this file except in compliance
% with the License. You may obtain a copy of the License at
%
% http://www.apache.org/licenses/LICENSE-2.0
%
% Unless required by applicable law or agreed to in writing, software
% distributed under the License is distributed on an "AS IS" BASIS,
% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
% implied. See the License for the specific language governing
% permissions and limitations under the License.
classdef tRoundTrip < matlab.unittest.TestCase

methods(TestClassSetup)
% Delete once arrow.internal.io.feather.Reader is submitted.
function addFeatherFunctionsToMATLABPath(testCase)
import matlab.unittest.fixtures.PathFixture
% Add Feather test utilities to the MATLAB path.
testCase.applyFixture(PathFixture('../../../util'));
% arrow.cpp.call must be on the MATLAB path.
testCase.assertTrue(~isempty(which('arrow.cpp.call')), ...
'''arrow.cpp.call'' must be on the MATLAB path. Use ''addpath'' to add folders to the MATLAB path.');
end
end

methods(Test)
function Basic(testCase)
import matlab.unittest.fixtures.TemporaryFolderFixture

fixture = testCase.applyFixture(TemporaryFolderFixture);
filename = fullfile(fixture.Folder, "temp.feather");

DoubleVar = [10; 20; 30; 40];
SingleVar = single([10; 15; 20; 25]);
tWrite = table(DoubleVar, SingleVar);

featherwrite(tWrite, filename);
tRead = featherread(filename);
testCase.verifyEqual(tWrite, tRead);
end
end
end

function featherwrite(T, filename)
writer = arrow.internal.io.feather.Writer(filename);
writer.write(T);
end
4 changes: 3 additions & 1 deletion matlab/tools/cmake/BuildMatlabArrowInterface.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,9 @@ set(MATLAB_ARROW_LIBMEXCLASS_CLIENT_PROXY_SOURCES "${CMAKE_SOURCE_DIR}/src/cpp/a
"${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/type/proxy/string_type.cc"
"${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/type/proxy/timestamp_type.cc"
"${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/type/proxy/field.cc"
"${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/type/proxy/wrap.cc")
"${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/type/proxy/wrap.cc"
"${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/io/feather/proxy/feather_writer.cc")



set(MATLAB_ARROW_LIBMEXCLASS_CLIENT_PROXY_FACTORY_INCLUDE_DIR "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/proxy")
Expand Down

0 comments on commit 71329ce

Please sign in to comment.