diff --git a/format/ColumnBag.fbs b/format/ColumnBag.fbs new file mode 100644 index 0000000000000..cf6ca3ac050cb --- /dev/null +++ b/format/ColumnBag.fbs @@ -0,0 +1,63 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +/// EXPERIMENTAL: A less restrictive version of RecordBatch that can +/// reduce wire overhead for small messages + +include "Schema.fbs"; + +namespace org.apache.arrow.flatbuf; + +/// ---------------------------------------------------------------------- +/// Data structures to represent a bag of columns (a collection of +/// possibly differing length Arrow arrays) + +/// A range of top level field nodes, identified by their ordering in the +/// schema. The offsets are zero-indexed, but only top level field nodes are +/// counted; it is impossible to refer to a child node anywhere in the schema. +struct FieldNodeRange { + /// The starting offset (inclusive) + start: long; + + /// The ending offset (exclusive) + end: long; +} + +/// A data header describing the shared memory layout of a "bag" of "columns". +/// It is similar to a RecordBatch but not every top level node is required +/// to be included in the wire payload. +table ColumnBag { + /// If not provided, all nodes are included and this payload is identical + /// to a RecordBatch. Otherwise the reader needs to skip top level FieldNodes + /// (and the parallel Buffers) that were not included. Note that ranges must + /// be listed in strictly increasing order and be non-overlapping. + includedNodes: [FieldNodeRange]; + + /// Nodes correspond to the pre-ordered flattened logical schema + nodes: [FieldNode]; + + /// Buffers correspond to the pre-ordered flattened buffer tree + /// + /// The number of buffers appended to this list depends on the schema. For + /// example, most primitive arrays will have 2 buffers, 1 for the validity + /// bitmap and 1 for the values. For struct arrays, there will only be a + /// single buffer for the validity (nulls) bitmap + buffers: [Buffer]; + + /// Optional compression of the message body + compression: BodyCompression; +} \ No newline at end of file diff --git a/format/Message.fbs b/format/Message.fbs index 24554abbb8358..22fc8b83b5f62 100644 --- a/format/Message.fbs +++ b/format/Message.fbs @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +include "ColumnBag.fbs"; include "Schema.fbs"; include "SparseTensor.fbs"; include "Tensor.fbs"; @@ -25,23 +26,6 @@ namespace org.apache.arrow.flatbuf; /// Data structures for describing a table row batch (a collection of /// equal-length Arrow arrays) -/// Metadata about a field at some level of a nested type tree (but not -/// its children). -/// -/// For example, a List with values `[[1, 2, 3], null, [4], [5, 6], null]` -/// would have {length: 5, null_count: 2} for its List node, and {length: 6, -/// null_count: 0} for its Int16 node, as separate FieldNode structs -struct FieldNode { - /// The number of value slots in the Arrow array at this level of a nested - /// tree - length: long; - - /// The number of observed nulls. Fields with null_count == 0 may choose not - /// to write their physical validity bitmap out as a materialized buffer, - /// instead setting the length of the bitmap buffer to 0. - null_count: long; -} - enum CompressionType:byte { // LZ4 frame format, for portability, as provided by lz4frame.h or wrappers // thereof. Not to be confused with "raw" (also called "block") format @@ -117,40 +101,6 @@ table DictionaryBatch { isDelta: bool = false; } -/// A range of field nodes, identified by their offset in the schema. -/// The offsets are zero-indexed. -struct FieldNodeRange { - /// The starting offset (inclusive) - start: long; - - /// The ending offset (exclusive) - end: long; -} - -/// A data header describing the shared memory layout of a "bag" of "columns". -/// It is similar to a RecordBatch but not every top level FieldNode is required -/// to be included in the wire payload. -table ColumnBag { - /// If not provided, all field nodes are included and this payload is - /// identical to a RecordBatch. Otherwise the reader needs to skip - /// top level FieldNodes that were not included. - includedNodes: [FieldNodeRange]; - - /// Nodes correspond to the pre-ordered flattened logical schema - nodes: [FieldNode]; - - /// Buffers correspond to the pre-ordered flattened buffer tree - /// - /// The number of buffers appended to this list depends on the schema. For - /// example, most primitive arrays will have 2 buffers, 1 for the validity - /// bitmap and 1 for the values. For struct arrays, there will only be a - /// single buffer for the validity (nulls) bitmap - buffers: [Buffer]; - - /// Optional compression of the message body - compression: BodyCompression; -} - /// ---------------------------------------------------------------------- /// The root Message type diff --git a/format/Schema.fbs b/format/Schema.fbs index 3d201ebd67785..3b929cfe3759b 100644 --- a/format/Schema.fbs +++ b/format/Schema.fbs @@ -37,7 +37,7 @@ enum MetadataVersion:short { /// >= 0.8.0 (December 2017). Non-backwards compatible with V3. V4, - /// 1.0.0 -> 6.0.0 (July 2020). Backwards compatible with V4 (V5 readers can read V4 + /// >= 1.0.0 (July 2020). Backwards compatible with V4 (V5 readers can read V4 /// metadata and IPC messages). Implementations are recommended to provide a /// V4 compatibility mode with V5 format changes disabled. /// @@ -45,12 +45,6 @@ enum MetadataVersion:short { /// - Union buffer layout has changed. In V5, Unions don't have a validity /// bitmap buffer. V5, - - /// >= 7.0.0 (Jan 2022). Backwards compatible with V4 and V5. - /// - /// Adds ColumnBag to wire format. It has looser restrictions than RecordBatch but is - /// otherwise similar in intent. - V6 } /// Represents Arrow Features that might not have full support @@ -507,6 +501,24 @@ struct Buffer { length: long; } +/// ---------------------------------------------------------------------- +/// FieldNode represents metadata about a field at some level of a nested +/// type tree (but not its children). +/// +/// For example, a List with values `[[1, 2, 3], null, [4], [5, 6], null]` +/// would have {length: 5, null_count: 2} for its List node, and {length: 6, +/// null_count: 0} for its Int16 node, as separate FieldNode structs +struct FieldNode { + /// The number of value slots in the Arrow array at this level of a nested + /// tree + length: long; + + /// The number of observed nulls. Fields with null_count == 0 may choose not + /// to write their physical validity bitmap out as a materialized buffer, + /// instead setting the length of the bitmap buffer to 0. + null_count: long; +} + /// ---------------------------------------------------------------------- /// A Schema describes the columns in a row batch