Skip to content

Commit

Permalink
David Li's feedback
Browse files Browse the repository at this point in the history
  • Loading branch information
nbauernfeind committed Nov 9, 2021
1 parent e361191 commit 9366a04
Show file tree
Hide file tree
Showing 3 changed files with 83 additions and 58 deletions.
63 changes: 63 additions & 0 deletions format/ColumnBag.fbs
@@ -0,0 +1,63 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

/// EXPERIMENTAL: A less restrictive version of RecordBatch that can
/// reduce wire overhead for small messages

include "Schema.fbs";

namespace org.apache.arrow.flatbuf;

/// ----------------------------------------------------------------------
/// Data structures to represent a bag of columns (a collection of
/// possibly differing length Arrow arrays)

/// A range of top level field nodes, identified by their ordering in the
/// schema. The offsets are zero-indexed, but only top level field nodes are
/// counted; it is impossible to refer to a child node anywhere in the schema.
struct FieldNodeRange {
/// The starting offset (inclusive)
start: long;

/// The ending offset (exclusive)
end: long;
}

/// A data header describing the shared memory layout of a "bag" of "columns".
/// It is similar to a RecordBatch but not every top level node is required
/// to be included in the wire payload.
table ColumnBag {
/// If not provided, all nodes are included and this payload is identical
/// to a RecordBatch. Otherwise the reader needs to skip top level FieldNodes
/// (and the parallel Buffers) that were not included. Note that ranges must
/// be listed in strictly increasing order and be non-overlapping.
includedNodes: [FieldNodeRange];

/// Nodes correspond to the pre-ordered flattened logical schema
nodes: [FieldNode];

/// Buffers correspond to the pre-ordered flattened buffer tree
///
/// The number of buffers appended to this list depends on the schema. For
/// example, most primitive arrays will have 2 buffers, 1 for the validity
/// bitmap and 1 for the values. For struct arrays, there will only be a
/// single buffer for the validity (nulls) bitmap
buffers: [Buffer];

/// Optional compression of the message body
compression: BodyCompression;
}
52 changes: 1 addition & 51 deletions format/Message.fbs
Expand Up @@ -15,6 +15,7 @@
// specific language governing permissions and limitations
// under the License.

include "ColumnBag.fbs";
include "Schema.fbs";
include "SparseTensor.fbs";
include "Tensor.fbs";
Expand All @@ -25,23 +26,6 @@ namespace org.apache.arrow.flatbuf;
/// Data structures for describing a table row batch (a collection of
/// equal-length Arrow arrays)

/// Metadata about a field at some level of a nested type tree (but not
/// its children).
///
/// For example, a List<Int16> with values `[[1, 2, 3], null, [4], [5, 6], null]`
/// would have {length: 5, null_count: 2} for its List node, and {length: 6,
/// null_count: 0} for its Int16 node, as separate FieldNode structs
struct FieldNode {
/// The number of value slots in the Arrow array at this level of a nested
/// tree
length: long;

/// The number of observed nulls. Fields with null_count == 0 may choose not
/// to write their physical validity bitmap out as a materialized buffer,
/// instead setting the length of the bitmap buffer to 0.
null_count: long;
}

enum CompressionType:byte {
// LZ4 frame format, for portability, as provided by lz4frame.h or wrappers
// thereof. Not to be confused with "raw" (also called "block") format
Expand Down Expand Up @@ -117,40 +101,6 @@ table DictionaryBatch {
isDelta: bool = false;
}

/// A range of field nodes, identified by their offset in the schema.
/// The offsets are zero-indexed.
struct FieldNodeRange {
/// The starting offset (inclusive)
start: long;

/// The ending offset (exclusive)
end: long;
}

/// A data header describing the shared memory layout of a "bag" of "columns".
/// It is similar to a RecordBatch but not every top level FieldNode is required
/// to be included in the wire payload.
table ColumnBag {
/// If not provided, all field nodes are included and this payload is
/// identical to a RecordBatch. Otherwise the reader needs to skip
/// top level FieldNodes that were not included.
includedNodes: [FieldNodeRange];

/// Nodes correspond to the pre-ordered flattened logical schema
nodes: [FieldNode];

/// Buffers correspond to the pre-ordered flattened buffer tree
///
/// The number of buffers appended to this list depends on the schema. For
/// example, most primitive arrays will have 2 buffers, 1 for the validity
/// bitmap and 1 for the values. For struct arrays, there will only be a
/// single buffer for the validity (nulls) bitmap
buffers: [Buffer];

/// Optional compression of the message body
compression: BodyCompression;
}

/// ----------------------------------------------------------------------
/// The root Message type

Expand Down
26 changes: 19 additions & 7 deletions format/Schema.fbs
Expand Up @@ -37,20 +37,14 @@ enum MetadataVersion:short {
/// >= 0.8.0 (December 2017). Non-backwards compatible with V3.
V4,

/// 1.0.0 -> 6.0.0 (July 2020). Backwards compatible with V4 (V5 readers can read V4
/// >= 1.0.0 (July 2020). Backwards compatible with V4 (V5 readers can read V4
/// metadata and IPC messages). Implementations are recommended to provide a
/// V4 compatibility mode with V5 format changes disabled.
///
/// Incompatible changes between V4 and V5:
/// - Union buffer layout has changed. In V5, Unions don't have a validity
/// bitmap buffer.
V5,

/// >= 7.0.0 (Jan 2022). Backwards compatible with V4 and V5.
///
/// Adds ColumnBag to wire format. It has looser restrictions than RecordBatch but is
/// otherwise similar in intent.
V6
}

/// Represents Arrow Features that might not have full support
Expand Down Expand Up @@ -507,6 +501,24 @@ struct Buffer {
length: long;
}

/// ----------------------------------------------------------------------
/// FieldNode represents metadata about a field at some level of a nested
/// type tree (but not its children).
///
/// For example, a List<Int16> with values `[[1, 2, 3], null, [4], [5, 6], null]`
/// would have {length: 5, null_count: 2} for its List node, and {length: 6,
/// null_count: 0} for its Int16 node, as separate FieldNode structs
struct FieldNode {
/// The number of value slots in the Arrow array at this level of a nested
/// tree
length: long;

/// The number of observed nulls. Fields with null_count == 0 may choose not
/// to write their physical validity bitmap out as a materialized buffer,
/// instead setting the length of the bitmap buffer to 0.
null_count: long;
}

/// ----------------------------------------------------------------------
/// A Schema describes the columns in a row batch

Expand Down

0 comments on commit 9366a04

Please sign in to comment.