diff --git a/datafusion/row/README.md b/datafusion/row/README.md index 9a93bbaa7ea5..4952d1fdc6b6 100644 --- a/datafusion/row/README.md +++ b/datafusion/row/README.md @@ -21,6 +21,9 @@ [DataFusion](df) is an extensible query execution framework, written in Rust, that uses Apache Arrow as its in-memory format. -This crate is a submodule of DataFusion that provides a format for row-based data. +This crate is a submodule of DataFusion that provides an optimized row based format for row-based operations. + +See the documentation in [`lib.rs`] for more details. [df]: https://crates.io/crates/datafusion +[`lib.rs`]: https://github.com/apache/arrow-datafusion/blob/master/datafusion/row/src/lib.rs diff --git a/datafusion/row/src/accessor.rs b/datafusion/row/src/accessor.rs index b6ec41d3345b..f8e34578dbda 100644 --- a/datafusion/row/src/accessor.rs +++ b/datafusion/row/src/accessor.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -//! Setter/Getter for row with all fixed-sized fields. +//! [`RowAccessor`] provides a Read/Write/Modify access for row with all fixed-sized fields: use crate::layout::{RowLayout, RowType}; use crate::validity::NullBitsFormatter; @@ -27,7 +27,21 @@ use std::sync::Arc; //TODO: DRY with reader and writer -/// Read the tuple `data[base_offset..]` we are currently pointing to +/// Provides read/write/modify access to a tuple stored in Row format +/// at `data[base_offset..]` +/// +/// ```text +/// Set / Update data +/// in [u8] +/// ─ ─ ─ ─ ─ ─ ─ ┐ Read data out as native +/// │ types or ScalarValues +/// │ +/// │ ┌───────────────────────┐ +/// │ │ +/// └ ▶│ [u8] │─ ─ ─ ─ ─ ─ ─ ─▶ +/// │ │ +/// └───────────────────────┘ +/// ``` pub struct RowAccessor<'a> { /// Layout on how to read each field layout: Arc, diff --git a/datafusion/row/src/layout.rs b/datafusion/row/src/layout.rs index 0c92025a74f4..e5214f7c307e 100644 --- a/datafusion/row/src/layout.rs +++ b/datafusion/row/src/layout.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -//! Various row layout for different use case +//! Various row layouts for different use case use crate::schema_null_free; use arrow::datatypes::{DataType, Schema}; @@ -27,10 +27,47 @@ const BINARY_DEFAULT_SIZE: usize = 100; #[derive(Copy, Clone, Debug)] /// Type of a RowLayout pub enum RowType { - /// This type of layout will store each field with minimum bytes for space efficiency. - /// Its typical use case represents a sorting payload that accesses all row fields as a unit. + /// Stores each field with minimum bytes for space efficiency. + /// + /// Its typical use case represents a sorting payload that + /// accesses all row fields as a unit. + /// + /// Each tuple consists of up to three parts: "`null bit set`" , + /// "`values`" and "`var length data`" + /// + /// The null bit set is used for null tracking and is aligned to 1-byte. It stores + /// one bit per field. + /// + /// In the region of the values, we store the fields in the order they are defined in the schema. + /// - For fixed-length, sequential access fields, we store them directly. + /// E.g., 4 bytes for int and 1 byte for bool. + /// - For fixed-length, update often fields, we store one 8-byte word per field. + /// - For fields of non-primitive or variable-length types, + /// we append their actual content to the end of the var length region and + /// store their offset relative to row base and their length, packed into an 8-byte word. + /// + /// ```plaintext + /// ┌────────────────┬──────────────────────────┬───────────────────────┐ ┌───────────────────────┬────────────┐ + /// │Validity Bitmask│ Fixed Width Field │ Variable Width Field │ ... │ vardata area │ padding │ + /// │ (byte aligned) │ (native type width) │(vardata offset + len) │ │ (variable length) │ bytes │ + /// └────────────────┴──────────────────────────┴───────────────────────┘ └───────────────────────┴────────────┘ + /// ``` + /// + /// For example, given the schema (Int8, Utf8, Float32, Utf8) + /// + /// Encoding the tuple (1, "FooBar", NULL, "baz") + /// + /// Requires 32 bytes (31 bytes payload and 1 byte padding to make each tuple 8-bytes aligned): + /// + /// ```plaintext + /// ┌──────────┬──────────┬──────────────────────┬──────────────┬──────────────────────┬───────────────────────┬──────────┐ + /// │0b00001011│ 0x01 │0x00000016 0x00000006│ 0x00000000 │0x0000001C 0x00000003│ FooBarbaz │ 0x00 │ + /// └──────────┴──────────┴──────────────────────┴──────────────┴──────────────────────┴───────────────────────┴──────────┘ + /// 0 1 2 10 14 22 31 32 + /// ``` Compact, - /// This type of layout will store one 8-byte word per field for CPU-friendly, + + /// This type of layout stores one 8-byte word per field for CPU-friendly, /// It is mainly used to represent the rows with frequently updated content, /// for example, grouping state for hash aggregation. WordAligned, @@ -154,8 +191,10 @@ pub(crate) fn estimate_row_width(schema: &Schema, layout: &RowLayout) -> usize { round_upto_power_of_2(width, 8) } -/// Tell if we can create raw-bytes based rows since we currently -/// has limited data type supports in the row format +/// Return true of data in `schema` can be converted to raw-bytes +/// based rows. +/// +/// Note all schemas can be supported in the row format pub fn row_supported(schema: &Schema, row_type: RowType) -> bool { schema .fields() diff --git a/datafusion/row/src/lib.rs b/datafusion/row/src/lib.rs index d77c37063e92..5a76693564ab 100644 --- a/datafusion/row/src/lib.rs +++ b/datafusion/row/src/lib.rs @@ -15,41 +15,21 @@ // specific language governing permissions and limitations // under the License. -//! An implementation of Row backed by raw bytes +//! This module contains code to translate arrays back and forth to a +//! row based format. The row based format is backed by raw bytes +//! ([`[u8]`]) and used to optimize certain operations. //! -//! Each tuple consists of up to three parts: "`null bit set`" , "`values`" and "`var length data`" +//! In general, DataFusion is a so called "vectorized" execution +//! model, specifically it uses the optimized calculation kernels in +//! [`arrow`] to amortize dispatch overhead. //! -//! The null bit set is used for null tracking and is aligned to 1-byte. It stores -//! one bit per field. -//! -//! In the region of the values, we store the fields in the order they are defined in the schema. -//! - For fixed-length, sequential access fields, we store them directly. -//! E.g., 4 bytes for int and 1 byte for bool. -//! - For fixed-length, update often fields, we store one 8-byte word per field. -//! - For fields of non-primitive or variable-length types, -//! we append their actual content to the end of the var length region and -//! store their offset relative to row base and their length, packed into an 8-byte word. -//! -//! ```plaintext -//! ┌────────────────┬──────────────────────────┬───────────────────────┐ ┌───────────────────────┬────────────┐ -//! │Validity Bitmask│ Fixed Width Field │ Variable Width Field │ ... │ vardata area │ padding │ -//! │ (byte aligned) │ (native type width) │(vardata offset + len) │ │ (variable length) │ bytes │ -//! └────────────────┴──────────────────────────┴───────────────────────┘ └───────────────────────┴────────────┘ -//! ``` -//! -//! For example, given the schema (Int8, Utf8, Float32, Utf8) -//! -//! Encoding the tuple (1, "FooBar", NULL, "baz") -//! -//! Requires 32 bytes (31 bytes payload and 1 byte padding to make each tuple 8-bytes aligned): -//! -//! ```plaintext -//! ┌──────────┬──────────┬──────────────────────┬──────────────┬──────────────────────┬───────────────────────┬──────────┐ -//! │0b00001011│ 0x01 │0x00000016 0x00000006│ 0x00000000 │0x0000001C 0x00000003│ FooBarbaz │ 0x00 │ -//! └──────────┴──────────┴──────────────────────┴──────────────┴──────────────────────┴───────────────────────┴──────────┘ -//! 0 1 2 10 14 22 31 32 -//! ``` +//! However, as mentioned in [this paper], there are some "row +//! oriented" operations in a database that are not typically amenable +//! to vectorization. The "classics" are: hash table updates in joins +//! and hash aggregates, as well as comparing tuples in sort / +//! merging. //! +//! [this paper]: https://db.in.tum.de/~kersten/vectorization_vs_compilation.pdf use arrow::array::{make_builder, ArrayBuilder, ArrayRef}; use arrow::datatypes::Schema; @@ -72,7 +52,7 @@ pub(crate) fn schema_null_free(schema: &Schema) -> bool { schema.fields().iter().all(|f| !f.is_nullable()) } -/// Columnar Batch buffer +/// Columnar Batch buffer that assists creating `RecordBatches` pub struct MutableRecordBatch { arrays: Vec>, schema: Arc, diff --git a/datafusion/row/src/reader.rs b/datafusion/row/src/reader.rs index 1bf6e102a9f2..7982ca120731 100644 --- a/datafusion/row/src/reader.rs +++ b/datafusion/row/src/reader.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -//! Accessing row from raw bytes +//! [`read_as_batch`] converts raw bytes to [`RecordBatch`] use crate::layout::{RowLayout, RowType}; use crate::validity::{all_valid, NullBitsFormatter}; @@ -27,7 +27,22 @@ use arrow::util::bit_util::get_bit_raw; use datafusion_common::{DataFusionError, Result}; use std::sync::Arc; -/// Read `data` of raw-bytes rows starting at `offsets` out to a record batch +/// Read raw-bytes from `data` rows starting at `offsets` out to a [`RecordBatch`] +/// +/// +/// ```text +/// Read data to RecordBatch ┌──────────────────┐ +/// │ │ +/// │ │ +/// ┌───────────────────────┐ │ │ +/// │ │ │ RecordBatch │ +/// │ [u8] │─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ▶│ │ +/// │ │ │ (... N Rows ...) │ +/// └───────────────────────┘ │ │ +/// │ │ +/// │ │ +/// └──────────────────┘ +/// ``` pub fn read_as_batch( data: &[u8], schema: Arc, diff --git a/datafusion/row/src/writer.rs b/datafusion/row/src/writer.rs index d71e1dbc073c..2992ec175b65 100644 --- a/datafusion/row/src/writer.rs +++ b/datafusion/row/src/writer.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -//! Reusable row writer backed by Vec to stitch attributes together +//! [`RowWriter`] writes [`RecordBatch`]es to Vec to stitch attributes together use crate::layout::{estimate_row_width, RowLayout, RowType}; use arrow::array::*; @@ -98,6 +98,22 @@ macro_rules! fn_set_idx { } /// Reusable row writer backed by Vec +/// +/// ```text +/// ┌ ─ ─ ─ ─ ─ ─ ─ ─ +/// RowWriter │ +/// ┌───────────────────────┐ │ [RowFormat] +/// │ │ │ +/// │ │ │(copy from Array +/// │ │ to [u8]) │ ┌───────────────────────┐ +/// │ RecordBatch │ └ ─ ─ ─ ─ ─ ─ ─ ─ │ RowFormat │ +/// │ │──────────────────────────────▶│ Vec │ +/// │ (... N Rows ...) │ │ │ +/// │ │ └───────────────────────┘ +/// │ │ +/// │ │ +/// └───────────────────────┘ +/// ``` pub struct RowWriter { /// Layout on how to write each field layout: RowLayout,