Skip to content

Commit

Permalink
perf: elide all possible casts in parquet reading (pola-rs#13604)
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 authored and bushuyev committed Jan 11, 2024
1 parent 930bfc7 commit 55a3ad1
Show file tree
Hide file tree
Showing 8 changed files with 159 additions and 73 deletions.
2 changes: 1 addition & 1 deletion crates/polars-arrow/src/datatypes/field.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ use super::{ArrowDataType, Metadata};
///
/// Almost all IO in this crate uses [`Field`] to represent logical information about the data
/// to be serialized.
#[derive(Debug, Clone, Eq, PartialEq, Hash)]
#[derive(Debug, Clone, Eq, PartialEq, Hash, Default)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
pub struct Field {
/// Its name
Expand Down
5 changes: 4 additions & 1 deletion crates/polars-arrow/src/datatypes/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,11 @@ pub(crate) type Extension = Option<(String, Option<String>)>;
/// which declares the in-memory representation of data.
/// The [`ArrowDataType::Extension`] is special in that it augments a [`ArrowDataType`] with metadata to support custom types.
/// Use `to_logical_type` to desugar such type and return its corresponding logical type.
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
#[derive(Debug, Clone, PartialEq, Eq, Hash, Default)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
pub enum ArrowDataType {
/// Null type
#[default]
Null,
/// `true` and `false`.
Boolean,
Expand Down Expand Up @@ -134,6 +135,8 @@ pub enum ArrowDataType {
/// The metadata is structured so that Arrow systems without special handling
/// for Map can make Map an alias for List. The "layout" attribute for the Map
/// field must have the same contents as a List.
/// - Field
/// - ordered
Map(Box<Field>, bool),
/// A dictionary encoded array (`key_type`, `value_type`), where
/// each array element is an index of `key_type` into an
Expand Down
35 changes: 24 additions & 11 deletions crates/polars-io/src/parquet/read_impl.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,21 +25,30 @@ use crate::predicates::{apply_predicate, PhysicalIoExpr};
use crate::utils::get_reader_bytes;
use crate::RowIndex;

fn enlarge_data_type(mut data_type: ArrowDataType) -> ArrowDataType {
#[cfg(debug_assertions)]
// Ensure we get the proper polars types from schema inference
// This saves unneeded casts.
fn assert_dtypes(data_type: &ArrowDataType) {
match data_type {
ArrowDataType::Utf8 => {
data_type = ArrowDataType::LargeUtf8;
unreachable!()
},
ArrowDataType::Binary => {
data_type = ArrowDataType::LargeBinary;
unreachable!()
},
ArrowDataType::List(mut inner_field) => {
inner_field.data_type = enlarge_data_type(inner_field.data_type);
data_type = ArrowDataType::LargeList(inner_field);
ArrowDataType::List(_) => {
unreachable!()
},
ArrowDataType::LargeList(inner) => {
assert_dtypes(&inner.data_type);
},
ArrowDataType::Struct(fields) => {
for fld in fields {
assert_dtypes(fld.data_type())
}
},
_ => {},
}
data_type
}

fn column_idx_to_series(
Expand All @@ -50,16 +59,20 @@ fn column_idx_to_series(
store: &mmap::ColumnStore,
chunk_size: usize,
) -> PolarsResult<Series> {
let mut field = file_schema.fields[column_i].clone();
field.data_type = enlarge_data_type(field.data_type);
let field = &file_schema.fields[column_i];

#[cfg(debug_assertions)]
{
assert_dtypes(field.data_type())
}

let columns = mmap_columns(store, md.columns(), &field.name);
let iter = mmap::to_deserializer(columns, field.clone(), remaining_rows, Some(chunk_size))?;

if remaining_rows < md.num_rows() {
array_iter_to_series(iter, &field, Some(remaining_rows))
array_iter_to_series(iter, field, Some(remaining_rows))
} else {
array_iter_to_series(iter, &field, None)
array_iter_to_series(iter, field, None)
}
}

Expand Down
14 changes: 0 additions & 14 deletions crates/polars-parquet/src/arrow/read/deserialize/nested.rs
Original file line number Diff line number Diff line change
Expand Up @@ -221,17 +221,6 @@ where
chunk_size,
))
},
Binary | Utf8 => {
init.push(InitNested::Primitive(field.is_nullable));
types.pop();
remove_nested(binary::NestedIter::<i32, _>::new(
columns.pop().unwrap(),
init,
field.data_type().clone(),
num_rows,
chunk_size,
))
},
LargeBinary | LargeUtf8 => {
init.push(InitNested::Primitive(field.is_nullable));
types.pop();
Expand Down Expand Up @@ -570,9 +559,6 @@ fn dict_read<'a, K: DictionaryKey, I: 'a + PagesIter>(
chunk_size,
|x: f64| x,
)),
Utf8 | Binary => primitive(binary::NestedDictIter::<K, i32, _>::new(
iter, init, data_type, num_rows, chunk_size,
)),
LargeUtf8 | LargeBinary => primitive(binary::NestedDictIter::<K, i64, _>::new(
iter, init, data_type, num_rows, chunk_size,
)),
Expand Down
3 changes: 0 additions & 3 deletions crates/polars-parquet/src/arrow/read/deserialize/simple.rs
Original file line number Diff line number Diff line change
Expand Up @@ -636,9 +636,6 @@ fn dict_read<'a, K: DictionaryKey, I: PagesIter + 'a>(
chunk_size,
|x: f64| x,
)),
(PhysicalType::ByteArray, Utf8 | Binary) => dyn_iter(binary::DictIter::<K, i32, _>::new(
iter, data_type, num_rows, chunk_size,
)),
(PhysicalType::ByteArray, LargeUtf8 | LargeBinary) => dyn_iter(
binary::DictIter::<K, i64, _>::new(iter, data_type, num_rows, chunk_size),
),
Expand Down
1 change: 0 additions & 1 deletion crates/polars-parquet/src/arrow/read/deserialize/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -262,7 +262,6 @@ fn reserve_pushable_and_validity<'a, T: Default, P: Pushable<T>>(
runs
}

// TODO! Check if we can monomorphisize this. This is all dynamic dispatch now.
/// Extends a [`Pushable`] from an iterator of non-null values and an hybrid-rle decoder
pub(super) fn extend_from_decoder<T: Default, P: Pushable<T>, I: Iterator<Item = T>>(
validity: &mut MutableBitmap,
Expand Down
Loading

0 comments on commit 55a3ad1

Please sign in to comment.