diff --git a/crates/polars-arrow/src/datatypes/field.rs b/crates/polars-arrow/src/datatypes/field.rs index d17752417352..950f081017c4 100644 --- a/crates/polars-arrow/src/datatypes/field.rs +++ b/crates/polars-arrow/src/datatypes/field.rs @@ -11,7 +11,7 @@ use super::{ArrowDataType, Metadata}; /// /// Almost all IO in this crate uses [`Field`] to represent logical information about the data /// to be serialized. -#[derive(Debug, Clone, Eq, PartialEq, Hash)] +#[derive(Debug, Clone, Eq, PartialEq, Hash, Default)] #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] pub struct Field { /// Its name diff --git a/crates/polars-arrow/src/datatypes/mod.rs b/crates/polars-arrow/src/datatypes/mod.rs index ba2b2995d81c..8fb4019f76f4 100644 --- a/crates/polars-arrow/src/datatypes/mod.rs +++ b/crates/polars-arrow/src/datatypes/mod.rs @@ -26,10 +26,11 @@ pub(crate) type Extension = Option<(String, Option)>; /// which declares the in-memory representation of data. /// The [`ArrowDataType::Extension`] is special in that it augments a [`ArrowDataType`] with metadata to support custom types. /// Use `to_logical_type` to desugar such type and return its corresponding logical type. -#[derive(Debug, Clone, PartialEq, Eq, Hash)] +#[derive(Debug, Clone, PartialEq, Eq, Hash, Default)] #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] pub enum ArrowDataType { /// Null type + #[default] Null, /// `true` and `false`. Boolean, @@ -134,6 +135,8 @@ pub enum ArrowDataType { /// The metadata is structured so that Arrow systems without special handling /// for Map can make Map an alias for List. The "layout" attribute for the Map /// field must have the same contents as a List. + /// - Field + /// - ordered Map(Box, bool), /// A dictionary encoded array (`key_type`, `value_type`), where /// each array element is an index of `key_type` into an diff --git a/crates/polars-io/src/parquet/read_impl.rs b/crates/polars-io/src/parquet/read_impl.rs index 9dc1f6182daf..312d30dc6e94 100644 --- a/crates/polars-io/src/parquet/read_impl.rs +++ b/crates/polars-io/src/parquet/read_impl.rs @@ -25,21 +25,30 @@ use crate::predicates::{apply_predicate, PhysicalIoExpr}; use crate::utils::get_reader_bytes; use crate::RowIndex; -fn enlarge_data_type(mut data_type: ArrowDataType) -> ArrowDataType { +#[cfg(debug_assertions)] +// Ensure we get the proper polars types from schema inference +// This saves unneeded casts. +fn assert_dtypes(data_type: &ArrowDataType) { match data_type { ArrowDataType::Utf8 => { - data_type = ArrowDataType::LargeUtf8; + unreachable!() }, ArrowDataType::Binary => { - data_type = ArrowDataType::LargeBinary; + unreachable!() }, - ArrowDataType::List(mut inner_field) => { - inner_field.data_type = enlarge_data_type(inner_field.data_type); - data_type = ArrowDataType::LargeList(inner_field); + ArrowDataType::List(_) => { + unreachable!() + }, + ArrowDataType::LargeList(inner) => { + assert_dtypes(&inner.data_type); + }, + ArrowDataType::Struct(fields) => { + for fld in fields { + assert_dtypes(fld.data_type()) + } }, _ => {}, } - data_type } fn column_idx_to_series( @@ -50,16 +59,20 @@ fn column_idx_to_series( store: &mmap::ColumnStore, chunk_size: usize, ) -> PolarsResult { - let mut field = file_schema.fields[column_i].clone(); - field.data_type = enlarge_data_type(field.data_type); + let field = &file_schema.fields[column_i]; + + #[cfg(debug_assertions)] + { + assert_dtypes(field.data_type()) + } let columns = mmap_columns(store, md.columns(), &field.name); let iter = mmap::to_deserializer(columns, field.clone(), remaining_rows, Some(chunk_size))?; if remaining_rows < md.num_rows() { - array_iter_to_series(iter, &field, Some(remaining_rows)) + array_iter_to_series(iter, field, Some(remaining_rows)) } else { - array_iter_to_series(iter, &field, None) + array_iter_to_series(iter, field, None) } } diff --git a/crates/polars-parquet/src/arrow/read/deserialize/nested.rs b/crates/polars-parquet/src/arrow/read/deserialize/nested.rs index 8bee072dc170..0341be876fb9 100644 --- a/crates/polars-parquet/src/arrow/read/deserialize/nested.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/nested.rs @@ -221,17 +221,6 @@ where chunk_size, )) }, - Binary | Utf8 => { - init.push(InitNested::Primitive(field.is_nullable)); - types.pop(); - remove_nested(binary::NestedIter::::new( - columns.pop().unwrap(), - init, - field.data_type().clone(), - num_rows, - chunk_size, - )) - }, LargeBinary | LargeUtf8 => { init.push(InitNested::Primitive(field.is_nullable)); types.pop(); @@ -570,9 +559,6 @@ fn dict_read<'a, K: DictionaryKey, I: 'a + PagesIter>( chunk_size, |x: f64| x, )), - Utf8 | Binary => primitive(binary::NestedDictIter::::new( - iter, init, data_type, num_rows, chunk_size, - )), LargeUtf8 | LargeBinary => primitive(binary::NestedDictIter::::new( iter, init, data_type, num_rows, chunk_size, )), diff --git a/crates/polars-parquet/src/arrow/read/deserialize/simple.rs b/crates/polars-parquet/src/arrow/read/deserialize/simple.rs index d6f6eb8f9598..abadab3df26c 100644 --- a/crates/polars-parquet/src/arrow/read/deserialize/simple.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/simple.rs @@ -636,9 +636,6 @@ fn dict_read<'a, K: DictionaryKey, I: PagesIter + 'a>( chunk_size, |x: f64| x, )), - (PhysicalType::ByteArray, Utf8 | Binary) => dyn_iter(binary::DictIter::::new( - iter, data_type, num_rows, chunk_size, - )), (PhysicalType::ByteArray, LargeUtf8 | LargeBinary) => dyn_iter( binary::DictIter::::new(iter, data_type, num_rows, chunk_size), ), diff --git a/crates/polars-parquet/src/arrow/read/deserialize/utils.rs b/crates/polars-parquet/src/arrow/read/deserialize/utils.rs index 2a5b5d103b61..f41f98b46bf1 100644 --- a/crates/polars-parquet/src/arrow/read/deserialize/utils.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/utils.rs @@ -262,7 +262,6 @@ fn reserve_pushable_and_validity<'a, T: Default, P: Pushable>( runs } -// TODO! Check if we can monomorphisize this. This is all dynamic dispatch now. /// Extends a [`Pushable`] from an iterator of non-null values and an hybrid-rle decoder pub(super) fn extend_from_decoder, I: Iterator>( validity: &mut MutableBitmap, diff --git a/crates/polars-parquet/src/arrow/read/schema/convert.rs b/crates/polars-parquet/src/arrow/read/schema/convert.rs index 6f4d763ea60c..5eeaa94a1355 100644 --- a/crates/polars-parquet/src/arrow/read/schema/convert.rs +++ b/crates/polars-parquet/src/arrow/read/schema/convert.rs @@ -150,15 +150,15 @@ fn from_byte_array( converted_type: &Option, ) -> ArrowDataType { match (logical_type, converted_type) { - (Some(PrimitiveLogicalType::String), _) => ArrowDataType::Utf8, - (Some(PrimitiveLogicalType::Json), _) => ArrowDataType::Binary, - (Some(PrimitiveLogicalType::Bson), _) => ArrowDataType::Binary, - (Some(PrimitiveLogicalType::Enum), _) => ArrowDataType::Binary, - (_, Some(PrimitiveConvertedType::Json)) => ArrowDataType::Binary, - (_, Some(PrimitiveConvertedType::Bson)) => ArrowDataType::Binary, - (_, Some(PrimitiveConvertedType::Enum)) => ArrowDataType::Binary, - (_, Some(PrimitiveConvertedType::Utf8)) => ArrowDataType::Utf8, - (_, _) => ArrowDataType::Binary, + (Some(PrimitiveLogicalType::String), _) => ArrowDataType::LargeUtf8, + (Some(PrimitiveLogicalType::Json), _) => ArrowDataType::LargeBinary, + (Some(PrimitiveLogicalType::Bson), _) => ArrowDataType::LargeBinary, + (Some(PrimitiveLogicalType::Enum), _) => ArrowDataType::LargeBinary, + (_, Some(PrimitiveConvertedType::Json)) => ArrowDataType::LargeBinary, + (_, Some(PrimitiveConvertedType::Bson)) => ArrowDataType::LargeBinary, + (_, Some(PrimitiveConvertedType::Enum)) => ArrowDataType::LargeBinary, + (_, Some(PrimitiveConvertedType::Utf8)) => ArrowDataType::LargeUtf8, + (_, _) => ArrowDataType::LargeBinary, } } @@ -221,7 +221,7 @@ fn to_primitive_type( let base_type = to_primitive_type_inner(primitive_type, options); if primitive_type.field_info.repetition == Repetition::Repeated { - ArrowDataType::List(Box::new(Field::new( + ArrowDataType::LargeList(Box::new(Field::new( &primitive_type.field_info.name, base_type, is_nullable(&primitive_type.field_info), @@ -284,7 +284,7 @@ fn to_group_type( ) -> Option { debug_assert!(!fields.is_empty()); if field_info.repetition == Repetition::Repeated { - Some(ArrowDataType::List(Box::new(Field::new( + Some(ArrowDataType::LargeList(Box::new(Field::new( &field_info.name, to_struct(fields, options)?, is_nullable(field_info), @@ -361,7 +361,7 @@ fn to_list( ), }; - Some(ArrowDataType::List(Box::new(Field::new( + Some(ArrowDataType::LargeList(Box::new(Field::new( list_item_name, item_type, item_is_optional, @@ -440,8 +440,8 @@ mod tests { Field::new("int64", ArrowDataType::Int64, false), Field::new("double", ArrowDataType::Float64, true), Field::new("float", ArrowDataType::Float32, true), - Field::new("string", ArrowDataType::Utf8, true), - Field::new("string_2", ArrowDataType::Utf8, true), + Field::new("string", ArrowDataType::LargeUtf8, true), + Field::new("string_2", ArrowDataType::LargeUtf8, true), ]; let parquet_schema = SchemaDescriptor::try_from_message(message)?; @@ -460,7 +460,7 @@ mod tests { } "; let expected = vec![ - Field::new("binary", ArrowDataType::Binary, false), + Field::new("binary", ArrowDataType::LargeBinary, false), Field::new("fixed_binary", ArrowDataType::FixedSizeBinary(20), false), ]; @@ -556,7 +556,11 @@ mod tests { { arrow_fields.push(Field::new( "my_list", - ArrowDataType::List(Box::new(Field::new("element", ArrowDataType::Utf8, true))), + ArrowDataType::LargeList(Box::new(Field::new( + "element", + ArrowDataType::Utf8, + true, + ))), false, )); } @@ -570,7 +574,11 @@ mod tests { { arrow_fields.push(Field::new( "my_list", - ArrowDataType::List(Box::new(Field::new("element", ArrowDataType::Utf8, false))), + ArrowDataType::LargeList(Box::new(Field::new( + "element", + ArrowDataType::Utf8, + false, + ))), true, )); } @@ -588,11 +596,14 @@ mod tests { // } // } { - let arrow_inner_list = - ArrowDataType::List(Box::new(Field::new("element", ArrowDataType::Int32, false))); + let arrow_inner_list = ArrowDataType::LargeList(Box::new(Field::new( + "element", + ArrowDataType::Int32, + false, + ))); arrow_fields.push(Field::new( "array_of_arrays", - ArrowDataType::List(Box::new(Field::new("element", arrow_inner_list, false))), + ArrowDataType::LargeList(Box::new(Field::new("element", arrow_inner_list, false))), true, )); } @@ -606,7 +617,11 @@ mod tests { { arrow_fields.push(Field::new( "my_list", - ArrowDataType::List(Box::new(Field::new("element", ArrowDataType::Utf8, false))), + ArrowDataType::LargeList(Box::new(Field::new( + "element", + ArrowDataType::Utf8, + false, + ))), true, )); } @@ -618,7 +633,11 @@ mod tests { { arrow_fields.push(Field::new( "my_list", - ArrowDataType::List(Box::new(Field::new("element", ArrowDataType::Int32, false))), + ArrowDataType::LargeList(Box::new(Field::new( + "element", + ArrowDataType::Int32, + false, + ))), true, )); } @@ -637,7 +656,7 @@ mod tests { ]); arrow_fields.push(Field::new( "my_list", - ArrowDataType::List(Box::new(Field::new("element", arrow_struct, false))), + ArrowDataType::LargeList(Box::new(Field::new("element", arrow_struct, false))), true, )); } @@ -654,7 +673,7 @@ mod tests { ArrowDataType::Struct(vec![Field::new("str", ArrowDataType::Utf8, false)]); arrow_fields.push(Field::new( "my_list", - ArrowDataType::List(Box::new(Field::new("array", arrow_struct, false))), + ArrowDataType::LargeList(Box::new(Field::new("array", arrow_struct, false))), true, )); } @@ -671,7 +690,11 @@ mod tests { ArrowDataType::Struct(vec![Field::new("str", ArrowDataType::Utf8, false)]); arrow_fields.push(Field::new( "my_list", - ArrowDataType::List(Box::new(Field::new("my_list_tuple", arrow_struct, false))), + ArrowDataType::LargeList(Box::new(Field::new( + "my_list_tuple", + arrow_struct, + false, + ))), true, )); } @@ -681,7 +704,7 @@ mod tests { { arrow_fields.push(Field::new( "name", - ArrowDataType::List(Box::new(Field::new("name", ArrowDataType::Int32, false))), + ArrowDataType::LargeList(Box::new(Field::new("name", ArrowDataType::Int32, false))), false, )); } @@ -710,7 +733,7 @@ mod tests { { let struct_fields = vec![ - Field::new("event_name", ArrowDataType::Utf8, false), + Field::new("event_name", ArrowDataType::LargeUtf8, false), Field::new( "event_time", ArrowDataType::Timestamp(TimeUnit::Millisecond, Some("+00:00".into())), @@ -719,7 +742,7 @@ mod tests { ]; arrow_fields.push(Field::new( "events", - ArrowDataType::List(Box::new(Field::new( + ArrowDataType::LargeList(Box::new(Field::new( "array", ArrowDataType::Struct(struct_fields), false, @@ -768,7 +791,11 @@ mod tests { { arrow_fields.push(Field::new( "my_list1", - ArrowDataType::List(Box::new(Field::new("element", ArrowDataType::Utf8, true))), + ArrowDataType::LargeList(Box::new(Field::new( + "element", + ArrowDataType::LargeUtf8, + true, + ))), false, )); } @@ -782,7 +809,11 @@ mod tests { { arrow_fields.push(Field::new( "my_list2", - ArrowDataType::List(Box::new(Field::new("element", ArrowDataType::Utf8, false))), + ArrowDataType::LargeList(Box::new(Field::new( + "element", + ArrowDataType::LargeUtf8, + false, + ))), true, )); } @@ -796,7 +827,11 @@ mod tests { { arrow_fields.push(Field::new( "my_list3", - ArrowDataType::List(Box::new(Field::new("element", ArrowDataType::Utf8, false))), + ArrowDataType::LargeList(Box::new(Field::new( + "element", + ArrowDataType::LargeUtf8, + false, + ))), false, )); } @@ -849,7 +884,7 @@ mod tests { let inner_group_list = Field::new( "innerGroup", - ArrowDataType::List(Box::new(Field::new( + ArrowDataType::LargeList(Box::new(Field::new( "innerGroup", ArrowDataType::Struct(vec![Field::new("leaf3", ArrowDataType::Int32, true)]), false, @@ -859,7 +894,7 @@ mod tests { let outer_group_list = Field::new( "outerGroup", - ArrowDataType::List(Box::new(Field::new( + ArrowDataType::LargeList(Box::new(Field::new( "outerGroup", ArrowDataType::Struct(vec![ Field::new("leaf2", ArrowDataType::Int32, true), @@ -929,7 +964,11 @@ mod tests { Field::new("string", ArrowDataType::Utf8, true), Field::new( "bools", - ArrowDataType::List(Box::new(Field::new("bools", ArrowDataType::Boolean, false))), + ArrowDataType::LargeList(Box::new(Field::new( + "bools", + ArrowDataType::Boolean, + false, + ))), false, ), Field::new("date", ArrowDataType::Date32, true), @@ -1020,10 +1059,10 @@ mod tests { Field::new("int64", ArrowDataType::Int64, false), Field::new("double", ArrowDataType::Float64, true), Field::new("float", ArrowDataType::Float32, true), - Field::new("string", ArrowDataType::Utf8, true), + Field::new("string", ArrowDataType::LargeUtf8, true), Field::new( "bools", - ArrowDataType::List(Box::new(Field::new( + ArrowDataType::LargeList(Box::new(Field::new( "element", ArrowDataType::Boolean, true, @@ -1032,7 +1071,7 @@ mod tests { ), Field::new( "bools_non_null", - ArrowDataType::List(Box::new(Field::new( + ArrowDataType::LargeList(Box::new(Field::new( "element", ArrowDataType::Boolean, false, @@ -1067,7 +1106,7 @@ mod tests { Field::new("uint32", ArrowDataType::UInt32, false), Field::new( "int32", - ArrowDataType::List(Box::new(Field::new( + ArrowDataType::LargeList(Box::new(Field::new( "element", ArrowDataType::Int32, true, @@ -1077,7 +1116,7 @@ mod tests { ]), false, ), - Field::new("dictionary_strings", ArrowDataType::Utf8, false), + Field::new("dictionary_strings", ArrowDataType::LargeUtf8, false), ]; let parquet_schema = SchemaDescriptor::try_from_message(message_type)?; @@ -1113,7 +1152,11 @@ mod tests { Field::new("int96_field", coerced_to.clone(), false), Field::new( "int96_list", - ArrowDataType::List(Box::new(Field::new("element", coerced_to.clone(), true))), + ArrowDataType::LargeList(Box::new(Field::new( + "element", + coerced_to.clone(), + true, + ))), true, ), Field::new( diff --git a/crates/polars-parquet/src/arrow/read/schema/metadata.rs b/crates/polars-parquet/src/arrow/read/schema/metadata.rs index 1a3582a7f964..aa974c23757c 100644 --- a/crates/polars-parquet/src/arrow/read/schema/metadata.rs +++ b/crates/polars-parquet/src/arrow/read/schema/metadata.rs @@ -1,4 +1,4 @@ -use arrow::datatypes::{ArrowSchema, Metadata}; +use arrow::datatypes::{ArrowDataType, ArrowSchema, Field, Metadata}; use arrow::io::ipc::read::deserialize_schema; use base64::engine::general_purpose; use base64::Engine as _; @@ -17,6 +17,46 @@ pub fn read_schema_from_metadata(metadata: &mut Metadata) -> PolarsResult