diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index a917c486498..11c39685911 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -636,8 +636,16 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { Type::BYTE_ARRAY | Type::FIXED_LEN_BYTE_ARRAY => { self.column_index_builder.append( null_page, - self.truncate_min_value(stat.min_bytes()), - self.truncate_max_value(stat.max_bytes()), + self.truncate_min_value( + self.props.column_index_truncate_length(), + stat.min_bytes(), + ) + .0, + self.truncate_max_value( + self.props.column_index_truncate_length(), + stat.max_bytes(), + ) + .0, self.page_metrics.num_page_nulls as i64, ); } @@ -658,26 +666,26 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { .append_row_count(self.page_metrics.num_buffered_rows as i64); } - fn truncate_min_value(&self, data: &[u8]) -> Vec { - self.props - .column_index_truncate_length() + fn truncate_min_value(&self, truncation_length: Option, data: &[u8]) -> (Vec, bool) { + truncation_length .filter(|l| data.len() > *l) .and_then(|l| match str::from_utf8(data) { Ok(str_data) => truncate_utf8(str_data, l), Err(_) => Some(data[..l].to_vec()), }) - .unwrap_or_else(|| data.to_vec()) + .map(|truncated| (truncated, true)) + .unwrap_or_else(|| (data.to_vec(), false)) } - fn truncate_max_value(&self, data: &[u8]) -> Vec { - self.props - .column_index_truncate_length() + fn truncate_max_value(&self, truncation_length: Option, data: &[u8]) -> (Vec, bool) { + truncation_length .filter(|l| data.len() > *l) .and_then(|l| match str::from_utf8(data) { Ok(str_data) => truncate_utf8(str_data, l).and_then(increment_utf8), Err(_) => increment(data[..l].to_vec()), }) - .unwrap_or_else(|| data.to_vec()) + .map(|truncated| (truncated, true)) + .unwrap_or_else(|| (data.to_vec(), false)) } /// Adds data page. @@ -856,20 +864,64 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { .set_dictionary_page_offset(dict_page_offset); if self.statistics_enabled != EnabledStatistics::None { + let backwards_compatible_min_max = self.descr.sort_order().is_signed(); + let statistics = ValueStatistics::::new( self.column_metrics.min_column_value.clone(), self.column_metrics.max_column_value.clone(), self.column_metrics.column_distinct_count, self.column_metrics.num_column_nulls, false, - ); + ) + .with_backwards_compatible_min_max(backwards_compatible_min_max) + .into(); + + let statistics = match statistics { + Statistics::ByteArray(stats) if stats.has_min_max_set() => { + let (min, did_truncate_min) = self.truncate_min_value( + self.props.statistics_truncate_length(), + stats.min_bytes(), + ); + let (max, did_truncate_max) = self.truncate_max_value( + self.props.statistics_truncate_length(), + stats.max_bytes(), + ); + Statistics::ByteArray( + ValueStatistics::new( + Some(min.into()), + Some(max.into()), + stats.distinct_count(), + stats.null_count(), + backwards_compatible_min_max, + ) + .with_max_is_exact(!did_truncate_max) + .with_min_is_exact(!did_truncate_min), + ) + } + Statistics::FixedLenByteArray(stats) if stats.has_min_max_set() => { + let (min, did_truncate_min) = self.truncate_min_value( + self.props.statistics_truncate_length(), + stats.min_bytes(), + ); + let (max, did_truncate_max) = self.truncate_max_value( + self.props.statistics_truncate_length(), + stats.max_bytes(), + ); + Statistics::FixedLenByteArray( + ValueStatistics::new( + Some(min.into()), + Some(max.into()), + stats.distinct_count(), + stats.null_count(), + backwards_compatible_min_max, + ) + .with_max_is_exact(!did_truncate_max) + .with_min_is_exact(!did_truncate_min), + ) + } + stats => stats, + }; - // Some common readers only support the deprecated statistics - // format so we also write them out if possible - // See https://github.com/apache/arrow-rs/issues/799 - let statistics = statistics - .with_backwards_compatible_min_max(self.descr.sort_order().is_signed()) - .into(); builder = builder.set_statistics(statistics); } @@ -2612,6 +2664,148 @@ mod tests { } } + #[test] + fn test_statistics_truncating_byte_array() { + let page_writer = get_test_page_writer(); + + const TEST_TRUNCATE_LENGTH: usize = 1; + + // Truncate values at 1 byte + let builder = + WriterProperties::builder().set_statistics_truncate_length(Some(TEST_TRUNCATE_LENGTH)); + let props = Arc::new(builder.build()); + let mut writer = get_test_column_writer::(page_writer, 0, 0, props); + + let mut data = vec![ByteArray::default(); 1]; + // This is the expected min value + data[0].set_data(Bytes::from(String::from("Blart Versenwald III"))); + + writer.write_batch(&data, None, None).unwrap(); + + writer.flush_data_pages().unwrap(); + + let r = writer.close().unwrap(); + + assert_eq!(1, r.rows_written); + + let stats = r.metadata.statistics().expect("statistics"); + assert!(stats.has_min_max_set()); + assert_eq!(stats.null_count(), 0); + assert_eq!(stats.distinct_count(), None); + if let Statistics::ByteArray(_stats) = stats { + let min_value = _stats.min(); + let max_value = _stats.max(); + + assert!(!_stats.min_is_exact()); + assert!(!_stats.max_is_exact()); + + assert_eq!(min_value.len(), TEST_TRUNCATE_LENGTH); + assert_eq!(max_value.len(), TEST_TRUNCATE_LENGTH); + + assert_eq!("B".as_bytes(), min_value.as_bytes()); + assert_eq!("C".as_bytes(), max_value.as_bytes()); + } else { + panic!("expecting Statistics::ByteArray"); + } + } + + #[test] + fn test_statistics_truncating_fixed_len_byte_array() { + let page_writer = get_test_page_writer(); + + const TEST_TRUNCATE_LENGTH: usize = 1; + + // Truncate values at 1 byte + let builder = + WriterProperties::builder().set_statistics_truncate_length(Some(TEST_TRUNCATE_LENGTH)); + let props = Arc::new(builder.build()); + let mut writer = get_test_column_writer::(page_writer, 0, 0, props); + + let mut data = vec![FixedLenByteArray::default(); 1]; + + const PSEUDO_DECIMAL_VALUE: i128 = 6541894651216648486512564456564654; + const PSEUDO_DECIMAL_BYTES: [u8; 16] = PSEUDO_DECIMAL_VALUE.to_be_bytes(); + + const EXPECTED_MIN: [u8; TEST_TRUNCATE_LENGTH] = [PSEUDO_DECIMAL_BYTES[0]]; // parquet specifies big-endian order for decimals + const EXPECTED_MAX: [u8; TEST_TRUNCATE_LENGTH] = + [PSEUDO_DECIMAL_BYTES[0].overflowing_add(1).0]; + + // This is the expected min value + data[0].set_data(Bytes::from(PSEUDO_DECIMAL_BYTES.as_slice())); + + writer.write_batch(&data, None, None).unwrap(); + + writer.flush_data_pages().unwrap(); + + let r = writer.close().unwrap(); + + assert_eq!(1, r.rows_written); + + let stats = r.metadata.statistics().expect("statistics"); + assert!(stats.has_min_max_set()); + assert_eq!(stats.null_count(), 0); + assert_eq!(stats.distinct_count(), None); + if let Statistics::FixedLenByteArray(_stats) = stats { + let min_value = _stats.min(); + let max_value = _stats.max(); + + assert!(!_stats.min_is_exact()); + assert!(!_stats.max_is_exact()); + + assert_eq!(min_value.len(), TEST_TRUNCATE_LENGTH); + assert_eq!(max_value.len(), TEST_TRUNCATE_LENGTH); + + assert_eq!(EXPECTED_MIN.as_slice(), min_value.as_bytes()); + assert_eq!(EXPECTED_MAX.as_slice(), max_value.as_bytes()); + + let reconstructed_min = i128::from_be_bytes([ + min_value.as_bytes()[0], + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + ]); + + let reconstructed_max = i128::from_be_bytes([ + max_value.as_bytes()[0], + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + ]); + + // check that the inner value is correctly bounded by the min/max + println!("min: {reconstructed_min} {PSEUDO_DECIMAL_VALUE}"); + assert!(reconstructed_min <= PSEUDO_DECIMAL_VALUE); + println!("max {reconstructed_max} {PSEUDO_DECIMAL_VALUE}"); + assert!(reconstructed_max >= PSEUDO_DECIMAL_VALUE); + } else { + panic!("expecting Statistics::FixedLenByteArray"); + } + } + #[test] fn test_send() { fn test() {} diff --git a/parquet/src/file/properties.rs b/parquet/src/file/properties.rs index ea71763a010..287e73c9906 100644 --- a/parquet/src/file/properties.rs +++ b/parquet/src/file/properties.rs @@ -51,6 +51,8 @@ pub const DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH: Option = Some(64); pub const DEFAULT_BLOOM_FILTER_FPP: f64 = 0.05; /// Default value for [`BloomFilterProperties::ndv`] pub const DEFAULT_BLOOM_FILTER_NDV: u64 = 1_000_000_u64; +/// Default values for [`WriterProperties::statistics_truncate_length`] +pub const DEFAULT_STATISTICS_TRUNCATE_LENGTH: Option = None; /// Parquet writer version. /// @@ -136,6 +138,7 @@ pub struct WriterProperties { column_properties: HashMap, sorting_columns: Option>, column_index_truncate_length: Option, + statistics_truncate_length: Option, } impl Default for WriterProperties { @@ -241,6 +244,13 @@ impl WriterProperties { self.column_index_truncate_length } + /// Returns the maximum length of truncated min/max values in statistics. + /// + /// `None` if truncation is disabled, must be greater than 0 otherwise. + pub fn statistics_truncate_length(&self) -> Option { + self.statistics_truncate_length + } + /// Returns encoding for a data page, when dictionary encoding is enabled. /// This is not configurable. #[inline] @@ -334,6 +344,7 @@ pub struct WriterPropertiesBuilder { column_properties: HashMap, sorting_columns: Option>, column_index_truncate_length: Option, + statistics_truncate_length: Option, } impl WriterPropertiesBuilder { @@ -352,6 +363,7 @@ impl WriterPropertiesBuilder { column_properties: HashMap::new(), sorting_columns: None, column_index_truncate_length: DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH, + statistics_truncate_length: DEFAULT_STATISTICS_TRUNCATE_LENGTH, } } @@ -370,6 +382,7 @@ impl WriterPropertiesBuilder { column_properties: self.column_properties, sorting_columns: self.sorting_columns, column_index_truncate_length: self.column_index_truncate_length, + statistics_truncate_length: self.statistics_truncate_length, } } @@ -643,6 +656,17 @@ impl WriterPropertiesBuilder { self.column_index_truncate_length = max_length; self } + + /// Sets the max length of min/max value fields in statistics. Must be greater than 0. + /// If set to `None` - there's no effective limit. + pub fn set_statistics_truncate_length(mut self, max_length: Option) -> Self { + if let Some(value) = max_length { + assert!(value > 0, "Cannot have a 0 statistics truncate length. If you wish to disable min/max value truncation, set it to `None`."); + } + + self.statistics_truncate_length = max_length; + self + } } /// Controls the level of statistics to be computed by the writer diff --git a/parquet/src/file/statistics.rs b/parquet/src/file/statistics.rs index 345fe7dd261..1bc003d4885 100644 --- a/parquet/src/file/statistics.rs +++ b/parquet/src/file/statistics.rs @@ -27,6 +27,8 @@ //! assert_eq!(stats.null_count(), 3); //! assert!(stats.has_min_max_set()); //! assert!(stats.is_min_max_deprecated()); +//! assert!(stats.min_is_exact()); +//! assert!(stats.max_is_exact()); //! //! match stats { //! Statistics::Int32(ref typed) => { @@ -206,19 +208,27 @@ pub fn from_thrift( null_count, old_format, ), - Type::BYTE_ARRAY => Statistics::byte_array( - min.map(ByteArray::from), - max.map(ByteArray::from), - distinct_count, - null_count, - old_format, + Type::BYTE_ARRAY => Statistics::ByteArray( + ValueStatistics::new( + min.map(ByteArray::from), + max.map(ByteArray::from), + distinct_count, + null_count, + old_format, + ) + .with_max_is_exact(stats.is_max_value_exact.unwrap_or(false)) + .with_min_is_exact(stats.is_min_value_exact.unwrap_or(false)), ), - Type::FIXED_LEN_BYTE_ARRAY => Statistics::fixed_len_byte_array( - min.map(ByteArray::from).map(FixedLenByteArray::from), - max.map(ByteArray::from).map(FixedLenByteArray::from), - distinct_count, - null_count, - old_format, + Type::FIXED_LEN_BYTE_ARRAY => Statistics::FixedLenByteArray( + ValueStatistics::new( + min.map(ByteArray::from).map(FixedLenByteArray::from), + max.map(ByteArray::from).map(FixedLenByteArray::from), + distinct_count, + null_count, + old_format, + ) + .with_max_is_exact(stats.is_max_value_exact.unwrap_or(false)) + .with_min_is_exact(stats.is_min_value_exact.unwrap_or(false)), ), }; @@ -248,13 +258,15 @@ pub fn to_thrift(stats: Option<&Statistics>) -> Option { }; // Get min/max if set. - let (min, max) = if stats.has_min_max_set() { + let (min, max, min_exact, max_exact) = if stats.has_min_max_set() { ( Some(stats.min_bytes().to_vec()), Some(stats.max_bytes().to_vec()), + Some(stats.min_is_exact()), + Some(stats.max_is_exact()), ) } else { - (None, None) + (None, None, None, None) }; if stats.is_min_max_backwards_compatible() { @@ -268,6 +280,9 @@ pub fn to_thrift(stats: Option<&Statistics>) -> Option { thrift_stats.max_value = max; } + thrift_stats.is_min_value_exact = min_exact; + thrift_stats.is_max_value_exact = max_exact; + Some(thrift_stats) } @@ -374,6 +389,16 @@ impl Statistics { statistics_enum_func![self, has_min_max_set] } + /// Returns `true` if the min value is set, and is an exact min value. + pub fn min_is_exact(&self) -> bool { + statistics_enum_func![self, min_is_exact] + } + + /// Returns `true` if the max value is set, and is an exact max value. + pub fn max_is_exact(&self) -> bool { + statistics_enum_func![self, max_is_exact] + } + /// Returns slice of bytes that represent min value. /// Panics if min value is not set. pub fn min_bytes(&self) -> &[u8] { @@ -428,6 +453,10 @@ pub struct ValueStatistics { distinct_count: Option, null_count: u64, + // Whether or not the min or max values are exact, or truncated. + is_max_value_exact: bool, + is_min_value_exact: bool, + /// If `true` populate the deprecated `min` and `max` fields instead of /// `min_value` and `max_value` is_min_max_deprecated: bool, @@ -447,6 +476,8 @@ impl ValueStatistics { is_min_max_deprecated: bool, ) -> Self { Self { + is_max_value_exact: max.is_some(), + is_min_value_exact: min.is_some(), min, max, distinct_count, @@ -456,6 +487,28 @@ impl ValueStatistics { } } + /// Set whether the stored `min` field represents the exact + /// minimum, or just a bound on the minimum value. + /// + /// see [`Self::min_is_exact`] + pub fn with_min_is_exact(self, is_min_value_exact: bool) -> Self { + Self { + is_min_value_exact, + ..self + } + } + + /// Set whether the stored `max` field represents the exact + /// maximum, or just a bound on the maximum value. + /// + /// see [`Self::max_is_exact`] + pub fn with_max_is_exact(self, is_max_value_exact: bool) -> Self { + Self { + is_max_value_exact, + ..self + } + } + /// Set whether to write the deprecated `min` and `max` fields /// for compatibility with older parquet writers /// @@ -506,13 +559,23 @@ impl ValueStatistics { self.min.is_some() && self.max.is_some() } + /// Whether or not max value is set, and is an exact value. + pub fn max_is_exact(&self) -> bool { + self.max.is_some() && self.is_max_value_exact + } + + /// Whether or not min value is set, and is an exact value. + pub fn min_is_exact(&self) -> bool { + self.min.is_some() && self.is_min_value_exact + } + /// Returns optional value of number of distinct values occurring. - fn distinct_count(&self) -> Option { + pub fn distinct_count(&self) -> Option { self.distinct_count } /// Returns null count. - fn null_count(&self) -> u64 { + pub fn null_count(&self) -> u64 { self.null_count } @@ -556,6 +619,8 @@ impl fmt::Display for ValueStatistics { } write!(f, ", null_count: {}", self.null_count)?; write!(f, ", min_max_deprecated: {}", self.is_min_max_deprecated)?; + write!(f, ", max_value_exact: {}", self.is_max_value_exact)?; + write!(f, ", min_value_exact: {}", self.is_min_value_exact)?; write!(f, "}}") } } @@ -565,13 +630,15 @@ impl fmt::Debug for ValueStatistics { write!( f, "{{min: {:?}, max: {:?}, distinct_count: {:?}, null_count: {}, \ - min_max_deprecated: {}, min_max_backwards_compatible: {}}}", + min_max_deprecated: {}, min_max_backwards_compatible: {}, max_value_exact: {}, min_value_exact: {}}}", self.min, self.max, self.distinct_count, self.null_count, self.is_min_max_deprecated, - self.is_min_max_backwards_compatible + self.is_min_max_backwards_compatible, + self.is_max_value_exact, + self.is_min_value_exact ) } } @@ -628,14 +695,14 @@ mod tests { assert_eq!( format!("{stats:?}"), "Int32({min: Some(1), max: Some(12), distinct_count: None, null_count: 12, \ - min_max_deprecated: true, min_max_backwards_compatible: true})" + min_max_deprecated: true, min_max_backwards_compatible: true, max_value_exact: true, min_value_exact: true})" ); let stats = Statistics::int32(None, None, None, 7, false); assert_eq!( format!("{stats:?}"), "Int32({min: None, max: None, distinct_count: None, null_count: 7, \ - min_max_deprecated: false, min_max_backwards_compatible: false})" + min_max_deprecated: false, min_max_backwards_compatible: false, max_value_exact: false, min_value_exact: false})" ) } @@ -644,14 +711,14 @@ mod tests { let stats = Statistics::int32(Some(1), Some(12), None, 12, true); assert_eq!( format!("{stats}"), - "{min: 1, max: 12, distinct_count: N/A, null_count: 12, min_max_deprecated: true}" + "{min: 1, max: 12, distinct_count: N/A, null_count: 12, min_max_deprecated: true, max_value_exact: true, min_value_exact: true}" ); let stats = Statistics::int64(None, None, None, 7, false); assert_eq!( format!("{stats}"), "{min: N/A, max: N/A, distinct_count: N/A, null_count: 7, min_max_deprecated: \ - false}" + false, max_value_exact: false, min_value_exact: false}" ); let stats = Statistics::int96( @@ -664,19 +731,23 @@ mod tests { assert_eq!( format!("{stats}"), "{min: [1, 0, 0], max: [2, 3, 4], distinct_count: N/A, null_count: 3, \ - min_max_deprecated: true}" + min_max_deprecated: true, max_value_exact: true, min_value_exact: true}" ); - let stats = Statistics::byte_array( - Some(ByteArray::from(vec![1u8])), - Some(ByteArray::from(vec![2u8])), - Some(5), - 7, - false, + let stats = Statistics::ByteArray( + ValueStatistics::new( + Some(ByteArray::from(vec![1u8])), + Some(ByteArray::from(vec![2u8])), + Some(5), + 7, + false, + ) + .with_max_is_exact(false) + .with_min_is_exact(false), ); assert_eq!( format!("{stats}"), - "{min: [1], max: [2], distinct_count: 5, null_count: 7, min_max_deprecated: false}" + "{min: [1], max: [2], distinct_count: 5, null_count: 7, min_max_deprecated: false, max_value_exact: false, min_value_exact: false}" ); } @@ -712,7 +783,45 @@ mod tests { Some(ByteArray::from(vec![1, 2, 3]).into()), None, 0, - true + true, + ) + ); + + assert!( + Statistics::byte_array( + Some(ByteArray::from(vec![1, 2, 3])), + Some(ByteArray::from(vec![1, 2, 3])), + None, + 0, + true, + ) != Statistics::ByteArray( + ValueStatistics::new( + Some(ByteArray::from(vec![1, 2, 3])), + Some(ByteArray::from(vec![1, 2, 3])), + None, + 0, + true, + ) + .with_max_is_exact(false) + ) + ); + + assert!( + Statistics::fixed_len_byte_array( + Some(FixedLenByteArray::from(vec![1, 2, 3])), + Some(FixedLenByteArray::from(vec![1, 2, 3])), + None, + 0, + true, + ) != Statistics::FixedLenByteArray( + ValueStatistics::new( + Some(FixedLenByteArray::from(vec![1, 2, 3])), + Some(FixedLenByteArray::from(vec![1, 2, 3])), + None, + 0, + true, + ) + .with_min_is_exact(false) ) ); } diff --git a/parquet/tests/arrow_writer_layout.rs b/parquet/tests/arrow_writer_layout.rs index fab87f32f5c..cd124031cfd 100644 --- a/parquet/tests/arrow_writer_layout.rs +++ b/parquet/tests/arrow_writer_layout.rs @@ -185,7 +185,7 @@ fn test_primitive() { pages: (0..8) .map(|_| Page { rows: 250, - page_header_size: 34, + page_header_size: 36, compressed_size: 1000, encoding: Encoding::PLAIN, page_type: PageType::DATA_PAGE, @@ -214,14 +214,14 @@ fn test_primitive() { pages: vec![ Page { rows: 250, - page_header_size: 34, + page_header_size: 36, compressed_size: 258, encoding: Encoding::RLE_DICTIONARY, page_type: PageType::DATA_PAGE, }, Page { rows: 1750, - page_header_size: 34, + page_header_size: 36, compressed_size: 7000, encoding: Encoding::PLAIN, page_type: PageType::DATA_PAGE, @@ -229,7 +229,7 @@ fn test_primitive() { ], dictionary_page: Some(Page { rows: 250, - page_header_size: 34, + page_header_size: 36, compressed_size: 1000, encoding: Encoding::PLAIN, page_type: PageType::DICTIONARY_PAGE, @@ -256,42 +256,42 @@ fn test_primitive() { pages: vec![ Page { rows: 400, - page_header_size: 34, + page_header_size: 36, compressed_size: 452, encoding: Encoding::RLE_DICTIONARY, page_type: PageType::DATA_PAGE, }, Page { rows: 370, - page_header_size: 34, + page_header_size: 36, compressed_size: 472, encoding: Encoding::RLE_DICTIONARY, page_type: PageType::DATA_PAGE, }, Page { rows: 330, - page_header_size: 34, + page_header_size: 36, compressed_size: 464, encoding: Encoding::RLE_DICTIONARY, page_type: PageType::DATA_PAGE, }, Page { rows: 330, - page_header_size: 34, + page_header_size: 36, compressed_size: 464, encoding: Encoding::RLE_DICTIONARY, page_type: PageType::DATA_PAGE, }, Page { rows: 330, - page_header_size: 34, + page_header_size: 36, compressed_size: 464, encoding: Encoding::RLE_DICTIONARY, page_type: PageType::DATA_PAGE, }, Page { rows: 240, - page_header_size: 34, + page_header_size: 36, compressed_size: 332, encoding: Encoding::RLE_DICTIONARY, page_type: PageType::DATA_PAGE, @@ -299,7 +299,7 @@ fn test_primitive() { ], dictionary_page: Some(Page { rows: 2000, - page_header_size: 34, + page_header_size: 36, compressed_size: 8000, encoding: Encoding::PLAIN, page_type: PageType::DICTIONARY_PAGE, @@ -325,7 +325,7 @@ fn test_primitive() { pages: (0..20) .map(|_| Page { rows: 100, - page_header_size: 34, + page_header_size: 36, compressed_size: 400, encoding: Encoding::PLAIN, page_type: PageType::DATA_PAGE, @@ -360,14 +360,14 @@ fn test_string() { pages: (0..15) .map(|_| Page { rows: 130, - page_header_size: 34, + page_header_size: 36, compressed_size: 1040, encoding: Encoding::PLAIN, page_type: PageType::DATA_PAGE, }) .chain(std::iter::once(Page { rows: 50, - page_header_size: 33, + page_header_size: 35, compressed_size: 400, encoding: Encoding::PLAIN, page_type: PageType::DATA_PAGE, @@ -396,21 +396,21 @@ fn test_string() { pages: vec![ Page { rows: 130, - page_header_size: 34, + page_header_size: 36, compressed_size: 138, encoding: Encoding::RLE_DICTIONARY, page_type: PageType::DATA_PAGE, }, Page { rows: 1250, - page_header_size: 36, + page_header_size: 38, compressed_size: 10000, encoding: Encoding::PLAIN, page_type: PageType::DATA_PAGE, }, Page { rows: 620, - page_header_size: 34, + page_header_size: 36, compressed_size: 4960, encoding: Encoding::PLAIN, page_type: PageType::DATA_PAGE, @@ -418,7 +418,7 @@ fn test_string() { ], dictionary_page: Some(Page { rows: 130, - page_header_size: 34, + page_header_size: 36, compressed_size: 1040, encoding: Encoding::PLAIN, page_type: PageType::DICTIONARY_PAGE, @@ -445,42 +445,42 @@ fn test_string() { pages: vec![ Page { rows: 400, - page_header_size: 34, + page_header_size: 36, compressed_size: 452, encoding: Encoding::RLE_DICTIONARY, page_type: PageType::DATA_PAGE, }, Page { rows: 370, - page_header_size: 34, + page_header_size: 36, compressed_size: 472, encoding: Encoding::RLE_DICTIONARY, page_type: PageType::DATA_PAGE, }, Page { rows: 330, - page_header_size: 34, + page_header_size: 36, compressed_size: 464, encoding: Encoding::RLE_DICTIONARY, page_type: PageType::DATA_PAGE, }, Page { rows: 330, - page_header_size: 34, + page_header_size: 36, compressed_size: 464, encoding: Encoding::RLE_DICTIONARY, page_type: PageType::DATA_PAGE, }, Page { rows: 330, - page_header_size: 34, + page_header_size: 36, compressed_size: 464, encoding: Encoding::RLE_DICTIONARY, page_type: PageType::DATA_PAGE, }, Page { rows: 240, - page_header_size: 34, + page_header_size: 36, compressed_size: 332, encoding: Encoding::RLE_DICTIONARY, page_type: PageType::DATA_PAGE, @@ -488,7 +488,7 @@ fn test_string() { ], dictionary_page: Some(Page { rows: 2000, - page_header_size: 34, + page_header_size: 36, compressed_size: 16000, encoding: Encoding::PLAIN, page_type: PageType::DICTIONARY_PAGE, @@ -528,7 +528,7 @@ fn test_list() { pages: (0..10) .map(|_| Page { rows: 20, - page_header_size: 34, + page_header_size: 36, compressed_size: 672, encoding: Encoding::PLAIN, page_type: PageType::DATA_PAGE,