Skip to content

Commit

Permalink
Expose page encoding ColumnChunkMetadata (#1322)
Browse files Browse the repository at this point in the history
* init

* replaced test file

* init

* thrift conversion

* refactor

* tests

* clippy
  • Loading branch information
shanisolomon committed Feb 17, 2022
1 parent 9870533 commit f4c7102
Show file tree
Hide file tree
Showing 4 changed files with 131 additions and 2 deletions.
46 changes: 45 additions & 1 deletion parquet/src/file/metadata.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ use parquet_format::{ColumnChunk, ColumnMetaData, RowGroup};

use crate::basic::{ColumnOrder, Compression, Encoding, Type};
use crate::errors::{ParquetError, Result};
use crate::file::page_encoding_stats::{self, PageEncodingStats};
use crate::file::statistics::{self, Statistics};
use crate::schema::types::{
ColumnDescPtr, ColumnDescriptor, ColumnPath, SchemaDescPtr, SchemaDescriptor,
Expand Down Expand Up @@ -349,6 +350,7 @@ pub struct ColumnChunkMetaData {
index_page_offset: Option<i64>,
dictionary_page_offset: Option<i64>,
statistics: Option<Statistics>,
encoding_stats: Option<Vec<PageEncodingStats>>,
bloom_filter_offset: Option<i64>,
offset_index_offset: Option<i64>,
offset_index_length: Option<i32>,
Expand Down Expand Up @@ -467,6 +469,17 @@ impl ColumnChunkMetaData {
self.statistics.as_ref()
}

/// Returns `true` if this column chunk contains page encoding stats, `false` otherwise.
pub fn has_page_encoding_stats(&self) -> bool {
self.encoding_stats.is_some()
}

/// Returns the offset for the page encoding stats,
/// or `None` if no page encoding stats are available.
pub fn page_encoding_stats(&self) -> Option<&Vec<PageEncodingStats>> {
self.encoding_stats.as_ref()
}

/// Returns `true` if this column chunk contains a bloom filter offset, `false` otherwise.
pub fn has_bloom_filter(&self) -> bool {
self.bloom_filter_offset.is_some()
Expand Down Expand Up @@ -528,11 +541,16 @@ impl ColumnChunkMetaData {
let index_page_offset = col_metadata.index_page_offset;
let dictionary_page_offset = col_metadata.dictionary_page_offset;
let statistics = statistics::from_thrift(column_type, col_metadata.statistics);
let encoding_stats = col_metadata
.encoding_stats
.as_ref()
.map(|vec| vec.iter().map(page_encoding_stats::from_thrift).collect());
let bloom_filter_offset = col_metadata.bloom_filter_offset;
let offset_index_offset = cc.offset_index_offset;
let offset_index_length = cc.offset_index_length;
let column_index_offset = cc.column_index_offset;
let column_index_length = cc.column_index_length;

let result = ColumnChunkMetaData {
column_type,
column_path,
Expand All @@ -548,6 +566,7 @@ impl ColumnChunkMetaData {
index_page_offset,
dictionary_page_offset,
statistics,
encoding_stats,
bloom_filter_offset,
offset_index_offset,
offset_index_length,
Expand All @@ -572,7 +591,10 @@ impl ColumnChunkMetaData {
index_page_offset: self.index_page_offset,
dictionary_page_offset: self.dictionary_page_offset,
statistics: statistics::to_thrift(self.statistics.as_ref()),
encoding_stats: None,
encoding_stats: self
.encoding_stats
.as_ref()
.map(|vec| vec.iter().map(page_encoding_stats::to_thrift).collect()),
bloom_filter_offset: self.bloom_filter_offset,
};

Expand Down Expand Up @@ -604,6 +626,7 @@ pub struct ColumnChunkMetaDataBuilder {
index_page_offset: Option<i64>,
dictionary_page_offset: Option<i64>,
statistics: Option<Statistics>,
encoding_stats: Option<Vec<PageEncodingStats>>,
bloom_filter_offset: Option<i64>,
offset_index_offset: Option<i64>,
offset_index_length: Option<i32>,
Expand All @@ -627,6 +650,7 @@ impl ColumnChunkMetaDataBuilder {
index_page_offset: None,
dictionary_page_offset: None,
statistics: None,
encoding_stats: None,
bloom_filter_offset: None,
offset_index_offset: None,
offset_index_length: None,
Expand Down Expand Up @@ -701,6 +725,12 @@ impl ColumnChunkMetaDataBuilder {
self
}

/// Sets page encoding stats for this column chunk.
pub fn set_page_encoding_stats(mut self, value: Vec<PageEncodingStats>) -> Self {
self.encoding_stats = Some(value);
self
}

/// Sets optional bloom filter offset in bytes.
pub fn set_bloom_filter_offset(mut self, value: Option<i64>) -> Self {
self.bloom_filter_offset = value;
Expand Down Expand Up @@ -748,6 +778,7 @@ impl ColumnChunkMetaDataBuilder {
index_page_offset: self.index_page_offset,
dictionary_page_offset: self.dictionary_page_offset,
statistics: self.statistics,
encoding_stats: self.encoding_stats,
bloom_filter_offset: self.bloom_filter_offset,
offset_index_offset: self.offset_index_offset,
offset_index_length: self.offset_index_length,
Expand All @@ -760,6 +791,7 @@ impl ColumnChunkMetaDataBuilder {
#[cfg(test)]
mod tests {
use super::*;
use crate::basic::{Encoding, PageType};

#[test]
fn test_row_group_metadata_thrift_conversion() {
Expand Down Expand Up @@ -815,6 +847,18 @@ mod tests {
.set_total_uncompressed_size(3000)
.set_data_page_offset(4000)
.set_dictionary_page_offset(Some(5000))
.set_page_encoding_stats(vec![
PageEncodingStats {
page_type: PageType::DATA_PAGE,
encoding: Encoding::PLAIN,
count: 3,
},
PageEncodingStats {
page_type: PageType::DATA_PAGE,
encoding: Encoding::RLE,
count: 5,
},
])
.set_bloom_filter_offset(Some(6000))
.set_offset_index_offset(Some(7000))
.set_offset_index_length(Some(25))
Expand Down
1 change: 1 addition & 0 deletions parquet/src/file/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@
//! ```
pub mod footer;
pub mod metadata;
pub mod page_encoding_stats;
pub mod properties;
pub mod reader;
pub mod serialized_reader;
Expand Down
75 changes: 75 additions & 0 deletions parquet/src/file/page_encoding_stats.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

use crate::basic::{Encoding, PageType};
use parquet_format::{
Encoding as TEncoding, PageEncodingStats as TPageEncodingStats, PageType as TPageType,
};

/// PageEncodingStats for a column chunk and data page.
#[derive(Clone, Debug, PartialEq)]
pub struct PageEncodingStats {
/// the page type (data/dic/...)
pub page_type: PageType,
/// encoding of the page
pub encoding: Encoding,
/// number of pages of this type with this encoding
pub count: i32,
}

/// Converts Thrift definition into `PageEncodingStats`.
pub fn from_thrift(thrift_encoding_stats: &TPageEncodingStats) -> PageEncodingStats {
let page_type = PageType::from(thrift_encoding_stats.page_type);
let encoding = Encoding::from(thrift_encoding_stats.encoding);
let count = thrift_encoding_stats.count;

PageEncodingStats {
page_type,
encoding,
count,
}
}

/// Converts `PageEncodingStats` into Thrift definition.
pub fn to_thrift(encoding_stats: &PageEncodingStats) -> TPageEncodingStats {
let page_type = TPageType::from(encoding_stats.page_type);
let encoding = TEncoding::from(encoding_stats.encoding);
let count = encoding_stats.count;

TPageEncodingStats {
page_type,
encoding,
count,
}
}

#[cfg(test)]
mod tests {
use super::*;
use crate::basic::{Encoding, PageType};

#[test]
fn test_page_encoding_stats_from_thrift() {
let stats = PageEncodingStats {
page_type: PageType::DATA_PAGE,
encoding: Encoding::PLAIN,
count: 1,
};

assert_eq!(from_thrift(&to_thrift(&stats)), stats);
}
}
11 changes: 10 additions & 1 deletion parquet/src/file/serialized_reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -400,7 +400,7 @@ impl<T: Read + Send> PageReader for SerializedPageReader<T> {
#[cfg(test)]
mod tests {
use super::*;
use crate::basic::ColumnOrder;
use crate::basic::{self, ColumnOrder};
use crate::record::RowAccessor;
use crate::schema::parser::parse_message_type;
use crate::util::test_common::{get_test_file, get_test_path};
Expand Down Expand Up @@ -772,6 +772,15 @@ mod tests {
// test optional bloom filter offset
assert_eq!(col0_metadata.bloom_filter_offset().unwrap(), 192);

// test page encoding stats
assert!(col0_metadata.has_page_encoding_stats());
let page_encoding_stats =
col0_metadata.page_encoding_stats().unwrap().get(0).unwrap();

assert_eq!(page_encoding_stats.page_type, basic::PageType::DATA_PAGE);
assert_eq!(page_encoding_stats.encoding, Encoding::PLAIN);
assert_eq!(page_encoding_stats.count, 1);

// test optional column index offset
assert!(col0_metadata.has_column_index());
assert_eq!(col0_metadata.column_index_offset().unwrap(), 156);
Expand Down

0 comments on commit f4c7102

Please sign in to comment.