Skip to content

Commit

Permalink
Expose column index and offset index (#1318)
Browse files Browse the repository at this point in the history
# Which issue does this PR close?
Closes #1317.

Exposing the column index and offset index offsets and lengths so parquet engines could optimize their reads.
  • Loading branch information
shanisolomon committed Feb 16, 2022
1 parent c26a0a1 commit 827cc3e
Show file tree
Hide file tree
Showing 3 changed files with 113 additions and 4 deletions.
88 changes: 84 additions & 4 deletions parquet/src/file/metadata.rs
Original file line number Diff line number Diff line change
Expand Up @@ -350,6 +350,10 @@ pub struct ColumnChunkMetaData {
dictionary_page_offset: Option<i64>,
statistics: Option<Statistics>,
bloom_filter_offset: Option<i64>,
offset_index_offset: Option<i64>,
offset_index_length: Option<i32>,
column_index_offset: Option<i64>,
column_index_length: Option<i32>,
}

/// Represents common operations for a column chunk.
Expand Down Expand Up @@ -473,6 +477,34 @@ impl ColumnChunkMetaData {
self.bloom_filter_offset
}

/// Returns `true` if this column chunk contains a column index, `false` otherwise.
pub fn has_column_index(&self) -> bool {
self.column_index_offset.is_some()
&& self.column_index_length.is_some()
&& self.offset_index_offset.is_some()
&& self.offset_index_length.is_some()
}

/// Returns the offset for the column index.
pub fn column_index_offset(&self) -> Option<i64> {
self.column_index_offset
}

/// Returns the offset for the column index length.
pub fn column_index_length(&self) -> Option<i32> {
self.column_index_length
}

/// Returns the offset for the offset index.
pub fn offset_index_offset(&self) -> Option<i64> {
self.offset_index_offset
}

/// Returns the offset for the offset index length.
pub fn offset_index_length(&self) -> Option<i32> {
self.offset_index_length
}

/// Method to convert from Thrift.
pub fn from_thrift(column_descr: ColumnDescPtr, cc: ColumnChunk) -> Result<Self> {
if cc.meta_data.is_none() {
Expand All @@ -497,6 +529,10 @@ impl ColumnChunkMetaData {
let dictionary_page_offset = col_metadata.dictionary_page_offset;
let statistics = statistics::from_thrift(column_type, col_metadata.statistics);
let bloom_filter_offset = col_metadata.bloom_filter_offset;
let offset_index_offset = cc.offset_index_offset;
let offset_index_length = cc.offset_index_length;
let column_index_offset = cc.column_index_offset;
let column_index_length = cc.column_index_length;
let result = ColumnChunkMetaData {
column_type,
column_path,
Expand All @@ -513,6 +549,10 @@ impl ColumnChunkMetaData {
dictionary_page_offset,
statistics,
bloom_filter_offset,
offset_index_offset,
offset_index_length,
column_index_offset,
column_index_length,
};
Ok(result)
}
Expand Down Expand Up @@ -540,10 +580,10 @@ impl ColumnChunkMetaData {
file_path: self.file_path().map(|s| s.to_owned()),
file_offset: self.file_offset,
meta_data: Some(column_metadata),
offset_index_offset: None,
offset_index_length: None,
column_index_offset: None,
column_index_length: None,
offset_index_offset: self.offset_index_offset,
offset_index_length: self.offset_index_length,
column_index_offset: self.column_index_offset,
column_index_length: self.column_index_length,
crypto_metadata: None,
encrypted_column_metadata: None,
}
Expand All @@ -565,6 +605,10 @@ pub struct ColumnChunkMetaDataBuilder {
dictionary_page_offset: Option<i64>,
statistics: Option<Statistics>,
bloom_filter_offset: Option<i64>,
offset_index_offset: Option<i64>,
offset_index_length: Option<i32>,
column_index_offset: Option<i64>,
column_index_length: Option<i32>,
}

impl ColumnChunkMetaDataBuilder {
Expand All @@ -584,6 +628,10 @@ impl ColumnChunkMetaDataBuilder {
dictionary_page_offset: None,
statistics: None,
bloom_filter_offset: None,
offset_index_offset: None,
offset_index_length: None,
column_index_offset: None,
column_index_length: None,
}
}

Expand Down Expand Up @@ -659,6 +707,30 @@ impl ColumnChunkMetaDataBuilder {
self
}

/// Sets optional offset index offset in bytes.
pub fn set_offset_index_offset(mut self, value: Option<i64>) -> Self {
self.offset_index_offset = value;
self
}

/// Sets optional offset index length in bytes.
pub fn set_offset_index_length(mut self, value: Option<i32>) -> Self {
self.offset_index_length = value;
self
}

/// Sets optional column index offset in bytes.
pub fn set_column_index_offset(mut self, value: Option<i64>) -> Self {
self.column_index_offset = value;
self
}

/// Sets optional column index length in bytes.
pub fn set_column_index_length(mut self, value: Option<i32>) -> Self {
self.column_index_length = value;
self
}

/// Builds column chunk metadata.
pub fn build(self) -> Result<ColumnChunkMetaData> {
Ok(ColumnChunkMetaData {
Expand All @@ -677,6 +749,10 @@ impl ColumnChunkMetaDataBuilder {
dictionary_page_offset: self.dictionary_page_offset,
statistics: self.statistics,
bloom_filter_offset: self.bloom_filter_offset,
offset_index_offset: self.offset_index_offset,
offset_index_length: self.offset_index_length,
column_index_offset: self.column_index_offset,
column_index_length: self.column_index_length,
})
}
}
Expand Down Expand Up @@ -740,6 +816,10 @@ mod tests {
.set_data_page_offset(4000)
.set_dictionary_page_offset(Some(5000))
.set_bloom_filter_offset(Some(6000))
.set_offset_index_offset(Some(7000))
.set_offset_index_length(Some(25))
.set_column_index_offset(Some(8000))
.set_column_index_length(Some(25))
.build()
.unwrap();

Expand Down
9 changes: 9 additions & 0 deletions parquet/src/file/serialized_reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -771,6 +771,15 @@ mod tests {

// test optional bloom filter offset
assert_eq!(col0_metadata.bloom_filter_offset().unwrap(), 192);

// test optional column index offset
assert!(col0_metadata.has_column_index());
assert_eq!(col0_metadata.column_index_offset().unwrap(), 156);
assert_eq!(col0_metadata.column_index_length().unwrap(), 25);

// test optional offset index offset
assert_eq!(col0_metadata.offset_index_offset().unwrap(), 181);
assert_eq!(col0_metadata.offset_index_length().unwrap(), 11);
}

#[test]
Expand Down
20 changes: 20 additions & 0 deletions parquet/src/schema/printer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,26 @@ fn print_column_chunk_metadata(
Some(bfo) => bfo.to_string(),
};
writeln!(out, "bloom filter offset: {}", bloom_filter_offset_str);
let offset_index_offset_str = match cc_metadata.offset_index_offset() {
None => "N/A".to_owned(),
Some(oio) => oio.to_string(),
};
writeln!(out, "offset index offset: {}", offset_index_offset_str);
let offset_index_length_str = match cc_metadata.offset_index_length() {
None => "N/A".to_owned(),
Some(oil) => oil.to_string(),
};
writeln!(out, "offset index length: {}", offset_index_length_str);
let column_index_offset_str = match cc_metadata.column_index_offset() {
None => "N/A".to_owned(),
Some(cio) => cio.to_string(),
};
writeln!(out, "column index offset: {}", column_index_offset_str);
let column_index_length_str = match cc_metadata.column_index_length() {
None => "N/A".to_owned(),
Some(cil) => cil.to_string(),
};
writeln!(out, "column index length: {}", column_index_length_str);
writeln!(out);
}

Expand Down

0 comments on commit 827cc3e

Please sign in to comment.