diff --git a/parquet/src/bin/parquet-layout.rs b/parquet/src/bin/parquet-layout.rs index 42e5a28edf69..007f93517d96 100644 --- a/parquet/src/bin/parquet-layout.rs +++ b/parquet/src/bin/parquet-layout.rs @@ -55,9 +55,21 @@ use parquet::file::reader::ChunkReader; use parquet::format::PageHeader; use parquet::thrift::TSerializable; +#[derive(Serialize, Debug)] +struct Index { + offset: i64, + length: Option, +} + +#[derive(Serialize, Debug)] +struct Footer { + metadata_size: Option, +} + #[derive(Serialize, Debug)] struct ParquetFile { row_groups: Vec, + footer: Footer, } #[derive(Serialize, Debug)] @@ -72,6 +84,9 @@ struct ColumnChunk { has_offset_index: bool, has_column_index: bool, has_bloom_filter: bool, + offset_index: Option, + column_index: Option, + bloom_filter: Option, pages: Vec, } @@ -89,7 +104,10 @@ struct Page { #[allow(deprecated)] fn do_layout(reader: &C) -> Result { - let metadata = ParquetMetaDataReader::new().parse_and_finish(reader)?; + let mut metadata_reader = ParquetMetaDataReader::new(); + metadata_reader.try_parse(reader)?; + let metadata_size = metadata_reader.metadata_size(); + let metadata = metadata_reader.finish()?; let schema = metadata.file_metadata().schema_descr(); let row_groups = (0..metadata.num_row_groups()) @@ -155,6 +173,18 @@ fn do_layout(reader: &C) -> Result { has_offset_index: column.offset_index_offset().is_some(), has_column_index: column.column_index_offset().is_some(), has_bloom_filter: column.bloom_filter_offset().is_some(), + offset_index: column.offset_index_offset().map(|offset| Index { + offset, + length: column.offset_index_length(), + }), + column_index: column.column_index_offset().map(|offset| Index { + offset, + length: column.column_index_length(), + }), + bloom_filter: column.bloom_filter_offset().map(|offset| Index { + offset, + length: column.bloom_filter_length(), + }), pages, }) }) @@ -167,7 +197,10 @@ fn do_layout(reader: &C) -> Result { }) .collect::>>()?; - Ok(ParquetFile { row_groups }) + Ok(ParquetFile { + row_groups, + footer: Footer { metadata_size }, + }) } /// Reads the page header at `offset` from `reader`, returning diff --git a/parquet/src/file/metadata/reader.rs b/parquet/src/file/metadata/reader.rs index 15b7264c90f4..091895e65919 100644 --- a/parquet/src/file/metadata/reader.rs +++ b/parquet/src/file/metadata/reader.rs @@ -561,6 +561,12 @@ impl ParquetMetaDataReader { self.decode_footer_metadata(bytes, file_size, footer) } + /// Size of the serialized thrift metadata plus the 8 byte footer. Only set if + /// `self.parse_metadata` is called. + pub fn metadata_size(&self) -> Option { + self.metadata_size + } + /// Return the number of bytes to read in the initial pass. If `prefetch_size` has /// been provided, then return that value if it is larger than the size of the Parquet /// file footer (8 bytes). Otherwise returns `8`.