Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 35 additions & 2 deletions parquet/src/bin/parquet-layout.rs
Original file line number Diff line number Diff line change
Expand Up @@ -55,9 +55,21 @@ use parquet::file::reader::ChunkReader;
use parquet::format::PageHeader;
use parquet::thrift::TSerializable;

#[derive(Serialize, Debug)]
struct Index {
offset: i64,
length: Option<i32>,
}

#[derive(Serialize, Debug)]
struct Footer {
metadata_size: Option<usize>,
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is required to estimate a prefetch_hint for files.

}

#[derive(Serialize, Debug)]
struct ParquetFile {
row_groups: Vec<RowGroup>,
footer: Footer,
}

#[derive(Serialize, Debug)]
Expand All @@ -72,6 +84,9 @@ struct ColumnChunk {
has_offset_index: bool,
has_column_index: bool,
has_bloom_filter: bool,
offset_index: Option<Index>,
column_index: Option<Index>,
bloom_filter: Option<Index>,
pages: Vec<Page>,
}

Expand All @@ -89,7 +104,10 @@ struct Page {

#[allow(deprecated)]
fn do_layout<C: ChunkReader>(reader: &C) -> Result<ParquetFile> {
let metadata = ParquetMetaDataReader::new().parse_and_finish(reader)?;
let mut metadata_reader = ParquetMetaDataReader::new();
metadata_reader.try_parse(reader)?;
let metadata_size = metadata_reader.metadata_size();
let metadata = metadata_reader.finish()?;
let schema = metadata.file_metadata().schema_descr();

let row_groups = (0..metadata.num_row_groups())
Expand Down Expand Up @@ -155,6 +173,18 @@ fn do_layout<C: ChunkReader>(reader: &C) -> Result<ParquetFile> {
has_offset_index: column.offset_index_offset().is_some(),
has_column_index: column.column_index_offset().is_some(),
has_bloom_filter: column.bloom_filter_offset().is_some(),
offset_index: column.offset_index_offset().map(|offset| Index {
offset,
length: column.offset_index_length(),
}),
column_index: column.column_index_offset().map(|offset| Index {
offset,
length: column.column_index_length(),
}),
bloom_filter: column.bloom_filter_offset().map(|offset| Index {
offset,
length: column.bloom_filter_length(),
}),
pages,
})
})
Expand All @@ -167,7 +197,10 @@ fn do_layout<C: ChunkReader>(reader: &C) -> Result<ParquetFile> {
})
.collect::<Result<Vec<_>>>()?;

Ok(ParquetFile { row_groups })
Ok(ParquetFile {
row_groups,
footer: Footer { metadata_size },
})
}

/// Reads the page header at `offset` from `reader`, returning
Expand Down
6 changes: 6 additions & 0 deletions parquet/src/file/metadata/reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -561,6 +561,12 @@ impl ParquetMetaDataReader {
self.decode_footer_metadata(bytes, file_size, footer)
}

/// Size of the serialized thrift metadata plus the 8 byte footer. Only set if
/// `self.parse_metadata` is called.
pub fn metadata_size(&self) -> Option<usize> {
self.metadata_size
}

/// Return the number of bytes to read in the initial pass. If `prefetch_size` has
/// been provided, then return that value if it is larger than the size of the Parquet
/// file footer (8 bytes). Otherwise returns `8`.
Expand Down
Loading