Skip to content

Commit

Permalink
Set write batch size in parquet fuzz tests
Browse files Browse the repository at this point in the history
Fix bug in column writer with small page sizes
  • Loading branch information
tustvold committed Jan 21, 2022
1 parent 761bd90 commit bf28e16
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 0 deletions.
7 changes: 7 additions & 0 deletions parquet/src/arrow/arrow_reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -578,6 +578,11 @@ mod tests {
record_batch_size: usize,
/// Percentage of nulls in column or None if required
null_percent: Option<usize>,
/// Set write batch size
///
/// This is the number of rows that are written at once to a page and
/// therefore acts as a bound on the page granularity of a row group
write_batch_size: usize,
/// Maximum size of page in bytes
max_data_page_size: usize,
/// Maximum size of dictionary page in bytes
Expand All @@ -595,6 +600,7 @@ mod tests {
num_rows: 100,
record_batch_size: 15,
null_percent: None,
write_batch_size: 64,
max_data_page_size: 1024 * 1024,
max_dict_page_size: 1024 * 1024,
writer_version: WriterVersion::PARQUET_1_0,
Expand Down Expand Up @@ -637,6 +643,7 @@ mod tests {
fn writer_props(&self) -> WriterProperties {
let builder = WriterProperties::builder()
.set_data_pagesize_limit(self.max_data_page_size)
.set_write_batch_size(self.write_batch_size)
.set_writer_version(self.writer_version);

let builder = match self.encoding {
Expand Down
8 changes: 8 additions & 0 deletions parquet/src/column/writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -567,6 +567,14 @@ impl<T: DataType> ColumnWriterImpl<T> {
/// Returns true if there is enough data for a data page, false otherwise.
#[inline]
fn should_add_data_page(&self) -> bool {
// This is necessary in the event of a much larger dictionary size than page size
//
// In such a scenario the dictionary decoder may return an estimated encoded
// size in excess of the page size limit, even when there are no buffered values
if self.num_buffered_values == 0 {
return false;
}

match self.dict_encoder {
Some(ref encoder) => {
encoder.estimated_data_encoded_size() >= self.props.data_pagesize_limit()
Expand Down

0 comments on commit bf28e16

Please sign in to comment.