From be7f41a8a8ab85b8d05dcf893a0bc4fef7db4969 Mon Sep 17 00:00:00 2001 From: Ivan Sadikov Date: Sun, 14 Apr 2019 08:37:34 +0200 Subject: [PATCH 1/3] check dict encoder for data pages --- rust/parquet/src/column/writer.rs | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/rust/parquet/src/column/writer.rs b/rust/parquet/src/column/writer.rs index b520997e46ffb..df29f3f060d7a 100644 --- a/rust/parquet/src/column/writer.rs +++ b/rust/parquet/src/column/writer.rs @@ -421,7 +421,15 @@ impl ColumnWriterImpl { /// Returns true if there is enough data for a data page, false otherwise. #[inline] fn should_add_data_page(&self) -> bool { - self.encoder.estimated_data_encoded_size() >= self.props.data_pagesize_limit() + match self.dict_encoder { + Some(ref encoder) => { + encoder.estimated_data_encoded_size() >= self.props.data_pagesize_limit() + }, + None => { + self.encoder.estimated_data_encoded_size() >= + self.props.data_pagesize_limit() + } + } } /// Performs dictionary fallback. From 42eb4de8d75c36dc7ba81108a96d8390aea2c321 Mon Sep 17 00:00:00 2001 From: Ivan Sadikov Date: Sun, 14 Apr 2019 10:10:23 +0200 Subject: [PATCH 2/3] add test --- rust/parquet/src/column/writer.rs | 42 +++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/rust/parquet/src/column/writer.rs b/rust/parquet/src/column/writer.rs index df29f3f060d7a..4658ac5ba64da 100644 --- a/rust/parquet/src/column/writer.rs +++ b/rust/parquet/src/column/writer.rs @@ -1387,6 +1387,48 @@ mod tests { ); } + #[test] + fn test_column_writer_add_data_pages_with_dict() { + // ARROW-5129: Test verifies that we add data page in case of dictionary encoding + // and no fallback occured so far. + let file = get_temp_file("test_column_writer_add_data_pages_with_dict", &[]); + let sink = FileSink::new(&file); + let page_writer = Box::new(SerializedPageWriter::new(sink)); + let props = Rc::new(WriterProperties::builder() + .set_data_pagesize_limit(15) // actually each page will have size 15-18 bytes + .set_write_batch_size(3) // write 3 values at a time + .build()); + let data = &[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]; + let mut writer = get_test_column_writer::(page_writer, 0, 0, props); + writer.write_batch(data, None, None).unwrap(); + let (bytes_written, _, _) = writer.close().unwrap(); + + // Read pages and check the sequence + let source = FileSource::new(&file, 0, bytes_written as usize); + let mut page_reader = Box::new( + SerializedPageReader::new( + source, + data.len() as i64, + Compression::UNCOMPRESSED, + Int32Type::get_physical_type() + ).unwrap() + ); + let mut res = Vec::new(); + while let Some(page) = page_reader.get_next_page().unwrap() { + res.push((page.page_type(), page.num_values())); + } + assert_eq!( + res, + vec![ + (PageType::DICTIONARY_PAGE, 10), + (PageType::DATA_PAGE, 3), + (PageType::DATA_PAGE, 3), + (PageType::DATA_PAGE, 3), + (PageType::DATA_PAGE, 1) + ] + ); + } + /// Performs write-read roundtrip with randomly generated values and levels. /// `max_size` is maximum number of values or levels (if `max_def_level` > 0) to write /// for a column. From 61d2741b1a281972aa05e486459158f145bda0f0 Mon Sep 17 00:00:00 2001 From: Ivan Sadikov Date: Sun, 14 Apr 2019 11:17:11 +0200 Subject: [PATCH 3/3] fix style --- rust/parquet/src/column/writer.rs | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/rust/parquet/src/column/writer.rs b/rust/parquet/src/column/writer.rs index 4658ac5ba64da..08e6e4623c6f5 100644 --- a/rust/parquet/src/column/writer.rs +++ b/rust/parquet/src/column/writer.rs @@ -424,10 +424,10 @@ impl ColumnWriterImpl { match self.dict_encoder { Some(ref encoder) => { encoder.estimated_data_encoded_size() >= self.props.data_pagesize_limit() - }, + } None => { - self.encoder.estimated_data_encoded_size() >= - self.props.data_pagesize_limit() + self.encoder.estimated_data_encoded_size() + >= self.props.data_pagesize_limit() } } } @@ -1394,10 +1394,12 @@ mod tests { let file = get_temp_file("test_column_writer_add_data_pages_with_dict", &[]); let sink = FileSink::new(&file); let page_writer = Box::new(SerializedPageWriter::new(sink)); - let props = Rc::new(WriterProperties::builder() - .set_data_pagesize_limit(15) // actually each page will have size 15-18 bytes - .set_write_batch_size(3) // write 3 values at a time - .build()); + let props = Rc::new( + WriterProperties::builder() + .set_data_pagesize_limit(15) // actually each page will have size 15-18 bytes + .set_write_batch_size(3) // write 3 values at a time + .build(), + ); let data = &[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]; let mut writer = get_test_column_writer::(page_writer, 0, 0, props); writer.write_batch(data, None, None).unwrap(); @@ -1410,8 +1412,9 @@ mod tests { source, data.len() as i64, Compression::UNCOMPRESSED, - Int32Type::get_physical_type() - ).unwrap() + Int32Type::get_physical_type(), + ) + .unwrap(), ); let mut res = Vec::new(); while let Some(page) = page_reader.get_next_page().unwrap() {