From 64ac3935fe0f62dc4430fb4ce902765509d906c7 Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Mon, 1 Jul 2024 23:47:41 -0500 Subject: [PATCH 1/4] Make statistics_from_parquet_meta a sync function --- .../core/src/datasource/file_format/parquet.rs | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/datafusion/core/src/datasource/file_format/parquet.rs b/datafusion/core/src/datasource/file_format/parquet.rs index 5921d8a797ac..3eb2c3124f4c 100644 --- a/datafusion/core/src/datasource/file_format/parquet.rs +++ b/datafusion/core/src/datasource/file_format/parquet.rs @@ -460,14 +460,14 @@ async fn fetch_statistics( metadata_size_hint: Option, ) -> Result { let metadata = fetch_parquet_metadata(store, file, metadata_size_hint).await?; - statistics_from_parquet_meta(&metadata, table_schema).await + statistics_from_parquet_meta_calc(&metadata, table_schema) } /// Convert statistics in [`ParquetMetaData`] into [`Statistics`] using ['StatisticsConverter`] /// /// The statistics are calculated for each column in the table schema /// using the row group statistics in the parquet metadata. -pub async fn statistics_from_parquet_meta( +pub fn statistics_from_parquet_meta_calc( metadata: &ParquetMetaData, table_schema: SchemaRef, ) -> Result { @@ -543,6 +543,16 @@ pub async fn statistics_from_parquet_meta( Ok(statistics) } +/// Deprecated +/// Use [`statistics_from_parquet_meta_calc`] instead +/// #[deprecated(since="0.40.0", note="please use `statistics_from_parquet_meta_calc` instead")] +pub async fn statistics_from_parquet_meta( + metadata: &ParquetMetaData, + table_schema: SchemaRef, +) -> Result { + statistics_from_parquet_meta_calc(metadata, table_schema) +} + fn summarize_min_max_null_counts( min_accs: &mut [Option], max_accs: &mut [Option], @@ -1514,7 +1524,7 @@ mod tests { // Fetch statistics for first file let pq_meta = fetch_parquet_metadata(store.as_ref(), &files[0], None).await?; - let stats = statistics_from_parquet_meta(&pq_meta, schema.clone()).await?; + let stats = statistics_from_parquet_meta_calc(&pq_meta, schema.clone())?; // assert_eq!(stats.num_rows, Precision::Exact(3)); // column c1 @@ -1536,7 +1546,7 @@ mod tests { // Fetch statistics for second file let pq_meta = fetch_parquet_metadata(store.as_ref(), &files[1], None).await?; - let stats = statistics_from_parquet_meta(&pq_meta, schema.clone()).await?; + let stats = statistics_from_parquet_meta_calc(&pq_meta, schema.clone())?; assert_eq!(stats.num_rows, Precision::Exact(3)); // column c1: missing from the file so the table treats all 3 rows as null let c1_stats = &stats.column_statistics[0]; From 5bdd3a1fc9c2cb27a0ec51e29cedf5620b64bedb Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Mon, 1 Jul 2024 23:48:29 -0500 Subject: [PATCH 2/4] Fix version --- datafusion/core/src/datasource/file_format/parquet.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/core/src/datasource/file_format/parquet.rs b/datafusion/core/src/datasource/file_format/parquet.rs index 3eb2c3124f4c..7cdcb6bc6085 100644 --- a/datafusion/core/src/datasource/file_format/parquet.rs +++ b/datafusion/core/src/datasource/file_format/parquet.rs @@ -545,7 +545,7 @@ pub fn statistics_from_parquet_meta_calc( /// Deprecated /// Use [`statistics_from_parquet_meta_calc`] instead -/// #[deprecated(since="0.40.0", note="please use `statistics_from_parquet_meta_calc` instead")] +/// #[deprecated(since="40.0.0", note="please use `statistics_from_parquet_meta_calc` instead")] pub async fn statistics_from_parquet_meta( metadata: &ParquetMetaData, table_schema: SchemaRef, From b91179434b260879b21338fb373ae015967f01b1 Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Mon, 1 Jul 2024 23:49:23 -0500 Subject: [PATCH 3/4] improve comment --- datafusion/core/src/datasource/file_format/parquet.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/datafusion/core/src/datasource/file_format/parquet.rs b/datafusion/core/src/datasource/file_format/parquet.rs index 7cdcb6bc6085..ddac669c7db2 100644 --- a/datafusion/core/src/datasource/file_format/parquet.rs +++ b/datafusion/core/src/datasource/file_format/parquet.rs @@ -544,8 +544,10 @@ pub fn statistics_from_parquet_meta_calc( } /// Deprecated -/// Use [`statistics_from_parquet_meta_calc`] instead -/// #[deprecated(since="40.0.0", note="please use `statistics_from_parquet_meta_calc` instead")] +/// Use [`statistics_from_parquet_meta_calc`] instead. +/// This method was deprecated because it didn't need to be async so a new method was created +/// that exposes a synchronous API. +#[deprecated(since="40.0.0", note="please use `statistics_from_parquet_meta_calc` instead")] pub async fn statistics_from_parquet_meta( metadata: &ParquetMetaData, table_schema: SchemaRef, From 4ad7e2c9a7b71d800c8e92d5e138b55fde4324f6 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 2 Jul 2024 08:59:47 -0400 Subject: [PATCH 4/4] Clippy and fmt --- datafusion/core/src/datasource/file_format/parquet.rs | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/datafusion/core/src/datasource/file_format/parquet.rs b/datafusion/core/src/datasource/file_format/parquet.rs index ddac669c7db2..27d783cd89b5 100644 --- a/datafusion/core/src/datasource/file_format/parquet.rs +++ b/datafusion/core/src/datasource/file_format/parquet.rs @@ -547,7 +547,10 @@ pub fn statistics_from_parquet_meta_calc( /// Use [`statistics_from_parquet_meta_calc`] instead. /// This method was deprecated because it didn't need to be async so a new method was created /// that exposes a synchronous API. -#[deprecated(since="40.0.0", note="please use `statistics_from_parquet_meta_calc` instead")] +#[deprecated( + since = "40.0.0", + note = "please use `statistics_from_parquet_meta_calc` instead" +)] pub async fn statistics_from_parquet_meta( metadata: &ParquetMetaData, table_schema: SchemaRef, @@ -1479,7 +1482,7 @@ mod tests { // Fetch statistics for first file let pq_meta = fetch_parquet_metadata(store.as_ref(), &files[0], None).await?; - let stats = statistics_from_parquet_meta(&pq_meta, schema.clone()).await?; + let stats = statistics_from_parquet_meta_calc(&pq_meta, schema.clone())?; assert_eq!(stats.num_rows, Precision::Exact(4)); // column c_dic