From d4228feca341cd707a3a26372cae71a94a93b4fd Mon Sep 17 00:00:00 2001 From: Trent Hauck Date: Sun, 16 Jun 2024 18:54:11 -0700 Subject: [PATCH 1/4] refactor: remove extra default in max rows (#10941) --- datafusion-cli/src/main.rs | 2 +- docs/source/user-guide/cli/usage.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/datafusion-cli/src/main.rs b/datafusion-cli/src/main.rs index 187f856894b2..f2b29fe78690 100644 --- a/datafusion-cli/src/main.rs +++ b/datafusion-cli/src/main.rs @@ -133,7 +133,7 @@ struct Args { #[clap( long, - help = "The max number of rows to display for 'Table' format\n[default: 40] [possible values: numbers(0/10/...), inf(no limit)]", + help = "The max number of rows to display for 'Table' format\n[possible values: numbers(0/10/...), inf(no limit)]", default_value = "40" )] maxrows: MaxRows, diff --git a/docs/source/user-guide/cli/usage.md b/docs/source/user-guide/cli/usage.md index 617b462875c7..6a620fc69252 100644 --- a/docs/source/user-guide/cli/usage.md +++ b/docs/source/user-guide/cli/usage.md @@ -52,7 +52,7 @@ OPTIONS: --maxrows The max number of rows to display for 'Table' format - [default: 40] [possible values: numbers(0/10/...), inf(no limit)] + [possible values: numbers(0/10/...), inf(no limit)] [default: 40] --mem-pool-type Specify the memory pool type 'greedy' or 'fair', default to 'greedy' From 378b9eecd4a77386a59953209f75fc5c192d7af4 Mon Sep 17 00:00:00 2001 From: Alex Huang Date: Mon, 17 Jun 2024 17:43:20 +0800 Subject: [PATCH 2/4] chore: Improve performance of Parquet statistics conversion (#10932) --- .../physical_plan/parquet/statistics.rs | 32 +++---------------- 1 file changed, 4 insertions(+), 28 deletions(-) diff --git a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs index a2e0d8fa66be..327a516f1af1 100644 --- a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs +++ b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs @@ -303,24 +303,12 @@ macro_rules! get_statistics { ))), DataType::Int8 => Ok(Arc::new(Int8Array::from_iter( [<$stat_type_prefix Int32StatsIterator>]::new($iterator).map(|x| { - x.and_then(|x| { - if let Ok(v) = i8::try_from(*x) { - Some(v) - } else { - None - } - }) + x.and_then(|x| i8::try_from(*x).ok()) }), ))), DataType::Int16 => Ok(Arc::new(Int16Array::from_iter( [<$stat_type_prefix Int32StatsIterator>]::new($iterator).map(|x| { - x.and_then(|x| { - if let Ok(v) = i16::try_from(*x) { - Some(v) - } else { - None - } - }) + x.and_then(|x| i16::try_from(*x).ok()) }), ))), DataType::Int32 => Ok(Arc::new(Int32Array::from_iter( @@ -331,24 +319,12 @@ macro_rules! get_statistics { ))), DataType::UInt8 => Ok(Arc::new(UInt8Array::from_iter( [<$stat_type_prefix Int32StatsIterator>]::new($iterator).map(|x| { - x.and_then(|x| { - if let Ok(v) = u8::try_from(*x) { - Some(v) - } else { - None - } - }) + x.and_then(|x| u8::try_from(*x).ok()) }), ))), DataType::UInt16 => Ok(Arc::new(UInt16Array::from_iter( [<$stat_type_prefix Int32StatsIterator>]::new($iterator).map(|x| { - x.and_then(|x| { - if let Ok(v) = u16::try_from(*x) { - Some(v) - } else { - None - } - }) + x.and_then(|x| u16::try_from(*x).ok()) }), ))), DataType::UInt32 => Ok(Arc::new(UInt32Array::from_iter( From c4fd7545ba7719d6d12473694fcdf6f34d25b8cb Mon Sep 17 00:00:00 2001 From: Leonardo Yvens Date: Mon, 17 Jun 2024 12:17:58 +0100 Subject: [PATCH 3/4] Add catalog::resolve_table_references (#10876) * resolve information_schema references only when necessary * add `catalog::resolve_table_references` as a public utility * collect CTEs separately in resolve_table_references * test CTE name shadowing * handle CTE name shadowing in resolve_table_references * handle unions, recursive and nested CTEs in resolve_table_references --- datafusion/core/src/catalog/mod.rs | 239 +++++++++++++++++- .../core/src/execution/session_state.rs | 96 +------ datafusion/sqllogictest/test_files/cte.slt | 7 + 3 files changed, 256 insertions(+), 86 deletions(-) diff --git a/datafusion/core/src/catalog/mod.rs b/datafusion/core/src/catalog/mod.rs index 209d9b2af297..53b133339924 100644 --- a/datafusion/core/src/catalog/mod.rs +++ b/datafusion/core/src/catalog/mod.rs @@ -27,6 +27,8 @@ use crate::catalog::schema::SchemaProvider; use dashmap::DashMap; use datafusion_common::{exec_err, not_impl_err, Result}; use std::any::Any; +use std::collections::BTreeSet; +use std::ops::ControlFlow; use std::sync::Arc; /// Represent a list of named [`CatalogProvider`]s. @@ -157,11 +159,11 @@ impl CatalogProviderList for MemoryCatalogProviderList { /// access required to read table details (e.g. statistics). /// /// The pattern that DataFusion itself uses to plan SQL queries is to walk over -/// the query to [find all schema / table references in an `async` function], +/// the query to [find all table references], /// performing required remote catalog in parallel, and then plans the query /// using that snapshot. /// -/// [find all schema / table references in an `async` function]: crate::execution::context::SessionState::resolve_table_references +/// [find all table references]: resolve_table_references /// /// # Example Catalog Implementations /// @@ -295,6 +297,182 @@ impl CatalogProvider for MemoryCatalogProvider { } } +/// Collects all tables and views referenced in the SQL statement. CTEs are collected separately. +/// This can be used to determine which tables need to be in the catalog for a query to be planned. +/// +/// # Returns +/// +/// A `(table_refs, ctes)` tuple, the first element contains table and view references and the second +/// element contains any CTE aliases that were defined and possibly referenced. +/// +/// ## Example +/// +/// ``` +/// # use datafusion_sql::parser::DFParser; +/// # use datafusion::catalog::resolve_table_references; +/// let query = "SELECT a FROM foo where x IN (SELECT y FROM bar)"; +/// let statement = DFParser::parse_sql(query).unwrap().pop_back().unwrap(); +/// let (table_refs, ctes) = resolve_table_references(&statement, true).unwrap(); +/// assert_eq!(table_refs.len(), 2); +/// assert_eq!(table_refs[0].to_string(), "bar"); +/// assert_eq!(table_refs[1].to_string(), "foo"); +/// assert_eq!(ctes.len(), 0); +/// ``` +/// +/// ## Example with CTEs +/// +/// ``` +/// # use datafusion_sql::parser::DFParser; +/// # use datafusion::catalog::resolve_table_references; +/// let query = "with my_cte as (values (1), (2)) SELECT * from my_cte;"; +/// let statement = DFParser::parse_sql(query).unwrap().pop_back().unwrap(); +/// let (table_refs, ctes) = resolve_table_references(&statement, true).unwrap(); +/// assert_eq!(table_refs.len(), 0); +/// assert_eq!(ctes.len(), 1); +/// assert_eq!(ctes[0].to_string(), "my_cte"); +/// ``` +pub fn resolve_table_references( + statement: &datafusion_sql::parser::Statement, + enable_ident_normalization: bool, +) -> datafusion_common::Result<(Vec, Vec)> { + use crate::sql::planner::object_name_to_table_reference; + use datafusion_sql::parser::{ + CopyToSource, CopyToStatement, Statement as DFStatement, + }; + use information_schema::INFORMATION_SCHEMA; + use information_schema::INFORMATION_SCHEMA_TABLES; + use sqlparser::ast::*; + + struct RelationVisitor { + relations: BTreeSet, + all_ctes: BTreeSet, + ctes_in_scope: Vec, + } + + impl RelationVisitor { + /// Record the reference to `relation`, if it's not a CTE reference. + fn insert_relation(&mut self, relation: &ObjectName) { + if !self.relations.contains(relation) + && !self.ctes_in_scope.contains(relation) + { + self.relations.insert(relation.clone()); + } + } + } + + impl Visitor for RelationVisitor { + type Break = (); + + fn pre_visit_relation(&mut self, relation: &ObjectName) -> ControlFlow<()> { + self.insert_relation(relation); + ControlFlow::Continue(()) + } + + fn pre_visit_query(&mut self, q: &Query) -> ControlFlow { + if let Some(with) = &q.with { + for cte in &with.cte_tables { + // The non-recursive CTE name is not in scope when evaluating the CTE itself, so this is valid: + // `WITH t AS (SELECT * FROM t) SELECT * FROM t` + // Where the first `t` refers to a predefined table. So we are careful here + // to visit the CTE first, before putting it in scope. + if !with.recursive { + // This is a bit hackish as the CTE will be visited again as part of visiting `q`, + // but thankfully `insert_relation` is idempotent. + cte.visit(self); + } + self.ctes_in_scope + .push(ObjectName(vec![cte.alias.name.clone()])); + } + } + ControlFlow::Continue(()) + } + + fn post_visit_query(&mut self, q: &Query) -> ControlFlow { + if let Some(with) = &q.with { + for _ in &with.cte_tables { + // Unwrap: We just pushed these in `pre_visit_query` + self.all_ctes.insert(self.ctes_in_scope.pop().unwrap()); + } + } + ControlFlow::Continue(()) + } + + fn pre_visit_statement(&mut self, statement: &Statement) -> ControlFlow<()> { + if let Statement::ShowCreate { + obj_type: ShowCreateObject::Table | ShowCreateObject::View, + obj_name, + } = statement + { + self.insert_relation(obj_name) + } + + // SHOW statements will later be rewritten into a SELECT from the information_schema + let requires_information_schema = matches!( + statement, + Statement::ShowFunctions { .. } + | Statement::ShowVariable { .. } + | Statement::ShowStatus { .. } + | Statement::ShowVariables { .. } + | Statement::ShowCreate { .. } + | Statement::ShowColumns { .. } + | Statement::ShowTables { .. } + | Statement::ShowCollation { .. } + ); + if requires_information_schema { + for s in INFORMATION_SCHEMA_TABLES { + self.relations.insert(ObjectName(vec![ + Ident::new(INFORMATION_SCHEMA), + Ident::new(*s), + ])); + } + } + ControlFlow::Continue(()) + } + } + + let mut visitor = RelationVisitor { + relations: BTreeSet::new(), + all_ctes: BTreeSet::new(), + ctes_in_scope: vec![], + }; + + fn visit_statement(statement: &DFStatement, visitor: &mut RelationVisitor) { + match statement { + DFStatement::Statement(s) => { + let _ = s.as_ref().visit(visitor); + } + DFStatement::CreateExternalTable(table) => { + visitor + .relations + .insert(ObjectName(vec![Ident::from(table.name.as_str())])); + } + DFStatement::CopyTo(CopyToStatement { source, .. }) => match source { + CopyToSource::Relation(table_name) => { + visitor.insert_relation(table_name); + } + CopyToSource::Query(query) => { + query.visit(visitor); + } + }, + DFStatement::Explain(explain) => visit_statement(&explain.statement, visitor), + } + } + + visit_statement(statement, &mut visitor); + + let table_refs = visitor + .relations + .into_iter() + .map(|x| object_name_to_table_reference(x, enable_ident_normalization)) + .collect::>()?; + let ctes = visitor + .all_ctes + .into_iter() + .map(|x| object_name_to_table_reference(x, enable_ident_normalization)) + .collect::>()?; + Ok((table_refs, ctes)) +} + #[cfg(test)] mod tests { use super::*; @@ -363,4 +541,61 @@ mod tests { let cat = Arc::new(MemoryCatalogProvider::new()) as Arc; assert!(cat.deregister_schema("foo", false).unwrap().is_none()); } + + #[test] + fn resolve_table_references_shadowed_cte() { + use datafusion_sql::parser::DFParser; + + // An interesting edge case where the `t` name is used both as an ordinary table reference + // and as a CTE reference. + let query = "WITH t AS (SELECT * FROM t) SELECT * FROM t"; + let statement = DFParser::parse_sql(query).unwrap().pop_back().unwrap(); + let (table_refs, ctes) = resolve_table_references(&statement, true).unwrap(); + assert_eq!(table_refs.len(), 1); + assert_eq!(ctes.len(), 1); + assert_eq!(ctes[0].to_string(), "t"); + assert_eq!(table_refs[0].to_string(), "t"); + + // UNION is a special case where the CTE is not in scope for the second branch. + let query = "(with t as (select 1) select * from t) union (select * from t)"; + let statement = DFParser::parse_sql(query).unwrap().pop_back().unwrap(); + let (table_refs, ctes) = resolve_table_references(&statement, true).unwrap(); + assert_eq!(table_refs.len(), 1); + assert_eq!(ctes.len(), 1); + assert_eq!(ctes[0].to_string(), "t"); + assert_eq!(table_refs[0].to_string(), "t"); + + // Nested CTEs are also handled. + // Here the first `u` is a CTE, but the second `u` is a table reference. + // While `t` is always a CTE. + let query = "(with t as (with u as (select 1) select * from u) select * from u cross join t)"; + let statement = DFParser::parse_sql(query).unwrap().pop_back().unwrap(); + let (table_refs, ctes) = resolve_table_references(&statement, true).unwrap(); + assert_eq!(table_refs.len(), 1); + assert_eq!(ctes.len(), 2); + assert_eq!(ctes[0].to_string(), "t"); + assert_eq!(ctes[1].to_string(), "u"); + assert_eq!(table_refs[0].to_string(), "u"); + } + + #[test] + fn resolve_table_references_recursive_cte() { + use datafusion_sql::parser::DFParser; + + let query = " + WITH RECURSIVE nodes AS ( + SELECT 1 as id + UNION ALL + SELECT id + 1 as id + FROM nodes + WHERE id < 10 + ) + SELECT * FROM nodes + "; + let statement = DFParser::parse_sql(query).unwrap().pop_back().unwrap(); + let (table_refs, ctes) = resolve_table_references(&statement, true).unwrap(); + assert_eq!(table_refs.len(), 0); + assert_eq!(ctes.len(), 1); + assert_eq!(ctes[0].to_string(), "nodes"); + } } diff --git a/datafusion/core/src/execution/session_state.rs b/datafusion/core/src/execution/session_state.rs index fed101bd239b..1df77a1f9e0b 100644 --- a/datafusion/core/src/execution/session_state.rs +++ b/datafusion/core/src/execution/session_state.rs @@ -66,15 +66,12 @@ use datafusion_optimizer::{ use datafusion_physical_expr::create_physical_expr; use datafusion_physical_expr_common::physical_expr::PhysicalExpr; use datafusion_physical_plan::ExecutionPlan; -use datafusion_sql::parser::{CopyToSource, CopyToStatement, DFParser, Statement}; -use datafusion_sql::planner::{ - object_name_to_table_reference, ContextProvider, ParserOptions, SqlToRel, -}; +use datafusion_sql::parser::{DFParser, Statement}; +use datafusion_sql::planner::{ContextProvider, ParserOptions, SqlToRel}; use sqlparser::dialect::dialect_from_str; use std::collections::hash_map::Entry; use std::collections::{HashMap, HashSet}; use std::fmt::Debug; -use std::ops::ControlFlow; use std::sync::Arc; use url::Url; use uuid::Uuid; @@ -493,91 +490,22 @@ impl SessionState { Ok(statement) } - /// Resolve all table references in the SQL statement. + /// Resolve all table references in the SQL statement. Does not include CTE references. + /// + /// See [`catalog::resolve_table_references`] for more information. + /// + /// [`catalog::resolve_table_references`]: crate::catalog::resolve_table_references pub fn resolve_table_references( &self, statement: &datafusion_sql::parser::Statement, ) -> datafusion_common::Result> { - use crate::catalog::information_schema::INFORMATION_SCHEMA_TABLES; - use datafusion_sql::parser::Statement as DFStatement; - use sqlparser::ast::*; - - // Getting `TableProviders` is async but planing is not -- thus pre-fetch - // table providers for all relations referenced in this query - let mut relations = hashbrown::HashSet::with_capacity(10); - - struct RelationVisitor<'a>(&'a mut hashbrown::HashSet); - - impl<'a> RelationVisitor<'a> { - /// Record that `relation` was used in this statement - fn insert(&mut self, relation: &ObjectName) { - self.0.get_or_insert_with(relation, |_| relation.clone()); - } - } - - impl<'a> Visitor for RelationVisitor<'a> { - type Break = (); - - fn pre_visit_relation(&mut self, relation: &ObjectName) -> ControlFlow<()> { - self.insert(relation); - ControlFlow::Continue(()) - } - - fn pre_visit_statement(&mut self, statement: &Statement) -> ControlFlow<()> { - if let Statement::ShowCreate { - obj_type: ShowCreateObject::Table | ShowCreateObject::View, - obj_name, - } = statement - { - self.insert(obj_name) - } - ControlFlow::Continue(()) - } - } - - let mut visitor = RelationVisitor(&mut relations); - fn visit_statement(statement: &DFStatement, visitor: &mut RelationVisitor<'_>) { - match statement { - DFStatement::Statement(s) => { - let _ = s.as_ref().visit(visitor); - } - DFStatement::CreateExternalTable(table) => { - visitor - .0 - .insert(ObjectName(vec![Ident::from(table.name.as_str())])); - } - DFStatement::CopyTo(CopyToStatement { source, .. }) => match source { - CopyToSource::Relation(table_name) => { - visitor.insert(table_name); - } - CopyToSource::Query(query) => { - query.visit(visitor); - } - }, - DFStatement::Explain(explain) => { - visit_statement(&explain.statement, visitor) - } - } - } - - visit_statement(statement, &mut visitor); - - // Always include information_schema if available - if self.config.information_schema() { - for s in INFORMATION_SCHEMA_TABLES { - relations.insert(ObjectName(vec![ - Ident::new(INFORMATION_SCHEMA), - Ident::new(*s), - ])); - } - } - let enable_ident_normalization = self.config.options().sql_parser.enable_ident_normalization; - relations - .into_iter() - .map(|x| object_name_to_table_reference(x, enable_ident_normalization)) - .collect::>() + let (table_refs, _) = crate::catalog::resolve_table_references( + statement, + enable_ident_normalization, + )?; + Ok(table_refs) } /// Convert an AST Statement into a LogicalPlan diff --git a/datafusion/sqllogictest/test_files/cte.slt b/datafusion/sqllogictest/test_files/cte.slt index 1ff108cf6c5f..d8eaa51fc88a 100644 --- a/datafusion/sqllogictest/test_files/cte.slt +++ b/datafusion/sqllogictest/test_files/cte.slt @@ -828,3 +828,10 @@ SELECT * FROM non_recursive_cte, recursive_cte; ---- 1 1 1 3 + +# Name shadowing: +# The first `t` refers to the table, the second to the CTE. +query I +WITH t AS (SELECT * FROM t where t.a < 2) SELECT * FROM t +---- +1 \ No newline at end of file From a923c659cf932f6369f2d5257e5b99128b67091a Mon Sep 17 00:00:00 2001 From: Alex Huang Date: Mon, 17 Jun 2024 19:22:55 +0800 Subject: [PATCH 4/4] feat: Add support for Int8 and Int16 data types in data page statistics (#10931) --- .../physical_plan/parquet/statistics.rs | 30 +++++++++++++++++++ .../core/tests/parquet/arrow_statistics.rs | 24 ++------------- 2 files changed, 33 insertions(+), 21 deletions(-) diff --git a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs index 327a516f1af1..a2f17ca9b7a7 100644 --- a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs +++ b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs @@ -548,6 +548,8 @@ macro_rules! make_data_page_stats_iterator { }; } +make_data_page_stats_iterator!(MinInt32DataPageStatsIterator, min, Index::INT32, i32); +make_data_page_stats_iterator!(MaxInt32DataPageStatsIterator, max, Index::INT32, i32); make_data_page_stats_iterator!(MinInt64DataPageStatsIterator, min, Index::INT64, i64); make_data_page_stats_iterator!(MaxInt64DataPageStatsIterator, max, Index::INT64, i64); @@ -555,6 +557,29 @@ macro_rules! get_data_page_statistics { ($stat_type_prefix: ident, $data_type: ident, $iterator: ident) => { paste! { match $data_type { + Some(DataType::Int8) => Ok(Arc::new( + Int8Array::from_iter( + [<$stat_type_prefix Int32DataPageStatsIterator>]::new($iterator) + .map(|x| { + x.into_iter().filter_map(|x| { + x.and_then(|x| i8::try_from(x).ok()) + }) + }) + .flatten() + ) + )), + Some(DataType::Int16) => Ok(Arc::new( + Int16Array::from_iter( + [<$stat_type_prefix Int32DataPageStatsIterator>]::new($iterator) + .map(|x| { + x.into_iter().filter_map(|x| { + x.and_then(|x| i16::try_from(x).ok()) + }) + }) + .flatten() + ) + )), + Some(DataType::Int32) => Ok(Arc::new(Int32Array::from_iter([<$stat_type_prefix Int32DataPageStatsIterator>]::new($iterator).flatten()))), Some(DataType::Int64) => Ok(Arc::new(Int64Array::from_iter([<$stat_type_prefix Int64DataPageStatsIterator>]::new($iterator).flatten()))), _ => unimplemented!() } @@ -642,6 +667,11 @@ where { let iter = iterator.flat_map(|(len, index)| match index { Index::NONE => vec![None; len], + Index::INT32(native_index) => native_index + .indexes + .iter() + .map(|x| x.null_count.map(|x| x as u64)) + .collect::>(), Index::INT64(native_index) => native_index .indexes .iter() diff --git a/datafusion/core/tests/parquet/arrow_statistics.rs b/datafusion/core/tests/parquet/arrow_statistics.rs index 6b8705441d12..87bd1372225f 100644 --- a/datafusion/core/tests/parquet/arrow_statistics.rs +++ b/datafusion/core/tests/parquet/arrow_statistics.rs @@ -550,16 +550,11 @@ async fn test_int_32() { // row counts are [5, 5, 5, 5] expected_row_counts: UInt64Array::from(vec![5, 5, 5, 5]), column_name: "i32", - check: Check::RowGroup, + check: Check::Both, } .run(); } -// BUG: ignore this test for now -// https://github.com/apache/datafusion/issues/10585 -// Note that the file has 4 columns named "i8", "i16", "i32", "i64". -// - The tests on column i32 and i64 passed. -// - The tests on column i8 and i16 failed. #[tokio::test] async fn test_int_16() { // This creates a parquet files of 4 columns named "i8", "i16", "i32", "i64" @@ -573,16 +568,6 @@ async fn test_int_16() { Test { reader: &reader, // mins are [-5, -4, 0, 5] - // BUG: not sure why this returns same data but in Int32Array type even though I debugged and the columns name is "i16" an its data is Int16 - // My debugging tells me the bug is either at: - // 1. The new code to get "iter". See the code in this PR with - // // Get an iterator over the column statistics - // let iter = row_groups - // .iter() - // .map(|x| x.column(parquet_idx).statistics()); - // OR - // 2. in the function (and/or its marco) `pub(crate) fn min_statistics<'a, I: Iterator>>` here - // https://github.com/apache/datafusion/blob/ea023e2d4878240eece870cf4b346c7a0667aeed/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs#L179 expected_min: Arc::new(Int16Array::from(vec![-5, -4, 0, 5])), // panic here because the actual data is Int32Array // maxes are [-1, 0, 4, 9] expected_max: Arc::new(Int16Array::from(vec![-1, 0, 4, 9])), @@ -591,13 +576,11 @@ async fn test_int_16() { // row counts are [5, 5, 5, 5] expected_row_counts: UInt64Array::from(vec![5, 5, 5, 5]), column_name: "i16", - check: Check::RowGroup, + check: Check::Both, } .run(); } -// BUG (same as above): ignore this test for now -// https://github.com/apache/datafusion/issues/10585 #[tokio::test] async fn test_int_8() { // This creates a parquet files of 4 columns named "i8", "i16", "i32", "i64" @@ -611,7 +594,6 @@ async fn test_int_8() { Test { reader: &reader, // mins are [-5, -4, 0, 5] - // BUG: not sure why this returns same data but in Int32Array even though I debugged and the columns name is "i8" an its data is Int8 expected_min: Arc::new(Int8Array::from(vec![-5, -4, 0, 5])), // panic here because the actual data is Int32Array // maxes are [-1, 0, 4, 9] expected_max: Arc::new(Int8Array::from(vec![-1, 0, 4, 9])), @@ -620,7 +602,7 @@ async fn test_int_8() { // row counts are [5, 5, 5, 5] expected_row_counts: UInt64Array::from(vec![5, 5, 5, 5]), column_name: "i8", - check: Check::RowGroup, + check: Check::Both, } .run(); }