From 61b1a1632ffe40e33dc252f1d9b7b9c7253af7dc Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Mon, 10 Oct 2022 16:58:48 -0600 Subject: [PATCH 1/7] Optimizer example and docs --- datafusion/optimizer/README.md | 140 ++++++++++++++- datafusion/optimizer/examples/rewrite_expr.rs | 163 ++++++++++++++++++ 2 files changed, 300 insertions(+), 3 deletions(-) create mode 100644 datafusion/optimizer/examples/rewrite_expr.rs diff --git a/datafusion/optimizer/README.md b/datafusion/optimizer/README.md index 39d28a8fae37..fc2b740d3295 100644 --- a/datafusion/optimizer/README.md +++ b/datafusion/optimizer/README.md @@ -17,10 +17,144 @@ under the License. --> -# DataFusion Query Optimizer Rules +# DataFusion Query Optimizer -[DataFusion](df) is an extensible query execution framework, written in Rust, that uses Apache Arrow as its in-memory format. +[DataFusion](df) is an extensible query execution framework, written in Rust, that uses Apache Arrow as its in-memory +format. -This crate is a submodule of DataFusion that provides query optimizer rules. +DataFusion has modular design, allowing individual crates to be re-used in other projects. + +This crate is a submodule of DataFusion that provides a query optimizer for logical plans. + +## Running the Optimizer + +The following code demonstrates the basic flow of creating the optimizer with a default set of optimization rules +and applying it to a logical plan to produce an optimized logical plan. + +```rust + +// We need a logical plan as the starting point. There are many ways to build a logical plan: +// +// The `datafusion-expr` crate provides a LogicalPlanBuilder +// The `datafusion-sql` crate provides a SQL query planner that can create a LogicalPlan from SQL +// The `datafusion` crate provides a DataFrame API that can create a LogicalPlan +let logical_plan = ... + +let mut config = OptimizerConfig::default(); +let optimizer = Optimizer::new(&config); +let optimized_plan = optimizer.optimize(&logical_plan, &mut config, observe)?; + +fn observe(plan: &LogicalPlan, rule: &dyn OptimizerRule) { + println!( + "After applying rule '{}':\n{}", + rule.name(), + plan.display_indent() + ) +} +``` + +## Providing Custom Rules + +The optimizer can be created with a custom set of rules. + +```rust +let optimizer = Optimizer::with_rules(vec![ + Arc::new(MyRule {}) +]); +``` + +## Writing Optimization Rules + +Please refer to the [examples](examples) to learn more about the general approach to writing optimizer rules and +then move onto studying the existing rules. + +All rules must implement the `OptimizerRule` trait. + +```rust +/// `OptimizerRule` transforms one ['LogicalPlan'] into another which +/// computes the same results, but in a potentially more efficient +/// way. +pub trait OptimizerRule { + /// Rewrite `plan` to an optimized form + fn optimize( + &self, + plan: &LogicalPlan, + optimizer_config: &mut OptimizerConfig, + ) -> Result; + + /// A human readable name for this optimizer rule + fn name(&self) -> &str; +} +``` + +### General Guidelines + +Rules typical walk the logical plan and walk the expression trees inside operators and selectively mutate +individual operators or expressions. + +Sometimes there is an initial pass that visits the plan and builds state that is used in a second pass that performs +the actual optimization. This approach is used in projection push down and filter push down. + +### Utilities + +There are a number of utility methods provided that take care of some common tasks. + +### ExprVisitor + +TBD + +### Rewriting Expressions + +The `MyExprRewriter` trait can be implemented to provide a way to rewrite expressions. This rule can then be applied +to an expression by calling `Expr::rewrite` (from the `ExprRewritable` trait). + +The `rewrite` method will perform a depth first walk of the expression and its children to rewrite an expression, +consuming `self` producing a new expression. + +```rust +let mut expr_rewriter = MyExprRewriter {}; +let expr = expr.rewrite(&mut expr_rewriter)?; +``` + +Here is an example implementation which will rewrite `expr BETWEEN a AND b` as `expr >= a AND expr <= b`. Note that the +implementation does not need to perform any recursion since this is handled by the `rewrite` method. + +```rust +struct MyExprRewriter {} + +impl ExprRewriter for MyExprRewriter { + fn mutate(&mut self, expr: Expr) -> Result { + match expr { + Expr::Between { + negated, + expr, + low, + high, + } => { + let expr: Expr = expr.as_ref().clone(); + let low: Expr = low.as_ref().clone(); + let high: Expr = high.as_ref().clone(); + if negated { + Ok(expr.clone().lt(low).or(expr.clone().gt(high))) + } else { + Ok(expr.clone().gt_eq(low).and(expr.clone().lt_eq(high))) + } + } + _ => Ok(expr.clone()), + } + } +} +``` + +### optimize_children + +TBD + +### Writing Tests + +There should be unit tests in the same file as the new rule that test the effect of the rule being applied to a plan +in isolation (without any other rule being applied). + +There should also be a test in `integration-tests.rs` that tests the rule as part of the overall optimization process. [df]: https://crates.io/crates/datafusion diff --git a/datafusion/optimizer/examples/rewrite_expr.rs b/datafusion/optimizer/examples/rewrite_expr.rs new file mode 100644 index 000000000000..7d2c312fb03a --- /dev/null +++ b/datafusion/optimizer/examples/rewrite_expr.rs @@ -0,0 +1,163 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; +use datafusion_common::{DataFusionError, Result}; +use datafusion_expr::expr_rewriter::{ExprRewritable, ExprRewriter}; +use datafusion_expr::{AggregateUDF, Expr, Filter, LogicalPlan, ScalarUDF, TableSource}; +use datafusion_optimizer::optimizer::Optimizer; +use datafusion_optimizer::{utils, OptimizerConfig, OptimizerRule}; +use datafusion_sql::planner::{ContextProvider, SqlToRel}; +use datafusion_sql::sqlparser::dialect::PostgreSqlDialect; +use datafusion_sql::sqlparser::parser::Parser; +use datafusion_sql::TableReference; +use std::any::Any; +use std::sync::Arc; + +pub fn main() -> Result<()> { + // produce a logical plan using the datafusion-sql crate + let dialect = PostgreSqlDialect {}; + let sql = "SELECT * FROM person WHERE age BETWEEN 21 AND 32"; + let statements = Parser::parse_sql(&dialect, sql)?; + + // produce a logical plan using the datafusion-sql crate + let context_provider = MyContextProvider {}; + let sql_to_rel = SqlToRel::new(&context_provider); + let logical_plan = sql_to_rel.sql_statement_to_plan(statements[0].clone())?; + println!( + "Unoptimized Logical Plan:\n\n{}\n", + logical_plan.display_indent() + ); + + // now run the optimizer with our custom rule + let optimizer = Optimizer::with_rules(vec![Arc::new(MyRule {})]); + let mut optimizer_config = OptimizerConfig::default().with_skip_failing_rules(false); + let optimized_plan = + optimizer.optimize(&logical_plan, &mut optimizer_config, observe)?; + println!( + "Optimized Logical Plan:\n\n{}\n", + optimized_plan.display_indent() + ); + + Ok(()) +} + +fn observe(plan: &LogicalPlan, rule: &dyn OptimizerRule) { + println!( + "After applying rule '{}':\n{}\n", + rule.name(), + plan.display_indent() + ) +} + +struct MyRule {} + +impl OptimizerRule for MyRule { + fn name(&self) -> &str { + "my_rule" + } + + fn optimize( + &self, + plan: &LogicalPlan, + _config: &mut OptimizerConfig, + ) -> Result { + // recurse down and optimize children first + let plan = utils::optimize_children(self, plan, _config)?; + + match plan { + LogicalPlan::Filter(filter) => { + let mut expr_rewriter = MyExprRewriter {}; + let predicate = filter.predicate.clone(); + let predicate = predicate.rewrite(&mut expr_rewriter)?; + Ok(LogicalPlan::Filter(Filter { + predicate, + input: filter.input.clone(), + })) + } + _ => Ok(plan.clone()), + } + } +} + +struct MyExprRewriter {} + +impl ExprRewriter for MyExprRewriter { + fn mutate(&mut self, expr: Expr) -> Result { + match expr { + Expr::Between { + negated, + expr, + low, + high, + } => { + let expr: Expr = expr.as_ref().clone(); + let low: Expr = low.as_ref().clone(); + let high: Expr = high.as_ref().clone(); + if negated { + Ok(expr.clone().lt(low).or(expr.clone().gt(high))) + } else { + Ok(expr.clone().gt_eq(low).and(expr.clone().lt_eq(high))) + } + } + _ => Ok(expr.clone()), + } + } +} + +struct MyContextProvider {} + +impl ContextProvider for MyContextProvider { + fn get_table_provider(&self, name: TableReference) -> Result> { + if name.table() == "person" { + Ok(Arc::new(MyTableSource { + schema: Arc::new(Schema::new(vec![ + Field::new("name", DataType::Utf8, false), + Field::new("age", DataType::UInt8, false), + ])), + })) + } else { + Err(DataFusionError::Plan("table not found".to_string())) + } + } + + fn get_function_meta(&self, _name: &str) -> Option> { + None + } + + fn get_aggregate_meta(&self, _name: &str) -> Option> { + None + } + + fn get_variable_type(&self, _variable_names: &[String]) -> Option { + None + } +} + +struct MyTableSource { + schema: SchemaRef, +} + +impl TableSource for MyTableSource { + fn as_any(&self) -> &dyn Any { + self + } + + fn schema(&self) -> SchemaRef { + self.schema.clone() + } +} From 795afc35f132b840e088e4edf071ac6d243e3fbe Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Mon, 10 Oct 2022 18:38:24 -0600 Subject: [PATCH 2/7] Add regression tests to benchmark --- benchmarks/expected-plans/q1.txt | 6 ++ benchmarks/expected-plans/q10.txt | 12 +++ benchmarks/expected-plans/q11.txt | 19 ++++ benchmarks/expected-plans/q12.txt | 7 ++ benchmarks/expected-plans/q13.txt | 11 +++ benchmarks/expected-plans/q14.txt | 7 ++ benchmarks/expected-plans/q15.txt | 11 +++ benchmarks/expected-plans/q16.txt | 13 +++ benchmarks/expected-plans/q17.txt | 11 +++ benchmarks/expected-plans/q18.txt | 13 +++ benchmarks/expected-plans/q19.txt | 9 ++ benchmarks/expected-plans/q2.txt | 25 +++++ benchmarks/expected-plans/q20.txt | 19 ++++ benchmarks/expected-plans/q21.txt | 21 ++++ benchmarks/expected-plans/q22.txt | 15 +++ benchmarks/expected-plans/q3.txt | 11 +++ benchmarks/expected-plans/q4.txt | 8 ++ benchmarks/expected-plans/q5.txt | 16 ++++ benchmarks/expected-plans/q6.txt | 5 + benchmarks/expected-plans/q7.txt | 20 ++++ benchmarks/expected-plans/q8.txt | 25 +++++ benchmarks/expected-plans/q9.txt | 17 ++++ benchmarks/src/bin/tpch.rs | 153 ++++++++++++++++++++++++++++++ dev/release/rat_exclude_files.txt | 1 + 24 files changed, 455 insertions(+) create mode 100644 benchmarks/expected-plans/q1.txt create mode 100644 benchmarks/expected-plans/q10.txt create mode 100644 benchmarks/expected-plans/q11.txt create mode 100644 benchmarks/expected-plans/q12.txt create mode 100644 benchmarks/expected-plans/q13.txt create mode 100644 benchmarks/expected-plans/q14.txt create mode 100644 benchmarks/expected-plans/q15.txt create mode 100644 benchmarks/expected-plans/q16.txt create mode 100644 benchmarks/expected-plans/q17.txt create mode 100644 benchmarks/expected-plans/q18.txt create mode 100644 benchmarks/expected-plans/q19.txt create mode 100644 benchmarks/expected-plans/q2.txt create mode 100644 benchmarks/expected-plans/q20.txt create mode 100644 benchmarks/expected-plans/q21.txt create mode 100644 benchmarks/expected-plans/q22.txt create mode 100644 benchmarks/expected-plans/q3.txt create mode 100644 benchmarks/expected-plans/q4.txt create mode 100644 benchmarks/expected-plans/q5.txt create mode 100644 benchmarks/expected-plans/q6.txt create mode 100644 benchmarks/expected-plans/q7.txt create mode 100644 benchmarks/expected-plans/q8.txt create mode 100644 benchmarks/expected-plans/q9.txt diff --git a/benchmarks/expected-plans/q1.txt b/benchmarks/expected-plans/q1.txt new file mode 100644 index 000000000000..1a9d4b7c98cc --- /dev/null +++ b/benchmarks/expected-plans/q1.txt @@ -0,0 +1,6 @@ +Sort: lineitem.l_returnflag ASC NULLS LAST, lineitem.l_linestatus ASC NULLS LAST + Projection: lineitem.l_returnflag, lineitem.l_linestatus, SUM(lineitem.l_quantity) AS sum_qty, SUM(lineitem.l_extendedprice) AS sum_base_price, SUM(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount) AS sum_disc_price, SUM(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount * Int64(1) + lineitem.l_tax) AS sum_charge, AVG(lineitem.l_quantity) AS avg_qty, AVG(lineitem.l_extendedprice) AS avg_price, AVG(lineitem.l_discount) AS avg_disc, COUNT(UInt8(1)) AS count_order + Aggregate: groupBy=[[lineitem.l_returnflag, lineitem.l_linestatus]], aggr=[[SUM(lineitem.l_quantity), SUM(lineitem.l_extendedprice), SUM(CAST(lineitem.l_extendedprice AS Decimal128(38, 4)) * CAST(Decimal128(Some(100),23,2) - CAST(lineitem.l_discount AS Decimal128(23, 2)) AS Decimal128(38, 4))CAST(Decimal128(Some(100),23,2) - CAST(lineitem.l_discount AS Decimal128(23, 2)) AS Decimal128(38, 4))Decimal128(Some(100),23,2) - CAST(lineitem.l_discount AS Decimal128(23, 2))CAST(lineitem.l_discount AS Decimal128(23, 2))lineitem.l_discountDecimal128(Some(100),23,2)CAST(lineitem.l_extendedprice AS Decimal128(38, 4))lineitem.l_extendedprice AS lineitem.l_extendedprice * Decimal128(Some(100),23,2) - lineitem.l_discount) AS SUM(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount), SUM(CAST(CAST(lineitem.l_extendedprice AS Decimal128(38, 4)) * CAST(Decimal128(Some(100),23,2) - CAST(lineitem.l_discount AS Decimal128(23, 2)) AS Decimal128(38, 4))CAST(Decimal128(Some(100),23,2) - CAST(lineitem.l_discount AS Decimal128(23, 2)) AS Decimal128(38, 4))Decimal128(Some(100),23,2) - CAST(lineitem.l_discount AS Decimal128(23, 2))CAST(lineitem.l_discount AS Decimal128(23, 2))lineitem.l_discountDecimal128(Some(100),23,2)CAST(lineitem.l_extendedprice AS Decimal128(38, 4))lineitem.l_extendedprice AS lineitem.l_extendedprice * Decimal128(Some(100),23,2) - lineitem.l_discount AS Decimal128(38, 6)) * CAST(Decimal128(Some(100),23,2) + CAST(lineitem.l_tax AS Decimal128(23, 2)) AS Decimal128(38, 6))) AS SUM(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount * Int64(1) + lineitem.l_tax), AVG(lineitem.l_quantity), AVG(lineitem.l_extendedprice), AVG(lineitem.l_discount), COUNT(UInt8(1))]] + Projection: CAST(lineitem.l_extendedprice AS Decimal128(38, 4)) * CAST(Decimal128(Some(100),23,2) - CAST(lineitem.l_discount AS Decimal128(23, 2)) AS Decimal128(38, 4)) AS CAST(lineitem.l_extendedprice AS Decimal128(38, 4)) * CAST(Decimal128(Some(100),23,2) - CAST(lineitem.l_discount AS Decimal128(23, 2)) AS Decimal128(38, 4))CAST(Decimal128(Some(100),23,2) - CAST(lineitem.l_discount AS Decimal128(23, 2)) AS Decimal128(38, 4))Decimal128(Some(100),23,2) - CAST(lineitem.l_discount AS Decimal128(23, 2))CAST(lineitem.l_discount AS Decimal128(23, 2))lineitem.l_discountDecimal128(Some(100),23,2)CAST(lineitem.l_extendedprice AS Decimal128(38, 4))lineitem.l_extendedprice, lineitem.l_quantity, lineitem.l_extendedprice, lineitem.l_discount, lineitem.l_tax, lineitem.l_returnflag, lineitem.l_linestatus + Filter: lineitem.l_shipdate <= Date32("10471") + TableScan: lineitem projection=[l_quantity, l_extendedprice, l_discount, l_tax, l_returnflag, l_linestatus, l_shipdate] \ No newline at end of file diff --git a/benchmarks/expected-plans/q10.txt b/benchmarks/expected-plans/q10.txt new file mode 100644 index 000000000000..25189feb61e1 --- /dev/null +++ b/benchmarks/expected-plans/q10.txt @@ -0,0 +1,12 @@ +Sort: revenue DESC NULLS FIRST + Projection: customer.c_custkey, customer.c_name, SUM(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount) AS revenue, customer.c_acctbal, nation.n_name, customer.c_address, customer.c_phone, customer.c_comment + Aggregate: groupBy=[[customer.c_custkey, customer.c_name, customer.c_acctbal, customer.c_phone, nation.n_name, customer.c_address, customer.c_comment]], aggr=[[SUM(CAST(lineitem.l_extendedprice AS Decimal128(38, 4)) * CAST(Decimal128(Some(100),23,2) - CAST(lineitem.l_discount AS Decimal128(23, 2)) AS Decimal128(38, 4))) AS SUM(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)]] + Inner Join: customer.c_nationkey = nation.n_nationkey + Inner Join: orders.o_orderkey = lineitem.l_orderkey + Inner Join: customer.c_custkey = orders.o_custkey + TableScan: customer projection=[c_custkey, c_name, c_address, c_nationkey, c_phone, c_acctbal, c_comment] + Filter: orders.o_orderdate >= Date32("8674") AND orders.o_orderdate < Date32("8766") + TableScan: orders projection=[o_orderkey, o_custkey, o_orderdate] + Filter: lineitem.l_returnflag = Utf8("R") + TableScan: lineitem projection=[l_orderkey, l_extendedprice, l_discount, l_returnflag] + TableScan: nation projection=[n_nationkey, n_name] \ No newline at end of file diff --git a/benchmarks/expected-plans/q11.txt b/benchmarks/expected-plans/q11.txt new file mode 100644 index 000000000000..b408340a32a0 --- /dev/null +++ b/benchmarks/expected-plans/q11.txt @@ -0,0 +1,19 @@ +Sort: value DESC NULLS FIRST + Projection: partsupp.ps_partkey, SUM(partsupp.ps_supplycost * partsupp.ps_availqty) AS value + Filter: CAST(SUM(partsupp.ps_supplycost * partsupp.ps_availqty) AS Decimal128(38, 17)) > __sq_1.__value + CrossJoin: + Aggregate: groupBy=[[partsupp.ps_partkey]], aggr=[[SUM(CAST(partsupp.ps_supplycost AS Decimal128(26, 2)) * CAST(partsupp.ps_availqty AS Decimal128(26, 2)))]] + Inner Join: supplier.s_nationkey = nation.n_nationkey + Inner Join: partsupp.ps_suppkey = supplier.s_suppkey + TableScan: partsupp projection=[ps_partkey, ps_suppkey, ps_availqty, ps_supplycost] + TableScan: supplier projection=[s_suppkey, s_nationkey] + Filter: nation.n_name = Utf8("GERMANY") + TableScan: nation projection=[n_nationkey, n_name] + Projection: CAST(SUM(partsupp.ps_supplycost * partsupp.ps_availqty) AS Decimal128(38, 17)) * Decimal128(Some(10000000000000),38,17) AS __value, alias=__sq_1 + Aggregate: groupBy=[[]], aggr=[[SUM(CAST(partsupp.ps_supplycost AS Decimal128(26, 2)) * CAST(partsupp.ps_availqty AS Decimal128(26, 2)))]] + Inner Join: supplier.s_nationkey = nation.n_nationkey + Inner Join: partsupp.ps_suppkey = supplier.s_suppkey + TableScan: partsupp projection=[ps_partkey, ps_suppkey, ps_availqty, ps_supplycost] + TableScan: supplier projection=[s_suppkey, s_nationkey] + Filter: nation.n_name = Utf8("GERMANY") + TableScan: nation projection=[n_nationkey, n_name] \ No newline at end of file diff --git a/benchmarks/expected-plans/q12.txt b/benchmarks/expected-plans/q12.txt new file mode 100644 index 000000000000..ef3bab4b7096 --- /dev/null +++ b/benchmarks/expected-plans/q12.txt @@ -0,0 +1,7 @@ +Sort: lineitem.l_shipmode ASC NULLS LAST + Projection: lineitem.l_shipmode, SUM(CASE WHEN orders.o_orderpriority = Utf8("1-URGENT") OR orders.o_orderpriority = Utf8("2-HIGH") THEN Int64(1) ELSE Int64(0) END) AS high_line_count, SUM(CASE WHEN orders.o_orderpriority != Utf8("1-URGENT") AND orders.o_orderpriority != Utf8("2-HIGH") THEN Int64(1) ELSE Int64(0) END) AS low_line_count + Aggregate: groupBy=[[lineitem.l_shipmode]], aggr=[[SUM(CASE WHEN orders.o_orderpriority = Utf8("1-URGENT") OR orders.o_orderpriority = Utf8("2-HIGH") THEN Int64(1) ELSE Int64(0) END), SUM(CASE WHEN orders.o_orderpriority != Utf8("1-URGENT") AND orders.o_orderpriority != Utf8("2-HIGH") THEN Int64(1) ELSE Int64(0) END)]] + Inner Join: lineitem.l_orderkey = orders.o_orderkey + Filter: lineitem.l_shipmode IN ([Utf8("MAIL"), Utf8("SHIP")]) AND lineitem.l_commitdate < lineitem.l_receiptdate AND lineitem.l_shipdate < lineitem.l_commitdate AND lineitem.l_receiptdate >= Date32("8766") AND lineitem.l_receiptdate < Date32("9131") + TableScan: lineitem projection=[l_orderkey, l_shipdate, l_commitdate, l_receiptdate, l_shipmode] + TableScan: orders projection=[o_orderkey, o_orderpriority] \ No newline at end of file diff --git a/benchmarks/expected-plans/q13.txt b/benchmarks/expected-plans/q13.txt new file mode 100644 index 000000000000..12a9bf1dac24 --- /dev/null +++ b/benchmarks/expected-plans/q13.txt @@ -0,0 +1,11 @@ +Sort: custdist DESC NULLS FIRST, c_orders.c_count DESC NULLS FIRST + Projection: c_orders.c_count, COUNT(UInt8(1)) AS custdist + Aggregate: groupBy=[[c_orders.c_count]], aggr=[[COUNT(UInt8(1))]] + Projection: c_orders.COUNT(orders.o_orderkey) AS c_count, alias=c_orders + Projection: c_orders.COUNT(orders.o_orderkey), alias=c_orders + Projection: COUNT(orders.o_orderkey), alias=c_orders + Aggregate: groupBy=[[customer.c_custkey]], aggr=[[COUNT(orders.o_orderkey)]] + Left Join: customer.c_custkey = orders.o_custkey + TableScan: customer projection=[c_custkey] + Filter: orders.o_comment NOT LIKE Utf8("%special%requests%") + TableScan: orders projection=[o_orderkey, o_custkey, o_comment] \ No newline at end of file diff --git a/benchmarks/expected-plans/q14.txt b/benchmarks/expected-plans/q14.txt new file mode 100644 index 000000000000..c410363a5821 --- /dev/null +++ b/benchmarks/expected-plans/q14.txt @@ -0,0 +1,7 @@ +Projection: CAST(Decimal128(Some(1000000000000000000000),38,19) * CAST(SUM(CASE WHEN part.p_type LIKE Utf8("PROMO%") THEN lineitem.l_extendedprice * Int64(1) - lineitem.l_discount ELSE Int64(0) END) AS Decimal128(38, 19)) AS Decimal128(38, 38)) / CAST(SUM(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount) AS Decimal128(38, 38)) AS promo_revenue + Aggregate: groupBy=[[]], aggr=[[SUM(CASE WHEN part.p_type LIKE Utf8("PROMO%") THEN CAST(lineitem.l_extendedprice AS Decimal128(38, 4)) * CAST(Decimal128(Some(100),23,2) - CAST(lineitem.l_discount AS Decimal128(23, 2)) AS Decimal128(38, 4))CAST(Decimal128(Some(100),23,2) - CAST(lineitem.l_discount AS Decimal128(23, 2)) AS Decimal128(38, 4))Decimal128(Some(100),23,2) - CAST(lineitem.l_discount AS Decimal128(23, 2))CAST(lineitem.l_discount AS Decimal128(23, 2))lineitem.l_discountDecimal128(Some(100),23,2)CAST(lineitem.l_extendedprice AS Decimal128(38, 4))lineitem.l_extendedprice AS lineitem.l_extendedprice * Decimal128(Some(100),23,2) - lineitem.l_discount ELSE Decimal128(Some(0),38,4) END) AS SUM(CASE WHEN part.p_type LIKE Utf8("PROMO%") THEN lineitem.l_extendedprice * Int64(1) - lineitem.l_discount ELSE Int64(0) END), SUM(CAST(lineitem.l_extendedprice AS Decimal128(38, 4)) * CAST(Decimal128(Some(100),23,2) - CAST(lineitem.l_discount AS Decimal128(23, 2)) AS Decimal128(38, 4))CAST(Decimal128(Some(100),23,2) - CAST(lineitem.l_discount AS Decimal128(23, 2)) AS Decimal128(38, 4))Decimal128(Some(100),23,2) - CAST(lineitem.l_discount AS Decimal128(23, 2))CAST(lineitem.l_discount AS Decimal128(23, 2))lineitem.l_discountDecimal128(Some(100),23,2)CAST(lineitem.l_extendedprice AS Decimal128(38, 4))lineitem.l_extendedprice AS lineitem.l_extendedprice * Decimal128(Some(100),23,2) - lineitem.l_discount) AS SUM(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)]] + Projection: CAST(lineitem.l_extendedprice AS Decimal128(38, 4)) * CAST(Decimal128(Some(100),23,2) - CAST(lineitem.l_discount AS Decimal128(23, 2)) AS Decimal128(38, 4)) AS CAST(lineitem.l_extendedprice AS Decimal128(38, 4)) * CAST(Decimal128(Some(100),23,2) - CAST(lineitem.l_discount AS Decimal128(23, 2)) AS Decimal128(38, 4))CAST(Decimal128(Some(100),23,2) - CAST(lineitem.l_discount AS Decimal128(23, 2)) AS Decimal128(38, 4))Decimal128(Some(100),23,2) - CAST(lineitem.l_discount AS Decimal128(23, 2))CAST(lineitem.l_discount AS Decimal128(23, 2))lineitem.l_discountDecimal128(Some(100),23,2)CAST(lineitem.l_extendedprice AS Decimal128(38, 4))lineitem.l_extendedprice, part.p_type + Inner Join: lineitem.l_partkey = part.p_partkey + Filter: lineitem.l_shipdate >= Date32("9374") AND lineitem.l_shipdate < Date32("9404") + TableScan: lineitem projection=[l_partkey, l_extendedprice, l_discount, l_shipdate] + TableScan: part projection=[p_partkey, p_type] \ No newline at end of file diff --git a/benchmarks/expected-plans/q15.txt b/benchmarks/expected-plans/q15.txt new file mode 100644 index 000000000000..e2f59dc5ca0f --- /dev/null +++ b/benchmarks/expected-plans/q15.txt @@ -0,0 +1,11 @@ +EmptyRelation +Sort: supplier.s_suppkey ASC NULLS LAST + Projection: supplier.s_suppkey, supplier.s_name, supplier.s_address, supplier.s_phone, revenue0.total_revenue + Inner Join: revenue0.total_revenue = __sq_1.__value + Inner Join: supplier.s_suppkey = revenue0.supplier_no + TableScan: supplier projection=[s_suppkey, s_name, s_address, s_phone] + TableScan: revenue0 projection=[supplier_no, total_revenue] + Projection: MAX(revenue0.total_revenue) AS __value, alias=__sq_1 + Aggregate: groupBy=[[]], aggr=[[MAX(revenue0.total_revenue)]] + TableScan: revenue0 projection=[total_revenue] +EmptyRelation \ No newline at end of file diff --git a/benchmarks/expected-plans/q16.txt b/benchmarks/expected-plans/q16.txt new file mode 100644 index 000000000000..11943cf2477b --- /dev/null +++ b/benchmarks/expected-plans/q16.txt @@ -0,0 +1,13 @@ +Sort: supplier_cnt DESC NULLS FIRST, part.p_brand ASC NULLS LAST, part.p_type ASC NULLS LAST, part.p_size ASC NULLS LAST + Projection: part.p_brand, part.p_type, part.p_size, COUNT(DISTINCT partsupp.ps_suppkey) AS supplier_cnt + Projection: group_alias_0 AS p_brand, group_alias_1 AS p_type, group_alias_2 AS p_size, COUNT(alias1) AS COUNT(DISTINCT partsupp.ps_suppkey) + Aggregate: groupBy=[[group_alias_0, group_alias_1, group_alias_2]], aggr=[[COUNT(alias1)]] + Aggregate: groupBy=[[part.p_brand AS group_alias_0, part.p_type AS group_alias_1, part.p_size AS group_alias_2, partsupp.ps_suppkey AS alias1]], aggr=[[]] + Anti Join: partsupp.ps_suppkey = __sq_1.s_suppkey + Inner Join: partsupp.ps_partkey = part.p_partkey + TableScan: partsupp projection=[ps_partkey, ps_suppkey] + Filter: part.p_brand != Utf8("Brand#45") AND part.p_type NOT LIKE Utf8("MEDIUM POLISHED%") AND part.p_size IN ([Int32(49), Int32(14), Int32(23), Int32(45), Int32(19), Int32(3), Int32(36), Int32(9)]) + TableScan: part projection=[p_partkey, p_brand, p_type, p_size] + Projection: supplier.s_suppkey AS s_suppkey, alias=__sq_1 + Filter: supplier.s_comment LIKE Utf8("%Customer%Complaints%") + TableScan: supplier projection=[s_suppkey, s_comment] \ No newline at end of file diff --git a/benchmarks/expected-plans/q17.txt b/benchmarks/expected-plans/q17.txt new file mode 100644 index 000000000000..17b8e969879a --- /dev/null +++ b/benchmarks/expected-plans/q17.txt @@ -0,0 +1,11 @@ +Projection: CAST(SUM(lineitem.l_extendedprice) AS Decimal128(38, 33)) / Decimal128(Some(7000000000000000195487369212723200),38,33) AS avg_yearly + Aggregate: groupBy=[[]], aggr=[[SUM(lineitem.l_extendedprice)]] + Filter: CAST(lineitem.l_quantity AS Decimal128(38, 21)) < __sq_1.__value + Inner Join: part.p_partkey = __sq_1.l_partkey + Inner Join: lineitem.l_partkey = part.p_partkey + TableScan: lineitem projection=[l_partkey, l_quantity, l_extendedprice] + Filter: part.p_brand = Utf8("Brand#23") AND part.p_container = Utf8("MED BOX") + TableScan: part projection=[p_partkey, p_brand, p_container] + Projection: lineitem.l_partkey, Decimal128(Some(200000000000000000000),38,21) * CAST(AVG(lineitem.l_quantity) AS Decimal128(38, 21)) AS __value, alias=__sq_1 + Aggregate: groupBy=[[lineitem.l_partkey]], aggr=[[AVG(lineitem.l_quantity)]] + TableScan: lineitem projection=[l_partkey, l_quantity, l_extendedprice] \ No newline at end of file diff --git a/benchmarks/expected-plans/q18.txt b/benchmarks/expected-plans/q18.txt new file mode 100644 index 000000000000..ebc22ea5d886 --- /dev/null +++ b/benchmarks/expected-plans/q18.txt @@ -0,0 +1,13 @@ +Sort: orders.o_totalprice DESC NULLS FIRST, orders.o_orderdate ASC NULLS LAST + Projection: customer.c_name, customer.c_custkey, orders.o_orderkey, orders.o_orderdate, orders.o_totalprice, SUM(lineitem.l_quantity) + Aggregate: groupBy=[[customer.c_name, customer.c_custkey, orders.o_orderkey, orders.o_orderdate, orders.o_totalprice]], aggr=[[SUM(lineitem.l_quantity)]] + Semi Join: orders.o_orderkey = __sq_1.l_orderkey + Inner Join: orders.o_orderkey = lineitem.l_orderkey + Inner Join: customer.c_custkey = orders.o_custkey + TableScan: customer projection=[c_custkey, c_name] + TableScan: orders projection=[o_orderkey, o_custkey, o_totalprice, o_orderdate] + TableScan: lineitem projection=[l_orderkey, l_quantity] + Projection: lineitem.l_orderkey AS l_orderkey, alias=__sq_1 + Filter: SUM(lineitem.l_quantity) > Decimal128(Some(30000),25,2) + Aggregate: groupBy=[[lineitem.l_orderkey]], aggr=[[SUM(lineitem.l_quantity)]] + TableScan: lineitem projection=[l_orderkey, l_quantity] \ No newline at end of file diff --git a/benchmarks/expected-plans/q19.txt b/benchmarks/expected-plans/q19.txt new file mode 100644 index 000000000000..902893ea9012 --- /dev/null +++ b/benchmarks/expected-plans/q19.txt @@ -0,0 +1,9 @@ +Projection: SUM(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount) AS revenue + Aggregate: groupBy=[[]], aggr=[[SUM(CAST(lineitem.l_extendedprice AS Decimal128(38, 4)) * CAST(Decimal128(Some(100),23,2) - CAST(lineitem.l_discount AS Decimal128(23, 2)) AS Decimal128(38, 4))) AS SUM(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)]] + Projection: lineitem.l_shipinstruct = Utf8("DELIVER IN PERSON") AS lineitem.l_shipinstruct = Utf8("DELIVER IN PERSON")Utf8("DELIVER IN PERSON")lineitem.l_shipinstruct, lineitem.l_shipmode IN ([Utf8("AIR"), Utf8("AIR REG")]) AS lineitem.l_shipmode IN ([Utf8("AIR"), Utf8("AIR REG")])Utf8("AIR REG")Utf8("AIR")lineitem.l_shipmode, part.p_size >= Int32(1) AS part.p_size >= Int32(1)Int32(1)part.p_size, lineitem.l_quantity, lineitem.l_extendedprice, lineitem.l_discount, part.p_brand, part.p_size, part.p_container + Filter: part.p_brand = Utf8("Brand#12") AND part.p_container IN ([Utf8("SM CASE"), Utf8("SM BOX"), Utf8("SM PACK"), Utf8("SM PKG")]) AND lineitem.l_quantity >= Decimal128(Some(100),15,2) AND lineitem.l_quantity <= Decimal128(Some(1100),15,2) AND part.p_size <= Int32(5) OR part.p_brand = Utf8("Brand#23") AND part.p_container IN ([Utf8("MED BAG"), Utf8("MED BOX"), Utf8("MED PKG"), Utf8("MED PACK")]) AND lineitem.l_quantity >= Decimal128(Some(1000),15,2) AND lineitem.l_quantity <= Decimal128(Some(2000),15,2) AND part.p_size <= Int32(10) OR part.p_brand = Utf8("Brand#34") AND part.p_container IN ([Utf8("LG CASE"), Utf8("LG BOX"), Utf8("LG PACK"), Utf8("LG PKG")]) AND lineitem.l_quantity >= Decimal128(Some(2000),15,2) AND lineitem.l_quantity <= Decimal128(Some(3000),15,2) AND part.p_size <= Int32(15) + Inner Join: lineitem.l_partkey = part.p_partkey + Filter: lineitem.l_shipmode IN ([Utf8("AIR"), Utf8("AIR REG")]) AND lineitem.l_shipinstruct = Utf8("DELIVER IN PERSON") + TableScan: lineitem projection=[l_partkey, l_quantity, l_extendedprice, l_discount, l_shipinstruct, l_shipmode] + Filter: part.p_size >= Int32(1) + TableScan: part projection=[p_partkey, p_brand, p_size, p_container] \ No newline at end of file diff --git a/benchmarks/expected-plans/q2.txt b/benchmarks/expected-plans/q2.txt new file mode 100644 index 000000000000..10d68cd374db --- /dev/null +++ b/benchmarks/expected-plans/q2.txt @@ -0,0 +1,25 @@ +Sort: supplier.s_acctbal DESC NULLS FIRST, nation.n_name ASC NULLS LAST, supplier.s_name ASC NULLS LAST, part.p_partkey ASC NULLS LAST + Projection: supplier.s_acctbal, supplier.s_name, nation.n_name, part.p_partkey, part.p_mfgr, supplier.s_address, supplier.s_phone, supplier.s_comment + Filter: partsupp.ps_supplycost = __sq_1.__value + Inner Join: part.p_partkey = __sq_1.ps_partkey + Inner Join: nation.n_regionkey = region.r_regionkey + Inner Join: supplier.s_nationkey = nation.n_nationkey + Inner Join: partsupp.ps_suppkey = supplier.s_suppkey + Inner Join: part.p_partkey = partsupp.ps_partkey + Filter: part.p_size = Int32(15) AND part.p_type LIKE Utf8("%BRASS") + TableScan: part projection=[p_partkey, p_mfgr, p_type, p_size] + TableScan: partsupp projection=[ps_partkey, ps_suppkey, ps_supplycost] + TableScan: supplier projection=[s_suppkey, s_name, s_address, s_nationkey, s_phone, s_acctbal, s_comment] + TableScan: nation projection=[n_nationkey, n_name, n_regionkey] + Filter: region.r_name = Utf8("EUROPE") + TableScan: region projection=[r_regionkey, r_name] + Projection: partsupp.ps_partkey, MIN(partsupp.ps_supplycost) AS __value, alias=__sq_1 + Aggregate: groupBy=[[partsupp.ps_partkey]], aggr=[[MIN(partsupp.ps_supplycost)]] + Inner Join: nation.n_regionkey = region.r_regionkey + Inner Join: supplier.s_nationkey = nation.n_nationkey + Inner Join: partsupp.ps_suppkey = supplier.s_suppkey + TableScan: partsupp projection=[ps_partkey, ps_suppkey, ps_supplycost] + TableScan: supplier projection=[s_suppkey, s_name, s_address, s_nationkey, s_phone, s_acctbal, s_comment] + TableScan: nation projection=[n_nationkey, n_name, n_regionkey] + Filter: region.r_name = Utf8("EUROPE") + TableScan: region projection=[r_regionkey, r_name] \ No newline at end of file diff --git a/benchmarks/expected-plans/q20.txt b/benchmarks/expected-plans/q20.txt new file mode 100644 index 000000000000..6d3ef1f6cc75 --- /dev/null +++ b/benchmarks/expected-plans/q20.txt @@ -0,0 +1,19 @@ +Sort: supplier.s_name ASC NULLS LAST + Projection: supplier.s_name, supplier.s_address + Semi Join: supplier.s_suppkey = __sq_2.ps_suppkey + Inner Join: supplier.s_nationkey = nation.n_nationkey + TableScan: supplier projection=[s_suppkey, s_name, s_address, s_nationkey] + Filter: nation.n_name = Utf8("CANADA") + TableScan: nation projection=[n_nationkey, n_name] + Projection: partsupp.ps_suppkey AS ps_suppkey, alias=__sq_2 + Filter: CAST(partsupp.ps_availqty AS Decimal128(38, 17)) > __sq_3.__value + Inner Join: partsupp.ps_partkey = __sq_3.l_partkey, partsupp.ps_suppkey = __sq_3.l_suppkey + Semi Join: partsupp.ps_partkey = __sq_1.p_partkey + TableScan: partsupp projection=[ps_partkey, ps_suppkey, ps_availqty] + Projection: part.p_partkey AS p_partkey, alias=__sq_1 + Filter: part.p_name LIKE Utf8("forest%") + TableScan: part projection=[p_partkey, p_name] + Projection: lineitem.l_partkey, lineitem.l_suppkey, Decimal128(Some(50000000000000000),38,17) * CAST(SUM(lineitem.l_quantity) AS Decimal128(38, 17)) AS __value, alias=__sq_3 + Aggregate: groupBy=[[lineitem.l_partkey, lineitem.l_suppkey]], aggr=[[SUM(lineitem.l_quantity)]] + Filter: lineitem.l_shipdate >= Date32("8766") AND lineitem.l_shipdate < Date32("9131") + TableScan: lineitem projection=[l_partkey, l_suppkey, l_quantity, l_shipdate] \ No newline at end of file diff --git a/benchmarks/expected-plans/q21.txt b/benchmarks/expected-plans/q21.txt new file mode 100644 index 000000000000..5689fafc2eb4 --- /dev/null +++ b/benchmarks/expected-plans/q21.txt @@ -0,0 +1,21 @@ +Sort: numwait DESC NULLS FIRST, supplier.s_name ASC NULLS LAST + Projection: supplier.s_name, COUNT(UInt8(1)) AS numwait + Aggregate: groupBy=[[supplier.s_name]], aggr=[[COUNT(UInt8(1))]] + Anti Join: l1.l_orderkey = l3.l_orderkey Filter: l3.l_suppkey != l1.l_suppkey + Semi Join: l1.l_orderkey = l2.l_orderkey Filter: l2.l_suppkey != l1.l_suppkey + Inner Join: supplier.s_nationkey = nation.n_nationkey + Inner Join: l1.l_orderkey = orders.o_orderkey + Inner Join: supplier.s_suppkey = l1.l_suppkey + TableScan: supplier projection=[s_suppkey, s_name, s_nationkey] + Filter: l1.l_receiptdate > l1.l_commitdate AND l1.l_receiptdate > l1.l_commitdate + SubqueryAlias: l1 + TableScan: lineitem projection=[l_orderkey, l_suppkey, l_commitdate, l_receiptdate] + Filter: orders.o_orderstatus = Utf8("F") AND orders.o_orderstatus = Utf8("F") + TableScan: orders projection=[o_orderkey, o_orderstatus] + Filter: nation.n_name = Utf8("SAUDI ARABIA") AND nation.n_name = Utf8("SAUDI ARABIA") + TableScan: nation projection=[n_nationkey, n_name] + SubqueryAlias: l2 + TableScan: lineitem projection=[l_orderkey, l_suppkey] + Filter: l3.l_receiptdate > l3.l_commitdate + SubqueryAlias: l3 + TableScan: lineitem projection=[l_orderkey, l_suppkey, l_commitdate, l_receiptdate] \ No newline at end of file diff --git a/benchmarks/expected-plans/q22.txt b/benchmarks/expected-plans/q22.txt new file mode 100644 index 000000000000..4b0c1a59c00b --- /dev/null +++ b/benchmarks/expected-plans/q22.txt @@ -0,0 +1,15 @@ +Sort: custsale.cntrycode ASC NULLS LAST + Projection: custsale.cntrycode, COUNT(UInt8(1)) AS numcust, SUM(custsale.c_acctbal) AS totacctbal + Aggregate: groupBy=[[custsale.cntrycode]], aggr=[[COUNT(UInt8(1)), SUM(custsale.c_acctbal)]] + Projection: custsale.cntrycode, custsale.c_acctbal, alias=custsale + Projection: substr(customer.c_phone, Int64(1), Int64(2)) AS cntrycode, customer.c_acctbal, alias=custsale + Filter: CAST(customer.c_acctbal AS Decimal128(19, 6)) > __sq_1.__value + CrossJoin: + Anti Join: customer.c_custkey = orders.o_custkey + Filter: substr(customer.c_phone, Int64(1), Int64(2)) IN ([Utf8("13"), Utf8("31"), Utf8("23"), Utf8("29"), Utf8("30"), Utf8("18"), Utf8("17")]) + TableScan: customer projection=[c_custkey, c_phone, c_acctbal] + TableScan: orders projection=[o_custkey] + Projection: AVG(customer.c_acctbal) AS __value, alias=__sq_1 + Aggregate: groupBy=[[]], aggr=[[AVG(customer.c_acctbal)]] + Filter: CAST(customer.c_acctbal AS Decimal128(30, 15)) > Decimal128(Some(0),30,15) AND substr(customer.c_phone, Int64(1), Int64(2)) IN ([Utf8("13"), Utf8("31"), Utf8("23"), Utf8("29"), Utf8("30"), Utf8("18"), Utf8("17")]) + TableScan: customer projection=[c_phone, c_acctbal] \ No newline at end of file diff --git a/benchmarks/expected-plans/q3.txt b/benchmarks/expected-plans/q3.txt new file mode 100644 index 000000000000..7cd69b92a556 --- /dev/null +++ b/benchmarks/expected-plans/q3.txt @@ -0,0 +1,11 @@ +Sort: revenue DESC NULLS FIRST, orders.o_orderdate ASC NULLS LAST + Projection: lineitem.l_orderkey, SUM(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount) AS revenue, orders.o_orderdate, orders.o_shippriority + Aggregate: groupBy=[[lineitem.l_orderkey, orders.o_orderdate, orders.o_shippriority]], aggr=[[SUM(CAST(lineitem.l_extendedprice AS Decimal128(38, 4)) * CAST(Decimal128(Some(100),23,2) - CAST(lineitem.l_discount AS Decimal128(23, 2)) AS Decimal128(38, 4))) AS SUM(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)]] + Inner Join: orders.o_orderkey = lineitem.l_orderkey + Inner Join: customer.c_custkey = orders.o_custkey + Filter: customer.c_mktsegment = Utf8("BUILDING") + TableScan: customer projection=[c_custkey, c_mktsegment] + Filter: orders.o_orderdate < Date32("9204") + TableScan: orders projection=[o_orderkey, o_custkey, o_orderdate, o_shippriority] + Filter: lineitem.l_shipdate > Date32("9204") + TableScan: lineitem projection=[l_orderkey, l_extendedprice, l_discount, l_shipdate] \ No newline at end of file diff --git a/benchmarks/expected-plans/q4.txt b/benchmarks/expected-plans/q4.txt new file mode 100644 index 000000000000..a4339732e68f --- /dev/null +++ b/benchmarks/expected-plans/q4.txt @@ -0,0 +1,8 @@ +Sort: orders.o_orderpriority ASC NULLS LAST + Projection: orders.o_orderpriority, COUNT(UInt8(1)) AS order_count + Aggregate: groupBy=[[orders.o_orderpriority]], aggr=[[COUNT(UInt8(1))]] + Semi Join: orders.o_orderkey = lineitem.l_orderkey + Filter: orders.o_orderdate >= Date32("8582") AND orders.o_orderdate < Date32("8674") + TableScan: orders projection=[o_orderkey, o_orderdate, o_orderpriority] + Filter: lineitem.l_commitdate < lineitem.l_receiptdate + TableScan: lineitem projection=[l_orderkey, l_commitdate, l_receiptdate] \ No newline at end of file diff --git a/benchmarks/expected-plans/q5.txt b/benchmarks/expected-plans/q5.txt new file mode 100644 index 000000000000..0c2e0c131805 --- /dev/null +++ b/benchmarks/expected-plans/q5.txt @@ -0,0 +1,16 @@ +Sort: revenue DESC NULLS FIRST + Projection: nation.n_name, SUM(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount) AS revenue + Aggregate: groupBy=[[nation.n_name]], aggr=[[SUM(CAST(lineitem.l_extendedprice AS Decimal128(38, 4)) * CAST(Decimal128(Some(100),23,2) - CAST(lineitem.l_discount AS Decimal128(23, 2)) AS Decimal128(38, 4))) AS SUM(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)]] + Inner Join: nation.n_regionkey = region.r_regionkey + Inner Join: supplier.s_nationkey = nation.n_nationkey + Inner Join: lineitem.l_suppkey = supplier.s_suppkey, customer.c_nationkey = supplier.s_nationkey + Inner Join: orders.o_orderkey = lineitem.l_orderkey + Inner Join: customer.c_custkey = orders.o_custkey + TableScan: customer projection=[c_custkey, c_nationkey] + Filter: orders.o_orderdate >= Date32("8766") AND orders.o_orderdate < Date32("9131") + TableScan: orders projection=[o_orderkey, o_custkey, o_orderdate] + TableScan: lineitem projection=[l_orderkey, l_suppkey, l_extendedprice, l_discount] + TableScan: supplier projection=[s_suppkey, s_nationkey] + TableScan: nation projection=[n_nationkey, n_name, n_regionkey] + Filter: region.r_name = Utf8("ASIA") + TableScan: region projection=[r_regionkey, r_name] \ No newline at end of file diff --git a/benchmarks/expected-plans/q6.txt b/benchmarks/expected-plans/q6.txt new file mode 100644 index 000000000000..ad27ba2b9aeb --- /dev/null +++ b/benchmarks/expected-plans/q6.txt @@ -0,0 +1,5 @@ +Projection: SUM(lineitem.l_extendedprice * lineitem.l_discount) AS revenue + Aggregate: groupBy=[[]], aggr=[[SUM(lineitem.l_extendedprice * lineitem.l_discount)]] + Projection: CAST(lineitem.l_discount AS Decimal128(30, 15)) AS CAST(lineitem.l_discount AS Decimal128(30, 15))lineitem.l_discount, lineitem.l_quantity, lineitem.l_extendedprice, lineitem.l_discount, lineitem.l_shipdate + Filter: lineitem.l_shipdate >= Date32("8766") AND lineitem.l_shipdate < Date32("9131") AND CAST(lineitem.l_discount AS Decimal128(30, 15)) AS lineitem.l_discount >= Decimal128(Some(49999999999999),30,15) AND CAST(lineitem.l_discount AS Decimal128(30, 15)) AS lineitem.l_discount <= Decimal128(Some(69999999999999),30,15) AND lineitem.l_quantity < Decimal128(Some(2400),15,2) + TableScan: lineitem projection=[l_quantity, l_extendedprice, l_discount, l_shipdate] \ No newline at end of file diff --git a/benchmarks/expected-plans/q7.txt b/benchmarks/expected-plans/q7.txt new file mode 100644 index 000000000000..4a2866a42df7 --- /dev/null +++ b/benchmarks/expected-plans/q7.txt @@ -0,0 +1,20 @@ +Sort: shipping.supp_nation ASC NULLS LAST, shipping.cust_nation ASC NULLS LAST, shipping.l_year ASC NULLS LAST + Projection: shipping.supp_nation, shipping.cust_nation, shipping.l_year, SUM(shipping.volume) AS revenue + Aggregate: groupBy=[[shipping.supp_nation, shipping.cust_nation, shipping.l_year]], aggr=[[SUM(shipping.volume)]] + Projection: shipping.supp_nation, shipping.cust_nation, shipping.l_year, shipping.volume, alias=shipping + Projection: n1.n_name AS supp_nation, n2.n_name AS cust_nation, datepart(Utf8("YEAR"), lineitem.l_shipdate) AS l_year, CAST(lineitem.l_extendedprice AS Decimal128(38, 4)) * CAST(Decimal128(Some(100),23,2) - CAST(lineitem.l_discount AS Decimal128(23, 2)) AS Decimal128(38, 4)) AS volume, alias=shipping + Filter: n1.n_name = Utf8("FRANCE") AND n2.n_name = Utf8("GERMANY") OR n1.n_name = Utf8("GERMANY") AND n2.n_name = Utf8("FRANCE") + Inner Join: customer.c_nationkey = n2.n_nationkey + Inner Join: supplier.s_nationkey = n1.n_nationkey + Inner Join: orders.o_custkey = customer.c_custkey + Inner Join: lineitem.l_orderkey = orders.o_orderkey + Inner Join: supplier.s_suppkey = lineitem.l_suppkey + TableScan: supplier projection=[s_suppkey, s_nationkey] + Filter: lineitem.l_shipdate >= Date32("9131") AND lineitem.l_shipdate <= Date32("9861") + TableScan: lineitem projection=[l_orderkey, l_suppkey, l_extendedprice, l_discount, l_shipdate] + TableScan: orders projection=[o_orderkey, o_custkey] + TableScan: customer projection=[c_custkey, c_nationkey] + SubqueryAlias: n1 + TableScan: nation projection=[n_nationkey, n_name] + SubqueryAlias: n2 + TableScan: nation projection=[n_nationkey, n_name] \ No newline at end of file diff --git a/benchmarks/expected-plans/q8.txt b/benchmarks/expected-plans/q8.txt new file mode 100644 index 000000000000..20452d4bde93 --- /dev/null +++ b/benchmarks/expected-plans/q8.txt @@ -0,0 +1,25 @@ +Sort: all_nations.o_year ASC NULLS LAST + Projection: all_nations.o_year, SUM(CASE WHEN all_nations.nation = Utf8("BRAZIL") THEN all_nations.volume ELSE Int64(0) END) / SUM(all_nations.volume) AS mkt_share + Aggregate: groupBy=[[all_nations.o_year]], aggr=[[SUM(CASE WHEN all_nations.nation = Utf8("BRAZIL") THEN all_nations.volume ELSE Decimal128(Some(0),38,4) END) AS SUM(CASE WHEN all_nations.nation = Utf8("BRAZIL") THEN all_nations.volume ELSE Int64(0) END), SUM(all_nations.volume)]] + Projection: all_nations.o_year, all_nations.volume, all_nations.nation, alias=all_nations + Projection: datepart(Utf8("YEAR"), orders.o_orderdate) AS o_year, CAST(lineitem.l_extendedprice AS Decimal128(38, 4)) * CAST(Decimal128(Some(100),23,2) - CAST(lineitem.l_discount AS Decimal128(23, 2)) AS Decimal128(38, 4)) AS volume, n2.n_name AS nation, alias=all_nations + Inner Join: n1.n_regionkey = region.r_regionkey + Inner Join: supplier.s_nationkey = n2.n_nationkey + Inner Join: customer.c_nationkey = n1.n_nationkey + Inner Join: orders.o_custkey = customer.c_custkey + Inner Join: lineitem.l_orderkey = orders.o_orderkey + Inner Join: lineitem.l_suppkey = supplier.s_suppkey + Inner Join: part.p_partkey = lineitem.l_partkey + Filter: part.p_type = Utf8("ECONOMY ANODIZED STEEL") + TableScan: part projection=[p_partkey, p_type] + TableScan: lineitem projection=[l_orderkey, l_partkey, l_suppkey, l_extendedprice, l_discount] + TableScan: supplier projection=[s_suppkey, s_nationkey] + Filter: orders.o_orderdate >= Date32("9131") AND orders.o_orderdate <= Date32("9861") + TableScan: orders projection=[o_orderkey, o_custkey, o_orderdate] + TableScan: customer projection=[c_custkey, c_nationkey] + SubqueryAlias: n1 + TableScan: nation projection=[n_nationkey, n_regionkey] + SubqueryAlias: n2 + TableScan: nation projection=[n_nationkey, n_name] + Filter: region.r_name = Utf8("AMERICA") + TableScan: region projection=[r_regionkey, r_name] \ No newline at end of file diff --git a/benchmarks/expected-plans/q9.txt b/benchmarks/expected-plans/q9.txt new file mode 100644 index 000000000000..954c28a35bb1 --- /dev/null +++ b/benchmarks/expected-plans/q9.txt @@ -0,0 +1,17 @@ +Sort: profit.nation ASC NULLS LAST, profit.o_year DESC NULLS FIRST + Projection: profit.nation, profit.o_year, SUM(profit.amount) AS sum_profit + Aggregate: groupBy=[[profit.nation, profit.o_year]], aggr=[[SUM(profit.amount)]] + Projection: profit.nation, profit.o_year, profit.amount, alias=profit + Projection: nation.n_name AS nation, datepart(Utf8("YEAR"), orders.o_orderdate) AS o_year, CAST(lineitem.l_extendedprice AS Decimal128(38, 4)) * CAST(Decimal128(Some(100),23,2) - CAST(lineitem.l_discount AS Decimal128(23, 2)) AS Decimal128(38, 4)) - CAST(partsupp.ps_supplycost * lineitem.l_quantity AS Decimal128(38, 4)) AS amount, alias=profit + Inner Join: supplier.s_nationkey = nation.n_nationkey + Inner Join: lineitem.l_orderkey = orders.o_orderkey + Inner Join: lineitem.l_suppkey = partsupp.ps_suppkey, lineitem.l_partkey = partsupp.ps_partkey + Inner Join: lineitem.l_suppkey = supplier.s_suppkey + Inner Join: part.p_partkey = lineitem.l_partkey + Filter: part.p_name LIKE Utf8("%green%") + TableScan: part projection=[p_partkey, p_name] + TableScan: lineitem projection=[l_orderkey, l_partkey, l_suppkey, l_quantity, l_extendedprice, l_discount] + TableScan: supplier projection=[s_suppkey, s_nationkey] + TableScan: partsupp projection=[ps_partkey, ps_suppkey, ps_supplycost] + TableScan: orders projection=[o_orderkey, o_orderdate] + TableScan: nation projection=[n_nationkey, n_name] \ No newline at end of file diff --git a/benchmarks/src/bin/tpch.rs b/benchmarks/src/bin/tpch.rs index d3cab89a2f03..b08539eaf7af 100644 --- a/benchmarks/src/bin/tpch.rs +++ b/benchmarks/src/bin/tpch.rs @@ -598,6 +598,7 @@ mod tests { use datafusion::logical_expr::Expr; use datafusion::logical_expr::Expr::Cast; use datafusion::logical_expr::Expr::ScalarFunction; + use datafusion::sql::TableReference; const QUERY_LIMIT: [Option; 22] = [ None, @@ -624,6 +625,158 @@ mod tests { None, ]; + #[tokio::test] + async fn q1_expected_plan() -> Result<()> { + expected_plan(1).await + } + + #[tokio::test] + async fn q2_expected_plan() -> Result<()> { + expected_plan(2).await + } + + #[tokio::test] + async fn q3_expected_plan() -> Result<()> { + expected_plan(3).await + } + + #[tokio::test] + async fn q4_expected_plan() -> Result<()> { + expected_plan(4).await + } + + #[tokio::test] + async fn q5_expected_plan() -> Result<()> { + expected_plan(5).await + } + + #[tokio::test] + async fn q6_expected_plan() -> Result<()> { + expected_plan(6).await + } + + #[tokio::test] + async fn q7_expected_plan() -> Result<()> { + expected_plan(7).await + } + + #[tokio::test] + async fn q8_expected_plan() -> Result<()> { + expected_plan(8).await + } + + #[tokio::test] + async fn q9_expected_plan() -> Result<()> { + expected_plan(9).await + } + + #[tokio::test] + async fn q10_expected_plan() -> Result<()> { + expected_plan(10).await + } + + #[tokio::test] + async fn q11_expected_plan() -> Result<()> { + expected_plan(11).await + } + + #[tokio::test] + async fn q12_expected_plan() -> Result<()> { + expected_plan(12).await + } + + #[tokio::test] + async fn q13_expected_plan() -> Result<()> { + expected_plan(13).await + } + + #[tokio::test] + async fn q14_expected_plan() -> Result<()> { + expected_plan(14).await + } + + #[tokio::test] + async fn q15_expected_plan() -> Result<()> { + expected_plan(15).await + } + + #[tokio::test] + async fn q16_expected_plan() -> Result<()> { + expected_plan(16).await + } + + #[tokio::test] + async fn q17_expected_plan() -> Result<()> { + expected_plan(17).await + } + + #[tokio::test] + async fn q18_expected_plan() -> Result<()> { + expected_plan(18).await + } + + #[tokio::test] + async fn q19_expected_plan() -> Result<()> { + expected_plan(19).await + } + + #[tokio::test] + async fn q20_expected_plan() -> Result<()> { + expected_plan(20).await + } + + #[tokio::test] + async fn q21_expected_plan() -> Result<()> { + expected_plan(21).await + } + + #[tokio::test] + async fn q22_expected_plan() -> Result<()> { + expected_plan(22).await + } + + async fn expected_plan(query: usize) -> Result<()> { + let ctx = SessionContext::new(); + for table in TABLES { + let table = table.to_string(); + let schema = get_schema(&table); + let mem_table = MemTable::try_new(Arc::new(schema), vec![])?; + ctx.register_table( + TableReference::from(table.as_str()), + Arc::new(mem_table), + )?; + } + + let mut actual = String::new(); + let sql = get_query_sql(query)?; + for sql in &sql { + let df = ctx.sql(sql.as_str()).await?; + let plan = df.to_logical_plan()?; + if !actual.is_empty() { + actual += "\n"; + } + actual += &format!("{}", plan.display_indent()); + } + + let possibilities = vec![ + format!("expected-plans/q{}.txt", query), + format!("benchmarks/expected-plans/q{}.txt", query), + ]; + + let mut found = false; + for path in &possibilities { + let path = Path::new(&path); + if let Some(expected) = fs::read_to_string(path).ok() { + assert_eq!(expected, actual); + found = true; + break; + } + } + assert!(found); + + Ok(()) + } + #[tokio::test] async fn q1() -> Result<()> { verify_query(1).await diff --git a/dev/release/rat_exclude_files.txt b/dev/release/rat_exclude_files.txt index 98e0c6a38493..91ccef60f3fa 100644 --- a/dev/release/rat_exclude_files.txt +++ b/dev/release/rat_exclude_files.txt @@ -113,6 +113,7 @@ python/rust-toolchain python/requirements*.txt **/testdata/* benchmarks/queries/* +benchmarks/expected-plans/* benchmarks/data/* ci/* **/*.svg From d5f193ce7bf62c9465674f8229cebaf2778e40d7 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Mon, 10 Oct 2022 18:41:01 -0600 Subject: [PATCH 3/7] revert a file --- datafusion/optimizer/examples/rewrite_expr.rs | 163 ------------------ 1 file changed, 163 deletions(-) delete mode 100644 datafusion/optimizer/examples/rewrite_expr.rs diff --git a/datafusion/optimizer/examples/rewrite_expr.rs b/datafusion/optimizer/examples/rewrite_expr.rs deleted file mode 100644 index 7d2c312fb03a..000000000000 --- a/datafusion/optimizer/examples/rewrite_expr.rs +++ /dev/null @@ -1,163 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; -use datafusion_common::{DataFusionError, Result}; -use datafusion_expr::expr_rewriter::{ExprRewritable, ExprRewriter}; -use datafusion_expr::{AggregateUDF, Expr, Filter, LogicalPlan, ScalarUDF, TableSource}; -use datafusion_optimizer::optimizer::Optimizer; -use datafusion_optimizer::{utils, OptimizerConfig, OptimizerRule}; -use datafusion_sql::planner::{ContextProvider, SqlToRel}; -use datafusion_sql::sqlparser::dialect::PostgreSqlDialect; -use datafusion_sql::sqlparser::parser::Parser; -use datafusion_sql::TableReference; -use std::any::Any; -use std::sync::Arc; - -pub fn main() -> Result<()> { - // produce a logical plan using the datafusion-sql crate - let dialect = PostgreSqlDialect {}; - let sql = "SELECT * FROM person WHERE age BETWEEN 21 AND 32"; - let statements = Parser::parse_sql(&dialect, sql)?; - - // produce a logical plan using the datafusion-sql crate - let context_provider = MyContextProvider {}; - let sql_to_rel = SqlToRel::new(&context_provider); - let logical_plan = sql_to_rel.sql_statement_to_plan(statements[0].clone())?; - println!( - "Unoptimized Logical Plan:\n\n{}\n", - logical_plan.display_indent() - ); - - // now run the optimizer with our custom rule - let optimizer = Optimizer::with_rules(vec![Arc::new(MyRule {})]); - let mut optimizer_config = OptimizerConfig::default().with_skip_failing_rules(false); - let optimized_plan = - optimizer.optimize(&logical_plan, &mut optimizer_config, observe)?; - println!( - "Optimized Logical Plan:\n\n{}\n", - optimized_plan.display_indent() - ); - - Ok(()) -} - -fn observe(plan: &LogicalPlan, rule: &dyn OptimizerRule) { - println!( - "After applying rule '{}':\n{}\n", - rule.name(), - plan.display_indent() - ) -} - -struct MyRule {} - -impl OptimizerRule for MyRule { - fn name(&self) -> &str { - "my_rule" - } - - fn optimize( - &self, - plan: &LogicalPlan, - _config: &mut OptimizerConfig, - ) -> Result { - // recurse down and optimize children first - let plan = utils::optimize_children(self, plan, _config)?; - - match plan { - LogicalPlan::Filter(filter) => { - let mut expr_rewriter = MyExprRewriter {}; - let predicate = filter.predicate.clone(); - let predicate = predicate.rewrite(&mut expr_rewriter)?; - Ok(LogicalPlan::Filter(Filter { - predicate, - input: filter.input.clone(), - })) - } - _ => Ok(plan.clone()), - } - } -} - -struct MyExprRewriter {} - -impl ExprRewriter for MyExprRewriter { - fn mutate(&mut self, expr: Expr) -> Result { - match expr { - Expr::Between { - negated, - expr, - low, - high, - } => { - let expr: Expr = expr.as_ref().clone(); - let low: Expr = low.as_ref().clone(); - let high: Expr = high.as_ref().clone(); - if negated { - Ok(expr.clone().lt(low).or(expr.clone().gt(high))) - } else { - Ok(expr.clone().gt_eq(low).and(expr.clone().lt_eq(high))) - } - } - _ => Ok(expr.clone()), - } - } -} - -struct MyContextProvider {} - -impl ContextProvider for MyContextProvider { - fn get_table_provider(&self, name: TableReference) -> Result> { - if name.table() == "person" { - Ok(Arc::new(MyTableSource { - schema: Arc::new(Schema::new(vec![ - Field::new("name", DataType::Utf8, false), - Field::new("age", DataType::UInt8, false), - ])), - })) - } else { - Err(DataFusionError::Plan("table not found".to_string())) - } - } - - fn get_function_meta(&self, _name: &str) -> Option> { - None - } - - fn get_aggregate_meta(&self, _name: &str) -> Option> { - None - } - - fn get_variable_type(&self, _variable_names: &[String]) -> Option { - None - } -} - -struct MyTableSource { - schema: SchemaRef, -} - -impl TableSource for MyTableSource { - fn as_any(&self) -> &dyn Any { - self - } - - fn schema(&self) -> SchemaRef { - self.schema.clone() - } -} From 17d1de8d5210dd144b944119566aecf979822840 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Mon, 10 Oct 2022 18:41:47 -0600 Subject: [PATCH 4/7] revert a file --- datafusion/optimizer/README.md | 140 +-------------------------------- 1 file changed, 3 insertions(+), 137 deletions(-) diff --git a/datafusion/optimizer/README.md b/datafusion/optimizer/README.md index fc2b740d3295..39d28a8fae37 100644 --- a/datafusion/optimizer/README.md +++ b/datafusion/optimizer/README.md @@ -17,144 +17,10 @@ under the License. --> -# DataFusion Query Optimizer +# DataFusion Query Optimizer Rules -[DataFusion](df) is an extensible query execution framework, written in Rust, that uses Apache Arrow as its in-memory -format. +[DataFusion](df) is an extensible query execution framework, written in Rust, that uses Apache Arrow as its in-memory format. -DataFusion has modular design, allowing individual crates to be re-used in other projects. - -This crate is a submodule of DataFusion that provides a query optimizer for logical plans. - -## Running the Optimizer - -The following code demonstrates the basic flow of creating the optimizer with a default set of optimization rules -and applying it to a logical plan to produce an optimized logical plan. - -```rust - -// We need a logical plan as the starting point. There are many ways to build a logical plan: -// -// The `datafusion-expr` crate provides a LogicalPlanBuilder -// The `datafusion-sql` crate provides a SQL query planner that can create a LogicalPlan from SQL -// The `datafusion` crate provides a DataFrame API that can create a LogicalPlan -let logical_plan = ... - -let mut config = OptimizerConfig::default(); -let optimizer = Optimizer::new(&config); -let optimized_plan = optimizer.optimize(&logical_plan, &mut config, observe)?; - -fn observe(plan: &LogicalPlan, rule: &dyn OptimizerRule) { - println!( - "After applying rule '{}':\n{}", - rule.name(), - plan.display_indent() - ) -} -``` - -## Providing Custom Rules - -The optimizer can be created with a custom set of rules. - -```rust -let optimizer = Optimizer::with_rules(vec![ - Arc::new(MyRule {}) -]); -``` - -## Writing Optimization Rules - -Please refer to the [examples](examples) to learn more about the general approach to writing optimizer rules and -then move onto studying the existing rules. - -All rules must implement the `OptimizerRule` trait. - -```rust -/// `OptimizerRule` transforms one ['LogicalPlan'] into another which -/// computes the same results, but in a potentially more efficient -/// way. -pub trait OptimizerRule { - /// Rewrite `plan` to an optimized form - fn optimize( - &self, - plan: &LogicalPlan, - optimizer_config: &mut OptimizerConfig, - ) -> Result; - - /// A human readable name for this optimizer rule - fn name(&self) -> &str; -} -``` - -### General Guidelines - -Rules typical walk the logical plan and walk the expression trees inside operators and selectively mutate -individual operators or expressions. - -Sometimes there is an initial pass that visits the plan and builds state that is used in a second pass that performs -the actual optimization. This approach is used in projection push down and filter push down. - -### Utilities - -There are a number of utility methods provided that take care of some common tasks. - -### ExprVisitor - -TBD - -### Rewriting Expressions - -The `MyExprRewriter` trait can be implemented to provide a way to rewrite expressions. This rule can then be applied -to an expression by calling `Expr::rewrite` (from the `ExprRewritable` trait). - -The `rewrite` method will perform a depth first walk of the expression and its children to rewrite an expression, -consuming `self` producing a new expression. - -```rust -let mut expr_rewriter = MyExprRewriter {}; -let expr = expr.rewrite(&mut expr_rewriter)?; -``` - -Here is an example implementation which will rewrite `expr BETWEEN a AND b` as `expr >= a AND expr <= b`. Note that the -implementation does not need to perform any recursion since this is handled by the `rewrite` method. - -```rust -struct MyExprRewriter {} - -impl ExprRewriter for MyExprRewriter { - fn mutate(&mut self, expr: Expr) -> Result { - match expr { - Expr::Between { - negated, - expr, - low, - high, - } => { - let expr: Expr = expr.as_ref().clone(); - let low: Expr = low.as_ref().clone(); - let high: Expr = high.as_ref().clone(); - if negated { - Ok(expr.clone().lt(low).or(expr.clone().gt(high))) - } else { - Ok(expr.clone().gt_eq(low).and(expr.clone().lt_eq(high))) - } - } - _ => Ok(expr.clone()), - } - } -} -``` - -### optimize_children - -TBD - -### Writing Tests - -There should be unit tests in the same file as the new rule that test the effect of the rule being applied to a plan -in isolation (without any other rule being applied). - -There should also be a test in `integration-tests.rs` that tests the rule as part of the overall optimization process. +This crate is a submodule of DataFusion that provides query optimizer rules. [df]: https://crates.io/crates/datafusion From 0a249086ff3271c029f89cab985ebbb5fb08e819 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Mon, 10 Oct 2022 21:13:55 -0600 Subject: [PATCH 5/7] fix tests on windows --- benchmarks/src/bin/tpch.rs | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/benchmarks/src/bin/tpch.rs b/benchmarks/src/bin/tpch.rs index b08539eaf7af..2341fc5b423d 100644 --- a/benchmarks/src/bin/tpch.rs +++ b/benchmarks/src/bin/tpch.rs @@ -590,6 +590,7 @@ struct QueryResult { mod tests { use super::*; use std::env; + use std::io::{BufRead, BufReader}; use std::ops::{Div, Mul}; use std::sync::Arc; @@ -766,7 +767,7 @@ mod tests { let mut found = false; for path in &possibilities { let path = Path::new(&path); - if let Some(expected) = fs::read_to_string(path).ok() { + if let Some(expected) = read_text_file(path).ok() { assert_eq!(expected, actual); found = true; break; @@ -777,6 +778,21 @@ mod tests { Ok(()) } + /// we need to read line by line and add \n so tests work on Windows + fn read_text_file(path: &Path) -> Result { + let file = File::open(path)?; + let reader = BufReader::new(file); + let mut str = String::new(); + for line in reader.lines() { + let line = line?; + if !str.is_empty() { + str += "\n"; + } + str += &line; + } + Ok(str) + } + #[tokio::test] async fn q1() -> Result<()> { verify_query(1).await From 4608073992691fc038445b03223545b2e6e76e4a Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Mon, 10 Oct 2022 21:42:00 -0600 Subject: [PATCH 6/7] ignore one test --- benchmarks/src/bin/tpch.rs | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/benchmarks/src/bin/tpch.rs b/benchmarks/src/bin/tpch.rs index 2341fc5b423d..d5c10ce503c2 100644 --- a/benchmarks/src/bin/tpch.rs +++ b/benchmarks/src/bin/tpch.rs @@ -706,7 +706,17 @@ mod tests { expected_plan(16).await } - #[tokio::test] + /// This query produces different plans depending on operating system. The difference is + /// due to re-writing the following expression: + /// + /// `sum(l_extendedprice) / 7.0 as avg_yearly` + /// + /// Linux: Decimal128(Some(7000000000000000195487369212723200),38,33) + /// Windows: Decimal128(Some(6999999999999999042565864605876224),38,33) + /// + /// See https://github.com/apache/arrow-datafusion/issues/3791 + #[tokio::test] + #[ignore] async fn q17_expected_plan() -> Result<()> { expected_plan(17).await } From b0ec2e2fd4226223a136062d94f16a2c5c72a2c7 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Tue, 11 Oct 2022 05:40:09 -0600 Subject: [PATCH 7/7] clippy --- benchmarks/src/bin/tpch.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/src/bin/tpch.rs b/benchmarks/src/bin/tpch.rs index d5c10ce503c2..5205f37c114f 100644 --- a/benchmarks/src/bin/tpch.rs +++ b/benchmarks/src/bin/tpch.rs @@ -777,7 +777,7 @@ mod tests { let mut found = false; for path in &possibilities { let path = Path::new(&path); - if let Some(expected) = read_text_file(path).ok() { + if let Ok(expected) = read_text_file(path) { assert_eq!(expected, actual); found = true; break;