From 63f1588be9f7ec25bf8eb83fabfb4783e4c81df9 Mon Sep 17 00:00:00 2001 From: Simon Vandel Sillesen Date: Sun, 28 Jul 2024 22:39:55 +0200 Subject: [PATCH 1/5] Properly specialize nullif for scalar (3x faster) --- datafusion/functions/Cargo.toml | 4 +++ datafusion/functions/benches/nullif.rs | 42 +++++++++++++++++++++++++ datafusion/functions/src/core/nullif.rs | 7 ++--- 3 files changed, 49 insertions(+), 4 deletions(-) create mode 100644 datafusion/functions/benches/nullif.rs diff --git a/datafusion/functions/Cargo.toml b/datafusion/functions/Cargo.toml index 0281676cabf2..12b584f74eac 100644 --- a/datafusion/functions/Cargo.toml +++ b/datafusion/functions/Cargo.toml @@ -112,6 +112,10 @@ harness = false name = "make_date" required-features = ["datetime_expressions"] +[[bench]] +harness = false +name = "nullif" + [[bench]] harness = false name = "date_bin" diff --git a/datafusion/functions/benches/nullif.rs b/datafusion/functions/benches/nullif.rs new file mode 100644 index 000000000000..630e1ba53f2f --- /dev/null +++ b/datafusion/functions/benches/nullif.rs @@ -0,0 +1,42 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +extern crate criterion; + +use arrow::util::bench_util::create_string_array_with_len; +use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use datafusion_common::ScalarValue; +use datafusion_expr::ColumnarValue; +use datafusion_functions::core::nullif; +use std::sync::Arc; + +fn criterion_benchmark(c: &mut Criterion) { + let nullif = nullif(); + for size in [1024, 4096, 8192] { + let array = Arc::new(create_string_array_with_len::(size, 0.2, 32)); + let args = vec![ + ColumnarValue::Scalar(ScalarValue::Utf8(Some("abcd".to_string()))), + ColumnarValue::Array(array), + ]; + c.bench_function(&format!("nullif scalar array: {}", size), |b| { + b.iter(|| black_box(nullif.invoke(&args))) + }); + } +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); diff --git a/datafusion/functions/src/core/nullif.rs b/datafusion/functions/src/core/nullif.rs index e8bf2db514c3..3d302547d30b 100644 --- a/datafusion/functions/src/core/nullif.rs +++ b/datafusion/functions/src/core/nullif.rs @@ -15,11 +15,10 @@ // specific language governing permissions and limitations // under the License. -use arrow::datatypes::DataType; +use arrow::{array::Datum, datatypes::DataType}; use datafusion_common::{exec_err, Result}; use datafusion_expr::ColumnarValue; -use arrow::array::Array; use arrow::compute::kernels::cmp::eq; use arrow::compute::kernels::nullif::nullif; use datafusion_common::ScalarValue; @@ -122,8 +121,8 @@ fn nullif_func(args: &[ColumnarValue]) -> Result { Ok(ColumnarValue::Array(array)) } (ColumnarValue::Scalar(lhs), ColumnarValue::Array(rhs)) => { - let lhs = lhs.to_array_of_size(rhs.len())?; - let array = nullif(&lhs, &eq(&lhs, &rhs)?)?; + let lhs = lhs.to_scalar()?; + let array = nullif(lhs.get().0, &eq(&lhs, &rhs)?)?; Ok(ColumnarValue::Array(array)) } (ColumnarValue::Scalar(lhs), ColumnarValue::Scalar(rhs)) => { From b24162058c48271c7d4b48b2c4d480bc5835e63b Mon Sep 17 00:00:00 2001 From: Simon Vandel Sillesen Date: Sun, 28 Jul 2024 22:48:56 +0200 Subject: [PATCH 2/5] missed feature flag --- datafusion/functions/Cargo.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/datafusion/functions/Cargo.toml b/datafusion/functions/Cargo.toml index 12b584f74eac..9675d03a0161 100644 --- a/datafusion/functions/Cargo.toml +++ b/datafusion/functions/Cargo.toml @@ -115,6 +115,7 @@ required-features = ["datetime_expressions"] [[bench]] harness = false name = "nullif" +required-features = ["core_expressions"] [[bench]] harness = false From 755c5732f5098ce9089c0f47afc2259d2ce526ae Mon Sep 17 00:00:00 2001 From: Simon Vandel Sillesen Date: Mon, 29 Jul 2024 08:00:23 +0200 Subject: [PATCH 3/5] fix test --- datafusion/functions/benches/nullif.rs | 2 +- datafusion/functions/src/core/nullif.rs | 10 +++++++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/datafusion/functions/benches/nullif.rs b/datafusion/functions/benches/nullif.rs index 630e1ba53f2f..dfabad335835 100644 --- a/datafusion/functions/benches/nullif.rs +++ b/datafusion/functions/benches/nullif.rs @@ -33,7 +33,7 @@ fn criterion_benchmark(c: &mut Criterion) { ColumnarValue::Array(array), ]; c.bench_function(&format!("nullif scalar array: {}", size), |b| { - b.iter(|| black_box(nullif.invoke(&args))) + b.iter(|| black_box(nullif.invoke(&args).unwrap())) }); } } diff --git a/datafusion/functions/src/core/nullif.rs b/datafusion/functions/src/core/nullif.rs index 3d302547d30b..356adacd24d3 100644 --- a/datafusion/functions/src/core/nullif.rs +++ b/datafusion/functions/src/core/nullif.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use arrow::{array::Datum, datatypes::DataType}; +use arrow::datatypes::DataType; use datafusion_common::{exec_err, Result}; use datafusion_expr::ColumnarValue; @@ -121,8 +121,12 @@ fn nullif_func(args: &[ColumnarValue]) -> Result { Ok(ColumnarValue::Array(array)) } (ColumnarValue::Scalar(lhs), ColumnarValue::Array(rhs)) => { - let lhs = lhs.to_scalar()?; - let array = nullif(lhs.get().0, &eq(&lhs, &rhs)?)?; + let lhs_s = lhs.to_scalar()?; + let array = nullif( + // nullif in arrow-select dodes not support Datum, so we need to convert to array + lhs.to_array_of_size(rhs.len())?.as_ref(), + &eq(&lhs_s, &rhs)?, + )?; Ok(ColumnarValue::Array(array)) } (ColumnarValue::Scalar(lhs), ColumnarValue::Scalar(rhs)) => { From 271cb722c4f70fa4537074d8ba1006c31d07bcfa Mon Sep 17 00:00:00 2001 From: Simon Vandel Sillesen Date: Mon, 29 Jul 2024 08:11:47 +0200 Subject: [PATCH 4/5] extract --- datafusion/functions/src/core/nullif.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/datafusion/functions/src/core/nullif.rs b/datafusion/functions/src/core/nullif.rs index 356adacd24d3..a5e0a2db31e4 100644 --- a/datafusion/functions/src/core/nullif.rs +++ b/datafusion/functions/src/core/nullif.rs @@ -122,9 +122,10 @@ fn nullif_func(args: &[ColumnarValue]) -> Result { } (ColumnarValue::Scalar(lhs), ColumnarValue::Array(rhs)) => { let lhs_s = lhs.to_scalar()?; + let lhs_a = lhs.to_array_of_size(rhs.len())?; let array = nullif( // nullif in arrow-select dodes not support Datum, so we need to convert to array - lhs.to_array_of_size(rhs.len())?.as_ref(), + lhs_a.as_ref(), &eq(&lhs_s, &rhs)?, )?; Ok(ColumnarValue::Array(array)) From 2119e218c9e7fabf13e068821f50cb20f912f3be Mon Sep 17 00:00:00 2001 From: Simon Vandel Sillesen Date: Sat, 3 Aug 2024 18:05:09 +0000 Subject: [PATCH 5/5] dodes -> does Co-authored-by: Oleks V --- datafusion/functions/src/core/nullif.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/functions/src/core/nullif.rs b/datafusion/functions/src/core/nullif.rs index a5e0a2db31e4..6fcfbd36416e 100644 --- a/datafusion/functions/src/core/nullif.rs +++ b/datafusion/functions/src/core/nullif.rs @@ -124,7 +124,7 @@ fn nullif_func(args: &[ColumnarValue]) -> Result { let lhs_s = lhs.to_scalar()?; let lhs_a = lhs.to_array_of_size(rhs.len())?; let array = nullif( - // nullif in arrow-select dodes not support Datum, so we need to convert to array + // nullif in arrow-select does not support Datum, so we need to convert to array lhs_a.as_ref(), &eq(&lhs_s, &rhs)?, )?;