From ec7fb99ce7b15c863deba6dc447bf10dfdcc4fae Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies Date: Fri, 20 Jan 2023 10:06:09 +0000 Subject: [PATCH 1/2] Add dictionary_expresions feature (#4386) --- .github/workflows/rust.yml | 4 ++-- datafusion/core/Cargo.toml | 3 ++- datafusion/core/tests/path_partition.rs | 2 +- datafusion/core/tests/sql/select.rs | 1 + datafusion/physical-expr/Cargo.toml | 7 +++++-- datafusion/physical-expr/src/expressions/binary.rs | 1 + 6 files changed, 12 insertions(+), 6 deletions(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index c4d8cd53306d..a2c8534ed335 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -95,11 +95,11 @@ jobs: - name: Build tests run: | export PATH=$PATH:$HOME/d/protoc/bin - cargo test --features avro,jit,scheduler,json --no-run + cargo test --features avro,jit,scheduler,json,dictionary_expressions --no-run - name: Run tests run: | export PATH=$PATH:$HOME/d/protoc/bin - cargo test --features avro,jit,scheduler,json + cargo test --features avro,jit,scheduler,json,dictionary_expressions - name: Run examples run: | export PATH=$PATH:$HOME/d/protoc/bin diff --git a/datafusion/core/Cargo.toml b/datafusion/core/Cargo.toml index 3003c22eefaf..62a93808c33c 100644 --- a/datafusion/core/Cargo.toml +++ b/datafusion/core/Cargo.toml @@ -53,6 +53,8 @@ regex_expressions = ["datafusion-physical-expr/regex_expressions"] scheduler = ["rayon"] simd = ["arrow/simd"] unicode_expressions = ["datafusion-physical-expr/regex_expressions", "datafusion-sql/unicode_expressions"] +# Enables support for non scalar, binary operations on dictionaries (note this results in significant additional codegen) +dictionary_expressions = ["datafusion-physical-expr/dictionary_expressions"] [dependencies] ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] } @@ -102,7 +104,6 @@ xz2 = { version = "0.1", optional = true } [dev-dependencies] -arrow = { version = "31.0.0", features = ["prettyprint", "dyn_cmp_dict"] } async-trait = "0.1.53" criterion = "0.4" csv = "1.1.6" diff --git a/datafusion/core/tests/path_partition.rs b/datafusion/core/tests/path_partition.rs index 2d257d49a389..670b508c3347 100644 --- a/datafusion/core/tests/path_partition.rs +++ b/datafusion/core/tests/path_partition.rs @@ -204,7 +204,7 @@ async fn csv_filter_with_file_col() -> Result<()> { ); let result = ctx - .sql("SELECT c1, c2 FROM t WHERE date='2021-10-27' and date!=c1 LIMIT 5") + .sql("SELECT c1, c2 FROM t WHERE date='2021-10-27' and c1!='2021-10-27' LIMIT 5") .await? .collect() .await?; diff --git a/datafusion/core/tests/sql/select.rs b/datafusion/core/tests/sql/select.rs index f65835101c52..124f25d36a88 100644 --- a/datafusion/core/tests/sql/select.rs +++ b/datafusion/core/tests/sql/select.rs @@ -621,6 +621,7 @@ async fn query_nested_get_indexed_field_on_struct() -> Result<()> { } #[tokio::test] +#[cfg(feature = "dictionary_expressions")] async fn query_on_string_dictionary() -> Result<()> { // Test to ensure DataFusion can operate on dictionary types // Use StringDictionary (32 bit indexes = keys) diff --git a/datafusion/physical-expr/Cargo.toml b/datafusion/physical-expr/Cargo.toml index 5b25d97075df..5b9496de2e41 100644 --- a/datafusion/physical-expr/Cargo.toml +++ b/datafusion/physical-expr/Cargo.toml @@ -24,7 +24,7 @@ repository = "https://github.com/apache/arrow-datafusion" readme = "README.md" authors = ["Apache Arrow "] license = "Apache-2.0" -keywords = [ "arrow", "query", "sql" ] +keywords = ["arrow", "query", "sql"] edition = "2021" rust-version = "1.62" @@ -37,10 +37,13 @@ crypto_expressions = ["md-5", "sha2", "blake2", "blake3"] default = ["crypto_expressions", "regex_expressions", "unicode_expressions"] regex_expressions = ["regex"] unicode_expressions = ["unicode-segmentation"] +# Enables support for non-scalar, binary operations on dictionaries +# Note: this results in significant additional codegen +dictionary_expressions = ["arrow/dyn_cmp_dict", "arrow/dyn_arith_dict"] [dependencies] ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] } -arrow = { version = "31.0.0", features = ["prettyprint", "dyn_cmp_dict"] } +arrow = { version = "31.0.0", features = ["prettyprint"] } arrow-buffer = "31.0.0" arrow-schema = "31.0.0" blake2 = { version = "^0.10.2", optional = true } diff --git a/datafusion/physical-expr/src/expressions/binary.rs b/datafusion/physical-expr/src/expressions/binary.rs index 149eee593424..a99d6d6e773a 100644 --- a/datafusion/physical-expr/src/expressions/binary.rs +++ b/datafusion/physical-expr/src/expressions/binary.rs @@ -1502,6 +1502,7 @@ mod tests { // is no way at the time of this writing to create a dictionary // array using the `From` trait #[test] + #[cfg(feature = "dictionary_expressions")] fn test_dictionary_type_to_array_coersion() -> Result<()> { // Test string a string dictionary let dict_type = From 779e4956207518b9e272a806916c36bb0ad72b50 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies Date: Fri, 20 Jan 2023 11:02:43 +0000 Subject: [PATCH 2/2] Toml format --- datafusion/core/Cargo.toml | 5 +++-- datafusion/physical-expr/Cargo.toml | 4 ++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/datafusion/core/Cargo.toml b/datafusion/core/Cargo.toml index 62a93808c33c..76ede0502c04 100644 --- a/datafusion/core/Cargo.toml +++ b/datafusion/core/Cargo.toml @@ -43,6 +43,9 @@ avro = ["apache-avro", "num-traits", "datafusion-common/avro"] compression = ["xz2", "bzip2", "flate2", "async-compression"] crypto_expressions = ["datafusion-physical-expr/crypto_expressions"] default = ["crypto_expressions", "regex_expressions", "unicode_expressions", "compression"] +# Enables support for non-scalar, binary operations on dictionaries +# Note: this results in significant additional codegen +dictionary_expressions = ["datafusion-physical-expr/dictionary_expressions"] # Used for testing ONLY: causes all values to hash to the same value (test for collisions) force_hash_collisions = [] # Used to enable JIT code generation @@ -53,8 +56,6 @@ regex_expressions = ["datafusion-physical-expr/regex_expressions"] scheduler = ["rayon"] simd = ["arrow/simd"] unicode_expressions = ["datafusion-physical-expr/regex_expressions", "datafusion-sql/unicode_expressions"] -# Enables support for non scalar, binary operations on dictionaries (note this results in significant additional codegen) -dictionary_expressions = ["datafusion-physical-expr/dictionary_expressions"] [dependencies] ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] } diff --git a/datafusion/physical-expr/Cargo.toml b/datafusion/physical-expr/Cargo.toml index 5b9496de2e41..6ab1bb7fa9d5 100644 --- a/datafusion/physical-expr/Cargo.toml +++ b/datafusion/physical-expr/Cargo.toml @@ -35,11 +35,11 @@ path = "src/lib.rs" [features] crypto_expressions = ["md-5", "sha2", "blake2", "blake3"] default = ["crypto_expressions", "regex_expressions", "unicode_expressions"] -regex_expressions = ["regex"] -unicode_expressions = ["unicode-segmentation"] # Enables support for non-scalar, binary operations on dictionaries # Note: this results in significant additional codegen dictionary_expressions = ["arrow/dyn_cmp_dict", "arrow/dyn_arith_dict"] +regex_expressions = ["regex"] +unicode_expressions = ["unicode-segmentation"] [dependencies] ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] }