From 507d978a3b2b9fe873239ae2d4640286e423086a Mon Sep 17 00:00:00 2001 From: Alex Huang Date: Wed, 19 Jun 2024 19:38:03 +0800 Subject: [PATCH] feat: Implement equality = and inequality <> support for StringView (#10985) * feat: Implement equality = and inequality <> support for StringView * chore: Add tests for the StringView * chore * chore: Update tests for NULL * fix: Used build_array_string! * chore: Update string_coercion function to handle Utf8View type in binary.rs * chore: add tests * chore: ci --- Cargo.toml | 24 ++-- datafusion-cli/Cargo.lock | 30 ++--- datafusion-cli/Cargo.toml | 22 ++-- datafusion/common/src/scalar/mod.rs | 2 +- datafusion/expr/src/type_coercion/binary.rs | 1 + .../sqllogictest/test_files/string_view.slt | 113 ++++++++++++++++++ 6 files changed, 153 insertions(+), 39 deletions(-) create mode 100644 datafusion/sqllogictest/test_files/string_view.slt diff --git a/Cargo.toml b/Cargo.toml index 290dd64021b7..be6e0c672f6f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -157,15 +157,15 @@ unused_imports = "deny" ## Temporary arrow-rs patch until 52.1.0 is released [patch.crates-io] -arrow = { git = "https://github.com/apache/arrow-rs.git", rev = "72467c670f8c38130e4743347407f1a542e59e0c" } -arrow-array = { git = "https://github.com/apache/arrow-rs.git", rev = "72467c670f8c38130e4743347407f1a542e59e0c" } -arrow-buffer = { git = "https://github.com/apache/arrow-rs.git", rev = "72467c670f8c38130e4743347407f1a542e59e0c" } -arrow-cast = { git = "https://github.com/apache/arrow-rs.git", rev = "72467c670f8c38130e4743347407f1a542e59e0c" } -arrow-data = { git = "https://github.com/apache/arrow-rs.git", rev = "72467c670f8c38130e4743347407f1a542e59e0c" } -arrow-ipc = { git = "https://github.com/apache/arrow-rs.git", rev = "72467c670f8c38130e4743347407f1a542e59e0c" } -arrow-schema = { git = "https://github.com/apache/arrow-rs.git", rev = "72467c670f8c38130e4743347407f1a542e59e0c" } -arrow-select = { git = "https://github.com/apache/arrow-rs.git", rev = "72467c670f8c38130e4743347407f1a542e59e0c" } -arrow-string = { git = "https://github.com/apache/arrow-rs.git", rev = "72467c670f8c38130e4743347407f1a542e59e0c" } -arrow-ord = { git = "https://github.com/apache/arrow-rs.git", rev = "72467c670f8c38130e4743347407f1a542e59e0c" } -arrow-flight = { git = "https://github.com/apache/arrow-rs.git", rev = "72467c670f8c38130e4743347407f1a542e59e0c" } -parquet = { git = "https://github.com/apache/arrow-rs.git", rev = "72467c670f8c38130e4743347407f1a542e59e0c" } +arrow = { git = "https://github.com/apache/arrow-rs.git", rev = "d0a88c651991b7fc4b970cf94fa77f4ec3def22d" } +arrow-array = { git = "https://github.com/apache/arrow-rs.git", rev = "d0a88c651991b7fc4b970cf94fa77f4ec3def22d" } +arrow-buffer = { git = "https://github.com/apache/arrow-rs.git", rev = "d0a88c651991b7fc4b970cf94fa77f4ec3def22d" } +arrow-cast = { git = "https://github.com/apache/arrow-rs.git", rev = "d0a88c651991b7fc4b970cf94fa77f4ec3def22d" } +arrow-data = { git = "https://github.com/apache/arrow-rs.git", rev = "d0a88c651991b7fc4b970cf94fa77f4ec3def22d" } +arrow-ipc = { git = "https://github.com/apache/arrow-rs.git", rev = "d0a88c651991b7fc4b970cf94fa77f4ec3def22d" } +arrow-schema = { git = "https://github.com/apache/arrow-rs.git", rev = "d0a88c651991b7fc4b970cf94fa77f4ec3def22d" } +arrow-select = { git = "https://github.com/apache/arrow-rs.git", rev = "d0a88c651991b7fc4b970cf94fa77f4ec3def22d" } +arrow-string = { git = "https://github.com/apache/arrow-rs.git", rev = "d0a88c651991b7fc4b970cf94fa77f4ec3def22d" } +arrow-ord = { git = "https://github.com/apache/arrow-rs.git", rev = "d0a88c651991b7fc4b970cf94fa77f4ec3def22d" } +arrow-flight = { git = "https://github.com/apache/arrow-rs.git", rev = "d0a88c651991b7fc4b970cf94fa77f4ec3def22d" } +parquet = { git = "https://github.com/apache/arrow-rs.git", rev = "d0a88c651991b7fc4b970cf94fa77f4ec3def22d" } diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index b0b41a12328d..15f7809ee5f5 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -131,7 +131,7 @@ checksum = "96d30a06541fbafbc7f82ed10c06164cfbd2c401138f6addd8404629c4b16711" [[package]] name = "arrow" version = "52.0.0" -source = "git+https://github.com/apache/arrow-rs.git?rev=72467c670f8c38130e4743347407f1a542e59e0c#72467c670f8c38130e4743347407f1a542e59e0c" +source = "git+https://github.com/apache/arrow-rs.git?rev=d0a88c651991b7fc4b970cf94fa77f4ec3def22d#d0a88c651991b7fc4b970cf94fa77f4ec3def22d" dependencies = [ "arrow-arith", "arrow-array", @@ -151,7 +151,7 @@ dependencies = [ [[package]] name = "arrow-arith" version = "52.0.0" -source = "git+https://github.com/apache/arrow-rs.git?rev=72467c670f8c38130e4743347407f1a542e59e0c#72467c670f8c38130e4743347407f1a542e59e0c" +source = "git+https://github.com/apache/arrow-rs.git?rev=d0a88c651991b7fc4b970cf94fa77f4ec3def22d#d0a88c651991b7fc4b970cf94fa77f4ec3def22d" dependencies = [ "arrow-array", "arrow-buffer", @@ -165,7 +165,7 @@ dependencies = [ [[package]] name = "arrow-array" version = "52.0.0" -source = "git+https://github.com/apache/arrow-rs.git?rev=72467c670f8c38130e4743347407f1a542e59e0c#72467c670f8c38130e4743347407f1a542e59e0c" +source = "git+https://github.com/apache/arrow-rs.git?rev=d0a88c651991b7fc4b970cf94fa77f4ec3def22d#d0a88c651991b7fc4b970cf94fa77f4ec3def22d" dependencies = [ "ahash", "arrow-buffer", @@ -181,7 +181,7 @@ dependencies = [ [[package]] name = "arrow-buffer" version = "52.0.0" -source = "git+https://github.com/apache/arrow-rs.git?rev=72467c670f8c38130e4743347407f1a542e59e0c#72467c670f8c38130e4743347407f1a542e59e0c" +source = "git+https://github.com/apache/arrow-rs.git?rev=d0a88c651991b7fc4b970cf94fa77f4ec3def22d#d0a88c651991b7fc4b970cf94fa77f4ec3def22d" dependencies = [ "bytes", "half", @@ -191,7 +191,7 @@ dependencies = [ [[package]] name = "arrow-cast" version = "52.0.0" -source = "git+https://github.com/apache/arrow-rs.git?rev=72467c670f8c38130e4743347407f1a542e59e0c#72467c670f8c38130e4743347407f1a542e59e0c" +source = "git+https://github.com/apache/arrow-rs.git?rev=d0a88c651991b7fc4b970cf94fa77f4ec3def22d#d0a88c651991b7fc4b970cf94fa77f4ec3def22d" dependencies = [ "arrow-array", "arrow-buffer", @@ -211,7 +211,7 @@ dependencies = [ [[package]] name = "arrow-csv" version = "52.0.0" -source = "git+https://github.com/apache/arrow-rs.git?rev=72467c670f8c38130e4743347407f1a542e59e0c#72467c670f8c38130e4743347407f1a542e59e0c" +source = "git+https://github.com/apache/arrow-rs.git?rev=d0a88c651991b7fc4b970cf94fa77f4ec3def22d#d0a88c651991b7fc4b970cf94fa77f4ec3def22d" dependencies = [ "arrow-array", "arrow-buffer", @@ -229,7 +229,7 @@ dependencies = [ [[package]] name = "arrow-data" version = "52.0.0" -source = "git+https://github.com/apache/arrow-rs.git?rev=72467c670f8c38130e4743347407f1a542e59e0c#72467c670f8c38130e4743347407f1a542e59e0c" +source = "git+https://github.com/apache/arrow-rs.git?rev=d0a88c651991b7fc4b970cf94fa77f4ec3def22d#d0a88c651991b7fc4b970cf94fa77f4ec3def22d" dependencies = [ "arrow-buffer", "arrow-schema", @@ -240,7 +240,7 @@ dependencies = [ [[package]] name = "arrow-ipc" version = "52.0.0" -source = "git+https://github.com/apache/arrow-rs.git?rev=72467c670f8c38130e4743347407f1a542e59e0c#72467c670f8c38130e4743347407f1a542e59e0c" +source = "git+https://github.com/apache/arrow-rs.git?rev=d0a88c651991b7fc4b970cf94fa77f4ec3def22d#d0a88c651991b7fc4b970cf94fa77f4ec3def22d" dependencies = [ "arrow-array", "arrow-buffer", @@ -254,7 +254,7 @@ dependencies = [ [[package]] name = "arrow-json" version = "52.0.0" -source = "git+https://github.com/apache/arrow-rs.git?rev=72467c670f8c38130e4743347407f1a542e59e0c#72467c670f8c38130e4743347407f1a542e59e0c" +source = "git+https://github.com/apache/arrow-rs.git?rev=d0a88c651991b7fc4b970cf94fa77f4ec3def22d#d0a88c651991b7fc4b970cf94fa77f4ec3def22d" dependencies = [ "arrow-array", "arrow-buffer", @@ -273,7 +273,7 @@ dependencies = [ [[package]] name = "arrow-ord" version = "52.0.0" -source = "git+https://github.com/apache/arrow-rs.git?rev=72467c670f8c38130e4743347407f1a542e59e0c#72467c670f8c38130e4743347407f1a542e59e0c" +source = "git+https://github.com/apache/arrow-rs.git?rev=d0a88c651991b7fc4b970cf94fa77f4ec3def22d#d0a88c651991b7fc4b970cf94fa77f4ec3def22d" dependencies = [ "arrow-array", "arrow-buffer", @@ -287,7 +287,7 @@ dependencies = [ [[package]] name = "arrow-row" version = "52.0.0" -source = "git+https://github.com/apache/arrow-rs.git?rev=72467c670f8c38130e4743347407f1a542e59e0c#72467c670f8c38130e4743347407f1a542e59e0c" +source = "git+https://github.com/apache/arrow-rs.git?rev=d0a88c651991b7fc4b970cf94fa77f4ec3def22d#d0a88c651991b7fc4b970cf94fa77f4ec3def22d" dependencies = [ "ahash", "arrow-array", @@ -301,12 +301,12 @@ dependencies = [ [[package]] name = "arrow-schema" version = "52.0.0" -source = "git+https://github.com/apache/arrow-rs.git?rev=72467c670f8c38130e4743347407f1a542e59e0c#72467c670f8c38130e4743347407f1a542e59e0c" +source = "git+https://github.com/apache/arrow-rs.git?rev=d0a88c651991b7fc4b970cf94fa77f4ec3def22d#d0a88c651991b7fc4b970cf94fa77f4ec3def22d" [[package]] name = "arrow-select" version = "52.0.0" -source = "git+https://github.com/apache/arrow-rs.git?rev=72467c670f8c38130e4743347407f1a542e59e0c#72467c670f8c38130e4743347407f1a542e59e0c" +source = "git+https://github.com/apache/arrow-rs.git?rev=d0a88c651991b7fc4b970cf94fa77f4ec3def22d#d0a88c651991b7fc4b970cf94fa77f4ec3def22d" dependencies = [ "ahash", "arrow-array", @@ -319,7 +319,7 @@ dependencies = [ [[package]] name = "arrow-string" version = "52.0.0" -source = "git+https://github.com/apache/arrow-rs.git?rev=72467c670f8c38130e4743347407f1a542e59e0c#72467c670f8c38130e4743347407f1a542e59e0c" +source = "git+https://github.com/apache/arrow-rs.git?rev=d0a88c651991b7fc4b970cf94fa77f4ec3def22d#d0a88c651991b7fc4b970cf94fa77f4ec3def22d" dependencies = [ "arrow-array", "arrow-buffer", @@ -2704,7 +2704,7 @@ dependencies = [ [[package]] name = "parquet" version = "52.0.0" -source = "git+https://github.com/apache/arrow-rs.git?rev=72467c670f8c38130e4743347407f1a542e59e0c#72467c670f8c38130e4743347407f1a542e59e0c" +source = "git+https://github.com/apache/arrow-rs.git?rev=d0a88c651991b7fc4b970cf94fa77f4ec3def22d#d0a88c651991b7fc4b970cf94fa77f4ec3def22d" dependencies = [ "ahash", "arrow-array", diff --git a/datafusion-cli/Cargo.toml b/datafusion-cli/Cargo.toml index b4883264731e..0e7b712d8b19 100644 --- a/datafusion-cli/Cargo.toml +++ b/datafusion-cli/Cargo.toml @@ -66,14 +66,14 @@ rstest = "0.17" ## Temporary arrow-rs patch until 52.1.0 is released [patch.crates-io] -arrow = { git = "https://github.com/apache/arrow-rs.git", rev = "72467c670f8c38130e4743347407f1a542e59e0c" } -arrow-array = { git = "https://github.com/apache/arrow-rs.git", rev = "72467c670f8c38130e4743347407f1a542e59e0c" } -arrow-buffer = { git = "https://github.com/apache/arrow-rs.git", rev = "72467c670f8c38130e4743347407f1a542e59e0c" } -arrow-cast = { git = "https://github.com/apache/arrow-rs.git", rev = "72467c670f8c38130e4743347407f1a542e59e0c" } -arrow-data = { git = "https://github.com/apache/arrow-rs.git", rev = "72467c670f8c38130e4743347407f1a542e59e0c" } -arrow-ipc = { git = "https://github.com/apache/arrow-rs.git", rev = "72467c670f8c38130e4743347407f1a542e59e0c" } -arrow-schema = { git = "https://github.com/apache/arrow-rs.git", rev = "72467c670f8c38130e4743347407f1a542e59e0c" } -arrow-select = { git = "https://github.com/apache/arrow-rs.git", rev = "72467c670f8c38130e4743347407f1a542e59e0c" } -arrow-string = { git = "https://github.com/apache/arrow-rs.git", rev = "72467c670f8c38130e4743347407f1a542e59e0c" } -arrow-ord = { git = "https://github.com/apache/arrow-rs.git", rev = "72467c670f8c38130e4743347407f1a542e59e0c" } -parquet = { git = "https://github.com/apache/arrow-rs.git", rev = "72467c670f8c38130e4743347407f1a542e59e0c" } +arrow = { git = "https://github.com/apache/arrow-rs.git", rev = "d0a88c651991b7fc4b970cf94fa77f4ec3def22d" } +arrow-array = { git = "https://github.com/apache/arrow-rs.git", rev = "d0a88c651991b7fc4b970cf94fa77f4ec3def22d" } +arrow-buffer = { git = "https://github.com/apache/arrow-rs.git", rev = "d0a88c651991b7fc4b970cf94fa77f4ec3def22d" } +arrow-cast = { git = "https://github.com/apache/arrow-rs.git", rev = "d0a88c651991b7fc4b970cf94fa77f4ec3def22d" } +arrow-data = { git = "https://github.com/apache/arrow-rs.git", rev = "d0a88c651991b7fc4b970cf94fa77f4ec3def22d" } +arrow-ipc = { git = "https://github.com/apache/arrow-rs.git", rev = "d0a88c651991b7fc4b970cf94fa77f4ec3def22d" } +arrow-schema = { git = "https://github.com/apache/arrow-rs.git", rev = "d0a88c651991b7fc4b970cf94fa77f4ec3def22d" } +arrow-select = { git = "https://github.com/apache/arrow-rs.git", rev = "d0a88c651991b7fc4b970cf94fa77f4ec3def22d" } +arrow-string = { git = "https://github.com/apache/arrow-rs.git", rev = "d0a88c651991b7fc4b970cf94fa77f4ec3def22d" } +arrow-ord = { git = "https://github.com/apache/arrow-rs.git", rev = "d0a88c651991b7fc4b970cf94fa77f4ec3def22d" } +parquet = { git = "https://github.com/apache/arrow-rs.git", rev = "d0a88c651991b7fc4b970cf94fa77f4ec3def22d" } diff --git a/datafusion/common/src/scalar/mod.rs b/datafusion/common/src/scalar/mod.rs index 96bf4216d9a1..86ac115cca02 100644 --- a/datafusion/common/src/scalar/mod.rs +++ b/datafusion/common/src/scalar/mod.rs @@ -1570,6 +1570,7 @@ impl ScalarValue { DataType::UInt16 => build_array_primitive!(UInt16Array, UInt16), DataType::UInt32 => build_array_primitive!(UInt32Array, UInt32), DataType::UInt64 => build_array_primitive!(UInt64Array, UInt64), + DataType::Utf8View => build_array_string!(StringViewArray, Utf8View), DataType::Utf8 => build_array_string!(StringArray, Utf8), DataType::LargeUtf8 => build_array_string!(LargeStringArray, LargeUtf8), DataType::Binary => build_array_string!(BinaryArray, Binary), @@ -1726,7 +1727,6 @@ impl ScalarValue { | DataType::Time64(TimeUnit::Millisecond) | DataType::Map(_, _) | DataType::RunEndEncoded(_, _) - | DataType::Utf8View | DataType::BinaryView | DataType::ListView(_) | DataType::LargeListView(_) => { diff --git a/datafusion/expr/src/type_coercion/binary.rs b/datafusion/expr/src/type_coercion/binary.rs index d7cb4b1a3ef6..d57b5228cb74 100644 --- a/datafusion/expr/src/type_coercion/binary.rs +++ b/datafusion/expr/src/type_coercion/binary.rs @@ -932,6 +932,7 @@ fn string_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option (LargeUtf8, Utf8) => Some(LargeUtf8), (Utf8, LargeUtf8) => Some(LargeUtf8), (LargeUtf8, LargeUtf8) => Some(LargeUtf8), + (Utf8View, Utf8View) | (Utf8View, Utf8) | (Utf8, Utf8View) => Some(Utf8View), _ => None, } } diff --git a/datafusion/sqllogictest/test_files/string_view.slt b/datafusion/sqllogictest/test_files/string_view.slt new file mode 100644 index 000000000000..3be3c94770db --- /dev/null +++ b/datafusion/sqllogictest/test_files/string_view.slt @@ -0,0 +1,113 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +# test StringViewArray with Utf8View columns +statement ok +create table test as values (arrow_cast('Andrew', 'Utf8View'), arrow_cast('X', 'Utf8View')), + (arrow_cast('Xiangpeng', 'Utf8View'), arrow_cast('Xiangpeng', 'Utf8View')), + (arrow_cast('Raphael', 'Utf8View'), arrow_cast('R', 'Utf8View')), + (arrow_cast(NULL, 'Utf8View'), arrow_cast('R', 'Utf8View')); + +query B +select arrow_cast('NULL', 'Utf8View') = arrow_cast('Andrew', 'Utf8View'); +---- +false + +query B +select arrow_cast('NULL', 'Utf8View') <> arrow_cast('Andrew', 'Utf8View'); +---- +true + +query B +select arrow_cast('Andrew', 'Utf8View') = arrow_cast('Andrew', 'Utf8View'); +---- +true + +query B +select arrow_cast('Xiangpeng', 'Utf8View') <> arrow_cast('Andrew', 'Utf8View'); +---- +true + +query ?? +select * from test where column1 = column2; +---- +Xiangpeng Xiangpeng + +query ?? +select * from test where column1 <> column2; +---- +Andrew X +Raphael R + +query ?? +select * from test where column1 = arrow_cast('Andrew', 'Utf8View'); +---- +Andrew X + +query ?? +select * from test where column1 = 'Andrew'; +---- +Andrew X + +query ?? +select * from test where column1 <> arrow_cast('Andrew', 'Utf8View'); +---- +Xiangpeng Xiangpeng +Raphael R + +query ?? +select * from test where column1 <> 'Andrew'; +---- +Xiangpeng Xiangpeng +Raphael R + +statement ok +drop table test; + + +# test StringViewArray with Utf8 and Utf8View columns +statement ok +create table test as values ('Andrew', arrow_cast('X', 'Utf8View')), + ('Xiangpeng', arrow_cast('Xiangpeng', 'Utf8View')), + ('Raphael', arrow_cast('R', 'Utf8View')), + (NULL, arrow_cast('R', 'Utf8View')); + +query T? +select * from test where column1 = column2; +---- +Xiangpeng Xiangpeng + +query T? +select * from test where column1 <> column2; +---- +Andrew X +Raphael R + +query T? +select * from test where column1 = arrow_cast('Andrew', 'Utf8View'); +---- +Andrew X + +query T? +select * from test where column1 <> arrow_cast('Andrew', 'Utf8View'); +---- +Xiangpeng Xiangpeng +Raphael R + +statement ok +drop table test;