From 905e1745635aa5438cfb209bfe3233eccf7bc062 Mon Sep 17 00:00:00 2001 From: Nicholas Chammas Date: Mon, 5 Feb 2024 21:43:59 -0500 Subject: [PATCH 01/15] use object equality in OpenHashSet --- .../spark/util/collection/OpenHashSet.scala | 17 ++++++++-- .../util/collection/OpenHashSetSuite.scala | 31 +++++++++++++++++++ 2 files changed, 45 insertions(+), 3 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala b/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala index faee9ce56a0a1..790e95b31a2e5 100644 --- a/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala +++ b/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala @@ -110,6 +110,18 @@ class OpenHashSet[@specialized(Long, Int, Double, Float) T: ClassTag]( this } + /** + * Check if a key exists at the provided position using object equality rather than + * cooperative equality. Otherwise, hash sets that include both 0.0 and -0.0 may drop + * one of those entries. + * + * See: https://issues.apache.org/jira/browse/SPARK-45599 + */ + @annotation.nowarn("cat=other-non-cooperative-equals") + private def keyExistsAtPos(k: T, pos: Int) = + // _data(pos) == k + _data(pos) equals k + /** * Add an element to the set. This one differs from add in that it doesn't trigger rehashing. * The caller is responsible for calling rehashIfNeeded. @@ -130,8 +142,7 @@ class OpenHashSet[@specialized(Long, Int, Double, Float) T: ClassTag]( _bitset.set(pos) _size += 1 return pos | NONEXISTENCE_MASK - } else if (_data(pos) == k) { - // Found an existing key. + } else if (keyExistsAtPos(k, pos)) { return pos } else { // quadratic probing with values increase by 1, 2, 3, ... @@ -165,7 +176,7 @@ class OpenHashSet[@specialized(Long, Int, Double, Float) T: ClassTag]( while (true) { if (!_bitset.get(pos)) { return INVALID_POS - } else if (k == _data(pos)) { + } else if (keyExistsAtPos(k, pos)) { return pos } else { // quadratic probing with values increase by 1, 2, 3, ... diff --git a/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala index 89a308556d5df..4fd6978d540d8 100644 --- a/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala @@ -269,4 +269,35 @@ class OpenHashSetSuite extends SparkFunSuite with Matchers { assert(pos1 == pos2) } } + + test("SPARK-45599: 0.0 and -0.0 are equal but not the same") { + // Therefore, 0.0 and -0.0 should get separate entries in the hash set. + val spark45599Repro = Seq( + // Need exactly these elements in roughly this order to get 0.0 and -0.0 + // to hash to the same bucket. + Double.NaN, + 2.0, + 168.0, + Double.NaN, + Double.NaN, + -0.0, + 153.0, + 0.0 + ) + val set = new OpenHashSet[Double]() + spark45599Repro.foreach(set.add) + assert(set.size == 6) + val zeroPos = set.getPos(0.0) + val negZeroPos = set.getPos(-0.0) + assert(zeroPos != negZeroPos) + } + + test("SPARK-45599: NaN and NaN are the same but not equal") { + // Any mathematical comparison to NaN will return false, but when we place it in + // a hash set we want the lookup to work like a "normal" value. + val set = new OpenHashSet[Double]() + set.add(Double.NaN) + set.add(Double.NaN) + assert(set.size == 1) + } } From 0d5c2ce427e06adfebf47bc446ff6646513ae750 Mon Sep 17 00:00:00 2001 From: Nicholas Chammas Date: Wed, 7 Feb 2024 15:50:35 -0500 Subject: [PATCH 02/15] tweak docstring --- .../scala/org/apache/spark/util/collection/OpenHashSet.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala b/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala index 790e95b31a2e5..5567483e29ff7 100644 --- a/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala +++ b/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala @@ -112,8 +112,8 @@ class OpenHashSet[@specialized(Long, Int, Double, Float) T: ClassTag]( /** * Check if a key exists at the provided position using object equality rather than - * cooperative equality. Otherwise, hash sets that include both 0.0 and -0.0 may drop - * one of those entries. + * cooperative equality. Otherwise, hash sets will mishandle values for which `==` + * and `equals` return different results, like 0.0/-0.0 and NaN/NaN. * * See: https://issues.apache.org/jira/browse/SPARK-45599 */ From 2bfc60548db2c41f4c64b63d40a2652cb22732ab Mon Sep 17 00:00:00 2001 From: Nicholas Chammas Date: Wed, 7 Feb 2024 15:54:34 -0500 Subject: [PATCH 03/15] =?UTF-8?q?add=20GROUP=20BY=20test=20for=20=C2=B10.0?= =?UTF-8?q?,=20NaN,=20and=20=C2=B1Infinity?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../resources/sql-tests/inputs/group-by.sql | 15 +++++++++++++ .../sql-tests/results/group-by.sql.out | 22 +++++++++++++++++++ 2 files changed, 37 insertions(+) diff --git a/sql/core/src/test/resources/sql-tests/inputs/group-by.sql b/sql/core/src/test/resources/sql-tests/inputs/group-by.sql index ea1e2f323151a..7d6116ac1e3f1 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/group-by.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/group-by.sql @@ -259,3 +259,18 @@ FROM ( GROUP BY b ) t3 GROUP BY c; + +-- SPARK-45599: Check that "weird" doubles group and sort as desired. +SELECT col1, count(*) AS cnt +FROM VALUES + (0.0), + (-0.0), + (double('NaN')), + (double('NaN')), + (double('Infinity')), + (double('Infinity')), + (-double('Infinity')), + (-double('Infinity')) +GROUP BY col1 +ORDER BY col1 +; diff --git a/sql/core/src/test/resources/sql-tests/results/group-by.sql.out b/sql/core/src/test/resources/sql-tests/results/group-by.sql.out index e9addb9631536..0735752947222 100644 --- a/sql/core/src/test/resources/sql-tests/results/group-by.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/group-by.sql.out @@ -1102,3 +1102,25 @@ struct -- !query output 0 2 + + +-- !query +SELECT col1, count(*) AS cnt +FROM VALUES + (0.0), + (-0.0), + (double('NaN')), + (double('NaN')), + (double('Infinity')), + (double('Infinity')), + (-double('Infinity')), + (-double('Infinity')) +GROUP BY col1 +ORDER BY col1 +-- !query schema +struct +-- !query output +-Infinity 2 +0.0 2 +Infinity 2 +NaN 2 From 029045556a192b25aa62c7d5c74f205f75a4a1d9 Mon Sep 17 00:00:00 2001 From: Nicholas Chammas Date: Wed, 7 Feb 2024 18:27:48 -0500 Subject: [PATCH 04/15] add test for OpenHashMapSuite --- .../util/collection/OpenHashMapSuite.scala | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala index 155c855c8723e..8141b9a053a0f 100644 --- a/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala @@ -249,4 +249,34 @@ class OpenHashMapSuite extends SparkFunSuite with Matchers { map(null) = null assert(map.get(null) === Some(null)) } + + test("SPARK-45599: 0.0 and -0.0 should count distinctly") { + // Exactly these elements provided in roughly this order trigger a condition where lookups of + // 0.0 and -0.0 in the bitset happen to collide, causing their counts to be merged incorrectly + // and inconsistently if `==` is used to check for key equality. + val spark45599Repro = Seq( + Double.NaN, + 2.0, + 168.0, + Double.NaN, + Double.NaN, + -0.0, + 153.0, + 0.0 + ) + + val map1 = new OpenHashMap[Double, Int]() + spark45599Repro.foreach(map1.changeValue(_, 1, {_ + 1})) + map1.iterator.foreach(println) + assert(map1(0.0) == 1) + assert(map1(-0.0) == 1) + + val map2 = new OpenHashMap[Double, Int]() + // Simply changing the order in which the elements are added to the map should not change the + // counts for 0.0 and -0.0. + spark45599Repro.reverse.foreach(map2.changeValue(_, 1, {_ + 1})) + map2.iterator.foreach(println) + assert(map2(0.0) == 1) + assert(map2(-0.0) == 1) + } } From 21cff3b5b452737e78895295407682cc4002ca04 Mon Sep 17 00:00:00 2001 From: Nicholas Chammas Date: Wed, 7 Feb 2024 18:28:29 -0500 Subject: [PATCH 05/15] explain details of bug in OpenHashSet --- .../spark/util/collection/OpenHashSetSuite.scala | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala index 4fd6978d540d8..f32424fe7d006 100644 --- a/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala @@ -272,9 +272,16 @@ class OpenHashSetSuite extends SparkFunSuite with Matchers { test("SPARK-45599: 0.0 and -0.0 are equal but not the same") { // Therefore, 0.0 and -0.0 should get separate entries in the hash set. + // + // Exactly these elements provided in roughly this order will trigger the following scenario: + // When probing the bitset in `getPos(-0.0)`, the loop will happen upon the entry for 0.0. + // In the old logic pre-SPARK-45599, the loop will find that the bit is set and, because + // -0.0 == 0.0, it will think that's the position of -0.0. But in reality this is the position + // of 0.0. So -0.0 and 0.0 will be stored at different positions, but `getPos()` will return + // the same position for them. This can cause users of OpenHashSet, like OpenHashMap, to + // return the wrong value for a key based on whether or not this bitset lookup collision + // happens. val spark45599Repro = Seq( - // Need exactly these elements in roughly this order to get 0.0 and -0.0 - // to hash to the same bucket. Double.NaN, 2.0, 168.0, From 0c47d4c657cbeb2e60534612b87ea63f6a938f42 Mon Sep 17 00:00:00 2001 From: Nicholas Chammas Date: Wed, 7 Feb 2024 22:03:45 -0500 Subject: [PATCH 06/15] add analyzer test output --- .../analyzer-results/group-by.sql.out | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/group-by.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/group-by.sql.out index 88517449760d9..f14c87fc6ae3e 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/group-by.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/group-by.sql.out @@ -1171,3 +1171,22 @@ Aggregate [c#x], [(c#x * 2) AS d#x] +- Project [if ((a#x < 0)) 0 else a#x AS b#x] +- SubqueryAlias t1 +- LocalRelation [a#x] + + +-- !query +SELECT col1, count(*) AS cnt +FROM VALUES + (0.0), + (-0.0), + (double('NaN')), + (double('NaN')), + (double('Infinity')), + (double('Infinity')), + (-double('Infinity')), + (-double('Infinity')) +GROUP BY col1 +ORDER BY col1 +-- !query analysis +Sort [col1#x ASC NULLS FIRST], true ++- Aggregate [col1#x], [col1#x, count(1) AS cnt#xL] + +- LocalRelation [col1#x] From 760cb49d71414468e568719d2dcddc16bb705a64 Mon Sep 17 00:00:00 2001 From: Nicholas Chammas Date: Wed, 7 Feb 2024 23:35:21 -0500 Subject: [PATCH 07/15] add percentile test that captures repro from jira --- .../spark/sql/DataFrameAggregateSuite.scala | 33 +++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala index 5c7cf874bd793..ab665d8943b33 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala @@ -1089,6 +1089,39 @@ class DataFrameAggregateSuite extends QueryTest ) } + test("SPARK-45599: Neither 0.0 nor -0.0 should be dropped when computing percentile") { + // To reproduce the bug described in SPARK-45599, we need exactly these rows in roughly + // this order in a DataFrame with exactly 1 partition. + // scalastyle:off line.size.limit + // See: https://issues.apache.org/jira/browse/SPARK-45599?focusedCommentId=17806954&page=com.atlassian.jira.plugin.system.issuetabpanels%3Acomment-tabpanel#comment-17806954 + // scalastyle:on line.size.limit + val spark45599Repro: DataFrame = Seq( + 0.0, + 2.0, + 153.0, + 168.0, + 3252411229536261.0, + 7.205759403792794e+16, + 1.7976931348623157e+308, + 0.25, + Double.NaN, + Double.NaN, + -0.0, + -128.0, + Double.NaN, + Double.NaN + ).toDF("val").coalesce(1) + + checkAnswer( + spark45599Repro.agg( + percentile(col("val"), lit(0.1)) + ), + // With the buggy implementation of OpenHashSet, this returns `0.050000000000000044` + // instead of `-0.0`. + List(Row(-0.0)) + ) + } + test("any_value") { checkAnswer( courseSales.groupBy("course").agg( From 7ac5e3e3430b9b785e493c317592ce122ee31cbc Mon Sep 17 00:00:00 2001 From: Nicholas Chammas Date: Thu, 8 Feb 2024 02:07:54 -0500 Subject: [PATCH 08/15] add contains NaN test --- .../org/apache/spark/util/collection/OpenHashSetSuite.scala | 1 + 1 file changed, 1 insertion(+) diff --git a/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala index f32424fe7d006..0bc8aa067f57a 100644 --- a/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala @@ -305,6 +305,7 @@ class OpenHashSetSuite extends SparkFunSuite with Matchers { val set = new OpenHashSet[Double]() set.add(Double.NaN) set.add(Double.NaN) + assert(set.contains(Double.NaN)) assert(set.size == 1) } } From 5bbde83ccef9264d53f9cd0d1478853e24614961 Mon Sep 17 00:00:00 2001 From: Nicholas Chammas Date: Thu, 8 Feb 2024 02:08:14 -0500 Subject: [PATCH 09/15] remove prints --- .../org/apache/spark/util/collection/OpenHashMapSuite.scala | 2 -- 1 file changed, 2 deletions(-) diff --git a/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala index 8141b9a053a0f..c9102890accfe 100644 --- a/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala @@ -267,7 +267,6 @@ class OpenHashMapSuite extends SparkFunSuite with Matchers { val map1 = new OpenHashMap[Double, Int]() spark45599Repro.foreach(map1.changeValue(_, 1, {_ + 1})) - map1.iterator.foreach(println) assert(map1(0.0) == 1) assert(map1(-0.0) == 1) @@ -275,7 +274,6 @@ class OpenHashMapSuite extends SparkFunSuite with Matchers { // Simply changing the order in which the elements are added to the map should not change the // counts for 0.0 and -0.0. spark45599Repro.reverse.foreach(map2.changeValue(_, 1, {_ + 1})) - map2.iterator.foreach(println) assert(map2(0.0) == 1) assert(map2(-0.0) == 1) } From 3b5118d44cdbb03016fd7dcd0b6035b8306afb7c Mon Sep 17 00:00:00 2001 From: Nicholas Chammas Date: Mon, 12 Feb 2024 10:08:24 -0500 Subject: [PATCH 10/15] add test to confirm -0 and -0.0 are normalized correctly --- .../analyzer-results/ansi/literals.sql.out | 14 ++++++++++++++ .../sql-tests/analyzer-results/literals.sql.out | 14 ++++++++++++++ .../test/resources/sql-tests/inputs/literals.sql | 4 ++++ .../sql-tests/results/ansi/literals.sql.out | 16 ++++++++++++++++ .../resources/sql-tests/results/literals.sql.out | 16 ++++++++++++++++ 5 files changed, 64 insertions(+) diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/literals.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/literals.sql.out index 83d0ff3f2edf7..841a52dae9273 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/literals.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/literals.sql.out @@ -699,3 +699,17 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException "fragment" : "-x'2379ACFe'" } ] } + + +-- !query +select -0 +-- !query analysis +Project [0 AS 0#x] ++- OneRowRelation + + +-- !query +select -0.0 +-- !query analysis +Project [0.0 AS 0.0#x] ++- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/literals.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/literals.sql.out index 83d0ff3f2edf7..841a52dae9273 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/literals.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/literals.sql.out @@ -699,3 +699,17 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException "fragment" : "-x'2379ACFe'" } ] } + + +-- !query +select -0 +-- !query analysis +Project [0 AS 0#x] ++- OneRowRelation + + +-- !query +select -0.0 +-- !query analysis +Project [0.0 AS 0.0#x] ++- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/inputs/literals.sql b/sql/core/src/test/resources/sql-tests/inputs/literals.sql index 9f0eefc16a8cd..8cc6202c3efb4 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/literals.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/literals.sql @@ -118,3 +118,7 @@ select +X'1'; select -date '1999-01-01'; select -timestamp '1999-01-01'; select -x'2379ACFe'; + +-- normalize -0 and -0.0 +select -0; +select -0.0; diff --git a/sql/core/src/test/resources/sql-tests/results/ansi/literals.sql.out b/sql/core/src/test/resources/sql-tests/results/ansi/literals.sql.out index 6e2c8a65206ef..80faeb5df9954 100644 --- a/sql/core/src/test/resources/sql-tests/results/ansi/literals.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/ansi/literals.sql.out @@ -777,3 +777,19 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException "fragment" : "-x'2379ACFe'" } ] } + + +-- !query +select -0 +-- !query schema +struct<0:int> +-- !query output +0 + + +-- !query +select -0.0 +-- !query schema +struct<0.0:decimal(1,1)> +-- !query output +0.0 diff --git a/sql/core/src/test/resources/sql-tests/results/literals.sql.out b/sql/core/src/test/resources/sql-tests/results/literals.sql.out index 6e2c8a65206ef..80faeb5df9954 100644 --- a/sql/core/src/test/resources/sql-tests/results/literals.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/literals.sql.out @@ -777,3 +777,19 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException "fragment" : "-x'2379ACFe'" } ] } + + +-- !query +select -0 +-- !query schema +struct<0:int> +-- !query output +0 + + +-- !query +select -0.0 +-- !query schema +struct<0.0:decimal(1,1)> +-- !query output +0.0 From 6b4f09141f114219a570bbe5b972c04becc20e30 Mon Sep 17 00:00:00 2001 From: Nicholas Chammas Date: Mon, 12 Feb 2024 10:31:58 -0500 Subject: [PATCH 11/15] confirm array_union and _distinct handle 0.0/-0.0/NaN correctly --- .../analyzer-results/ansi/array.sql.out | 14 ++++++++++++++ .../sql-tests/analyzer-results/array.sql.out | 14 ++++++++++++++ .../test/resources/sql-tests/inputs/array.sql | 4 ++++ .../sql-tests/results/ansi/array.sql.out | 16 ++++++++++++++++ .../resources/sql-tests/results/array.sql.out | 16 ++++++++++++++++ 5 files changed, 64 insertions(+) diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/array.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/array.sql.out index f4fd42d6adea3..727025d6fc83b 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/array.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/array.sql.out @@ -747,3 +747,17 @@ select array_prepend(array(CAST(NULL AS String)), CAST(NULL AS String)) -- !query analysis Project [array_prepend(array(cast(null as string)), cast(null as string)) AS array_prepend(array(CAST(NULL AS STRING)), CAST(NULL AS STRING))#x] +- OneRowRelation + + +-- !query +select array_union(array(0.0, -0.0, DOUBLE("NaN")), array(0.0, -0.0, DOUBLE("NaN"))) +-- !query analysis +Project [array_union(array(cast(0.0 as double), cast(0.0 as double), cast(NaN as double)), array(cast(0.0 as double), cast(0.0 as double), cast(NaN as double))) AS array_union(array(0.0, 0.0, NaN), array(0.0, 0.0, NaN))#x] ++- OneRowRelation + + +-- !query +select array_distinct(array(0.0, -0.0, -0.0, DOUBLE("NaN"), DOUBLE("NaN"))) +-- !query analysis +Project [array_distinct(array(cast(0.0 as double), cast(0.0 as double), cast(0.0 as double), cast(NaN as double), cast(NaN as double))) AS array_distinct(array(0.0, 0.0, 0.0, NaN, NaN))#x] ++- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/array.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/array.sql.out index c26bb210b0fff..216d978546b26 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/array.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/array.sql.out @@ -747,3 +747,17 @@ select array_prepend(array(CAST(NULL AS String)), CAST(NULL AS String)) -- !query analysis Project [array_prepend(array(cast(null as string)), cast(null as string)) AS array_prepend(array(CAST(NULL AS STRING)), CAST(NULL AS STRING))#x] +- OneRowRelation + + +-- !query +select array_union(array(0.0, -0.0, DOUBLE("NaN")), array(0.0, -0.0, DOUBLE("NaN"))) +-- !query analysis +Project [array_union(array(cast(0.0 as double), cast(0.0 as double), cast(NaN as double)), array(cast(0.0 as double), cast(0.0 as double), cast(NaN as double))) AS array_union(array(0.0, 0.0, NaN), array(0.0, 0.0, NaN))#x] ++- OneRowRelation + + +-- !query +select array_distinct(array(0.0, -0.0, -0.0, DOUBLE("NaN"), DOUBLE("NaN"))) +-- !query analysis +Project [array_distinct(array(cast(0.0 as double), cast(0.0 as double), cast(0.0 as double), cast(NaN as double), cast(NaN as double))) AS array_distinct(array(0.0, 0.0, 0.0, NaN, NaN))#x] ++- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/inputs/array.sql b/sql/core/src/test/resources/sql-tests/inputs/array.sql index 52a0906ea7392..865dc8bac4ea5 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/array.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/array.sql @@ -177,3 +177,7 @@ select array_prepend(CAST(null AS ARRAY), CAST(null as String)); select array_prepend(array(), 1); select array_prepend(CAST(array() AS ARRAY), CAST(NULL AS String)); select array_prepend(array(CAST(NULL AS String)), CAST(NULL AS String)); + +-- SPARK-45599: Confirm 0.0, -0.0, and NaN are handled appropriately. +select array_union(array(0.0, -0.0, DOUBLE("NaN")), array(0.0, -0.0, DOUBLE("NaN"))); +select array_distinct(array(0.0, -0.0, -0.0, DOUBLE("NaN"), DOUBLE("NaN"))); diff --git a/sql/core/src/test/resources/sql-tests/results/ansi/array.sql.out b/sql/core/src/test/resources/sql-tests/results/ansi/array.sql.out index 49e18411ffa37..6a07d659e39b5 100644 --- a/sql/core/src/test/resources/sql-tests/results/ansi/array.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/ansi/array.sql.out @@ -907,3 +907,19 @@ select array_prepend(array(CAST(NULL AS String)), CAST(NULL AS String)) struct> -- !query output [null,null] + + +-- !query +select array_union(array(0.0, -0.0, DOUBLE("NaN")), array(0.0, -0.0, DOUBLE("NaN"))) +-- !query schema +struct> +-- !query output +[0.0,NaN] + + +-- !query +select array_distinct(array(0.0, -0.0, -0.0, DOUBLE("NaN"), DOUBLE("NaN"))) +-- !query schema +struct> +-- !query output +[0.0,NaN] diff --git a/sql/core/src/test/resources/sql-tests/results/array.sql.out b/sql/core/src/test/resources/sql-tests/results/array.sql.out index e568f5fa7796d..d33fc62f0d9a1 100644 --- a/sql/core/src/test/resources/sql-tests/results/array.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/array.sql.out @@ -788,3 +788,19 @@ select array_prepend(array(CAST(NULL AS String)), CAST(NULL AS String)) struct> -- !query output [null,null] + + +-- !query +select array_union(array(0.0, -0.0, DOUBLE("NaN")), array(0.0, -0.0, DOUBLE("NaN"))) +-- !query schema +struct> +-- !query output +[0.0,NaN] + + +-- !query +select array_distinct(array(0.0, -0.0, -0.0, DOUBLE("NaN"), DOUBLE("NaN"))) +-- !query schema +struct> +-- !query output +[0.0,NaN] From 9fe698b292bcb6e87ab3238affc24e84eb3244ec Mon Sep 17 00:00:00 2001 From: Nicholas Chammas Date: Thu, 15 Feb 2024 10:10:03 -0500 Subject: [PATCH 12/15] remove commented out `==` --- .../scala/org/apache/spark/util/collection/OpenHashSet.scala | 1 - 1 file changed, 1 deletion(-) diff --git a/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala b/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala index 5567483e29ff7..a42fa9ba6bc85 100644 --- a/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala +++ b/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala @@ -119,7 +119,6 @@ class OpenHashSet[@specialized(Long, Int, Double, Float) T: ClassTag]( */ @annotation.nowarn("cat=other-non-cooperative-equals") private def keyExistsAtPos(k: T, pos: Int) = - // _data(pos) == k _data(pos) equals k /** From 60f698a995eb634d007aeaf6b30b5fc89d24e5ff Mon Sep 17 00:00:00 2001 From: Nicholas Chammas Date: Thu, 15 Feb 2024 10:11:19 -0500 Subject: [PATCH 13/15] check count of NaNs in map --- .../org/apache/spark/util/collection/OpenHashMapSuite.scala | 2 ++ 1 file changed, 2 insertions(+) diff --git a/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala index c9102890accfe..4809e292224da 100644 --- a/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala @@ -269,6 +269,7 @@ class OpenHashMapSuite extends SparkFunSuite with Matchers { spark45599Repro.foreach(map1.changeValue(_, 1, {_ + 1})) assert(map1(0.0) == 1) assert(map1(-0.0) == 1) + assert(map1(Double.NaN) == 3) val map2 = new OpenHashMap[Double, Int]() // Simply changing the order in which the elements are added to the map should not change the @@ -276,5 +277,6 @@ class OpenHashMapSuite extends SparkFunSuite with Matchers { spark45599Repro.reverse.foreach(map2.changeValue(_, 1, {_ + 1})) assert(map2(0.0) == 1) assert(map2(-0.0) == 1) + assert(map2(Double.NaN) == 3) } } From 15a091cbfc804cc752af7e0fade67f0bdd4e221a Mon Sep 17 00:00:00 2001 From: Nicholas Chammas Date: Thu, 15 Feb 2024 10:12:28 -0500 Subject: [PATCH 14/15] merge literals test into single query --- .../analyzer-results/ansi/literals.sql.out | 11 ++--------- .../sql-tests/analyzer-results/literals.sql.out | 11 ++--------- .../test/resources/sql-tests/inputs/literals.sql | 3 +-- .../sql-tests/results/ansi/literals.sql.out | 14 +++----------- .../resources/sql-tests/results/literals.sql.out | 14 +++----------- 5 files changed, 11 insertions(+), 42 deletions(-) diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/literals.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/literals.sql.out index 841a52dae9273..fece926834b7c 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/literals.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/literals.sql.out @@ -702,14 +702,7 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException -- !query -select -0 +select -0, -0.0 -- !query analysis -Project [0 AS 0#x] -+- OneRowRelation - - --- !query -select -0.0 --- !query analysis -Project [0.0 AS 0.0#x] +Project [0 AS 0#x, 0.0 AS 0.0#x] +- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/literals.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/literals.sql.out index 841a52dae9273..fece926834b7c 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/literals.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/literals.sql.out @@ -702,14 +702,7 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException -- !query -select -0 +select -0, -0.0 -- !query analysis -Project [0 AS 0#x] -+- OneRowRelation - - --- !query -select -0.0 --- !query analysis -Project [0.0 AS 0.0#x] +Project [0 AS 0#x, 0.0 AS 0.0#x] +- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/inputs/literals.sql b/sql/core/src/test/resources/sql-tests/inputs/literals.sql index 8cc6202c3efb4..e1e4a370bffdc 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/literals.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/literals.sql @@ -120,5 +120,4 @@ select -timestamp '1999-01-01'; select -x'2379ACFe'; -- normalize -0 and -0.0 -select -0; -select -0.0; +select -0, -0.0; diff --git a/sql/core/src/test/resources/sql-tests/results/ansi/literals.sql.out b/sql/core/src/test/resources/sql-tests/results/ansi/literals.sql.out index 80faeb5df9954..9c3a0cce023b7 100644 --- a/sql/core/src/test/resources/sql-tests/results/ansi/literals.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/ansi/literals.sql.out @@ -780,16 +780,8 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException -- !query -select -0 +select -0, -0.0 -- !query schema -struct<0:int> +struct<0:int,0.0:decimal(1,1)> -- !query output -0 - - --- !query -select -0.0 --- !query schema -struct<0.0:decimal(1,1)> --- !query output -0.0 +0 0.0 diff --git a/sql/core/src/test/resources/sql-tests/results/literals.sql.out b/sql/core/src/test/resources/sql-tests/results/literals.sql.out index 80faeb5df9954..9c3a0cce023b7 100644 --- a/sql/core/src/test/resources/sql-tests/results/literals.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/literals.sql.out @@ -780,16 +780,8 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException -- !query -select -0 +select -0, -0.0 -- !query schema -struct<0:int> +struct<0:int,0.0:decimal(1,1)> -- !query output -0 - - --- !query -select -0.0 --- !query schema -struct<0.0:decimal(1,1)> --- !query output -0.0 +0 0.0 From 3a0b194c0b9fdeb7fe9165e3071d968fa3d674a6 Mon Sep 17 00:00:00 2001 From: Nicholas Chammas Date: Fri, 16 Feb 2024 00:29:07 -0500 Subject: [PATCH 15/15] tweak test name --- .../org/apache/spark/util/collection/OpenHashMapSuite.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala index 4809e292224da..ae9fb54bddfbe 100644 --- a/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala @@ -250,7 +250,7 @@ class OpenHashMapSuite extends SparkFunSuite with Matchers { assert(map.get(null) === Some(null)) } - test("SPARK-45599: 0.0 and -0.0 should count distinctly") { + test("SPARK-45599: 0.0 and -0.0 should count distinctly; NaNs should count together") { // Exactly these elements provided in roughly this order trigger a condition where lookups of // 0.0 and -0.0 in the bitset happen to collide, causing their counts to be merged incorrectly // and inconsistently if `==` is used to check for key equality.