From 905e1745635aa5438cfb209bfe3233eccf7bc062 Mon Sep 17 00:00:00 2001
From: Nicholas Chammas <nicholas.chammas@gmail.com>
Date: Mon, 5 Feb 2024 21:43:59 -0500
Subject: [PATCH 01/15] use object equality in OpenHashSet

---
 .../spark/util/collection/OpenHashSet.scala   | 17 ++++++++--
 .../util/collection/OpenHashSetSuite.scala    | 31 +++++++++++++++++++
 2 files changed, 45 insertions(+), 3 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala b/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala
index faee9ce56a0a1..790e95b31a2e5 100644
--- a/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala
+++ b/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala
@@ -110,6 +110,18 @@ class OpenHashSet[@specialized(Long, Int, Double, Float) T: ClassTag](
     this
   }
 
+  /**
+   * Check if a key exists at the provided position using object equality rather than
+   * cooperative equality. Otherwise, hash sets that include both 0.0 and -0.0 may drop
+   * one of those entries.
+   *
+   * See: https://issues.apache.org/jira/browse/SPARK-45599
+   */
+  @annotation.nowarn("cat=other-non-cooperative-equals")
+  private def keyExistsAtPos(k: T, pos: Int) =
+    // _data(pos) == k
+    _data(pos) equals k
+
   /**
    * Add an element to the set. This one differs from add in that it doesn't trigger rehashing.
    * The caller is responsible for calling rehashIfNeeded.
@@ -130,8 +142,7 @@ class OpenHashSet[@specialized(Long, Int, Double, Float) T: ClassTag](
         _bitset.set(pos)
         _size += 1
         return pos | NONEXISTENCE_MASK
-      } else if (_data(pos) == k) {
-        // Found an existing key.
+      } else if (keyExistsAtPos(k, pos)) {
         return pos
       } else {
         // quadratic probing with values increase by 1, 2, 3, ...
@@ -165,7 +176,7 @@ class OpenHashSet[@specialized(Long, Int, Double, Float) T: ClassTag](
     while (true) {
       if (!_bitset.get(pos)) {
         return INVALID_POS
-      } else if (k == _data(pos)) {
+      } else if (keyExistsAtPos(k, pos)) {
         return pos
       } else {
         // quadratic probing with values increase by 1, 2, 3, ...
diff --git a/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala
index 89a308556d5df..4fd6978d540d8 100644
--- a/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala
@@ -269,4 +269,35 @@ class OpenHashSetSuite extends SparkFunSuite with Matchers {
       assert(pos1 == pos2)
     }
   }
+
+  test("SPARK-45599: 0.0 and -0.0 are equal but not the same") {
+    // Therefore, 0.0 and -0.0 should get separate entries in the hash set.
+    val spark45599Repro = Seq(
+      // Need exactly these elements in roughly this order to get 0.0 and -0.0
+      // to hash to the same bucket.
+      Double.NaN,
+      2.0,
+      168.0,
+      Double.NaN,
+      Double.NaN,
+      -0.0,
+      153.0,
+      0.0
+    )
+    val set = new OpenHashSet[Double]()
+    spark45599Repro.foreach(set.add)
+    assert(set.size == 6)
+    val zeroPos = set.getPos(0.0)
+    val negZeroPos = set.getPos(-0.0)
+    assert(zeroPos != negZeroPos)
+  }
+
+  test("SPARK-45599: NaN and NaN are the same but not equal") {
+    // Any mathematical comparison to NaN will return false, but when we place it in
+    // a hash set we want the lookup to work like a "normal" value.
+    val set = new OpenHashSet[Double]()
+    set.add(Double.NaN)
+    set.add(Double.NaN)
+    assert(set.size == 1)
+  }
 }

From 0d5c2ce427e06adfebf47bc446ff6646513ae750 Mon Sep 17 00:00:00 2001
From: Nicholas Chammas <nicholas.chammas@gmail.com>
Date: Wed, 7 Feb 2024 15:50:35 -0500
Subject: [PATCH 02/15] tweak docstring

---
 .../scala/org/apache/spark/util/collection/OpenHashSet.scala  | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala b/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala
index 790e95b31a2e5..5567483e29ff7 100644
--- a/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala
+++ b/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala
@@ -112,8 +112,8 @@ class OpenHashSet[@specialized(Long, Int, Double, Float) T: ClassTag](
 
   /**
    * Check if a key exists at the provided position using object equality rather than
-   * cooperative equality. Otherwise, hash sets that include both 0.0 and -0.0 may drop
-   * one of those entries.
+   * cooperative equality. Otherwise, hash sets will mishandle values for which `==`
+   * and `equals` return different results, like 0.0/-0.0 and NaN/NaN.
    *
    * See: https://issues.apache.org/jira/browse/SPARK-45599
    */

From 2bfc60548db2c41f4c64b63d40a2652cb22732ab Mon Sep 17 00:00:00 2001
From: Nicholas Chammas <nicholas.chammas@gmail.com>
Date: Wed, 7 Feb 2024 15:54:34 -0500
Subject: [PATCH 03/15] =?UTF-8?q?add=20GROUP=20BY=20test=20for=20=C2=B10.0?=
 =?UTF-8?q?,=20NaN,=20and=20=C2=B1Infinity?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../resources/sql-tests/inputs/group-by.sql   | 15 +++++++++++++
 .../sql-tests/results/group-by.sql.out        | 22 +++++++++++++++++++
 2 files changed, 37 insertions(+)

diff --git a/sql/core/src/test/resources/sql-tests/inputs/group-by.sql b/sql/core/src/test/resources/sql-tests/inputs/group-by.sql
index ea1e2f323151a..7d6116ac1e3f1 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/group-by.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/group-by.sql
@@ -259,3 +259,18 @@ FROM (
          GROUP BY b
      ) t3
 GROUP BY c;
+
+-- SPARK-45599: Check that "weird" doubles group and sort as desired.
+SELECT col1, count(*) AS cnt
+FROM VALUES
+  (0.0),
+  (-0.0),
+  (double('NaN')),
+  (double('NaN')),
+  (double('Infinity')),
+  (double('Infinity')),
+  (-double('Infinity')),
+  (-double('Infinity'))
+GROUP BY col1
+ORDER BY col1
+;
diff --git a/sql/core/src/test/resources/sql-tests/results/group-by.sql.out b/sql/core/src/test/resources/sql-tests/results/group-by.sql.out
index e9addb9631536..0735752947222 100644
--- a/sql/core/src/test/resources/sql-tests/results/group-by.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/group-by.sql.out
@@ -1102,3 +1102,25 @@ struct<d:int>
 -- !query output
 0
 2
+
+
+-- !query
+SELECT col1, count(*) AS cnt
+FROM VALUES
+  (0.0),
+  (-0.0),
+  (double('NaN')),
+  (double('NaN')),
+  (double('Infinity')),
+  (double('Infinity')),
+  (-double('Infinity')),
+  (-double('Infinity'))
+GROUP BY col1
+ORDER BY col1
+-- !query schema
+struct<col1:double,cnt:bigint>
+-- !query output
+-Infinity	2
+0.0	2
+Infinity	2
+NaN	2

From 029045556a192b25aa62c7d5c74f205f75a4a1d9 Mon Sep 17 00:00:00 2001
From: Nicholas Chammas <nicholas.chammas@gmail.com>
Date: Wed, 7 Feb 2024 18:27:48 -0500
Subject: [PATCH 04/15] add test for OpenHashMapSuite

---
 .../util/collection/OpenHashMapSuite.scala    | 30 +++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala
index 155c855c8723e..8141b9a053a0f 100644
--- a/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala
@@ -249,4 +249,34 @@ class OpenHashMapSuite extends SparkFunSuite with Matchers {
     map(null) = null
     assert(map.get(null) === Some(null))
   }
+
+  test("SPARK-45599: 0.0 and -0.0 should count distinctly") {
+    // Exactly these elements provided in roughly this order trigger a condition where lookups of
+    // 0.0 and -0.0 in the bitset happen to collide, causing their counts to be merged incorrectly
+    // and inconsistently if `==` is used to check for key equality.
+    val spark45599Repro = Seq(
+      Double.NaN,
+      2.0,
+      168.0,
+      Double.NaN,
+      Double.NaN,
+      -0.0,
+      153.0,
+      0.0
+    )
+
+    val map1 = new OpenHashMap[Double, Int]()
+    spark45599Repro.foreach(map1.changeValue(_, 1, {_ + 1}))
+    map1.iterator.foreach(println)
+    assert(map1(0.0) == 1)
+    assert(map1(-0.0) == 1)
+
+    val map2 = new OpenHashMap[Double, Int]()
+    // Simply changing the order in which the elements are added to the map should not change the
+    // counts for 0.0 and -0.0.
+    spark45599Repro.reverse.foreach(map2.changeValue(_, 1, {_ + 1}))
+    map2.iterator.foreach(println)
+    assert(map2(0.0) == 1)
+    assert(map2(-0.0) == 1)
+  }
 }

From 21cff3b5b452737e78895295407682cc4002ca04 Mon Sep 17 00:00:00 2001
From: Nicholas Chammas <nicholas.chammas@gmail.com>
Date: Wed, 7 Feb 2024 18:28:29 -0500
Subject: [PATCH 05/15] explain details of bug in OpenHashSet

---
 .../spark/util/collection/OpenHashSetSuite.scala      | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala
index 4fd6978d540d8..f32424fe7d006 100644
--- a/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala
@@ -272,9 +272,16 @@ class OpenHashSetSuite extends SparkFunSuite with Matchers {
 
   test("SPARK-45599: 0.0 and -0.0 are equal but not the same") {
     // Therefore, 0.0 and -0.0 should get separate entries in the hash set.
+    //
+    // Exactly these elements provided in roughly this order will trigger the following scenario:
+    // When probing the bitset in `getPos(-0.0)`, the loop will happen upon the entry for 0.0.
+    // In the old logic pre-SPARK-45599, the loop will find that the bit is set and, because
+    // -0.0 == 0.0, it will think that's the position of -0.0. But in reality this is the position
+    // of 0.0. So -0.0 and 0.0 will be stored at different positions, but `getPos()` will return
+    // the same position for them. This can cause users of OpenHashSet, like OpenHashMap, to
+    // return the wrong value for a key based on whether or not this bitset lookup collision
+    // happens.
     val spark45599Repro = Seq(
-      // Need exactly these elements in roughly this order to get 0.0 and -0.0
-      // to hash to the same bucket.
       Double.NaN,
       2.0,
       168.0,

From 0c47d4c657cbeb2e60534612b87ea63f6a938f42 Mon Sep 17 00:00:00 2001
From: Nicholas Chammas <nicholas.chammas@gmail.com>
Date: Wed, 7 Feb 2024 22:03:45 -0500
Subject: [PATCH 06/15] add analyzer test output

---
 .../analyzer-results/group-by.sql.out         | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/group-by.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/group-by.sql.out
index 88517449760d9..f14c87fc6ae3e 100644
--- a/sql/core/src/test/resources/sql-tests/analyzer-results/group-by.sql.out
+++ b/sql/core/src/test/resources/sql-tests/analyzer-results/group-by.sql.out
@@ -1171,3 +1171,22 @@ Aggregate [c#x], [(c#x * 2) AS d#x]
          +- Project [if ((a#x < 0)) 0 else a#x AS b#x]
             +- SubqueryAlias t1
                +- LocalRelation [a#x]
+
+
+-- !query
+SELECT col1, count(*) AS cnt
+FROM VALUES
+  (0.0),
+  (-0.0),
+  (double('NaN')),
+  (double('NaN')),
+  (double('Infinity')),
+  (double('Infinity')),
+  (-double('Infinity')),
+  (-double('Infinity'))
+GROUP BY col1
+ORDER BY col1
+-- !query analysis
+Sort [col1#x ASC NULLS FIRST], true
++- Aggregate [col1#x], [col1#x, count(1) AS cnt#xL]
+   +- LocalRelation [col1#x]

From 760cb49d71414468e568719d2dcddc16bb705a64 Mon Sep 17 00:00:00 2001
From: Nicholas Chammas <nicholas.chammas@gmail.com>
Date: Wed, 7 Feb 2024 23:35:21 -0500
Subject: [PATCH 07/15] add percentile test that captures repro from jira

---
 .../spark/sql/DataFrameAggregateSuite.scala   | 33 +++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
index 5c7cf874bd793..ab665d8943b33 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
@@ -1089,6 +1089,39 @@ class DataFrameAggregateSuite extends QueryTest
     )
   }
 
+  test("SPARK-45599: Neither 0.0 nor -0.0 should be dropped when computing percentile") {
+    // To reproduce the bug described in SPARK-45599, we need exactly these rows in roughly
+    // this order in a DataFrame with exactly 1 partition.
+    // scalastyle:off line.size.limit
+    // See: https://issues.apache.org/jira/browse/SPARK-45599?focusedCommentId=17806954&page=com.atlassian.jira.plugin.system.issuetabpanels%3Acomment-tabpanel#comment-17806954
+    // scalastyle:on line.size.limit
+    val spark45599Repro: DataFrame = Seq(
+        0.0,
+        2.0,
+        153.0,
+        168.0,
+        3252411229536261.0,
+        7.205759403792794e+16,
+        1.7976931348623157e+308,
+        0.25,
+        Double.NaN,
+        Double.NaN,
+        -0.0,
+        -128.0,
+        Double.NaN,
+        Double.NaN
+      ).toDF("val").coalesce(1)
+
+    checkAnswer(
+      spark45599Repro.agg(
+        percentile(col("val"), lit(0.1))
+      ),
+      // With the buggy implementation of OpenHashSet, this returns `0.050000000000000044`
+      // instead of `-0.0`.
+      List(Row(-0.0))
+    )
+  }
+
   test("any_value") {
     checkAnswer(
       courseSales.groupBy("course").agg(

From 7ac5e3e3430b9b785e493c317592ce122ee31cbc Mon Sep 17 00:00:00 2001
From: Nicholas Chammas <nicholas.chammas@gmail.com>
Date: Thu, 8 Feb 2024 02:07:54 -0500
Subject: [PATCH 08/15] add contains NaN test

---
 .../org/apache/spark/util/collection/OpenHashSetSuite.scala      | 1 +
 1 file changed, 1 insertion(+)

diff --git a/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala
index f32424fe7d006..0bc8aa067f57a 100644
--- a/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala
@@ -305,6 +305,7 @@ class OpenHashSetSuite extends SparkFunSuite with Matchers {
     val set = new OpenHashSet[Double]()
     set.add(Double.NaN)
     set.add(Double.NaN)
+    assert(set.contains(Double.NaN))
     assert(set.size == 1)
   }
 }

From 5bbde83ccef9264d53f9cd0d1478853e24614961 Mon Sep 17 00:00:00 2001
From: Nicholas Chammas <nicholas.chammas@gmail.com>
Date: Thu, 8 Feb 2024 02:08:14 -0500
Subject: [PATCH 09/15] remove prints

---
 .../org/apache/spark/util/collection/OpenHashMapSuite.scala     | 2 --
 1 file changed, 2 deletions(-)

diff --git a/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala
index 8141b9a053a0f..c9102890accfe 100644
--- a/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala
@@ -267,7 +267,6 @@ class OpenHashMapSuite extends SparkFunSuite with Matchers {
 
     val map1 = new OpenHashMap[Double, Int]()
     spark45599Repro.foreach(map1.changeValue(_, 1, {_ + 1}))
-    map1.iterator.foreach(println)
     assert(map1(0.0) == 1)
     assert(map1(-0.0) == 1)
 
@@ -275,7 +274,6 @@ class OpenHashMapSuite extends SparkFunSuite with Matchers {
     // Simply changing the order in which the elements are added to the map should not change the
     // counts for 0.0 and -0.0.
     spark45599Repro.reverse.foreach(map2.changeValue(_, 1, {_ + 1}))
-    map2.iterator.foreach(println)
     assert(map2(0.0) == 1)
     assert(map2(-0.0) == 1)
   }

From 3b5118d44cdbb03016fd7dcd0b6035b8306afb7c Mon Sep 17 00:00:00 2001
From: Nicholas Chammas <nicholas.chammas@gmail.com>
Date: Mon, 12 Feb 2024 10:08:24 -0500
Subject: [PATCH 10/15] add test to confirm -0 and -0.0 are normalized
 correctly

---
 .../analyzer-results/ansi/literals.sql.out       | 14 ++++++++++++++
 .../sql-tests/analyzer-results/literals.sql.out  | 14 ++++++++++++++
 .../test/resources/sql-tests/inputs/literals.sql |  4 ++++
 .../sql-tests/results/ansi/literals.sql.out      | 16 ++++++++++++++++
 .../resources/sql-tests/results/literals.sql.out | 16 ++++++++++++++++
 5 files changed, 64 insertions(+)

diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/literals.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/literals.sql.out
index 83d0ff3f2edf7..841a52dae9273 100644
--- a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/literals.sql.out
+++ b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/literals.sql.out
@@ -699,3 +699,17 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException
     "fragment" : "-x'2379ACFe'"
   } ]
 }
+
+
+-- !query
+select -0
+-- !query analysis
+Project [0 AS 0#x]
++- OneRowRelation
+
+
+-- !query
+select -0.0
+-- !query analysis
+Project [0.0 AS 0.0#x]
++- OneRowRelation
diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/literals.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/literals.sql.out
index 83d0ff3f2edf7..841a52dae9273 100644
--- a/sql/core/src/test/resources/sql-tests/analyzer-results/literals.sql.out
+++ b/sql/core/src/test/resources/sql-tests/analyzer-results/literals.sql.out
@@ -699,3 +699,17 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException
     "fragment" : "-x'2379ACFe'"
   } ]
 }
+
+
+-- !query
+select -0
+-- !query analysis
+Project [0 AS 0#x]
++- OneRowRelation
+
+
+-- !query
+select -0.0
+-- !query analysis
+Project [0.0 AS 0.0#x]
++- OneRowRelation
diff --git a/sql/core/src/test/resources/sql-tests/inputs/literals.sql b/sql/core/src/test/resources/sql-tests/inputs/literals.sql
index 9f0eefc16a8cd..8cc6202c3efb4 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/literals.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/literals.sql
@@ -118,3 +118,7 @@ select +X'1';
 select -date '1999-01-01';
 select -timestamp '1999-01-01';
 select -x'2379ACFe';
+
+-- normalize -0 and -0.0
+select -0;
+select -0.0;
diff --git a/sql/core/src/test/resources/sql-tests/results/ansi/literals.sql.out b/sql/core/src/test/resources/sql-tests/results/ansi/literals.sql.out
index 6e2c8a65206ef..80faeb5df9954 100644
--- a/sql/core/src/test/resources/sql-tests/results/ansi/literals.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/ansi/literals.sql.out
@@ -777,3 +777,19 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException
     "fragment" : "-x'2379ACFe'"
   } ]
 }
+
+
+-- !query
+select -0
+-- !query schema
+struct<0:int>
+-- !query output
+0
+
+
+-- !query
+select -0.0
+-- !query schema
+struct<0.0:decimal(1,1)>
+-- !query output
+0.0
diff --git a/sql/core/src/test/resources/sql-tests/results/literals.sql.out b/sql/core/src/test/resources/sql-tests/results/literals.sql.out
index 6e2c8a65206ef..80faeb5df9954 100644
--- a/sql/core/src/test/resources/sql-tests/results/literals.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/literals.sql.out
@@ -777,3 +777,19 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException
     "fragment" : "-x'2379ACFe'"
   } ]
 }
+
+
+-- !query
+select -0
+-- !query schema
+struct<0:int>
+-- !query output
+0
+
+
+-- !query
+select -0.0
+-- !query schema
+struct<0.0:decimal(1,1)>
+-- !query output
+0.0

From 6b4f09141f114219a570bbe5b972c04becc20e30 Mon Sep 17 00:00:00 2001
From: Nicholas Chammas <nicholas.chammas@gmail.com>
Date: Mon, 12 Feb 2024 10:31:58 -0500
Subject: [PATCH 11/15] confirm array_union and _distinct handle 0.0/-0.0/NaN
 correctly

---
 .../analyzer-results/ansi/array.sql.out          | 14 ++++++++++++++
 .../sql-tests/analyzer-results/array.sql.out     | 14 ++++++++++++++
 .../test/resources/sql-tests/inputs/array.sql    |  4 ++++
 .../sql-tests/results/ansi/array.sql.out         | 16 ++++++++++++++++
 .../resources/sql-tests/results/array.sql.out    | 16 ++++++++++++++++
 5 files changed, 64 insertions(+)

diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/array.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/array.sql.out
index f4fd42d6adea3..727025d6fc83b 100644
--- a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/array.sql.out
+++ b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/array.sql.out
@@ -747,3 +747,17 @@ select array_prepend(array(CAST(NULL AS String)), CAST(NULL AS String))
 -- !query analysis
 Project [array_prepend(array(cast(null as string)), cast(null as string)) AS array_prepend(array(CAST(NULL AS STRING)), CAST(NULL AS STRING))#x]
 +- OneRowRelation
+
+
+-- !query
+select array_union(array(0.0, -0.0, DOUBLE("NaN")), array(0.0, -0.0, DOUBLE("NaN")))
+-- !query analysis
+Project [array_union(array(cast(0.0 as double), cast(0.0 as double), cast(NaN as double)), array(cast(0.0 as double), cast(0.0 as double), cast(NaN as double))) AS array_union(array(0.0, 0.0, NaN), array(0.0, 0.0, NaN))#x]
++- OneRowRelation
+
+
+-- !query
+select array_distinct(array(0.0, -0.0, -0.0, DOUBLE("NaN"), DOUBLE("NaN")))
+-- !query analysis
+Project [array_distinct(array(cast(0.0 as double), cast(0.0 as double), cast(0.0 as double), cast(NaN as double), cast(NaN as double))) AS array_distinct(array(0.0, 0.0, 0.0, NaN, NaN))#x]
++- OneRowRelation
diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/array.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/array.sql.out
index c26bb210b0fff..216d978546b26 100644
--- a/sql/core/src/test/resources/sql-tests/analyzer-results/array.sql.out
+++ b/sql/core/src/test/resources/sql-tests/analyzer-results/array.sql.out
@@ -747,3 +747,17 @@ select array_prepend(array(CAST(NULL AS String)), CAST(NULL AS String))
 -- !query analysis
 Project [array_prepend(array(cast(null as string)), cast(null as string)) AS array_prepend(array(CAST(NULL AS STRING)), CAST(NULL AS STRING))#x]
 +- OneRowRelation
+
+
+-- !query
+select array_union(array(0.0, -0.0, DOUBLE("NaN")), array(0.0, -0.0, DOUBLE("NaN")))
+-- !query analysis
+Project [array_union(array(cast(0.0 as double), cast(0.0 as double), cast(NaN as double)), array(cast(0.0 as double), cast(0.0 as double), cast(NaN as double))) AS array_union(array(0.0, 0.0, NaN), array(0.0, 0.0, NaN))#x]
++- OneRowRelation
+
+
+-- !query
+select array_distinct(array(0.0, -0.0, -0.0, DOUBLE("NaN"), DOUBLE("NaN")))
+-- !query analysis
+Project [array_distinct(array(cast(0.0 as double), cast(0.0 as double), cast(0.0 as double), cast(NaN as double), cast(NaN as double))) AS array_distinct(array(0.0, 0.0, 0.0, NaN, NaN))#x]
++- OneRowRelation
diff --git a/sql/core/src/test/resources/sql-tests/inputs/array.sql b/sql/core/src/test/resources/sql-tests/inputs/array.sql
index 52a0906ea7392..865dc8bac4ea5 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/array.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/array.sql
@@ -177,3 +177,7 @@ select array_prepend(CAST(null AS ARRAY<String>), CAST(null as String));
 select array_prepend(array(), 1);
 select array_prepend(CAST(array() AS ARRAY<String>), CAST(NULL AS String));
 select array_prepend(array(CAST(NULL AS String)), CAST(NULL AS String));
+
+-- SPARK-45599: Confirm 0.0, -0.0, and NaN are handled appropriately.
+select array_union(array(0.0, -0.0, DOUBLE("NaN")), array(0.0, -0.0, DOUBLE("NaN")));
+select array_distinct(array(0.0, -0.0, -0.0, DOUBLE("NaN"), DOUBLE("NaN")));
diff --git a/sql/core/src/test/resources/sql-tests/results/ansi/array.sql.out b/sql/core/src/test/resources/sql-tests/results/ansi/array.sql.out
index 49e18411ffa37..6a07d659e39b5 100644
--- a/sql/core/src/test/resources/sql-tests/results/ansi/array.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/ansi/array.sql.out
@@ -907,3 +907,19 @@ select array_prepend(array(CAST(NULL AS String)), CAST(NULL AS String))
 struct<array_prepend(array(CAST(NULL AS STRING)), CAST(NULL AS STRING)):array<string>>
 -- !query output
 [null,null]
+
+
+-- !query
+select array_union(array(0.0, -0.0, DOUBLE("NaN")), array(0.0, -0.0, DOUBLE("NaN")))
+-- !query schema
+struct<array_union(array(0.0, 0.0, NaN), array(0.0, 0.0, NaN)):array<double>>
+-- !query output
+[0.0,NaN]
+
+
+-- !query
+select array_distinct(array(0.0, -0.0, -0.0, DOUBLE("NaN"), DOUBLE("NaN")))
+-- !query schema
+struct<array_distinct(array(0.0, 0.0, 0.0, NaN, NaN)):array<double>>
+-- !query output
+[0.0,NaN]
diff --git a/sql/core/src/test/resources/sql-tests/results/array.sql.out b/sql/core/src/test/resources/sql-tests/results/array.sql.out
index e568f5fa7796d..d33fc62f0d9a1 100644
--- a/sql/core/src/test/resources/sql-tests/results/array.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/array.sql.out
@@ -788,3 +788,19 @@ select array_prepend(array(CAST(NULL AS String)), CAST(NULL AS String))
 struct<array_prepend(array(CAST(NULL AS STRING)), CAST(NULL AS STRING)):array<string>>
 -- !query output
 [null,null]
+
+
+-- !query
+select array_union(array(0.0, -0.0, DOUBLE("NaN")), array(0.0, -0.0, DOUBLE("NaN")))
+-- !query schema
+struct<array_union(array(0.0, 0.0, NaN), array(0.0, 0.0, NaN)):array<double>>
+-- !query output
+[0.0,NaN]
+
+
+-- !query
+select array_distinct(array(0.0, -0.0, -0.0, DOUBLE("NaN"), DOUBLE("NaN")))
+-- !query schema
+struct<array_distinct(array(0.0, 0.0, 0.0, NaN, NaN)):array<double>>
+-- !query output
+[0.0,NaN]

From 9fe698b292bcb6e87ab3238affc24e84eb3244ec Mon Sep 17 00:00:00 2001
From: Nicholas Chammas <nicholas.chammas@gmail.com>
Date: Thu, 15 Feb 2024 10:10:03 -0500
Subject: [PATCH 12/15] remove commented out `==`

---
 .../scala/org/apache/spark/util/collection/OpenHashSet.scala     | 1 -
 1 file changed, 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala b/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala
index 5567483e29ff7..a42fa9ba6bc85 100644
--- a/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala
+++ b/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala
@@ -119,7 +119,6 @@ class OpenHashSet[@specialized(Long, Int, Double, Float) T: ClassTag](
    */
   @annotation.nowarn("cat=other-non-cooperative-equals")
   private def keyExistsAtPos(k: T, pos: Int) =
-    // _data(pos) == k
     _data(pos) equals k
 
   /**

From 60f698a995eb634d007aeaf6b30b5fc89d24e5ff Mon Sep 17 00:00:00 2001
From: Nicholas Chammas <nicholas.chammas@gmail.com>
Date: Thu, 15 Feb 2024 10:11:19 -0500
Subject: [PATCH 13/15] check count of NaNs in map

---
 .../org/apache/spark/util/collection/OpenHashMapSuite.scala     | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala
index c9102890accfe..4809e292224da 100644
--- a/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala
@@ -269,6 +269,7 @@ class OpenHashMapSuite extends SparkFunSuite with Matchers {
     spark45599Repro.foreach(map1.changeValue(_, 1, {_ + 1}))
     assert(map1(0.0) == 1)
     assert(map1(-0.0) == 1)
+    assert(map1(Double.NaN) == 3)
 
     val map2 = new OpenHashMap[Double, Int]()
     // Simply changing the order in which the elements are added to the map should not change the
@@ -276,5 +277,6 @@ class OpenHashMapSuite extends SparkFunSuite with Matchers {
     spark45599Repro.reverse.foreach(map2.changeValue(_, 1, {_ + 1}))
     assert(map2(0.0) == 1)
     assert(map2(-0.0) == 1)
+    assert(map2(Double.NaN) == 3)
   }
 }

From 15a091cbfc804cc752af7e0fade67f0bdd4e221a Mon Sep 17 00:00:00 2001
From: Nicholas Chammas <nicholas.chammas@gmail.com>
Date: Thu, 15 Feb 2024 10:12:28 -0500
Subject: [PATCH 14/15] merge literals test into single query

---
 .../analyzer-results/ansi/literals.sql.out         | 11 ++---------
 .../sql-tests/analyzer-results/literals.sql.out    | 11 ++---------
 .../test/resources/sql-tests/inputs/literals.sql   |  3 +--
 .../sql-tests/results/ansi/literals.sql.out        | 14 +++-----------
 .../resources/sql-tests/results/literals.sql.out   | 14 +++-----------
 5 files changed, 11 insertions(+), 42 deletions(-)

diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/literals.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/literals.sql.out
index 841a52dae9273..fece926834b7c 100644
--- a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/literals.sql.out
+++ b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/literals.sql.out
@@ -702,14 +702,7 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException
 
 
 -- !query
-select -0
+select -0, -0.0
 -- !query analysis
-Project [0 AS 0#x]
-+- OneRowRelation
-
-
--- !query
-select -0.0
--- !query analysis
-Project [0.0 AS 0.0#x]
+Project [0 AS 0#x, 0.0 AS 0.0#x]
 +- OneRowRelation
diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/literals.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/literals.sql.out
index 841a52dae9273..fece926834b7c 100644
--- a/sql/core/src/test/resources/sql-tests/analyzer-results/literals.sql.out
+++ b/sql/core/src/test/resources/sql-tests/analyzer-results/literals.sql.out
@@ -702,14 +702,7 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException
 
 
 -- !query
-select -0
+select -0, -0.0
 -- !query analysis
-Project [0 AS 0#x]
-+- OneRowRelation
-
-
--- !query
-select -0.0
--- !query analysis
-Project [0.0 AS 0.0#x]
+Project [0 AS 0#x, 0.0 AS 0.0#x]
 +- OneRowRelation
diff --git a/sql/core/src/test/resources/sql-tests/inputs/literals.sql b/sql/core/src/test/resources/sql-tests/inputs/literals.sql
index 8cc6202c3efb4..e1e4a370bffdc 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/literals.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/literals.sql
@@ -120,5 +120,4 @@ select -timestamp '1999-01-01';
 select -x'2379ACFe';
 
 -- normalize -0 and -0.0
-select -0;
-select -0.0;
+select -0, -0.0;
diff --git a/sql/core/src/test/resources/sql-tests/results/ansi/literals.sql.out b/sql/core/src/test/resources/sql-tests/results/ansi/literals.sql.out
index 80faeb5df9954..9c3a0cce023b7 100644
--- a/sql/core/src/test/resources/sql-tests/results/ansi/literals.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/ansi/literals.sql.out
@@ -780,16 +780,8 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException
 
 
 -- !query
-select -0
+select -0, -0.0
 -- !query schema
-struct<0:int>
+struct<0:int,0.0:decimal(1,1)>
 -- !query output
-0
-
-
--- !query
-select -0.0
--- !query schema
-struct<0.0:decimal(1,1)>
--- !query output
-0.0
+0	0.0
diff --git a/sql/core/src/test/resources/sql-tests/results/literals.sql.out b/sql/core/src/test/resources/sql-tests/results/literals.sql.out
index 80faeb5df9954..9c3a0cce023b7 100644
--- a/sql/core/src/test/resources/sql-tests/results/literals.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/literals.sql.out
@@ -780,16 +780,8 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException
 
 
 -- !query
-select -0
+select -0, -0.0
 -- !query schema
-struct<0:int>
+struct<0:int,0.0:decimal(1,1)>
 -- !query output
-0
-
-
--- !query
-select -0.0
--- !query schema
-struct<0.0:decimal(1,1)>
--- !query output
-0.0
+0	0.0

From 3a0b194c0b9fdeb7fe9165e3071d968fa3d674a6 Mon Sep 17 00:00:00 2001
From: Nicholas Chammas <nicholas.chammas@gmail.com>
Date: Fri, 16 Feb 2024 00:29:07 -0500
Subject: [PATCH 15/15] tweak test name

---
 .../org/apache/spark/util/collection/OpenHashMapSuite.scala     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala
index 4809e292224da..ae9fb54bddfbe 100644
--- a/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala
@@ -250,7 +250,7 @@ class OpenHashMapSuite extends SparkFunSuite with Matchers {
     assert(map.get(null) === Some(null))
   }
 
-  test("SPARK-45599: 0.0 and -0.0 should count distinctly") {
+  test("SPARK-45599: 0.0 and -0.0 should count distinctly; NaNs should count together") {
     // Exactly these elements provided in roughly this order trigger a condition where lookups of
     // 0.0 and -0.0 in the bitset happen to collide, causing their counts to be merged incorrectly
     // and inconsistently if `==` is used to check for key equality.