[SPARK-43063][SQL] df.show handle null should print NULL instead of…

… null ### What changes were proposed in this pull request? `df.show` handle null should print NULL instead of null to consistent behavior; Like as the following behavior is currently inconsistent: ``` shell scala> spark.sql("select decode(6, 1, 'Southlake', 2, 'San Francisco', 3, 'New Jersey', 4, 'Seattle') as result").show(false) +------+ |result| +------+ |null | +------+ ``` ``` shell spark-sql> DESC FUNCTION EXTENDED decode; function_desc Function: decode Class: org.apache.spark.sql.catalyst.expressions.Decode Usage: decode(bin, charset) - Decodes the first argument using the second argument character set. decode(expr, search, result [, search, result ] ... [, default]) - Compares expr to each search value in order. If expr is equal to a search value, decode returns the corresponding result. If no match is found, then it returns default. If default is omitted, it returns null. Extended Usage: Examples: > SELECT decode(encode('abc', 'utf-8'), 'utf-8'); abc > SELECT decode(2, 1, 'Southlake', 2, 'San Francisco', 3, 'New Jersey', 4, 'Seattle', 'Non domestic'); San Francisco > SELECT decode(6, 1, 'Southlake', 2, 'San Francisco', 3, 'New Jersey', 4, 'Seattle', 'Non domestic'); Non domestic > SELECT decode(6, 1, 'Southlake', 2, 'San Francisco', 3, 'New Jersey', 4, 'Seattle'); NULL Since: 3.2.0 Time taken: 0.074 seconds, Fetched 4 row(s) ``` ``` shell spark-sql> select decode(6, 1, 'Southlake', 2, 'San Francisco', 3, 'New Jersey', 4, 'Seattle'); NULL ``` ### Why are the changes needed? `df.show` keep consistent behavior when handle `null` with spark-sql CLI. ### Does this PR introduce _any_ user-facing change? Yes, `null` will display NULL instead of null. ### How was this patch tested? GA Closes #40699 from Yikf/show-NULL. Authored-by: Yikf <yikaifei@apache.org> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
apache · Apr 13, 2023 · a45affe · a45affe
1 parent a3839d8
commit a45affe
Show file tree

Hide file tree

Showing 13 changed files with 184 additions and 184 deletions.
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
@@ -5313,7 +5313,7 @@ class VectorAssembler(
     +---+---+----+-------------+
     |  a|  b|   c|     features|
     +---+---+----+-------------+
-    |1.0|2.0|null|[1.0,2.0,NaN]|
+    |1.0|2.0|NULL|[1.0,2.0,NaN]|
     |3.0|NaN| 4.0|[3.0,NaN,4.0]|
     |5.0|6.0| 7.0|[5.0,6.0,7.0]|
     +---+---+----+-------------+

diff --git a/python/pyspark/pandas/frame.py b/python/pyspark/pandas/frame.py
@@ -1530,7 +1530,7 @@ def corr(self, method: str = "pearson", min_periods: Optional[int] = None) -> "D
         # |  A|  B|   C|
         # +---+---+----+
         # |  1|  2| 3.0|
-        # |  4|  1|null|
+        # |  4|  1|NULL|
         # +---+---+----+
 
         pair_scols: List[GenericColumn] = []
@@ -1560,10 +1560,10 @@ def corr(self, method: str = "pearson", min_periods: Optional[int] = None) -> "D
         # |                  2|                  2|                3.0|                3.0|
         # |                  0|                  0|                4.0|                4.0|
         # |                  0|                  1|                4.0|                1.0|
-        # |                  0|                  2|               null|               null|
+        # |                  0|                  2|               NULL|               NULL|
         # |                  1|                  1|                1.0|                1.0|
-        # |                  1|                  2|               null|               null|
-        # |                  2|                  2|               null|               null|
+        # |                  1|                  2|               NULL|               NULL|
+        # |                  2|                  2|               NULL|               NULL|
         # +-------------------+-------------------+-------------------+-------------------+
         sdf = sdf.select(F.inline(F.array(*pair_scols)))  # type: ignore[arg-type]
 
@@ -1586,15 +1586,15 @@ def corr(self, method: str = "pearson", min_periods: Optional[int] = None) -> "D
         # +-------------------+-------------------+----------------+
         # |__tmp_index_1_col__|__tmp_index_2_col__|__tmp_corr_col__|
         # +-------------------+-------------------+----------------+
-        # |                  2|                  2|            null|
-        # |                  1|                  2|            null|
-        # |                  2|                  1|            null|
+        # |                  2|                  2|            NULL|
+        # |                  1|                  2|            NULL|
+        # |                  2|                  1|            NULL|
         # |                  1|                  1|             1.0|
         # |                  0|                  0|             1.0|
         # |                  0|                  1|            -1.0|
         # |                  1|                  0|            -1.0|
-        # |                  0|                  2|            null|
-        # |                  2|                  0|            null|
+        # |                  0|                  2|            NULL|
+        # |                  2|                  0|            NULL|
         # +-------------------+-------------------+----------------+
 
         auxiliary_col_name = verify_temp_column_name(sdf, "__corr_auxiliary_temp_column__")
@@ -12929,7 +12929,7 @@ def mode(self, axis: Axis = 0, numeric_only: bool = False, dropna: bool = True)
         # |species|legs|wings|
         # +-------+----+-----+
         # |   bird|   2|  0.0|
-        # |   null|null|  2.0|
+        # |   NULL|NULL|  2.0|
         # +-------+----+-----+
         sdf = (
             sdf.select(F.arrays_zip(*[F.col(name) for name in mode_col_names]).alias(zip_col_name))

diff --git a/python/pyspark/sql/column.py b/python/pyspark/sql/column.py
@@ -304,7 +304,7 @@ def __ne__(  # type: ignore[override]
     |(value = foo)|(value <=> foo)|(value <=> NULL)|
     +-------------+---------------+----------------+
     |         true|           true|           false|
-    |         null|          false|            true|
+    |         NULL|          false|            true|
     +-------------+---------------+----------------+
     >>> df2 = spark.createDataFrame([
     ...     Row(value = 'bar'),

diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
@@ -2308,16 +2308,16 @@ def join(
         | name|height|
         +-----+------+
         |  Bob|    85|
-        |Alice|  null|
-        | null|    80|
+        |Alice|  NULL|
+        | NULL|    80|
         +-----+------+
         >>> df.join(df2, 'name', 'outer').select('name', 'height').sort(desc("name")).show()
         +-----+------+
         | name|height|
         +-----+------+
         |  Tom|    80|
         |  Bob|    85|
-        |Alice|  null|
+        |Alice|  NULL|
         +-----+------+
 
         Outer join for both DataFrams with multiple columns.
@@ -3278,10 +3278,10 @@ def rollup(self, *cols: "ColumnOrName") -> "GroupedData":  # type: ignore[misc]
         +-----+----+-----+
         | name| age|count|
         +-----+----+-----+
-        | null|null|    2|
-        |Alice|null|    1|
+        | NULL|NULL|    2|
+        |Alice|NULL|    1|
         |Alice|   2|    1|
-        |  Bob|null|    1|
+        |  Bob|NULL|    1|
         |  Bob|   5|    1|
         +-----+----+-----+
         """
@@ -3327,12 +3327,12 @@ def cube(self, *cols: "ColumnOrName") -> "GroupedData":  # type: ignore[misc]
         +-----+----+-----+
         | name| age|count|
         +-----+----+-----+
-        | null|null|    2|
-        | null|   2|    1|
-        | null|   5|    1|
-        |Alice|null|    1|
+        | NULL|NULL|    2|
+        | NULL|   2|    1|
+        | NULL|   5|    1|
+        |Alice|NULL|    1|
         |Alice|   2|    1|
-        |  Bob|null|    1|
+        |  Bob|NULL|    1|
         |  Bob|   5|    1|
         +-----+----+-----+
         """
@@ -3778,8 +3778,8 @@ def unionByName(self, other: "DataFrame", allowMissingColumns: bool = False) ->
         +----+----+----+----+
         |col0|col1|col2|col3|
         +----+----+----+----+
-        |   1|   2|   3|null|
-        |null|   4|   5|   6|
+        |   1|   2|   3|NULL|
+        |NULL|   4|   5|   6|
         +----+----+----+----+
         """
         return DataFrame(self._jdf.unionByName(other._jdf, allowMissingColumns), self.sparkSession)
@@ -4146,10 +4146,10 @@ def fillna(
         +---+------+-----+----+
         |age|height| name|bool|
         +---+------+-----+----+
-        | 10|  80.5|Alice|null|
-        |  5|  50.0|  Bob|null|
-        | 50|  50.0|  Tom|null|
-        | 50|  50.0| null|true|
+        | 10|  80.5|Alice|NULL|
+        |  5|  50.0|  Bob|NULL|
+        | 50|  50.0|  Tom|NULL|
+        | 50|  50.0| NULL|true|
         +---+------+-----+----+
 
         Fill all null values with ``False`` for boolean columns.
@@ -4159,9 +4159,9 @@ def fillna(
         | age|height| name| bool|
         +----+------+-----+-----+
         |  10|  80.5|Alice|false|
-        |   5|  null|  Bob|false|
-        |null|  null|  Tom|false|
-        |null|  null| null| true|
+        |   5|  NULL|  Bob|false|
+        |NULL|  NULL|  Tom|false|
+        |NULL|  NULL| NULL| true|
         +----+------+-----+-----+
 
         Fill all null values with to 50 and "unknown" for 'age' and 'name' column respectively.
@@ -4170,10 +4170,10 @@ def fillna(
         +---+------+-------+----+
         |age|height|   name|bool|
         +---+------+-------+----+
-        | 10|  80.5|  Alice|null|
-        |  5|  null|    Bob|null|
-        | 50|  null|    Tom|null|
-        | 50|  null|unknown|true|
+        | 10|  80.5|  Alice|NULL|
+        |  5|  NULL|    Bob|NULL|
+        | 50|  NULL|    Tom|NULL|
+        | 50|  NULL|unknown|true|
         +---+------+-------+----+
         """
         if not isinstance(value, (float, int, str, bool, dict)):
@@ -4301,9 +4301,9 @@ def replace(  # type: ignore[misc]
         | age|height| name|
         +----+------+-----+
         |  20|    80|Alice|
-        |   5|  null|  Bob|
-        |null|    20|  Tom|
-        |null|  null| null|
+        |   5|  NULL|  Bob|
+        |NULL|    20|  Tom|
+        |NULL|  NULL| NULL|
         +----+------+-----+
 
         Replace 'Alice' to null in all columns.
@@ -4312,10 +4312,10 @@ def replace(  # type: ignore[misc]
         +----+------+----+
         | age|height|name|
         +----+------+----+
-        |  10|    80|null|
-        |   5|  null| Bob|
-        |null|    10| Tom|
-        |null|  null|null|
+        |  10|    80|NULL|
+        |   5|  NULL| Bob|
+        |NULL|    10| Tom|
+        |NULL|  NULL|NULL|
         +----+------+----+
 
         Replace 'Alice' to 'A', and 'Bob' to 'B' in the 'name' column.
@@ -4325,9 +4325,9 @@ def replace(  # type: ignore[misc]
         | age|height|name|
         +----+------+----+
         |  10|    80|   A|
-        |   5|  null|   B|
-        |null|    10| Tom|
-        |null|  null|null|
+        |   5|  NULL|   B|
+        |NULL|    10| Tom|
+        |NULL|  NULL|NULL|
         +----+------+----+
         """
         if value is _NoValue: