Skip to content

Commit

Permalink
[SPARK-43063][SQL] df.show handle null should print NULL instead of…
Browse files Browse the repository at this point in the history
… null

### What changes were proposed in this pull request?

`df.show` handle null should print NULL instead of null to consistent behavior;

Like as the following behavior is currently inconsistent:
``` shell
scala> spark.sql("select decode(6, 1, 'Southlake', 2, 'San Francisco', 3, 'New Jersey', 4, 'Seattle') as result").show(false)
+------+
|result|
+------+
|null  |
+------+
```
``` shell
spark-sql> DESC FUNCTION EXTENDED decode;
function_desc
Function: decode
Class: org.apache.spark.sql.catalyst.expressions.Decode
Usage:
    decode(bin, charset) - Decodes the first argument using the second argument character set.

    decode(expr, search, result [, search, result ] ... [, default]) - Compares expr
      to each search value in order. If expr is equal to a search value, decode returns
      the corresponding result. If no match is found, then it returns default. If default
      is omitted, it returns null.

Extended Usage:
    Examples:
      > SELECT decode(encode('abc', 'utf-8'), 'utf-8');
       abc
      > SELECT decode(2, 1, 'Southlake', 2, 'San Francisco', 3, 'New Jersey', 4, 'Seattle', 'Non domestic');
       San Francisco
      > SELECT decode(6, 1, 'Southlake', 2, 'San Francisco', 3, 'New Jersey', 4, 'Seattle', 'Non domestic');
       Non domestic
      > SELECT decode(6, 1, 'Southlake', 2, 'San Francisco', 3, 'New Jersey', 4, 'Seattle');
       NULL

    Since: 3.2.0

Time taken: 0.074 seconds, Fetched 4 row(s)
```
``` shell
spark-sql> select decode(6, 1, 'Southlake', 2, 'San Francisco', 3, 'New Jersey', 4, 'Seattle');
NULL
```

### Why are the changes needed?

`df.show` keep consistent behavior when handle `null` with spark-sql CLI.

### Does this PR introduce _any_ user-facing change?

Yes, `null` will display NULL instead of null.

### How was this patch tested?

GA

Closes #40699 from Yikf/show-NULL.

Authored-by: Yikf <yikaifei@apache.org>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
  • Loading branch information
yikf authored and cloud-fan committed Apr 13, 2023
1 parent a3839d8 commit a45affe
Show file tree
Hide file tree
Showing 13 changed files with 184 additions and 184 deletions.
2 changes: 1 addition & 1 deletion python/pyspark/ml/feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -5313,7 +5313,7 @@ class VectorAssembler(
+---+---+----+-------------+
| a| b| c| features|
+---+---+----+-------------+
|1.0|2.0|null|[1.0,2.0,NaN]|
|1.0|2.0|NULL|[1.0,2.0,NaN]|
|3.0|NaN| 4.0|[3.0,NaN,4.0]|
|5.0|6.0| 7.0|[5.0,6.0,7.0]|
+---+---+----+-------------+
Expand Down
20 changes: 10 additions & 10 deletions python/pyspark/pandas/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1530,7 +1530,7 @@ def corr(self, method: str = "pearson", min_periods: Optional[int] = None) -> "D
# | A| B| C|
# +---+---+----+
# | 1| 2| 3.0|
# | 4| 1|null|
# | 4| 1|NULL|
# +---+---+----+

pair_scols: List[GenericColumn] = []
Expand Down Expand Up @@ -1560,10 +1560,10 @@ def corr(self, method: str = "pearson", min_periods: Optional[int] = None) -> "D
# | 2| 2| 3.0| 3.0|
# | 0| 0| 4.0| 4.0|
# | 0| 1| 4.0| 1.0|
# | 0| 2| null| null|
# | 0| 2| NULL| NULL|
# | 1| 1| 1.0| 1.0|
# | 1| 2| null| null|
# | 2| 2| null| null|
# | 1| 2| NULL| NULL|
# | 2| 2| NULL| NULL|
# +-------------------+-------------------+-------------------+-------------------+
sdf = sdf.select(F.inline(F.array(*pair_scols))) # type: ignore[arg-type]

Expand All @@ -1586,15 +1586,15 @@ def corr(self, method: str = "pearson", min_periods: Optional[int] = None) -> "D
# +-------------------+-------------------+----------------+
# |__tmp_index_1_col__|__tmp_index_2_col__|__tmp_corr_col__|
# +-------------------+-------------------+----------------+
# | 2| 2| null|
# | 1| 2| null|
# | 2| 1| null|
# | 2| 2| NULL|
# | 1| 2| NULL|
# | 2| 1| NULL|
# | 1| 1| 1.0|
# | 0| 0| 1.0|
# | 0| 1| -1.0|
# | 1| 0| -1.0|
# | 0| 2| null|
# | 2| 0| null|
# | 0| 2| NULL|
# | 2| 0| NULL|
# +-------------------+-------------------+----------------+

auxiliary_col_name = verify_temp_column_name(sdf, "__corr_auxiliary_temp_column__")
Expand Down Expand Up @@ -12929,7 +12929,7 @@ def mode(self, axis: Axis = 0, numeric_only: bool = False, dropna: bool = True)
# |species|legs|wings|
# +-------+----+-----+
# | bird| 2| 0.0|
# | null|null| 2.0|
# | NULL|NULL| 2.0|
# +-------+----+-----+
sdf = (
sdf.select(F.arrays_zip(*[F.col(name) for name in mode_col_names]).alias(zip_col_name))
Expand Down
2 changes: 1 addition & 1 deletion python/pyspark/sql/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -304,7 +304,7 @@ def __ne__( # type: ignore[override]
|(value = foo)|(value <=> foo)|(value <=> NULL)|
+-------------+---------------+----------------+
| true| true| false|
| null| false| true|
| NULL| false| true|
+-------------+---------------+----------------+
>>> df2 = spark.createDataFrame([
... Row(value = 'bar'),
Expand Down
68 changes: 34 additions & 34 deletions python/pyspark/sql/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -2308,16 +2308,16 @@ def join(
| name|height|
+-----+------+
| Bob| 85|
|Alice| null|
| null| 80|
|Alice| NULL|
| NULL| 80|
+-----+------+
>>> df.join(df2, 'name', 'outer').select('name', 'height').sort(desc("name")).show()
+-----+------+
| name|height|
+-----+------+
| Tom| 80|
| Bob| 85|
|Alice| null|
|Alice| NULL|
+-----+------+
Outer join for both DataFrams with multiple columns.
Expand Down Expand Up @@ -3278,10 +3278,10 @@ def rollup(self, *cols: "ColumnOrName") -> "GroupedData": # type: ignore[misc]
+-----+----+-----+
| name| age|count|
+-----+----+-----+
| null|null| 2|
|Alice|null| 1|
| NULL|NULL| 2|
|Alice|NULL| 1|
|Alice| 2| 1|
| Bob|null| 1|
| Bob|NULL| 1|
| Bob| 5| 1|
+-----+----+-----+
"""
Expand Down Expand Up @@ -3327,12 +3327,12 @@ def cube(self, *cols: "ColumnOrName") -> "GroupedData": # type: ignore[misc]
+-----+----+-----+
| name| age|count|
+-----+----+-----+
| null|null| 2|
| null| 2| 1|
| null| 5| 1|
|Alice|null| 1|
| NULL|NULL| 2|
| NULL| 2| 1|
| NULL| 5| 1|
|Alice|NULL| 1|
|Alice| 2| 1|
| Bob|null| 1|
| Bob|NULL| 1|
| Bob| 5| 1|
+-----+----+-----+
"""
Expand Down Expand Up @@ -3778,8 +3778,8 @@ def unionByName(self, other: "DataFrame", allowMissingColumns: bool = False) ->
+----+----+----+----+
|col0|col1|col2|col3|
+----+----+----+----+
| 1| 2| 3|null|
|null| 4| 5| 6|
| 1| 2| 3|NULL|
|NULL| 4| 5| 6|
+----+----+----+----+
"""
return DataFrame(self._jdf.unionByName(other._jdf, allowMissingColumns), self.sparkSession)
Expand Down Expand Up @@ -4146,10 +4146,10 @@ def fillna(
+---+------+-----+----+
|age|height| name|bool|
+---+------+-----+----+
| 10| 80.5|Alice|null|
| 5| 50.0| Bob|null|
| 50| 50.0| Tom|null|
| 50| 50.0| null|true|
| 10| 80.5|Alice|NULL|
| 5| 50.0| Bob|NULL|
| 50| 50.0| Tom|NULL|
| 50| 50.0| NULL|true|
+---+------+-----+----+
Fill all null values with ``False`` for boolean columns.
Expand All @@ -4159,9 +4159,9 @@ def fillna(
| age|height| name| bool|
+----+------+-----+-----+
| 10| 80.5|Alice|false|
| 5| null| Bob|false|
|null| null| Tom|false|
|null| null| null| true|
| 5| NULL| Bob|false|
|NULL| NULL| Tom|false|
|NULL| NULL| NULL| true|
+----+------+-----+-----+
Fill all null values with to 50 and "unknown" for 'age' and 'name' column respectively.
Expand All @@ -4170,10 +4170,10 @@ def fillna(
+---+------+-------+----+
|age|height| name|bool|
+---+------+-------+----+
| 10| 80.5| Alice|null|
| 5| null| Bob|null|
| 50| null| Tom|null|
| 50| null|unknown|true|
| 10| 80.5| Alice|NULL|
| 5| NULL| Bob|NULL|
| 50| NULL| Tom|NULL|
| 50| NULL|unknown|true|
+---+------+-------+----+
"""
if not isinstance(value, (float, int, str, bool, dict)):
Expand Down Expand Up @@ -4301,9 +4301,9 @@ def replace( # type: ignore[misc]
| age|height| name|
+----+------+-----+
| 20| 80|Alice|
| 5| null| Bob|
|null| 20| Tom|
|null| null| null|
| 5| NULL| Bob|
|NULL| 20| Tom|
|NULL| NULL| NULL|
+----+------+-----+
Replace 'Alice' to null in all columns.
Expand All @@ -4312,10 +4312,10 @@ def replace( # type: ignore[misc]
+----+------+----+
| age|height|name|
+----+------+----+
| 10| 80|null|
| 5| null| Bob|
|null| 10| Tom|
|null| null|null|
| 10| 80|NULL|
| 5| NULL| Bob|
|NULL| 10| Tom|
|NULL| NULL|NULL|
+----+------+----+
Replace 'Alice' to 'A', and 'Bob' to 'B' in the 'name' column.
Expand All @@ -4325,9 +4325,9 @@ def replace( # type: ignore[misc]
| age|height|name|
+----+------+----+
| 10| 80| A|
| 5| null| B|
|null| 10| Tom|
|null| null|null|
| 5| NULL| B|
|NULL| 10| Tom|
|NULL| NULL|NULL|
+----+------+----+
"""
if value is _NoValue:
Expand Down

0 comments on commit a45affe

Please sign in to comment.