From a45affe3c8e7a724aea7dbbc1af08e36001c7540 Mon Sep 17 00:00:00 2001 From: Yikf Date: Thu, 13 Apr 2023 10:15:14 +0800 Subject: [PATCH] [SPARK-43063][SQL] `df.show` handle null should print NULL instead of null ### What changes were proposed in this pull request? `df.show` handle null should print NULL instead of null to consistent behavior; Like as the following behavior is currently inconsistent: ``` shell scala> spark.sql("select decode(6, 1, 'Southlake', 2, 'San Francisco', 3, 'New Jersey', 4, 'Seattle') as result").show(false) +------+ |result| +------+ |null | +------+ ``` ``` shell spark-sql> DESC FUNCTION EXTENDED decode; function_desc Function: decode Class: org.apache.spark.sql.catalyst.expressions.Decode Usage: decode(bin, charset) - Decodes the first argument using the second argument character set. decode(expr, search, result [, search, result ] ... [, default]) - Compares expr to each search value in order. If expr is equal to a search value, decode returns the corresponding result. If no match is found, then it returns default. If default is omitted, it returns null. Extended Usage: Examples: > SELECT decode(encode('abc', 'utf-8'), 'utf-8'); abc > SELECT decode(2, 1, 'Southlake', 2, 'San Francisco', 3, 'New Jersey', 4, 'Seattle', 'Non domestic'); San Francisco > SELECT decode(6, 1, 'Southlake', 2, 'San Francisco', 3, 'New Jersey', 4, 'Seattle', 'Non domestic'); Non domestic > SELECT decode(6, 1, 'Southlake', 2, 'San Francisco', 3, 'New Jersey', 4, 'Seattle'); NULL Since: 3.2.0 Time taken: 0.074 seconds, Fetched 4 row(s) ``` ``` shell spark-sql> select decode(6, 1, 'Southlake', 2, 'San Francisco', 3, 'New Jersey', 4, 'Seattle'); NULL ``` ### Why are the changes needed? `df.show` keep consistent behavior when handle `null` with spark-sql CLI. ### Does this PR introduce _any_ user-facing change? Yes, `null` will display NULL instead of null. ### How was this patch tested? GA Closes #40699 from Yikf/show-NULL. Authored-by: Yikf Signed-off-by: Wenchen Fan --- python/pyspark/ml/feature.py | 2 +- python/pyspark/pandas/frame.py | 20 ++--- python/pyspark/sql/column.py | 2 +- python/pyspark/sql/dataframe.py | 68 +++++++-------- python/pyspark/sql/functions.py | 86 +++++++++---------- python/pyspark/sql/readwriter.py | 10 +-- .../sql/tests/connect/test_connect_basic.py | 38 ++++---- .../sql/tests/connect/test_connect_column.py | 36 ++++---- .../tests/connect/test_connect_function.py | 62 ++++++------- .../spark/sql/catalyst/expressions/Cast.scala | 22 ++--- .../catalyst/expressions/CastSuiteBase.scala | 10 +-- .../scala/org/apache/spark/sql/Dataset.scala | 10 +-- .../org/apache/spark/sql/DatasetSuite.scala | 2 +- 13 files changed, 184 insertions(+), 184 deletions(-) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index ff7aaf71f9c37..e7ec35bffa022 100755 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -5313,7 +5313,7 @@ class VectorAssembler( +---+---+----+-------------+ | a| b| c| features| +---+---+----+-------------+ - |1.0|2.0|null|[1.0,2.0,NaN]| + |1.0|2.0|NULL|[1.0,2.0,NaN]| |3.0|NaN| 4.0|[3.0,NaN,4.0]| |5.0|6.0| 7.0|[5.0,6.0,7.0]| +---+---+----+-------------+ diff --git a/python/pyspark/pandas/frame.py b/python/pyspark/pandas/frame.py index 1f81f0addf90d..8bddcb6bae881 100644 --- a/python/pyspark/pandas/frame.py +++ b/python/pyspark/pandas/frame.py @@ -1530,7 +1530,7 @@ def corr(self, method: str = "pearson", min_periods: Optional[int] = None) -> "D # | A| B| C| # +---+---+----+ # | 1| 2| 3.0| - # | 4| 1|null| + # | 4| 1|NULL| # +---+---+----+ pair_scols: List[GenericColumn] = [] @@ -1560,10 +1560,10 @@ def corr(self, method: str = "pearson", min_periods: Optional[int] = None) -> "D # | 2| 2| 3.0| 3.0| # | 0| 0| 4.0| 4.0| # | 0| 1| 4.0| 1.0| - # | 0| 2| null| null| + # | 0| 2| NULL| NULL| # | 1| 1| 1.0| 1.0| - # | 1| 2| null| null| - # | 2| 2| null| null| + # | 1| 2| NULL| NULL| + # | 2| 2| NULL| NULL| # +-------------------+-------------------+-------------------+-------------------+ sdf = sdf.select(F.inline(F.array(*pair_scols))) # type: ignore[arg-type] @@ -1586,15 +1586,15 @@ def corr(self, method: str = "pearson", min_periods: Optional[int] = None) -> "D # +-------------------+-------------------+----------------+ # |__tmp_index_1_col__|__tmp_index_2_col__|__tmp_corr_col__| # +-------------------+-------------------+----------------+ - # | 2| 2| null| - # | 1| 2| null| - # | 2| 1| null| + # | 2| 2| NULL| + # | 1| 2| NULL| + # | 2| 1| NULL| # | 1| 1| 1.0| # | 0| 0| 1.0| # | 0| 1| -1.0| # | 1| 0| -1.0| - # | 0| 2| null| - # | 2| 0| null| + # | 0| 2| NULL| + # | 2| 0| NULL| # +-------------------+-------------------+----------------+ auxiliary_col_name = verify_temp_column_name(sdf, "__corr_auxiliary_temp_column__") @@ -12929,7 +12929,7 @@ def mode(self, axis: Axis = 0, numeric_only: bool = False, dropna: bool = True) # |species|legs|wings| # +-------+----+-----+ # | bird| 2| 0.0| - # | null|null| 2.0| + # | NULL|NULL| 2.0| # +-------+----+-----+ sdf = ( sdf.select(F.arrays_zip(*[F.col(name) for name in mode_col_names]).alias(zip_col_name)) diff --git a/python/pyspark/sql/column.py b/python/pyspark/sql/column.py index 7339408a55f76..49a42406048c2 100644 --- a/python/pyspark/sql/column.py +++ b/python/pyspark/sql/column.py @@ -304,7 +304,7 @@ def __ne__( # type: ignore[override] |(value = foo)|(value <=> foo)|(value <=> NULL)| +-------------+---------------+----------------+ | true| true| false| - | null| false| true| + | NULL| false| true| +-------------+---------------+----------------+ >>> df2 = spark.createDataFrame([ ... Row(value = 'bar'), diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index 3b2e547c21166..0014695aa7194 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -2308,8 +2308,8 @@ def join( | name|height| +-----+------+ | Bob| 85| - |Alice| null| - | null| 80| + |Alice| NULL| + | NULL| 80| +-----+------+ >>> df.join(df2, 'name', 'outer').select('name', 'height').sort(desc("name")).show() +-----+------+ @@ -2317,7 +2317,7 @@ def join( +-----+------+ | Tom| 80| | Bob| 85| - |Alice| null| + |Alice| NULL| +-----+------+ Outer join for both DataFrams with multiple columns. @@ -3278,10 +3278,10 @@ def rollup(self, *cols: "ColumnOrName") -> "GroupedData": # type: ignore[misc] +-----+----+-----+ | name| age|count| +-----+----+-----+ - | null|null| 2| - |Alice|null| 1| + | NULL|NULL| 2| + |Alice|NULL| 1| |Alice| 2| 1| - | Bob|null| 1| + | Bob|NULL| 1| | Bob| 5| 1| +-----+----+-----+ """ @@ -3327,12 +3327,12 @@ def cube(self, *cols: "ColumnOrName") -> "GroupedData": # type: ignore[misc] +-----+----+-----+ | name| age|count| +-----+----+-----+ - | null|null| 2| - | null| 2| 1| - | null| 5| 1| - |Alice|null| 1| + | NULL|NULL| 2| + | NULL| 2| 1| + | NULL| 5| 1| + |Alice|NULL| 1| |Alice| 2| 1| - | Bob|null| 1| + | Bob|NULL| 1| | Bob| 5| 1| +-----+----+-----+ """ @@ -3778,8 +3778,8 @@ def unionByName(self, other: "DataFrame", allowMissingColumns: bool = False) -> +----+----+----+----+ |col0|col1|col2|col3| +----+----+----+----+ - | 1| 2| 3|null| - |null| 4| 5| 6| + | 1| 2| 3|NULL| + |NULL| 4| 5| 6| +----+----+----+----+ """ return DataFrame(self._jdf.unionByName(other._jdf, allowMissingColumns), self.sparkSession) @@ -4146,10 +4146,10 @@ def fillna( +---+------+-----+----+ |age|height| name|bool| +---+------+-----+----+ - | 10| 80.5|Alice|null| - | 5| 50.0| Bob|null| - | 50| 50.0| Tom|null| - | 50| 50.0| null|true| + | 10| 80.5|Alice|NULL| + | 5| 50.0| Bob|NULL| + | 50| 50.0| Tom|NULL| + | 50| 50.0| NULL|true| +---+------+-----+----+ Fill all null values with ``False`` for boolean columns. @@ -4159,9 +4159,9 @@ def fillna( | age|height| name| bool| +----+------+-----+-----+ | 10| 80.5|Alice|false| - | 5| null| Bob|false| - |null| null| Tom|false| - |null| null| null| true| + | 5| NULL| Bob|false| + |NULL| NULL| Tom|false| + |NULL| NULL| NULL| true| +----+------+-----+-----+ Fill all null values with to 50 and "unknown" for 'age' and 'name' column respectively. @@ -4170,10 +4170,10 @@ def fillna( +---+------+-------+----+ |age|height| name|bool| +---+------+-------+----+ - | 10| 80.5| Alice|null| - | 5| null| Bob|null| - | 50| null| Tom|null| - | 50| null|unknown|true| + | 10| 80.5| Alice|NULL| + | 5| NULL| Bob|NULL| + | 50| NULL| Tom|NULL| + | 50| NULL|unknown|true| +---+------+-------+----+ """ if not isinstance(value, (float, int, str, bool, dict)): @@ -4301,9 +4301,9 @@ def replace( # type: ignore[misc] | age|height| name| +----+------+-----+ | 20| 80|Alice| - | 5| null| Bob| - |null| 20| Tom| - |null| null| null| + | 5| NULL| Bob| + |NULL| 20| Tom| + |NULL| NULL| NULL| +----+------+-----+ Replace 'Alice' to null in all columns. @@ -4312,10 +4312,10 @@ def replace( # type: ignore[misc] +----+------+----+ | age|height|name| +----+------+----+ - | 10| 80|null| - | 5| null| Bob| - |null| 10| Tom| - |null| null|null| + | 10| 80|NULL| + | 5| NULL| Bob| + |NULL| 10| Tom| + |NULL| NULL|NULL| +----+------+----+ Replace 'Alice' to 'A', and 'Bob' to 'B' in the 'name' column. @@ -4325,9 +4325,9 @@ def replace( # type: ignore[misc] | age|height|name| +----+------+----+ | 10| 80| A| - | 5| null| B| - |null| 10| Tom| - |null| null|null| + | 5| NULL| B| + |NULL| 10| Tom| + |NULL| NULL|NULL| +----+------+----+ """ if value is _NoValue: diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index 2f71ac845a6de..e7f237355d835 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -1749,7 +1749,7 @@ def asc_nulls_first(col: "ColumnOrName") -> Column: +---+-----+ |age| name| +---+-----+ - | 0| null| + | 0| NULL| | 2|Alice| | 1| Bob| +---+-----+ @@ -1794,7 +1794,7 @@ def asc_nulls_last(col: "ColumnOrName") -> Column: +---+-----+ | 2|Alice| | 1| Bob| - | 0| null| + | 0| NULL| +---+-----+ """ @@ -1833,7 +1833,7 @@ def desc_nulls_first(col: "ColumnOrName") -> Column: +---+-----+ |age| name| +---+-----+ - | 0| null| + | 0| NULL| | 1| Bob| | 2|Alice| +---+-----+ @@ -1878,7 +1878,7 @@ def desc_nulls_last(col: "ColumnOrName") -> Column: +---+-----+ | 1| Bob| | 2|Alice| - | 0| null| + | 0| NULL| +---+-----+ """ @@ -2712,16 +2712,16 @@ def coalesce(*cols: "ColumnOrName") -> Column: +----+----+ | a| b| +----+----+ - |null|null| - | 1|null| - |null| 2| + |NULL|NULL| + | 1|NULL| + |NULL| 2| +----+----+ >>> cDf.select(coalesce(cDf["a"], cDf["b"])).show() +--------------+ |coalesce(a, b)| +--------------+ - | null| + | NULL| | 1| | 2| +--------------+ @@ -2730,9 +2730,9 @@ def coalesce(*cols: "ColumnOrName") -> Column: +----+----+----------------+ | a| b|coalesce(a, 0.0)| +----+----+----------------+ - |null|null| 0.0| - | 1|null| 1.0| - |null| 2| 0.0| + |NULL|NULL| 0.0| + | 1|NULL| 1.0| + |NULL| 2| 0.0| +----+----+----------------+ """ return _invoke_function_over_seq_of_columns("coalesce", cols) @@ -2939,7 +2939,7 @@ def first(col: "ColumnOrName", ignorenulls: bool = False) -> Column: +-----+----------+ | name|first(age)| +-----+----------+ - |Alice| null| + |Alice| NULL| | Bob| 5| +-----+----------+ @@ -2984,7 +2984,7 @@ def grouping(col: "ColumnOrName") -> Column: +-----+--------------+--------+ | name|grouping(name)|sum(age)| +-----+--------------+--------+ - | null| 1| 7| + | NULL| 1| 7| |Alice| 0| 2| | Bob| 0| 5| +-----+--------------+--------+ @@ -3028,12 +3028,12 @@ def grouping_id(*cols: "ColumnOrName") -> Column: +----+----+-------------+-------+ | c2| c3|grouping_id()|sum(c1)| +----+----+-------------+-------+ - |null|null| 3| 8| - |null| a| 2| 4| - |null| c| 2| 4| - | a|null| 1| 4| + |NULL|NULL| 3| 8| + |NULL| a| 2| 4| + |NULL| c| 2| 4| + | a|NULL| 1| 4| | a| a| 0| 4| - | b|null| 1| 4| + | b|NULL| 1| 4| | b| c| 0| 4| +----+----+-------------+-------+ """ @@ -3125,8 +3125,8 @@ def isnull(col: "ColumnOrName") -> Column: +----+----+-----+-----+ | a| b| r1| r2| +----+----+-----+-----+ - | 1|null|false| true| - |null| 2| true|false| + | 1|NULL|false| true| + |NULL| 2| true|false| +----+----+-----+-----+ """ return _invoke_function_over_columns("isnull", col) @@ -3169,7 +3169,7 @@ def last(col: "ColumnOrName", ignorenulls: bool = False) -> Column: +-----+---------+ | name|last(age)| +-----+---------+ - |Alice| null| + |Alice| NULL| | Bob| 5| +-----+---------+ @@ -3824,8 +3824,8 @@ def when(condition: Column, value: Any) -> Column: +----+ | age| +----+ - |null| - |null| + |NULL| + |NULL| | 3| +----+ """ @@ -4050,10 +4050,10 @@ def lag(col: "ColumnOrName", offset: int = 1, default: Optional[Any] = None) -> +---+---+-------------+ | c1| c2|previos_value| +---+---+-------------+ - | a| 1| null| + | a| 1| NULL| | a| 2| 1| | a| 3| 2| - | b| 2| null| + | b| 2| NULL| | b| 8| 2| +---+---+-------------+ >>> df.withColumn("previos_value", lag("c2", 1, 0).over(w)).show() @@ -4133,9 +4133,9 @@ def lead(col: "ColumnOrName", offset: int = 1, default: Optional[Any] = None) -> +---+---+----------+ | a| 1| 2| | a| 2| 3| - | a| 3| null| + | a| 3| NULL| | b| 2| 8| - | b| 8| null| + | b| 8| NULL| +---+---+----------+ >>> df.withColumn("next_value", lead("c2", 1, 0).over(w)).show() +---+---+----------+ @@ -4225,10 +4225,10 @@ def nth_value(col: "ColumnOrName", offset: int, ignoreNulls: Optional[bool] = Fa +---+---+---------+ | c1| c2|nth_value| +---+---+---------+ - | a| 1| null| + | a| 1| NULL| | a| 2| 2| | a| 3| 2| - | b| 2| null| + | b| 2| NULL| | b| 8| 8| +---+---+---------+ """ @@ -7602,14 +7602,14 @@ def get(col: "ColumnOrName", index: Union["ColumnOrName", int]) -> Column: +-------------+ |get(data, -1)| +-------------+ - | null| + | NULL| +-------------+ >>> df.select(get(df.data, 3)).show() +------------+ |get(data, 3)| +------------+ - | null| + | NULL| +------------+ >>> df.select(get(df.data, "index")).show() @@ -8085,8 +8085,8 @@ def explode_outer(col: "ColumnOrName") -> Column: | id| an_array| key|value| +---+----------+----+-----+ | 1|[foo, bar]| x| 1.0| - | 2| []|null| null| - | 3| null|null| null| + | 2| []|NULL| NULL| + | 3| NULL|NULL| NULL| +---+----------+----+-----+ >>> df.select("id", "a_map", explode_outer("an_array")).show() @@ -8095,8 +8095,8 @@ def explode_outer(col: "ColumnOrName") -> Column: +---+----------+----+ | 1|{x -> 1.0}| foo| | 1|{x -> 1.0}| bar| - | 2| {}|null| - | 3| null|null| + | 2| {}|NULL| + | 3| NULL|NULL| +---+----------+----+ """ return _invoke_function_over_columns("explode_outer", col) @@ -8136,8 +8136,8 @@ def posexplode_outer(col: "ColumnOrName") -> Column: | id| an_array| pos| key|value| +---+----------+----+----+-----+ | 1|[foo, bar]| 0| x| 1.0| - | 2| []|null|null| null| - | 3| null|null|null| null| + | 2| []|NULL|NULL| NULL| + | 3| NULL|NULL|NULL| NULL| +---+----------+----+----+-----+ >>> df.select("id", "a_map", posexplode_outer("an_array")).show() +---+----------+----+----+ @@ -8145,8 +8145,8 @@ def posexplode_outer(col: "ColumnOrName") -> Column: +---+----------+----+----+ | 1|{x -> 1.0}| 0| foo| | 1|{x -> 1.0}| 1| bar| - | 2| {}|null|null| - | 3| null|null|null| + | 2| {}|NULL|NULL| + | 3| NULL|NULL|NULL| +---+----------+----+----+ """ return _invoke_function_over_columns("posexplode_outer", col) @@ -8191,7 +8191,7 @@ def inline_outer(col: "ColumnOrName") -> Column: +---+----+----+ | 1| 1| 2| | 1| 3| 4| - | 2|null|null| + | 2|NULL|NULL| +---+----+----+ """ return _invoke_function_over_columns("inline_outer", col) @@ -8796,14 +8796,14 @@ def flatten(col: "ColumnOrName") -> Column: |data | +------------------------+ |[[1, 2, 3], [4, 5], [6]]| - |[null, [4, 5]] | + |[NULL, [4, 5]] | +------------------------+ >>> df.select(flatten(df.data).alias('r')).show() +------------------+ | r| +------------------+ |[1, 2, 3, 4, 5, 6]| - | null| + | NULL| +------------------+ """ return _invoke_function_over_columns("flatten", col) @@ -9059,7 +9059,7 @@ def arrays_zip(*cols: "ColumnOrName") -> Column: +------------------------------------+ |zipped | +------------------------------------+ - |[{1, 2, 3}, {2, 4, 6}, {3, 6, null}]| + |[{1, 2, 3}, {2, 4, 6}, {3, 6, NULL}]| +------------------------------------+ >>> df.printSchema() root diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py index 74a1320c215bc..2c42dae42f245 100644 --- a/python/pyspark/sql/readwriter.py +++ b/python/pyspark/sql/readwriter.py @@ -198,7 +198,7 @@ def option(self, key: str, value: "OptionalPrimitiveType") -> "DataFrameReader": +---+----+ |age|name| +---+----+ - |100|null| + |100|NULL| +---+----+ """ self._jreader = self._jreader.option(key, to_str(value)) @@ -240,7 +240,7 @@ def options(self, **options: "OptionalPrimitiveType") -> "DataFrameReader": +---+----+ |age|name| +---+----+ - |100|null| + |100|NULL| +---+----+ """ for k in options: @@ -295,7 +295,7 @@ def load( +---+----+ |age|name| +---+----+ - |100|null| + |100|NULL| +---+----+ """ if format is not None: @@ -695,7 +695,7 @@ def csv( +---+----+ |age|name| +---+----+ - |100|null| + |100|NULL| +---+----+ """ self._set_opts( @@ -1829,7 +1829,7 @@ def csv( +---+----+ |age|name| +---+----+ - |100|null| + |100|NULL| +---+----+ """ self.mode(mode) diff --git a/python/pyspark/sql/tests/connect/test_connect_basic.py b/python/pyspark/sql/tests/connect/test_connect_basic.py index c53e073543d46..2658ad79ab066 100644 --- a/python/pyspark/sql/tests/connect/test_connect_basic.py +++ b/python/pyspark/sql/tests/connect/test_connect_basic.py @@ -165,9 +165,9 @@ def test_df_get_item(self): # +-----+----+----+ # | a| b| c| # +-----+----+----+ - # | true| 1|null| - # |false|null| 2.0| - # | null| 3| 3.0| + # | true| 1|NULL| + # |false|NULL| 2.0| + # | NULL| 3| 3.0| # +-----+----+----+ cdf = self.connect.sql(query) @@ -673,7 +673,7 @@ def test_with_none_and_nan(self): # +---+-----+ # | 1| NaN| # | 2| 42.0| - # | 3| null| + # | 3| NULL| # +---+-----+ for data in [data1, data2, data3, data4, data5, data6, data7, data8]: @@ -1278,9 +1278,9 @@ def test_sort(self): # +-----+----+----+ # | a| b| c| # +-----+----+----+ - # |false| 1|null| - # |false|null| 2.0| - # | null| 3| 3.0| + # |false| 1|NULL| + # |false|NULL| 2.0| + # | NULL| 3| 3.0| # +-----+----+----+ cdf = self.connect.sql(query) @@ -1498,9 +1498,9 @@ def test_fill_na(self): # +-----+----+----+ # | a| b| c| # +-----+----+----+ - # |false| 1|null| - # |false|null| 2.0| - # | null| 3| 3.0| + # |false| 1|NULL| + # |false|NULL| 2.0| + # | NULL| 3| 3.0| # +-----+----+----+ self.assert_eq( @@ -1530,9 +1530,9 @@ def test_drop_na(self): # +-----+----+----+ # | a| b| c| # +-----+----+----+ - # |false| 1|null| - # |false|null| 2.0| - # | null| 3| 3.0| + # |false| 1|NULL| + # |false|NULL| 2.0| + # | NULL| 3| 3.0| # +-----+----+----+ self.assert_eq( @@ -1562,9 +1562,9 @@ def test_replace(self): # +-----+----+----+ # | a| b| c| # +-----+----+----+ - # |false| 1|null| - # |false|null| 2.0| - # | null| 3| 3.0| + # |false| 1|NULL| + # |false|NULL| 2.0| + # | NULL| 3| 3.0| # +-----+----+----+ self.assert_eq( @@ -2692,8 +2692,8 @@ def test_collect_nested_type(self): # | a| b| c| d| e| f| g| h| # +---+---+----+----+-----+----+------------+-------------------+ # | 1| 4| 0| 8| true|true|[1, null, 3]| {1 -> 2, 3 -> 4}| - # | 2| 5| -1|null|false|null| [1, 3]|{1 -> null, 3 -> 4}| - # | 3| 6|null| 0|false|null| [null]| null| + # | 2| 5| -1|NULL|false|NULL| [1, 3]|{1 -> null, 3 -> 4}| + # | 3| 6|NULL| 0|false|NULL| [null]| NULL| # +---+---+----+----+-----+----+------------+-------------------+ cdf = self.connect.sql(query) @@ -2750,7 +2750,7 @@ def test_collect_nested_type(self): # +-------------------+-------------------+ # | {1 -> 2, 3 -> 4}| {1 -> 4, 4 -> 0}| # |{1 -> null, 3 -> 4}| {2 -> 5, 5 -> -1}| - # | null|{3 -> 6, 6 -> null}| + # | NULL|{3 -> 6, 6 -> null}| # +-------------------+-------------------+ self.assertEqual( cdf.select(CF.col("h"), CF.create_map("a", "b", "b", "c")).collect(), diff --git a/python/pyspark/sql/tests/connect/test_connect_column.py b/python/pyspark/sql/tests/connect/test_connect_column.py index 30be5fb1498f2..2a22ca6ad8d60 100644 --- a/python/pyspark/sql/tests/connect/test_connect_column.py +++ b/python/pyspark/sql/tests/connect/test_connect_column.py @@ -172,8 +172,8 @@ def test_column_with_null(self): # +---+----+----+ # | a| b| c| # +---+----+----+ - # | 1| 1|null| - # | 2|null|null| + # | 1| 1|NULL| + # | 2|NULL|NULL| # | 3| 3| 1| # +---+----+----+ @@ -250,9 +250,9 @@ def test_datetime(self): # | a| b| c| # +-------------------+----------+----+ # |2022-12-22 15:50:00|2022-12-25| 1.1| - # |2022-12-22 18:50:00| null| 2.2| + # |2022-12-22 18:50:00| NULL| 2.2| # |2022-12-23 15:50:00|2022-12-24| 3.3| - # | null|2022-12-22|null| + # | NULL|2022-12-22|NULL| # +-------------------+----------+----+ cdf = self.connect.sql(query) @@ -308,8 +308,8 @@ def test_decimal(self): # +---+----+---+----+ # | a| b| c| d| # +---+----+---+----+ - # | 1| 1| 0|null| - # | 2|null| 1| 2.0| + # | 1| 1| 0|NULL| + # | 2|NULL| 1| 2.0| # | 3| 3| 4| 3.5| # +---+----+---+----+ @@ -349,9 +349,9 @@ def test_none(self): # +----+----+----+ # | a| b| c| # +----+----+----+ - # | 1| 1|null| - # | 2|null| 1| - # |null| 3| 4| + # | 1| 1|NULL| + # | 2|NULL| 1| + # |NULL| 3| 4| # +----+----+----+ cdf = self.connect.sql(query) @@ -576,8 +576,8 @@ def test_isin(self): # +---+----+---+----+ # | a| b| c| d| # +---+----+---+----+ - # | 1| 1| 0|null| - # | 2|null| 1| 2.0| + # | 1| 1| 0|NULL| + # | 2|NULL| 1| 2.0| # | 3| 3| 4| 3.5| # +---+----+---+----+ @@ -632,9 +632,9 @@ def test_between(self): # | a| b| c| # +-------------------+----------+----+ # |2022-12-22 15:50:00|2022-12-25| 1.1| - # |2022-12-22 18:50:00| null| 2.2| + # |2022-12-22 18:50:00| NULL| 2.2| # |2022-12-23 15:50:00|2022-12-24| 3.3| - # | null|2022-12-22|null| + # | NULL|2022-12-22|NULL| # +-------------------+----------+----+ cdf = self.connect.sql(query) @@ -692,7 +692,7 @@ def test_column_bitwise_ops(self): # | a| b| c| # +---+----+---+ # | 1| 1| 0| - # | 2|null| 1| + # | 2|NULL| 1| # | 3| 3| 4| # +---+----+---+ @@ -734,7 +734,7 @@ def test_column_accessor(self): # +----------------+-------------------+------------+----+ # |{1.0, 1.0, 2022}|{b -> 123, a -> kk}| [1, 2, 3]|2022| # |{2.0, 2.0, 2018}| {a -> xy}|[-1, -2, -3]|2018| - # |{3.0, 3.0, null}| {a -> ab}| [-1, 0, 1]|null| + # |{3.0, 3.0, null}| {a -> ab}| [-1, 0, 1]|NULL| # +----------------+-------------------+------------+----+ cdf = self.connect.sql(query) @@ -798,8 +798,8 @@ def test_column_arithmetic_ops(self): # +---+----+---+----+ # | a| b| c| d| # +---+----+---+----+ - # | 1| 1| 0|null| - # | 2|null| 1| 2.0| + # | 1| 1| 0|NULL| + # | 2|NULL| 1| 2.0| # | 3| 3| 4| 3.5| # +---+----+---+----+ @@ -857,7 +857,7 @@ def test_column_field_ops(self): # +----------------------+----+ # | {1.0, 1.0, 2022, 1}| 0| # |{2.0, 2.0, 2018, null}| 2| - # | {3.0, 3.0, null, 3}|null| + # | {3.0, 3.0, null, 3}|NULL| # +----------------------+----+ cdf = self.connect.sql(query) diff --git a/python/pyspark/sql/tests/connect/test_connect_function.py b/python/pyspark/sql/tests/connect/test_connect_function.py index 9a9ecc1a6fec4..563db9ea63db9 100644 --- a/python/pyspark/sql/tests/connect/test_connect_function.py +++ b/python/pyspark/sql/tests/connect/test_connect_function.py @@ -134,8 +134,8 @@ def test_broadcast(self): # +---+----+----+ # | a| b| c| # +---+----+----+ - # | 0| NaN|null| - # | 1|null| 2.0| + # | 0| NaN|NULL| + # | 1|NULL| 2.0| # | 2| 2.1| 3.5| # +---+----+----+ @@ -185,8 +185,8 @@ def test_normal_functions(self): # +---+----+----+ # | a| b| c| # +---+----+----+ - # | 0| NaN|null| - # | 1|null| 2.0| + # | 0| NaN|NULL| + # | 1|NULL| 2.0| # | 2| 2.1| 3.5| # +---+----+----+ @@ -272,8 +272,8 @@ def test_when_otherwise(self): # +---+----+----+ # | a| b| c| # +---+----+----+ - # | 0| NaN|null| - # | 1|null| 2.0| + # | 0| NaN|NULL| + # | 1|NULL| 2.0| # | 2| 2.1| 3.5| # | 3| 3.1| NaN| # +---+----+----+ @@ -414,9 +414,9 @@ def test_sort_with_nulls_order(self): # +-----+----+----+ # | a| b| c| # +-----+----+----+ - # |false| 1|null| - # | true|null| 2.0| - # | null| 3| 3.0| + # |false| 1|NULL| + # | true|NULL| 2.0| + # | NULL| 3| 3.0| # +-----+----+----+ cdf = self.connect.sql(query) @@ -460,9 +460,9 @@ def test_math_functions(self): # +-----+----+----+ # | a| b| c| # +-----+----+----+ - # |false| 1|null| - # | true|null| 2.0| - # | null| 3| 3.5| + # |false| 1|NULL| + # | true|NULL| 2.0| + # | NULL| 3| 3.5| # +-----+----+----+ cdf = self.connect.sql(query) @@ -582,8 +582,8 @@ def test_aggregation_functions(self): # +---+----+----+ # | a| b| c| # +---+----+----+ - # | 0| NaN|null| - # | 1|null| 2.0| + # | 0| NaN|NULL| + # | 1|NULL| 2.0| # | 1| 2.1| 3.5| # | 0| 0.5| 1.0| # +---+----+----+ @@ -714,8 +714,8 @@ def test_window_functions(self): # +---+----+----+ # | a| b| c| # +---+----+----+ - # | 0| NaN|null| - # | 1|null| 2.0| + # | 0| NaN|NULL| + # | 1|NULL| 2.0| # | 1| 2.1| 3.5| # | 0| 0.5| 1.0| # | 0| 1.5| 1.1| @@ -1014,8 +1014,8 @@ def test_collection_functions(self): # | a| b| c| d| e| f| # +---------+------------+------------+---+---+----+ # | [a, ab]| [1, 2, 3]|[1, null, 3]| 1| 2| a| - # |[x, null]| null| [1, 3]| 3| 4| x| - # | null|[-1, -2, -3]| []| 5| 6|null| + # |[x, null]| NULL| [1, 3]| 3| 4| x| + # | NULL|[-1, -2, -3]| []| 5| 6|NULL| # +---------+------------+------------+---+---+----+ cdf = self.connect.sql(query) @@ -1271,8 +1271,8 @@ def test_map_collection_functions(self): # | a| b| c| e| f| g| h| # +---------+-----------+----------------------+---+---+------+------+ # |{a -> ab}| {x -> ab}| {1 -> 2, 3 -> 4}| 1| a|[1, 2]|[X, Y]| - # |{x -> yz}|{c -> null}| null| 2| x|[3, 4]|[A, B]| - # |{c -> de}| null|{-1 -> null, -3 -> -4}| -3| c| null| [Z]| + # |{x -> yz}|{c -> null}| NULL| 2| x|[3, 4]|[A, B]| + # |{c -> de}| NULL|{-1 -> null, -3 -> -4}| -3| c| NULL| [Z]| # +---------+-----------+----------------------+---+---+------+------+ cdf = self.connect.sql(query) @@ -1332,8 +1332,8 @@ def test_generator_functions(self): # | a| b| c| d| e| f| g| # +---------+------------+------------+----------------------+---+---+---+ # | [a, ab]| [1, 2, 3]|[1, null, 3]| {1 -> 2, 3 -> 4}| 1|2.0| 3| - # |[x, null]| null| [1, 3]| null| 3|4.0| 5| - # | null|[-1, -2, -3]| []|{-1 -> null, -3 -> -4}| 7|NaN| 9| + # |[x, null]| NULL| [1, 3]| NULL| 3|4.0| 5| + # | NULL|[-1, -2, -3]| []|{-1 -> null, -3 -> -4}| 7|NaN| 9| # +---------+------------+------------+----------------------+---+---+---+ cdf = self.connect.sql(query) @@ -1455,9 +1455,9 @@ def test_lambda_functions(self): # +---------+------------+------------+---+---+----+-------------------+---------+ # | a| b| c| d| e| f| g| h| # +---------+------------+------------+---+---+----+-------------------+---------+ - # | [a, ab]| [1, 2, 3]|[1, null, 3]| 1| 2| a| null| {0 -> 0}| - # |[x, null]| null| [1, 3]| 3| 4| x| {2 -> 0}|{-1 -> 1}| - # | null|[-1, -2, -3]| []| 5| 6|null|{-1 -> 2, -3 -> -4}| null| + # | [a, ab]| [1, 2, 3]|[1, null, 3]| 1| 2| a| NULL| {0 -> 0}| + # |[x, null]| NULL| [1, 3]| 3| 4| x| {2 -> 0}|{-1 -> 1}| + # | NULL|[-1, -2, -3]| []| 5| 6|NULL|{-1 -> 2, -3 -> -4}| NULL| # +---------+------------+------------+---+---+----+-------------------+---------+ cdf = self.connect.sql(query) @@ -1880,8 +1880,8 @@ def test_string_functions_one_arg(self): # +--------+-----+----+ # | a| b| c| # +--------+-----+----+ - # | ab |ab |null| - # | ab| null| ab| + # | ab |ab |NULL| + # | ab| NULL| ab| # +--------+-----+----+ cdf = self.connect.sql(query) @@ -2276,9 +2276,9 @@ def test_misc_functions(self): # | a| b| c| d| # +---+----+----+----+ # | 0| NaN| x|[78]| - # | 1|null| y|[79]| + # | 1|NULL| y|[79]| # | 1| 2.1| z|[7A]| - # | 0| 0.5|null|null| + # | 0| 0.5|NULL|NULL| # +---+----+----+----+ cdf = self.connect.sql(query) @@ -2342,9 +2342,9 @@ def test_call_udf(self): # | a| b| c| d| # +----+----+----+----+ # |-1.0| NaN| x|[78]| - # |-2.1|null| y|[79]| + # |-2.1|NULL| y|[79]| # | 1.0| 2.1| z|[7A]| - # | 0.0| 0.5|null|null| + # | 0.0| 0.5|NULL|NULL| # +----+----+----+----+ cdf = self.connect.sql(query) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala index ae09183811c77..3de31b1ed28df 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala @@ -620,7 +620,7 @@ case class Cast( if (array.numElements > 0) { val toUTF8String = castToString(et) if (array.isNullAt(0)) { - if (!legacyCastToStr) builder.append("null") + if (!legacyCastToStr) builder.append("NULL") } else { builder.append(toUTF8String(array.get(0, et)).asInstanceOf[UTF8String]) } @@ -628,7 +628,7 @@ case class Cast( while (i < array.numElements) { builder.append(",") if (array.isNullAt(i)) { - if (!legacyCastToStr) builder.append(" null") + if (!legacyCastToStr) builder.append(" NULL") } else { builder.append(" ") builder.append(toUTF8String(array.get(i, et)).asInstanceOf[UTF8String]) @@ -651,7 +651,7 @@ case class Cast( builder.append(keyToUTF8String(keyArray.get(0, kt)).asInstanceOf[UTF8String]) builder.append(" ->") if (valueArray.isNullAt(0)) { - if (!legacyCastToStr) builder.append(" null") + if (!legacyCastToStr) builder.append(" NULL") } else { builder.append(" ") builder.append(valueToUTF8String(valueArray.get(0, vt)).asInstanceOf[UTF8String]) @@ -662,7 +662,7 @@ case class Cast( builder.append(keyToUTF8String(keyArray.get(i, kt)).asInstanceOf[UTF8String]) builder.append(" ->") if (valueArray.isNullAt(i)) { - if (!legacyCastToStr) builder.append(" null") + if (!legacyCastToStr) builder.append(" NULL") } else { builder.append(" ") builder.append(valueToUTF8String(valueArray.get(i, vt)) @@ -682,7 +682,7 @@ case class Cast( val st = fields.map(_.dataType) val toUTF8StringFuncs = st.map(castToString) if (row.isNullAt(0)) { - if (!legacyCastToStr) builder.append("null") + if (!legacyCastToStr) builder.append("NULL") } else { builder.append(toUTF8StringFuncs(0)(row.get(0, st(0))).asInstanceOf[UTF8String]) } @@ -690,7 +690,7 @@ case class Cast( while (i < row.numFields) { builder.append(",") if (row.isNullAt(i)) { - if (!legacyCastToStr) builder.append(" null") + if (!legacyCastToStr) builder.append(" NULL") } else { builder.append(" ") builder.append(toUTF8StringFuncs(i)(row.get(i, st(i))).asInstanceOf[UTF8String]) @@ -1421,14 +1421,14 @@ case class Cast( |$buffer.append("["); |if ($array.numElements() > 0) { | if ($array.isNullAt(0)) { - | ${appendIfNotLegacyCastToStr(buffer, "null")} + | ${appendIfNotLegacyCastToStr(buffer, "NULL")} | } else { | $buffer.append($elementToStringFunc(${CodeGenerator.getValue(array, et, "0")})); | } | for (int $loopIndex = 1; $loopIndex < $array.numElements(); $loopIndex++) { | $buffer.append(","); | if ($array.isNullAt($loopIndex)) { - | ${appendIfNotLegacyCastToStr(buffer, " null")} + | ${appendIfNotLegacyCastToStr(buffer, " NULL")} | } else { | $buffer.append(" "); | $buffer.append($elementToStringFunc(${CodeGenerator.getValue(array, et, loopIndex)})); @@ -1478,7 +1478,7 @@ case class Cast( | $buffer.append($keyToStringFunc($getMapFirstKey)); | $buffer.append(" ->"); | if ($map.valueArray().isNullAt(0)) { - | ${appendIfNotLegacyCastToStr(buffer, " null")} + | ${appendIfNotLegacyCastToStr(buffer, " NULL")} | } else { | $buffer.append(" "); | $buffer.append($valueToStringFunc($getMapFirstValue)); @@ -1488,7 +1488,7 @@ case class Cast( | $buffer.append($keyToStringFunc($getMapKeyArray)); | $buffer.append(" ->"); | if ($map.valueArray().isNullAt($loopIndex)) { - | ${appendIfNotLegacyCastToStr(buffer, " null")} + | ${appendIfNotLegacyCastToStr(buffer, " NULL")} | } else { | $buffer.append(" "); | $buffer.append($valueToStringFunc($getMapValueArray)); @@ -1512,7 +1512,7 @@ case class Cast( code""" |${if (i != 0) code"""$buffer.append(",");""" else EmptyBlock} |if ($row.isNullAt($i)) { - | ${appendIfNotLegacyCastToStr(buffer, if (i == 0) "null" else " null")} + | ${appendIfNotLegacyCastToStr(buffer, if (i == 0) "NULL" else " NULL")} |} else { | ${if (i != 0) code"""$buffer.append(" ");""" else EmptyBlock} | diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuiteBase.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuiteBase.scala index bad85ca4176b5..504fb8648b6e4 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuiteBase.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuiteBase.scala @@ -784,7 +784,7 @@ abstract class CastSuiteBase extends SparkFunSuite with ExpressionEvalHelper { Seq(false, true).foreach { omitNull => withSQLConf(SQLConf.LEGACY_COMPLEX_TYPES_TO_STRING.key -> omitNull.toString) { val ret3 = cast(Literal.create(Array("ab", null, "c")), StringType) - checkEvaluation(ret3, s"[ab,${if (omitNull) "" else " null"}, c]") + checkEvaluation(ret3, s"[ab,${if (omitNull) "" else " NULL"}, c]") } } val ret4 = @@ -813,7 +813,7 @@ abstract class CastSuiteBase extends SparkFunSuite with ExpressionEvalHelper { val ret1 = cast(Literal.create(Array(null, null)), StringType) checkEvaluation( ret1, - s"[${if (omitNull) "" else "null"},${if (omitNull) "" else " null"}]") + s"[${if (omitNull) "" else "NULL"},${if (omitNull) "" else " NULL"}]") } } } @@ -828,7 +828,7 @@ abstract class CastSuiteBase extends SparkFunSuite with ExpressionEvalHelper { val ret2 = cast( Literal.create(Map("1" -> "a".getBytes, "2" -> null, "3" -> "c".getBytes)), StringType) - checkEvaluation(ret2, s"${lb}1 -> a, 2 ->${if (legacyCast) "" else " null"}, 3 -> c$rb") + checkEvaluation(ret2, s"${lb}1 -> a, 2 ->${if (legacyCast) "" else " NULL"}, 3 -> c$rb") val ret3 = cast( Literal.create(Map( 1 -> Date.valueOf("2014-12-03"), @@ -860,7 +860,7 @@ abstract class CastSuiteBase extends SparkFunSuite with ExpressionEvalHelper { val ret1 = cast(Literal.create((1, "a", 0.1)), StringType) checkEvaluation(ret1, s"${lb}1, a, 0.1$rb") val ret2 = cast(Literal.create(Tuple3[Int, String, String](1, null, "a")), StringType) - checkEvaluation(ret2, s"${lb}1,${if (legacyCast) "" else " null"}, a$rb") + checkEvaluation(ret2, s"${lb}1,${if (legacyCast) "" else " NULL"}, a$rb") val ret3 = cast(Literal.create( (Date.valueOf("2014-12-03"), Timestamp.valueOf("2014-12-03 15:05:00"))), StringType) checkEvaluation(ret3, s"${lb}2014-12-03, 2014-12-03 15:05:00$rb") @@ -882,7 +882,7 @@ abstract class CastSuiteBase extends SparkFunSuite with ExpressionEvalHelper { val ret1 = cast(Literal.create(Tuple2[String, String](null, null)), StringType) checkEvaluation( ret1, - s"$lb${if (legacyCast) "" else "null"},${if (legacyCast) "" else " null"}$rb") + s"$lb${if (legacyCast) "" else "NULL"},${if (legacyCast) "" else " NULL"}$rb") } } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala index be393e46e54d5..584ce19c77a28 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala @@ -289,7 +289,7 @@ class Dataset[T] private[sql]( schema.fieldNames.map(SchemaUtils.escapeMetaCharacters).toSeq +: data.map { row => row.toSeq.map { cell => val str = cell match { - case null => "null" + case null => "NULL" case binary: Array[Byte] => binary.map("%02X".format(_)).mkString("[", " ", "]") case _ => // Escapes meta-characters not to break the `showString` format @@ -2392,8 +2392,8 @@ class Dataset[T] private[sql]( * // +----+----+----+----+ * // |col0|col1|col2|col3| * // +----+----+----+----+ - * // | 1| 2| 3|null| - * // | 5| 4|null| 6| + * // | 1| 2| 3|NULL| + * // | 5| 4|NULL| 6| * // +----+----+----+----+ * * df2.unionByName(df1, true).show @@ -2402,8 +2402,8 @@ class Dataset[T] private[sql]( * // +----+----+----+----+ * // |col1|col0|col3|col2| * // +----+----+----+----+ - * // | 4| 5| 6|null| - * // | 2| 1|null| 3| + * // | 4| 5| 6|NULL| + * // | 2| 1|NULL| 3| * // +----+----+----+----+ * }}} * diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala index b6297bc24acb6..4aca7c8a5a666 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala @@ -1430,7 +1430,7 @@ class DatasetSuite extends QueryTest """+---+----+---+ || b| a| c| |+---+----+---+ - || 0|null| 1| + || 0|NULL| 1| || 0| | 1| || 0|ab c| 1| || 0|1098| 1|