From 5fb862f6751710df319f0b6fbce9826ada81453d Mon Sep 17 00:00:00 2001 From: fusheng Date: Tue, 11 Feb 2025 11:41:41 +0800 Subject: [PATCH 01/13] Add an example for get_json_object when the JSON object is of JSON array type --- .../apache/spark/sql/catalyst/expressions/jsonExpressions.scala | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala index 5e6da7ac41250..2a118bb606325 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala @@ -42,6 +42,8 @@ import org.apache.spark.unsafe.types.UTF8String Examples: > SELECT _FUNC_('{"a":"b"}', '$.a'); b + > SELECT _FUNC_('[{"a":"b"},{"a":"c"}]', '$[*].a'); + ["b","c"] """, group = "json_funcs", since = "1.5.0") From 76045edbcd205ab9c3101fbc10efc9f2fd50fe31 Mon Sep 17 00:00:00 2001 From: fusheng Date: Tue, 11 Feb 2025 21:22:01 +0800 Subject: [PATCH 02/13] add more comprehensive examples --- python/pyspark/sql/functions/builtin.py | 12 ++++++++++++ .../sql/catalyst/expressions/jsonExpressions.scala | 2 ++ 2 files changed, 14 insertions(+) diff --git a/python/pyspark/sql/functions/builtin.py b/python/pyspark/sql/functions/builtin.py index 4575bf730fcaa..6564013b1adb6 100644 --- a/python/pyspark/sql/functions/builtin.py +++ b/python/pyspark/sql/functions/builtin.py @@ -20115,11 +20115,23 @@ def get_json_object(col: "ColumnOrName", path: str) -> Column: Examples -------- + Example1: Get json object from json object + >>> data = [("1", '''{"f1": "value1", "f2": "value2"}'''), ("2", '''{"f1": "value12"}''')] >>> df = spark.createDataFrame(data, ("key", "jstring")) >>> df.select(df.key, get_json_object(df.jstring, '$.f1').alias("c0"), \\ ... get_json_object(df.jstring, '$.f2').alias("c1") ).collect() [Row(key='1', c0='value1', c1='value2'), Row(key='2', c0='value12', c1=None)] + + Example2: Get json object from json array object + data = [("1", '''[{"f1": "value1", "f2": "value2"},{"f1": "value3", "f2": "value4"}]'''), ("2", '''[{"f1": "value12"},{"f1": "value13"}]''')] + df = spark.createDataFrame(data, ("key", "jarray")) + df.select(df.key, get_json_object(df.jarray, '$[0].f1').alias("c0"), \\ + ... get_json_object(df.jarray, '$[0].f2').alias("c1") ).collect() + [Row(key='1', c0='value1', c1='value2'), Row(key='2', c0='value12', c1=None)] + df.select(df.key, get_json_object(df.jarray, '$[*].f1').alias("c0"), \\ + ... get_json_object(df.jarray, '$[*].f2').alias("c1") ).collect() + [Row(key='1', c0='["value1","value3"]', c1='["value2","value4"]'), Row(key='2', c0='["value12","value13"]', c1=None)] """ from pyspark.sql.classic.column import _to_java_column diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala index 2a118bb606325..84b8374599d30 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala @@ -42,6 +42,8 @@ import org.apache.spark.unsafe.types.UTF8String Examples: > SELECT _FUNC_('{"a":"b"}', '$.a'); b + > SELECT _FUNC_('[{"a":"b"},{"a":"c"}]', '$[0].a'); + b > SELECT _FUNC_('[{"a":"b"},{"a":"c"}]', '$[*].a'); ["b","c"] """, From 25fb4f847b2a7cc401673c53f474c6fe417e58f8 Mon Sep 17 00:00:00 2001 From: Hyukjin Kwon Date: Wed, 12 Feb 2025 09:51:00 +0900 Subject: [PATCH 03/13] Update python/pyspark/sql/functions/builtin.py --- python/pyspark/sql/functions/builtin.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/pyspark/sql/functions/builtin.py b/python/pyspark/sql/functions/builtin.py index 6564013b1adb6..2b12a1c357096 100644 --- a/python/pyspark/sql/functions/builtin.py +++ b/python/pyspark/sql/functions/builtin.py @@ -20124,6 +20124,7 @@ def get_json_object(col: "ColumnOrName", path: str) -> Column: [Row(key='1', c0='value1', c1='value2'), Row(key='2', c0='value12', c1=None)] Example2: Get json object from json array object + data = [("1", '''[{"f1": "value1", "f2": "value2"},{"f1": "value3", "f2": "value4"}]'''), ("2", '''[{"f1": "value12"},{"f1": "value13"}]''')] df = spark.createDataFrame(data, ("key", "jarray")) df.select(df.key, get_json_object(df.jarray, '$[0].f1').alias("c0"), \\ From 6b9d029a1d32a17ff58ac7541846849b8176dea0 Mon Sep 17 00:00:00 2001 From: fusheng Date: Wed, 12 Feb 2025 14:40:04 +0800 Subject: [PATCH 04/13] fix the indentation --- python/pyspark/sql/functions/builtin.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/python/pyspark/sql/functions/builtin.py b/python/pyspark/sql/functions/builtin.py index 6564013b1adb6..f2fa6210ff65a 100644 --- a/python/pyspark/sql/functions/builtin.py +++ b/python/pyspark/sql/functions/builtin.py @@ -20115,7 +20115,7 @@ def get_json_object(col: "ColumnOrName", path: str) -> Column: Examples -------- - Example1: Get json object from json object + Example 1: Extracts a json object from json string >>> data = [("1", '''{"f1": "value1", "f2": "value2"}'''), ("2", '''{"f1": "value12"}''')] >>> df = spark.createDataFrame(data, ("key", "jstring")) @@ -20123,15 +20123,17 @@ def get_json_object(col: "ColumnOrName", path: str) -> Column: ... get_json_object(df.jstring, '$.f2').alias("c1") ).collect() [Row(key='1', c0='value1', c1='value2'), Row(key='2', c0='value12', c1=None)] - Example2: Get json object from json array object - data = [("1", '''[{"f1": "value1", "f2": "value2"},{"f1": "value3", "f2": "value4"}]'''), ("2", '''[{"f1": "value12"},{"f1": "value13"}]''')] + Example 2: Extracts a json object from json array + data = [("1", '''[{"f1": "value1", "f2": "value2"},{"f1": "value3", "f2": "value4"}]'''), \\ + ... ("2", '''[{"f1": "value12"},{"f1": "value13"}]''')] df = spark.createDataFrame(data, ("key", "jarray")) df.select(df.key, get_json_object(df.jarray, '$[0].f1').alias("c0"), \\ ... get_json_object(df.jarray, '$[0].f2').alias("c1") ).collect() [Row(key='1', c0='value1', c1='value2'), Row(key='2', c0='value12', c1=None)] df.select(df.key, get_json_object(df.jarray, '$[*].f1').alias("c0"), \\ ... get_json_object(df.jarray, '$[*].f2').alias("c1") ).collect() - [Row(key='1', c0='["value1","value3"]', c1='["value2","value4"]'), Row(key='2', c0='["value12","value13"]', c1=None)] + [Row(key='1', c0='["value1","value3"]', c1='["value2","value4"]'), \\ + ... Row(key='2', c0='["value12","value13"]', c1=None)] """ from pyspark.sql.classic.column import _to_java_column From 922104bb15fbcf318aaf66a1bcf2e0c5a54d5ca4 Mon Sep 17 00:00:00 2001 From: fusheng Date: Wed, 12 Feb 2025 14:48:54 +0800 Subject: [PATCH 05/13] fix the indentation --- python/pyspark/sql/functions/builtin.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/python/pyspark/sql/functions/builtin.py b/python/pyspark/sql/functions/builtin.py index f2fa6210ff65a..3728fd620548a 100644 --- a/python/pyspark/sql/functions/builtin.py +++ b/python/pyspark/sql/functions/builtin.py @@ -20124,13 +20124,15 @@ def get_json_object(col: "ColumnOrName", path: str) -> Column: [Row(key='1', c0='value1', c1='value2'), Row(key='2', c0='value12', c1=None)] Example 2: Extracts a json object from json array - data = [("1", '''[{"f1": "value1", "f2": "value2"},{"f1": "value3", "f2": "value4"}]'''), \\ + + >>> data = [("1", '''[{"f1": "value1", "f2": "value2"},{"f1": "value3", "f2": "value4"}]'''), \\ ... ("2", '''[{"f1": "value12"},{"f1": "value13"}]''')] - df = spark.createDataFrame(data, ("key", "jarray")) - df.select(df.key, get_json_object(df.jarray, '$[0].f1').alias("c0"), \\ + >>> df = spark.createDataFrame(data, ("key", "jarray")) + >>> df.select(df.key, get_json_object(df.jarray, '$[0].f1').alias("c0"), \\ ... get_json_object(df.jarray, '$[0].f2').alias("c1") ).collect() [Row(key='1', c0='value1', c1='value2'), Row(key='2', c0='value12', c1=None)] - df.select(df.key, get_json_object(df.jarray, '$[*].f1').alias("c0"), \\ + + >>> df.select(df.key, get_json_object(df.jarray, '$[*].f1').alias("c0"), \\ ... get_json_object(df.jarray, '$[*].f2').alias("c1") ).collect() [Row(key='1', c0='["value1","value3"]', c1='["value2","value4"]'), \\ ... Row(key='2', c0='["value12","value13"]', c1=None)] From 3b663dd26377807c8064a9716b1f57b312680f97 Mon Sep 17 00:00:00 2001 From: fusheng Date: Wed, 12 Feb 2025 17:40:03 +0800 Subject: [PATCH 06/13] fix the indentation --- python/pyspark/sql/functions/builtin.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/python/pyspark/sql/functions/builtin.py b/python/pyspark/sql/functions/builtin.py index 3728fd620548a..f326174d69847 100644 --- a/python/pyspark/sql/functions/builtin.py +++ b/python/pyspark/sql/functions/builtin.py @@ -20126,7 +20126,7 @@ def get_json_object(col: "ColumnOrName", path: str) -> Column: Example 2: Extracts a json object from json array >>> data = [("1", '''[{"f1": "value1", "f2": "value2"},{"f1": "value3", "f2": "value4"}]'''), \\ - ... ("2", '''[{"f1": "value12"},{"f1": "value13"}]''')] + ... ("2", '''[{"f1": "value12"}]''')] >>> df = spark.createDataFrame(data, ("key", "jarray")) >>> df.select(df.key, get_json_object(df.jarray, '$[0].f1').alias("c0"), \\ ... get_json_object(df.jarray, '$[0].f2').alias("c1") ).collect() @@ -20134,8 +20134,7 @@ def get_json_object(col: "ColumnOrName", path: str) -> Column: >>> df.select(df.key, get_json_object(df.jarray, '$[*].f1').alias("c0"), \\ ... get_json_object(df.jarray, '$[*].f2').alias("c1") ).collect() - [Row(key='1', c0='["value1","value3"]', c1='["value2","value4"]'), \\ - ... Row(key='2', c0='["value12","value13"]', c1=None)] + [Row(key='1', c0='["value1","value3"]', c1='["value2","value4"]'), Row(key='2', c0='"value12"', c1=None)] """ from pyspark.sql.classic.column import _to_java_column From 287d9175aa0fc989ab9a52cd3cb687a90fbe4ce3 Mon Sep 17 00:00:00 2001 From: fusheng Date: Thu, 13 Feb 2025 10:11:38 +0800 Subject: [PATCH 07/13] fix the indentation --- python/pyspark/sql/functions/builtin.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/python/pyspark/sql/functions/builtin.py b/python/pyspark/sql/functions/builtin.py index f326174d69847..414236c54745c 100644 --- a/python/pyspark/sql/functions/builtin.py +++ b/python/pyspark/sql/functions/builtin.py @@ -20125,16 +20125,16 @@ def get_json_object(col: "ColumnOrName", path: str) -> Column: Example 2: Extracts a json object from json array - >>> data = [("1", '''[{"f1": "value1", "f2": "value2"},{"f1": "value3", "f2": "value4"}]'''), \\ - ... ("2", '''[{"f1": "value12"}]''')] + >>> data = [("1", '''[{"f1": "value1"},{"f1": "value2"}]'''), \\ + ... ("2", '''[{"f1": "value12"},{"f2": "value13"}]''')] >>> df = spark.createDataFrame(data, ("key", "jarray")) >>> df.select(df.key, get_json_object(df.jarray, '$[0].f1').alias("c0"), \\ - ... get_json_object(df.jarray, '$[0].f2').alias("c1") ).collect() - [Row(key='1', c0='value1', c1='value2'), Row(key='2', c0='value12', c1=None)] + ... get_json_object(df.jarray, '$[1].f2').alias("c1") ).collect() + [Row(key='1', c0='value1', c1=None), Row(key='2', c0='value12', c1='value13')] >>> df.select(df.key, get_json_object(df.jarray, '$[*].f1').alias("c0"), \\ ... get_json_object(df.jarray, '$[*].f2').alias("c1") ).collect() - [Row(key='1', c0='["value1","value3"]', c1='["value2","value4"]'), Row(key='2', c0='"value12"', c1=None)] + [Row(key='1', c0='["value1","value2"]', c1=None), Row(key='2', c0='"value12"', c1='"value13"')] """ from pyspark.sql.classic.column import _to_java_column From bd91e85e0468add8fe3e11a8f16bd04d3e7a2c81 Mon Sep 17 00:00:00 2001 From: fusheng Date: Mon, 17 Feb 2025 15:58:30 +0800 Subject: [PATCH 08/13] format the example code and replace collect() with show() --- python/pyspark/sql/functions/builtin.py | 53 +++++++++++++++++-------- 1 file changed, 37 insertions(+), 16 deletions(-) diff --git a/python/pyspark/sql/functions/builtin.py b/python/pyspark/sql/functions/builtin.py index 414236c54745c..d704b8a50625d 100644 --- a/python/pyspark/sql/functions/builtin.py +++ b/python/pyspark/sql/functions/builtin.py @@ -20115,26 +20115,47 @@ def get_json_object(col: "ColumnOrName", path: str) -> Column: Examples -------- - Example 1: Extracts a json object from json string + Example 1: Extract a json object from json string + >>> import pyspark.sql.functions as sf >>> data = [("1", '''{"f1": "value1", "f2": "value2"}'''), ("2", '''{"f1": "value12"}''')] >>> df = spark.createDataFrame(data, ("key", "jstring")) - >>> df.select(df.key, get_json_object(df.jstring, '$.f1').alias("c0"), \\ - ... get_json_object(df.jstring, '$.f2').alias("c1") ).collect() - [Row(key='1', c0='value1', c1='value2'), Row(key='2', c0='value12', c1=None)] - - Example 2: Extracts a json object from json array - - >>> data = [("1", '''[{"f1": "value1"},{"f1": "value2"}]'''), \\ - ... ("2", '''[{"f1": "value12"},{"f2": "value13"}]''')] + >>> extracted1 = sf.get_json_object(df.jstring, '$.f1').alias("c0") + >>> extracted2 = get_json_object(df.jstring, '$.f2').alias("c1") + >>> df.select(df.key, extracted1, extracted2).show() + +---+-------+------+ + |key| c0| c1| + +---+-------+------+ + | 1| value1|value2| + | 2|value12| null| + +---+-------+------+ + + Example 2: Extract a json object from json array + + >>> import pyspark.sql.functions as sf + >>> jarray1 = '''[{"f1": "value1"},{"f1": "value2"}]''' + >>> jarray2 = '''[{"f1": "value12"},{"f2": "value13"}]''' + >>> data = [("1", jarray1), ("2", jarray2)] >>> df = spark.createDataFrame(data, ("key", "jarray")) - >>> df.select(df.key, get_json_object(df.jarray, '$[0].f1').alias("c0"), \\ - ... get_json_object(df.jarray, '$[1].f2').alias("c1") ).collect() - [Row(key='1', c0='value1', c1=None), Row(key='2', c0='value12', c1='value13')] - - >>> df.select(df.key, get_json_object(df.jarray, '$[*].f1').alias("c0"), \\ - ... get_json_object(df.jarray, '$[*].f2').alias("c1") ).collect() - [Row(key='1', c0='["value1","value2"]', c1=None), Row(key='2', c0='"value12"', c1='"value13"')] + >>> extracted1 = get_json_object(df.jarray, '$[0].f1').alias("c0") + >>> extracted2 = get_json_object(df.jarray, '$[1].f2').alias("c1") + >>> df.select(df.key, extracted1, extracted2).show() + +---+-------+-------+ + |key| c0| c1| + +---+-------+-------+ + | 1| value1| null| + | 2|value12|value13| + +---+-------+-------+ + + >>> extracted3 = get_json_object(df.jarray, '$[*].f1').alias("c0") + >>> extracted4 = get_json_object(df.jarray, '$[*].f2').alias("c1") + >>> df.select(df.key, extracted3, extracted4).show() + +---+-------------------+---------+ + |key| c0| c1| + +---+-------------------+---------+ + | 1|["value1","value2"]| null| + | 2| "value12"|"value13"| + +---+-------------------+---------+ """ from pyspark.sql.classic.column import _to_java_column From a0d3fcbda1927189217169f1ce38a9d0f0b99ee2 Mon Sep 17 00:00:00 2001 From: fusheng Date: Mon, 17 Feb 2025 18:20:29 +0800 Subject: [PATCH 09/13] format null to NULL --- python/pyspark/sql/functions/builtin.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/pyspark/sql/functions/builtin.py b/python/pyspark/sql/functions/builtin.py index d704b8a50625d..554d79df55073 100644 --- a/python/pyspark/sql/functions/builtin.py +++ b/python/pyspark/sql/functions/builtin.py @@ -20127,7 +20127,7 @@ def get_json_object(col: "ColumnOrName", path: str) -> Column: |key| c0| c1| +---+-------+------+ | 1| value1|value2| - | 2|value12| null| + | 2|value12| NULL| +---+-------+------+ Example 2: Extract a json object from json array @@ -20143,7 +20143,7 @@ def get_json_object(col: "ColumnOrName", path: str) -> Column: +---+-------+-------+ |key| c0| c1| +---+-------+-------+ - | 1| value1| null| + | 1| value1| NULL| | 2|value12|value13| +---+-------+-------+ @@ -20153,7 +20153,7 @@ def get_json_object(col: "ColumnOrName", path: str) -> Column: +---+-------------------+---------+ |key| c0| c1| +---+-------------------+---------+ - | 1|["value1","value2"]| null| + | 1|["value1","value2"]| NULL| | 2| "value12"|"value13"| +---+-------------------+---------+ """ From 94a3960daae3d69d43a8efc0095965a9020fd3fa Mon Sep 17 00:00:00 2001 From: Ruifeng Zheng Date: Mon, 17 Feb 2025 17:47:18 -0800 Subject: [PATCH 10/13] Update python/pyspark/sql/functions/builtin.py --- python/pyspark/sql/functions/builtin.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyspark/sql/functions/builtin.py b/python/pyspark/sql/functions/builtin.py index 554d79df55073..46b8e6ed0b309 100644 --- a/python/pyspark/sql/functions/builtin.py +++ b/python/pyspark/sql/functions/builtin.py @@ -20121,7 +20121,7 @@ def get_json_object(col: "ColumnOrName", path: str) -> Column: >>> data = [("1", '''{"f1": "value1", "f2": "value2"}'''), ("2", '''{"f1": "value12"}''')] >>> df = spark.createDataFrame(data, ("key", "jstring")) >>> extracted1 = sf.get_json_object(df.jstring, '$.f1').alias("c0") - >>> extracted2 = get_json_object(df.jstring, '$.f2').alias("c1") + >>> extracted2 = sf.get_json_object(df.jstring, '$.f2').alias("c1") >>> df.select(df.key, extracted1, extracted2).show() +---+-------+------+ |key| c0| c1| From 4f444e6abc1cce462fd1c3490f77c7d9c0ce2eea Mon Sep 17 00:00:00 2001 From: Ruifeng Zheng Date: Mon, 17 Feb 2025 17:47:26 -0800 Subject: [PATCH 11/13] Update python/pyspark/sql/functions/builtin.py --- python/pyspark/sql/functions/builtin.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/pyspark/sql/functions/builtin.py b/python/pyspark/sql/functions/builtin.py index 46b8e6ed0b309..f2f11eaa8d1db 100644 --- a/python/pyspark/sql/functions/builtin.py +++ b/python/pyspark/sql/functions/builtin.py @@ -20137,8 +20137,8 @@ def get_json_object(col: "ColumnOrName", path: str) -> Column: >>> jarray2 = '''[{"f1": "value12"},{"f2": "value13"}]''' >>> data = [("1", jarray1), ("2", jarray2)] >>> df = spark.createDataFrame(data, ("key", "jarray")) - >>> extracted1 = get_json_object(df.jarray, '$[0].f1').alias("c0") - >>> extracted2 = get_json_object(df.jarray, '$[1].f2').alias("c1") + >>> extracted1 = sf.get_json_object(df.jarray, '$[0].f1').alias("c0") + >>> extracted2 = sf.get_json_object(df.jarray, '$[1].f2').alias("c1") >>> df.select(df.key, extracted1, extracted2).show() +---+-------+-------+ |key| c0| c1| From 140c16ffc2e271a68f56478b14041bf45952575a Mon Sep 17 00:00:00 2001 From: Ruifeng Zheng Date: Mon, 17 Feb 2025 17:47:36 -0800 Subject: [PATCH 12/13] Update python/pyspark/sql/functions/builtin.py --- python/pyspark/sql/functions/builtin.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/pyspark/sql/functions/builtin.py b/python/pyspark/sql/functions/builtin.py index f2f11eaa8d1db..1765f1bbaf0d4 100644 --- a/python/pyspark/sql/functions/builtin.py +++ b/python/pyspark/sql/functions/builtin.py @@ -20147,8 +20147,8 @@ def get_json_object(col: "ColumnOrName", path: str) -> Column: | 2|value12|value13| +---+-------+-------+ - >>> extracted3 = get_json_object(df.jarray, '$[*].f1').alias("c0") - >>> extracted4 = get_json_object(df.jarray, '$[*].f2').alias("c1") + >>> extracted3 = sf.get_json_object(df.jarray, '$[*].f1').alias("c0") + >>> extracted4 = sf.get_json_object(df.jarray, '$[*].f2').alias("c1") >>> df.select(df.key, extracted3, extracted4).show() +---+-------------------+---------+ |key| c0| c1| From 983ee828e0ec57b5fbecc255b5feea2fb162a864 Mon Sep 17 00:00:00 2001 From: fusheng Date: Tue, 18 Feb 2025 11:19:43 +0800 Subject: [PATCH 13/13] format json example --- python/pyspark/sql/functions/builtin.py | 30 +++++++++++++------------ 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/python/pyspark/sql/functions/builtin.py b/python/pyspark/sql/functions/builtin.py index 554d79df55073..e895098a3620a 100644 --- a/python/pyspark/sql/functions/builtin.py +++ b/python/pyspark/sql/functions/builtin.py @@ -20117,12 +20117,12 @@ def get_json_object(col: "ColumnOrName", path: str) -> Column: -------- Example 1: Extract a json object from json string - >>> import pyspark.sql.functions as sf >>> data = [("1", '''{"f1": "value1", "f2": "value2"}'''), ("2", '''{"f1": "value12"}''')] >>> df = spark.createDataFrame(data, ("key", "jstring")) - >>> extracted1 = sf.get_json_object(df.jstring, '$.f1').alias("c0") - >>> extracted2 = get_json_object(df.jstring, '$.f2').alias("c1") - >>> df.select(df.key, extracted1, extracted2).show() + >>> df.select(df.key, + ... get_json_object(df.jstring, '$.f1').alias("c0"), + ... get_json_object(df.jstring, '$.f2').alias("c1") + ... ).show() +---+-------+------+ |key| c0| c1| +---+-------+------+ @@ -20132,14 +20132,15 @@ def get_json_object(col: "ColumnOrName", path: str) -> Column: Example 2: Extract a json object from json array - >>> import pyspark.sql.functions as sf - >>> jarray1 = '''[{"f1": "value1"},{"f1": "value2"}]''' - >>> jarray2 = '''[{"f1": "value12"},{"f2": "value13"}]''' - >>> data = [("1", jarray1), ("2", jarray2)] + >>> data = [ + ... ("1", '''[{"f1": "value1"},{"f1": "value2"}]'''), + ... ("2", '''[{"f1": "value12"},{"f2": "value13"}]''') + ... ] >>> df = spark.createDataFrame(data, ("key", "jarray")) - >>> extracted1 = get_json_object(df.jarray, '$[0].f1').alias("c0") - >>> extracted2 = get_json_object(df.jarray, '$[1].f2').alias("c1") - >>> df.select(df.key, extracted1, extracted2).show() + >>> df.select(df.key, + ... get_json_object(df.jarray, '$[0].f1').alias("c0"), + ... get_json_object(df.jarray, '$[1].f2').alias("c1") + ... ).show() +---+-------+-------+ |key| c0| c1| +---+-------+-------+ @@ -20147,9 +20148,10 @@ def get_json_object(col: "ColumnOrName", path: str) -> Column: | 2|value12|value13| +---+-------+-------+ - >>> extracted3 = get_json_object(df.jarray, '$[*].f1').alias("c0") - >>> extracted4 = get_json_object(df.jarray, '$[*].f2').alias("c1") - >>> df.select(df.key, extracted3, extracted4).show() + >>> df.select(df.key, + ... get_json_object(df.jarray, '$[*].f1').alias("c0"), + ... get_json_object(df.jarray, '$[*].f2').alias("c1") + ... ).show() +---+-------------------+---------+ |key| c0| c1| +---+-------------------+---------+