-
Notifications
You must be signed in to change notification settings - Fork 28.9k
[SPARK-19595][SQL] Support json array in from_json #16929
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
a0a7091
ef007f9
470d879
72d6410
54e60bb
9f1e966
0c088bf
3d490e3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -27,7 +27,7 @@ import org.apache.spark.sql.catalyst.analysis.TypeCheckResult | |
import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback | ||
import org.apache.spark.sql.catalyst.InternalRow | ||
import org.apache.spark.sql.catalyst.json._ | ||
import org.apache.spark.sql.catalyst.util.ParseModes | ||
import org.apache.spark.sql.catalyst.util.{GenericArrayData, ParseModes} | ||
import org.apache.spark.sql.types._ | ||
import org.apache.spark.unsafe.types.UTF8String | ||
import org.apache.spark.util.Utils | ||
|
@@ -480,23 +480,45 @@ case class JsonTuple(children: Seq[Expression]) | |
} | ||
|
||
/** | ||
* Converts an json input string to a [[StructType]] with the specified schema. | ||
* Converts an json input string to a [[StructType]] or [[ArrayType]] with the specified schema. | ||
*/ | ||
case class JsonToStruct( | ||
schema: StructType, | ||
schema: DataType, | ||
options: Map[String, String], | ||
child: Expression, | ||
timeZoneId: Option[String] = None) | ||
extends UnaryExpression with TimeZoneAwareExpression with CodegenFallback with ExpectsInputTypes { | ||
override def nullable: Boolean = true | ||
|
||
def this(schema: StructType, options: Map[String, String], child: Expression) = | ||
def this(schema: DataType, options: Map[String, String], child: Expression) = | ||
this(schema, options, child, None) | ||
|
||
override def checkInputDataTypes(): TypeCheckResult = schema match { | ||
case _: StructType | ArrayType(_: StructType, _) => | ||
super.checkInputDataTypes() | ||
case _ => TypeCheckResult.TypeCheckFailure( | ||
s"Input schema ${schema.simpleString} must be a struct or an array of structs.") | ||
} | ||
|
||
@transient | ||
lazy val rowSchema = schema match { | ||
case st: StructType => st | ||
case ArrayType(st: StructType, _) => st | ||
} | ||
|
||
// This converts parsed rows to the desired output by the given schema. | ||
@transient | ||
lazy val converter = schema match { | ||
case _: StructType => | ||
(rows: Seq[InternalRow]) => if (rows.length == 1) rows.head else null | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this breaks previous behavior. I would still return the first element if There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm okay breaking previous behavior because I'd call truncating an array a bug. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We should list this in the release notes though (i.e. go tag the JIRA). |
||
case ArrayType(_: StructType, _) => | ||
(rows: Seq[InternalRow]) => new GenericArrayData(rows) | ||
} | ||
|
||
@transient | ||
lazy val parser = | ||
new JacksonParser( | ||
schema, | ||
rowSchema, | ||
new JSONOptions(options + ("mode" -> ParseModes.FAIL_FAST_MODE), timeZoneId.get)) | ||
|
||
override def dataType: DataType = schema | ||
|
@@ -505,11 +527,32 @@ case class JsonToStruct( | |
copy(timeZoneId = Option(timeZoneId)) | ||
|
||
override def nullSafeEval(json: Any): Any = { | ||
// When input is, | ||
// - `null`: `null`. | ||
// - invalid json: `null`. | ||
// - empty string: `null`. | ||
// | ||
// When the schema is array, | ||
// - json array: `Array(Row(...), ...)` | ||
// - json object: `Array(Row(...))` | ||
// - empty json array: `Array()`. | ||
// - empty json object: `Array(Row(null))`. | ||
// | ||
// When the schema is a struct, | ||
// - json object/array with single element: `Row(...)` | ||
// - json array with multiple elements: `null` | ||
// - empty json array: `null`. | ||
// - empty json object: `Row(null)`. | ||
|
||
// We need `null` if the input string is an empty string. `JacksonParser` can | ||
// deal with this but produces `Nil`. | ||
if (json.toString.trim.isEmpty) return null | ||
|
||
try { | ||
parser.parse( | ||
converter(parser.parse( | ||
json.asInstanceOf[UTF8String], | ||
CreateJacksonParser.utf8String, | ||
identity[UTF8String]).headOption.orNull | ||
identity[UTF8String])) | ||
} catch { | ||
case _: SparkSQLJsonProcessingException => null | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -22,7 +22,7 @@ import java.util.Calendar | |
import org.apache.spark.SparkFunSuite | ||
import org.apache.spark.sql.catalyst.InternalRow | ||
import org.apache.spark.sql.catalyst.util.{DateTimeTestUtils, DateTimeUtils, ParseModes} | ||
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType, TimestampType} | ||
import org.apache.spark.sql.types._ | ||
import org.apache.spark.unsafe.types.UTF8String | ||
|
||
class JsonExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { | ||
|
@@ -372,6 +372,62 @@ class JsonExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { | |
) | ||
} | ||
|
||
test("from_json - input=array, schema=array, output=array") { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. these are great! thanks! |
||
val input = """[{"a": 1}, {"a": 2}]""" | ||
val schema = ArrayType(StructType(StructField("a", IntegerType) :: Nil)) | ||
val output = InternalRow(1) :: InternalRow(2) :: Nil | ||
checkEvaluation(JsonToStruct(schema, Map.empty, Literal(input), gmtId), output) | ||
} | ||
|
||
test("from_json - input=object, schema=array, output=array of single row") { | ||
val input = """{"a": 1}""" | ||
val schema = ArrayType(StructType(StructField("a", IntegerType) :: Nil)) | ||
val output = InternalRow(1) :: Nil | ||
checkEvaluation(JsonToStruct(schema, Map.empty, Literal(input), gmtId), output) | ||
} | ||
|
||
test("from_json - input=empty array, schema=array, output=empty array") { | ||
val input = "[ ]" | ||
val schema = ArrayType(StructType(StructField("a", IntegerType) :: Nil)) | ||
val output = Nil | ||
checkEvaluation(JsonToStruct(schema, Map.empty, Literal(input), gmtId), output) | ||
} | ||
|
||
test("from_json - input=empty object, schema=array, output=array of single row with null") { | ||
val input = "{ }" | ||
val schema = ArrayType(StructType(StructField("a", IntegerType) :: Nil)) | ||
val output = InternalRow(null) :: Nil | ||
checkEvaluation(JsonToStruct(schema, Map.empty, Literal(input), gmtId), output) | ||
} | ||
|
||
test("from_json - input=array of single object, schema=struct, output=single row") { | ||
val input = """[{"a": 1}]""" | ||
val schema = StructType(StructField("a", IntegerType) :: Nil) | ||
val output = InternalRow(1) | ||
checkEvaluation(JsonToStruct(schema, Map.empty, Literal(input), gmtId), output) | ||
} | ||
|
||
test("from_json - input=array, schema=struct, output=null") { | ||
val input = """[{"a": 1}, {"a": 2}]""" | ||
val schema = StructType(StructField("a", IntegerType) :: Nil) | ||
val output = null | ||
checkEvaluation(JsonToStruct(schema, Map.empty, Literal(input), gmtId), output) | ||
} | ||
|
||
test("from_json - input=empty array, schema=struct, output=null") { | ||
val input = """[]""" | ||
val schema = StructType(StructField("a", IntegerType) :: Nil) | ||
val output = null | ||
checkEvaluation(JsonToStruct(schema, Map.empty, Literal(input), gmtId), output) | ||
} | ||
|
||
test("from_json - input=empty object, schema=struct, output=single row with null") { | ||
val input = """{ }""" | ||
val schema = StructType(StructField("a", IntegerType) :: Nil) | ||
val output = InternalRow(null) | ||
checkEvaluation(JsonToStruct(schema, Map.empty, Literal(input), gmtId), output) | ||
} | ||
|
||
test("from_json null input column") { | ||
val schema = StructType(StructField("a", IntegerType) :: Nil) | ||
checkEvaluation( | ||
|
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
why not just override:
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Uh.. I thought
schema
is not the child of the expression. Let me check again!There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I tried several combinations with
TypeCollection
but it seems not working.