Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package org.apache.spark.sql.avro

import scala.util.control.NonFatal

import org.apache.avro.Schema
import org.apache.avro.generic.GenericDatumReader
import org.apache.avro.io.{BinaryDecoder, DecoderFactory}

Expand Down Expand Up @@ -62,9 +63,26 @@ case class AvroDataToCatalyst(

@transient private lazy val reader = new GenericDatumReader[Any](actualSchema, expectedSchema)

/**
* Resolve a union schema that contains a single non-null type by unwrapping it.
* This mirrors the logic in AvroSerializer.resolveNullableType so that from_avro
* can handle the same union schemas that to_avro accepts.
*/
@transient private lazy val resolvedSchema: Schema = {
if (expectedSchema.getType == Schema.Type.UNION) {
val nonNullTypes = AvroUtils.nonNullUnionBranches(expectedSchema)
nonNullTypes match {
case Seq(singleType) => singleType
case _ => expectedSchema
}
} else {
expectedSchema
}
}

@transient private lazy val deserializer =
new AvroDeserializer(
expectedSchema,
resolvedSchema,
dataType,
avroOptions.datetimeRebaseModeInRead,
avroOptions.useStableIdForUnionType,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -776,4 +776,24 @@ class AvroFunctionsSuite extends QueryTest with SharedSparkSession {
assert(!ex.isInstanceOf[NullPointerException],
s"Should not throw NPE, but got: ${ex.getClass.getName}: ${ex.getMessage}")
}

test("SPARK-51961: roundtrip in to_avro and from_avro with union schema of record and null") {
val unionSchema =
"""
|[{
| "type": "record",
| "name": "value",
| "fields": [
| {"name": "age", "type": ["long", "null"]},
| {"name": "name", "type": ["string", "null"]}
| ]
|}, "null"]
""".stripMargin

val df = spark.range(1).select(
struct(lit(2L).as("age"), lit("Alice").as("name")).as("value"))
val avroDF = df.select(to_avro($"value", unionSchema).as("avro"))
val result = avroDF.select(from_avro($"avro", unionSchema).as("value"))
checkAnswer(result, df)
}
}