diff --git a/.rat-excludes b/.rat-excludes
index 0240e81c45ea2..236c2db05367c 100644
--- a/.rat-excludes
+++ b/.rat-excludes
@@ -91,3 +91,5 @@ help/*
html/*
INDEX
.lintr
+gen-java.*
+.*avpr
diff --git a/pom.xml b/pom.xml
index bece526a2a3e5..9cf2471b51304 100644
--- a/pom.xml
+++ b/pom.xml
@@ -161,6 +161,7 @@
2.4.41.1.1.71.1.2
+ 0.9.2false
@@ -179,6 +180,8 @@
compilecompilecompile
+ test
+ test
+
+ twttr-repo
+ Twttr Repository
+ http://maven.twttr.com
+
+ true
+
+
+ false
+
+ spark-1.4-staging
@@ -1101,6 +1116,24 @@
${parquet.version}${parquet.deps.scope}
+
+ org.apache.parquet
+ parquet-avro
+ ${parquet.version}
+ ${parquet.test.deps.scope}
+
+
+ org.apache.parquet
+ parquet-thrift
+ ${parquet.version}
+ ${parquet.test.deps.scope}
+
+
+ org.apache.thrift
+ libthrift
+ ${thrift.version}
+ ${thrift.test.deps.scope}
+ org.apache.flumeflume-ng-core
diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index 680b699e9e4a1..0a26b5f01d3b8 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -60,21 +60,8 @@ object MimaExcludes {
"org.apache.spark.ml.regression.LeastSquaresCostFun.this"),
// SQL execution is considered private.
excludePackage("org.apache.spark.sql.execution"),
- // NanoTime and CatalystTimestampConverter is only used inside catalyst,
- // not needed anymore
- ProblemFilters.exclude[MissingClassProblem](
- "org.apache.spark.sql.parquet.timestamp.NanoTime"),
- ProblemFilters.exclude[MissingClassProblem](
- "org.apache.spark.sql.parquet.timestamp.NanoTime$"),
- ProblemFilters.exclude[MissingClassProblem](
- "org.apache.spark.sql.parquet.CatalystTimestampConverter"),
- ProblemFilters.exclude[MissingClassProblem](
- "org.apache.spark.sql.parquet.CatalystTimestampConverter$"),
- // SPARK-6777 Implements backwards compatibility rules in CatalystSchemaConverter
- ProblemFilters.exclude[MissingClassProblem](
- "org.apache.spark.sql.parquet.ParquetTypeInfo"),
- ProblemFilters.exclude[MissingClassProblem](
- "org.apache.spark.sql.parquet.ParquetTypeInfo$")
+ // Parquet support is considered private.
+ excludePackage("org.apache.spark.sql.parquet")
) ++ Seq(
// SPARK-8479 Add numNonzeros and numActives to Matrix.
ProblemFilters.exclude[MissingMethodProblem](
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala
index 7d00047d08d74..a4c2da8e05f5d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala
@@ -17,11 +17,12 @@
package org.apache.spark.sql.types
+import scala.util.Try
import scala.util.parsing.combinator.RegexParsers
-import org.json4s._
import org.json4s.JsonAST.JValue
import org.json4s.JsonDSL._
+import org.json4s._
import org.json4s.jackson.JsonMethods._
import org.apache.spark.annotation.DeveloperApi
@@ -82,6 +83,9 @@ abstract class DataType extends AbstractDataType {
object DataType {
+ private[sql] def fromString(raw: String): DataType = {
+ Try(DataType.fromJson(raw)).getOrElse(DataType.fromCaseClassString(raw))
+ }
def fromJson(json: String): DataType = parseDataType(parse(json))
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala
index 3b17566d54d9b..e2d3f53f7d978 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala
@@ -311,6 +311,11 @@ object StructType extends AbstractDataType {
private[sql] override def simpleString: String = "struct"
+ private[sql] def fromString(raw: String): StructType = DataType.fromString(raw) match {
+ case t: StructType => t
+ case _ => throw new RuntimeException(s"Failed parsing StructType: $raw")
+ }
+
def apply(fields: Seq[StructField]): StructType = StructType(fields.toArray)
def apply(fields: java.util.List[StructField]): StructType = {
diff --git a/sql/core/pom.xml b/sql/core/pom.xml
index 8fc16928adbd9..f90099f22d4bd 100644
--- a/sql/core/pom.xml
+++ b/sql/core/pom.xml
@@ -101,9 +101,45 @@
9.3-1102-jdbc41test
+
+ org.apache.parquet
+ parquet-avro
+ test
+
+
+ org.apache.parquet
+ parquet-thrift
+ test
+
+
+ org.apache.thrift
+ libthrift
+ test
+ target/scala-${scala.binary.version}/classestarget/scala-${scala.binary.version}/test-classes
+
+
+ org.codehaus.mojo
+ build-helper-maven-plugin
+
+
+ add-scala-test-sources
+ generate-test-sources
+
+ add-test-source
+
+
+
+
+
+
+
+
+
+
+
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystRowConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystRowConverter.scala
new file mode 100644
index 0000000000000..0c3d8fdab6bd2
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystRowConverter.scala
@@ -0,0 +1,434 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.parquet
+
+import java.nio.ByteOrder
+
+import scala.collection.JavaConversions._
+import scala.collection.mutable
+import scala.collection.mutable.ArrayBuffer
+
+import org.apache.parquet.column.Dictionary
+import org.apache.parquet.io.api.{Binary, Converter, GroupConverter, PrimitiveConverter}
+import org.apache.parquet.schema.Type.Repetition
+import org.apache.parquet.schema.{GroupType, PrimitiveType, Type}
+
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.util.DateTimeUtils
+import org.apache.spark.sql.types._
+import org.apache.spark.unsafe.types.UTF8String
+
+/**
+ * A [[ParentContainerUpdater]] is used by a Parquet converter to set converted values to some
+ * corresponding parent container. For example, a converter for a `StructType` field may set
+ * converted values to a [[MutableRow]]; or a converter for array elements may append converted
+ * values to an [[ArrayBuffer]].
+ */
+private[parquet] trait ParentContainerUpdater {
+ def set(value: Any): Unit = ()
+ def setBoolean(value: Boolean): Unit = set(value)
+ def setByte(value: Byte): Unit = set(value)
+ def setShort(value: Short): Unit = set(value)
+ def setInt(value: Int): Unit = set(value)
+ def setLong(value: Long): Unit = set(value)
+ def setFloat(value: Float): Unit = set(value)
+ def setDouble(value: Double): Unit = set(value)
+}
+
+/** A no-op updater used for root converter (who doesn't have a parent). */
+private[parquet] object NoopUpdater extends ParentContainerUpdater
+
+/**
+ * A [[CatalystRowConverter]] is used to convert Parquet "structs" into Spark SQL [[Row]]s. Since
+ * any Parquet record is also a struct, this converter can also be used as root converter.
+ *
+ * When used as a root converter, [[NoopUpdater]] should be used since root converters don't have
+ * any "parent" container.
+ *
+ * @param parquetType Parquet schema of Parquet records
+ * @param catalystType Spark SQL schema that corresponds to the Parquet record type
+ * @param updater An updater which propagates converted field values to the parent container
+ */
+private[parquet] class CatalystRowConverter(
+ parquetType: GroupType,
+ catalystType: StructType,
+ updater: ParentContainerUpdater)
+ extends GroupConverter {
+
+ /**
+ * Updater used together with field converters within a [[CatalystRowConverter]]. It propagates
+ * converted filed values to the `ordinal`-th cell in `currentRow`.
+ */
+ private final class RowUpdater(row: MutableRow, ordinal: Int) extends ParentContainerUpdater {
+ override def set(value: Any): Unit = row(ordinal) = value
+ override def setBoolean(value: Boolean): Unit = row.setBoolean(ordinal, value)
+ override def setByte(value: Byte): Unit = row.setByte(ordinal, value)
+ override def setShort(value: Short): Unit = row.setShort(ordinal, value)
+ override def setInt(value: Int): Unit = row.setInt(ordinal, value)
+ override def setLong(value: Long): Unit = row.setLong(ordinal, value)
+ override def setDouble(value: Double): Unit = row.setDouble(ordinal, value)
+ override def setFloat(value: Float): Unit = row.setFloat(ordinal, value)
+ }
+
+ /**
+ * Represents the converted row object once an entire Parquet record is converted.
+ *
+ * @todo Uses [[UnsafeRow]] for better performance.
+ */
+ val currentRow = new SpecificMutableRow(catalystType.map(_.dataType))
+
+ // Converters for each field.
+ private val fieldConverters: Array[Converter] = {
+ parquetType.getFields.zip(catalystType).zipWithIndex.map {
+ case ((parquetFieldType, catalystField), ordinal) =>
+ // Converted field value should be set to the `ordinal`-th cell of `currentRow`
+ newConverter(parquetFieldType, catalystField.dataType, new RowUpdater(currentRow, ordinal))
+ }.toArray
+ }
+
+ override def getConverter(fieldIndex: Int): Converter = fieldConverters(fieldIndex)
+
+ override def end(): Unit = updater.set(currentRow)
+
+ override def start(): Unit = {
+ var i = 0
+ while (i < currentRow.length) {
+ currentRow.setNullAt(i)
+ i += 1
+ }
+ }
+
+ /**
+ * Creates a converter for the given Parquet type `parquetType` and Spark SQL data type
+ * `catalystType`. Converted values are handled by `updater`.
+ */
+ private def newConverter(
+ parquetType: Type,
+ catalystType: DataType,
+ updater: ParentContainerUpdater): Converter = {
+
+ catalystType match {
+ case BooleanType | IntegerType | LongType | FloatType | DoubleType | BinaryType =>
+ new CatalystPrimitiveConverter(updater)
+
+ case ByteType =>
+ new PrimitiveConverter {
+ override def addInt(value: Int): Unit =
+ updater.setByte(value.asInstanceOf[ByteType#InternalType])
+ }
+
+ case ShortType =>
+ new PrimitiveConverter {
+ override def addInt(value: Int): Unit =
+ updater.setShort(value.asInstanceOf[ShortType#InternalType])
+ }
+
+ case t: DecimalType =>
+ new CatalystDecimalConverter(t, updater)
+
+ case StringType =>
+ new CatalystStringConverter(updater)
+
+ case TimestampType =>
+ // TODO Implements `TIMESTAMP_MICROS` once parquet-mr has that.
+ new PrimitiveConverter {
+ // Converts nanosecond timestamps stored as INT96
+ override def addBinary(value: Binary): Unit = {
+ assert(
+ value.length() == 12,
+ "Timestamps (with nanoseconds) are expected to be stored in 12-byte long binaries, " +
+ s"but got a ${value.length()}-byte binary.")
+
+ val buf = value.toByteBuffer.order(ByteOrder.LITTLE_ENDIAN)
+ val timeOfDayNanos = buf.getLong
+ val julianDay = buf.getInt
+ updater.setLong(DateTimeUtils.fromJulianDay(julianDay, timeOfDayNanos))
+ }
+ }
+
+ case DateType =>
+ new PrimitiveConverter {
+ override def addInt(value: Int): Unit = {
+ // DateType is not specialized in `SpecificMutableRow`, have to box it here.
+ updater.set(value.asInstanceOf[DateType#InternalType])
+ }
+ }
+
+ case t: ArrayType =>
+ new CatalystArrayConverter(parquetType.asGroupType(), t, updater)
+
+ case t: MapType =>
+ new CatalystMapConverter(parquetType.asGroupType(), t, updater)
+
+ case t: StructType =>
+ new CatalystRowConverter(parquetType.asGroupType(), t, new ParentContainerUpdater {
+ override def set(value: Any): Unit = updater.set(value.asInstanceOf[Row].copy())
+ })
+
+ case t: UserDefinedType[_] =>
+ val catalystTypeForUDT = t.sqlType
+ val nullable = parquetType.isRepetition(Repetition.OPTIONAL)
+ val field = StructField("udt", catalystTypeForUDT, nullable)
+ val parquetTypeForUDT = new CatalystSchemaConverter().convertField(field)
+ newConverter(parquetTypeForUDT, catalystTypeForUDT, updater)
+
+ case _ =>
+ throw new RuntimeException(
+ s"Unable to create Parquet converter for data type ${catalystType.json}")
+ }
+ }
+
+ /**
+ * Parquet converter for Parquet primitive types. Note that not all Spark SQL atomic types
+ * are handled by this converter. Parquet primitive types are only a subset of those of Spark
+ * SQL. For example, BYTE, SHORT, and INT in Spark SQL are all covered by INT32 in Parquet.
+ */
+ private final class CatalystPrimitiveConverter(updater: ParentContainerUpdater)
+ extends PrimitiveConverter {
+
+ override def addBoolean(value: Boolean): Unit = updater.setBoolean(value)
+ override def addInt(value: Int): Unit = updater.setInt(value)
+ override def addLong(value: Long): Unit = updater.setLong(value)
+ override def addFloat(value: Float): Unit = updater.setFloat(value)
+ override def addDouble(value: Double): Unit = updater.setDouble(value)
+ override def addBinary(value: Binary): Unit = updater.set(value.getBytes)
+ }
+
+ /**
+ * Parquet converter for strings. A dictionary is used to minimize string decoding cost.
+ */
+ private final class CatalystStringConverter(updater: ParentContainerUpdater)
+ extends PrimitiveConverter {
+
+ private var expandedDictionary: Array[UTF8String] = null
+
+ override def hasDictionarySupport: Boolean = true
+
+ override def setDictionary(dictionary: Dictionary): Unit = {
+ this.expandedDictionary = Array.tabulate(dictionary.getMaxId + 1) { i =>
+ UTF8String.fromBytes(dictionary.decodeToBinary(i).getBytes)
+ }
+ }
+
+ override def addValueFromDictionary(dictionaryId: Int): Unit = {
+ updater.set(expandedDictionary(dictionaryId))
+ }
+
+ override def addBinary(value: Binary): Unit = {
+ updater.set(UTF8String.fromBytes(value.getBytes))
+ }
+ }
+
+ /**
+ * Parquet converter for fixed-precision decimals.
+ */
+ private final class CatalystDecimalConverter(
+ decimalType: DecimalType,
+ updater: ParentContainerUpdater)
+ extends PrimitiveConverter {
+
+ // Converts decimals stored as INT32
+ override def addInt(value: Int): Unit = {
+ addLong(value: Long)
+ }
+
+ // Converts decimals stored as INT64
+ override def addLong(value: Long): Unit = {
+ updater.set(Decimal(value, decimalType.precision, decimalType.scale))
+ }
+
+ // Converts decimals stored as either FIXED_LENGTH_BYTE_ARRAY or BINARY
+ override def addBinary(value: Binary): Unit = {
+ updater.set(toDecimal(value))
+ }
+
+ private def toDecimal(value: Binary): Decimal = {
+ val precision = decimalType.precision
+ val scale = decimalType.scale
+ val bytes = value.getBytes
+
+ var unscaled = 0L
+ var i = 0
+
+ while (i < bytes.length) {
+ unscaled = (unscaled << 8) | (bytes(i) & 0xff)
+ i += 1
+ }
+
+ val bits = 8 * bytes.length
+ unscaled = (unscaled << (64 - bits)) >> (64 - bits)
+ Decimal(unscaled, precision, scale)
+ }
+ }
+
+ /**
+ * Parquet converter for arrays. Spark SQL arrays are represented as Parquet lists. Standard
+ * Parquet lists are represented as a 3-level group annotated by `LIST`:
+ * {{{
+ * group (LIST) { <-- parquetSchema points here
+ * repeated group list {
+ * element;
+ * }
+ * }
+ * }}}
+ * The `parquetSchema` constructor argument points to the outermost group.
+ *
+ * However, before this representation is standardized, some Parquet libraries/tools also use some
+ * non-standard formats to represent list-like structures. Backwards-compatibility rules for
+ * handling these cases are described in Parquet format spec.
+ *
+ * @see https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#lists
+ */
+ private final class CatalystArrayConverter(
+ parquetSchema: GroupType,
+ catalystSchema: ArrayType,
+ updater: ParentContainerUpdater)
+ extends GroupConverter {
+
+ private var currentArray: ArrayBuffer[Any] = _
+
+ private val elementConverter: Converter = {
+ val repeatedType = parquetSchema.getType(0)
+ val elementType = catalystSchema.elementType
+
+ if (isElementType(repeatedType, elementType)) {
+ newConverter(repeatedType, elementType, new ParentContainerUpdater {
+ override def set(value: Any): Unit = currentArray += value
+ })
+ } else {
+ new ElementConverter(repeatedType.asGroupType().getType(0), elementType)
+ }
+ }
+
+ override def getConverter(fieldIndex: Int): Converter = elementConverter
+
+ override def end(): Unit = updater.set(currentArray)
+
+ // NOTE: We can't reuse the mutable `ArrayBuffer` here and must instantiate a new buffer for the
+ // next value. `Row.copy()` only copies row cells, it doesn't do deep copy to objects stored
+ // in row cells.
+ override def start(): Unit = currentArray = ArrayBuffer.empty[Any]
+
+ // scalastyle:off
+ /**
+ * Returns whether the given type is the element type of a list or is a syntactic group with
+ * one field that is the element type. This is determined by checking whether the type can be
+ * a syntactic group and by checking whether a potential syntactic group matches the expected
+ * schema.
+ * {{{
+ * group (LIST) {
+ * repeated group list { <-- repeatedType points here
+ * element;
+ * }
+ * }
+ * }}}
+ * In short, here we handle Parquet list backwards-compatibility rules on the read path. This
+ * method is based on `AvroIndexedRecordConverter.isElementType`.
+ *
+ * @see https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#backward-compatibility-rules
+ */
+ // scalastyle:on
+ private def isElementType(parquetRepeatedType: Type, catalystElementType: DataType): Boolean = {
+ (parquetRepeatedType, catalystElementType) match {
+ case (t: PrimitiveType, _) => true
+ case (t: GroupType, _) if t.getFieldCount > 1 => true
+ case (t: GroupType, StructType(Array(f))) if f.name == t.getFieldName(0) => true
+ case _ => false
+ }
+ }
+
+ /** Array element converter */
+ private final class ElementConverter(parquetType: Type, catalystType: DataType)
+ extends GroupConverter {
+
+ private var currentElement: Any = _
+
+ private val converter = newConverter(parquetType, catalystType, new ParentContainerUpdater {
+ override def set(value: Any): Unit = currentElement = value
+ })
+
+ override def getConverter(fieldIndex: Int): Converter = converter
+
+ override def end(): Unit = currentArray += currentElement
+
+ override def start(): Unit = currentElement = null
+ }
+ }
+
+ /** Parquet converter for maps */
+ private final class CatalystMapConverter(
+ parquetType: GroupType,
+ catalystType: MapType,
+ updater: ParentContainerUpdater)
+ extends GroupConverter {
+
+ private var currentMap: mutable.Map[Any, Any] = _
+
+ private val keyValueConverter = {
+ val repeatedType = parquetType.getType(0).asGroupType()
+ new KeyValueConverter(
+ repeatedType.getType(0),
+ repeatedType.getType(1),
+ catalystType.keyType,
+ catalystType.valueType)
+ }
+
+ override def getConverter(fieldIndex: Int): Converter = keyValueConverter
+
+ override def end(): Unit = updater.set(currentMap)
+
+ // NOTE: We can't reuse the mutable Map here and must instantiate a new `Map` for the next
+ // value. `Row.copy()` only copies row cells, it doesn't do deep copy to objects stored in row
+ // cells.
+ override def start(): Unit = currentMap = mutable.Map.empty[Any, Any]
+
+ /** Parquet converter for key-value pairs within the map. */
+ private final class KeyValueConverter(
+ parquetKeyType: Type,
+ parquetValueType: Type,
+ catalystKeyType: DataType,
+ catalystValueType: DataType)
+ extends GroupConverter {
+
+ private var currentKey: Any = _
+
+ private var currentValue: Any = _
+
+ private val converters = Array(
+ // Converter for keys
+ newConverter(parquetKeyType, catalystKeyType, new ParentContainerUpdater {
+ override def set(value: Any): Unit = currentKey = value
+ }),
+
+ // Converter for values
+ newConverter(parquetValueType, catalystValueType, new ParentContainerUpdater {
+ override def set(value: Any): Unit = currentValue = value
+ }))
+
+ override def getConverter(fieldIndex: Int): Converter = converters(fieldIndex)
+
+ override def end(): Unit = currentMap(currentKey) = currentValue
+
+ override def start(): Unit = {
+ currentKey = null
+ currentValue = null
+ }
+ }
+ }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala
index 4ab274ec17a02..de3a72d8146c5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala
@@ -358,9 +358,24 @@ private[parquet] class CatalystSchemaConverter(
case DateType =>
Types.primitive(INT32, repetition).as(DATE).named(field.name)
- // NOTE: !! This timestamp type is not specified in Parquet format spec !!
- // However, Impala and older versions of Spark SQL use INT96 to store timestamps with
- // nanosecond precision (not TIME_MILLIS or TIMESTAMP_MILLIS described in the spec).
+ // NOTE: Spark SQL TimestampType is NOT a well defined type in Parquet format spec.
+ //
+ // As stated in PARQUET-323, Parquet `INT96` was originally introduced to represent nanosecond
+ // timestamp in Impala for some historical reasons, it's not recommended to be used for any
+ // other types and will probably be deprecated in future Parquet format spec. That's the
+ // reason why Parquet format spec only defines `TIMESTAMP_MILLIS` and `TIMESTAMP_MICROS` which
+ // are both logical types annotating `INT64`.
+ //
+ // Originally, Spark SQL uses the same nanosecond timestamp type as Impala and Hive. Starting
+ // from Spark 1.5.0, we resort to a timestamp type with 100 ns precision so that we can store
+ // a timestamp into a `Long`. This design decision is subject to change though, for example,
+ // we may resort to microsecond precision in the future.
+ //
+ // For Parquet, we plan to write all `TimestampType` value as `TIMESTAMP_MICROS`, but it's
+ // currently not implemented yet because parquet-mr 1.7.0 (the version we're currently using)
+ // hasn't implemented `TIMESTAMP_MICROS` yet.
+ //
+ // TODO Implements `TIMESTAMP_MICROS` once parquet-mr has that.
case TimestampType =>
Types.primitive(INT96, repetition).named(field.name)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetConverter.scala
index 86a77bf965daa..be0a2029d233b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetConverter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetConverter.scala
@@ -17,61 +17,15 @@
package org.apache.spark.sql.parquet
-import java.nio.ByteOrder
-
-import scala.collection.mutable.{ArrayBuffer, Buffer, HashMap}
-
-import org.apache.parquet.Preconditions
-import org.apache.parquet.column.Dictionary
-import org.apache.parquet.io.api.{Binary, Converter, GroupConverter, PrimitiveConverter}
-import org.apache.parquet.schema.MessageType
-
import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.util.DateTimeUtils
-import org.apache.spark.sql.parquet.CatalystConverter.FieldType
-import org.apache.spark.sql.types._
-import org.apache.spark.unsafe.types.UTF8String
-
-/**
- * Collection of converters of Parquet types (group and primitive types) that
- * model arrays and maps. The conversions are partly based on the AvroParquet
- * converters that are part of Parquet in order to be able to process these
- * types.
- *
- * There are several types of converters:
- *
- *
[[org.apache.spark.sql.parquet.CatalystPrimitiveConverter]] for primitive
- * (numeric, boolean and String) types
- *
[[org.apache.spark.sql.parquet.CatalystNativeArrayConverter]] for arrays
- * of native JVM element types; note: currently null values are not supported!
- *
[[org.apache.spark.sql.parquet.CatalystArrayConverter]] for arrays of
- * arbitrary element types (including nested element types); note: currently
- * null values are not supported!
- *
[[org.apache.spark.sql.parquet.CatalystStructConverter]] for structs
- *
[[org.apache.spark.sql.parquet.CatalystMapConverter]] for maps; note:
- * currently null values are not supported!
- *
[[org.apache.spark.sql.parquet.CatalystPrimitiveRowConverter]] for rows
- * of only primitive element types
- *
[[org.apache.spark.sql.parquet.CatalystGroupConverter]] for other nested
- * records, including the top-level row record
- *
- */
private[sql] object CatalystConverter {
- // The type internally used for fields
- type FieldType = StructField
-
// This is mostly Parquet convention (see, e.g., `ConversionPatterns`).
// Note that "array" for the array elements is chosen by ParquetAvro.
// Using a different value will result in Parquet silently dropping columns.
val ARRAY_CONTAINS_NULL_BAG_SCHEMA_NAME = "bag"
val ARRAY_ELEMENTS_SCHEMA_NAME = "array"
- // SPARK-4520: Thrift generated parquet files have different array element
- // schema names than avro. Thrift parquet uses array_schema_name + "_tuple"
- // as opposed to "array" used by default. For more information, check
- // TestThriftSchemaConverter.java in parquet.thrift.
- val THRIFT_ARRAY_ELEMENTS_SCHEMA_NAME_SUFFIX = "_tuple"
+
val MAP_KEY_SCHEMA_NAME = "key"
val MAP_VALUE_SCHEMA_NAME = "value"
val MAP_SCHEMA_NAME = "map"
@@ -80,787 +34,4 @@ private[sql] object CatalystConverter {
type ArrayScalaType[T] = Seq[T]
type StructScalaType[T] = InternalRow
type MapScalaType[K, V] = Map[K, V]
-
- protected[parquet] def createConverter(
- field: FieldType,
- fieldIndex: Int,
- parent: CatalystConverter): Converter = {
- val fieldType: DataType = field.dataType
- fieldType match {
- case udt: UserDefinedType[_] => {
- createConverter(field.copy(dataType = udt.sqlType), fieldIndex, parent)
- }
- // For native JVM types we use a converter with native arrays
- case ArrayType(elementType: AtomicType, false) => {
- new CatalystNativeArrayConverter(elementType, fieldIndex, parent)
- }
- // This is for other types of arrays, including those with nested fields
- case ArrayType(elementType: DataType, false) => {
- new CatalystArrayConverter(elementType, fieldIndex, parent)
- }
- case ArrayType(elementType: DataType, true) => {
- new CatalystArrayContainsNullConverter(elementType, fieldIndex, parent)
- }
- case StructType(fields: Array[StructField]) => {
- new CatalystStructConverter(fields, fieldIndex, parent)
- }
- case MapType(keyType: DataType, valueType: DataType, valueContainsNull: Boolean) => {
- new CatalystMapConverter(
- Array(
- new FieldType(MAP_KEY_SCHEMA_NAME, keyType, false),
- new FieldType(MAP_VALUE_SCHEMA_NAME, valueType, valueContainsNull)),
- fieldIndex,
- parent)
- }
- // Strings, Shorts and Bytes do not have a corresponding type in Parquet
- // so we need to treat them separately
- case StringType =>
- new CatalystPrimitiveStringConverter(parent, fieldIndex)
- case ShortType => {
- new CatalystPrimitiveConverter(parent, fieldIndex) {
- override def addInt(value: Int): Unit =
- parent.updateShort(fieldIndex, value.asInstanceOf[ShortType.InternalType])
- }
- }
- case ByteType => {
- new CatalystPrimitiveConverter(parent, fieldIndex) {
- override def addInt(value: Int): Unit =
- parent.updateByte(fieldIndex, value.asInstanceOf[ByteType.InternalType])
- }
- }
- case DateType => {
- new CatalystPrimitiveConverter(parent, fieldIndex) {
- override def addInt(value: Int): Unit =
- parent.updateDate(fieldIndex, value.asInstanceOf[DateType.InternalType])
- }
- }
- case d: DecimalType => {
- new CatalystPrimitiveConverter(parent, fieldIndex) {
- override def addBinary(value: Binary): Unit =
- parent.updateDecimal(fieldIndex, value, d)
- }
- }
- case TimestampType => {
- new CatalystPrimitiveConverter(parent, fieldIndex) {
- override def addBinary(value: Binary): Unit =
- parent.updateTimestamp(fieldIndex, value)
- }
- }
- // All other primitive types use the default converter
- case ctype: DataType if ParquetTypesConverter.isPrimitiveType(ctype) => {
- // note: need the type tag here!
- new CatalystPrimitiveConverter(parent, fieldIndex)
- }
- case _ => throw new RuntimeException(
- s"unable to convert datatype ${field.dataType.toString} in CatalystConverter")
- }
- }
-
- protected[parquet] def createRootConverter(
- parquetSchema: MessageType,
- attributes: Seq[Attribute]): CatalystConverter = {
- // For non-nested types we use the optimized Row converter
- if (attributes.forall(a => ParquetTypesConverter.isPrimitiveType(a.dataType))) {
- new CatalystPrimitiveRowConverter(attributes.toArray)
- } else {
- new CatalystGroupConverter(attributes.toArray)
- }
- }
-}
-
-private[parquet] abstract class CatalystConverter extends GroupConverter {
- /**
- * The number of fields this group has
- */
- protected[parquet] val size: Int
-
- /**
- * The index of this converter in the parent
- */
- protected[parquet] val index: Int
-
- /**
- * The parent converter
- */
- protected[parquet] val parent: CatalystConverter
-
- /**
- * Called by child converters to update their value in its parent (this).
- * Note that if possible the more specific update methods below should be used
- * to avoid auto-boxing of native JVM types.
- *
- * @param fieldIndex
- * @param value
- */
- protected[parquet] def updateField(fieldIndex: Int, value: Any): Unit
-
- protected[parquet] def updateBoolean(fieldIndex: Int, value: Boolean): Unit =
- updateField(fieldIndex, value)
-
- protected[parquet] def updateInt(fieldIndex: Int, value: Int): Unit =
- updateField(fieldIndex, value)
-
- protected[parquet] def updateDate(fieldIndex: Int, value: Int): Unit =
- updateField(fieldIndex, value)
-
- protected[parquet] def updateLong(fieldIndex: Int, value: Long): Unit =
- updateField(fieldIndex, value)
-
- protected[parquet] def updateShort(fieldIndex: Int, value: Short): Unit =
- updateField(fieldIndex, value)
-
- protected[parquet] def updateByte(fieldIndex: Int, value: Byte): Unit =
- updateField(fieldIndex, value)
-
- protected[parquet] def updateDouble(fieldIndex: Int, value: Double): Unit =
- updateField(fieldIndex, value)
-
- protected[parquet] def updateFloat(fieldIndex: Int, value: Float): Unit =
- updateField(fieldIndex, value)
-
- protected[parquet] def updateBinary(fieldIndex: Int, value: Binary): Unit =
- updateField(fieldIndex, value.getBytes)
-
- protected[parquet] def updateString(fieldIndex: Int, value: Array[Byte]): Unit =
- updateField(fieldIndex, UTF8String.fromBytes(value))
-
- protected[parquet] def updateTimestamp(fieldIndex: Int, value: Binary): Unit =
- updateField(fieldIndex, readTimestamp(value))
-
- protected[parquet] def updateDecimal(fieldIndex: Int, value: Binary, ctype: DecimalType): Unit =
- updateField(fieldIndex, readDecimal(new Decimal(), value, ctype))
-
- protected[parquet] def isRootConverter: Boolean = parent == null
-
- protected[parquet] def clearBuffer(): Unit
-
- /**
- * Should only be called in the root (group) converter!
- *
- * @return
- */
- def getCurrentRecord: InternalRow = throw new UnsupportedOperationException
-
- /**
- * Read a decimal value from a Parquet Binary into "dest". Only supports decimals that fit in
- * a long (i.e. precision <= 18)
- *
- * Returned value is needed by CatalystConverter, which doesn't reuse the Decimal object.
- */
- protected[parquet] def readDecimal(dest: Decimal, value: Binary, ctype: DecimalType): Decimal = {
- val precision = ctype.precisionInfo.get.precision
- val scale = ctype.precisionInfo.get.scale
- val bytes = value.getBytes
- require(bytes.length <= 16, "Decimal field too large to read")
- var unscaled = 0L
- var i = 0
- while (i < bytes.length) {
- unscaled = (unscaled << 8) | (bytes(i) & 0xFF)
- i += 1
- }
- // Make sure unscaled has the right sign, by sign-extending the first bit
- val numBits = 8 * bytes.length
- unscaled = (unscaled << (64 - numBits)) >> (64 - numBits)
- dest.set(unscaled, precision, scale)
- }
-
- /**
- * Read a Timestamp value from a Parquet Int96Value
- */
- protected[parquet] def readTimestamp(value: Binary): Long = {
- Preconditions.checkArgument(value.length() == 12, "Must be 12 bytes")
- val buf = value.toByteBuffer
- buf.order(ByteOrder.LITTLE_ENDIAN)
- val timeOfDayNanos = buf.getLong
- val julianDay = buf.getInt
- DateTimeUtils.fromJulianDay(julianDay, timeOfDayNanos)
- }
-}
-
-/**
- * A `parquet.io.api.GroupConverter` that is able to convert a Parquet record
- * to a [[org.apache.spark.sql.catalyst.expressions.InternalRow]] object.
- *
- * @param schema The corresponding Catalyst schema in the form of a list of attributes.
- */
-private[parquet] class CatalystGroupConverter(
- protected[parquet] val schema: Array[FieldType],
- protected[parquet] val index: Int,
- protected[parquet] val parent: CatalystConverter,
- protected[parquet] var current: ArrayBuffer[Any],
- protected[parquet] var buffer: ArrayBuffer[InternalRow])
- extends CatalystConverter {
-
- def this(schema: Array[FieldType], index: Int, parent: CatalystConverter) =
- this(
- schema,
- index,
- parent,
- current = null,
- buffer = new ArrayBuffer[InternalRow](
- CatalystArrayConverter.INITIAL_ARRAY_SIZE))
-
- /**
- * This constructor is used for the root converter only!
- */
- def this(attributes: Array[Attribute]) =
- this(attributes.map(a => new FieldType(a.name, a.dataType, a.nullable)), 0, null)
-
- protected [parquet] val converters: Array[Converter] =
- schema.zipWithIndex.map {
- case (field, idx) => CatalystConverter.createConverter(field, idx, this)
- }.toArray
-
- override val size = schema.size
-
- override def getCurrentRecord: InternalRow = {
- assert(isRootConverter, "getCurrentRecord should only be called in root group converter!")
- // TODO: use iterators if possible
- // Note: this will ever only be called in the root converter when the record has been
- // fully processed. Therefore it will be difficult to use mutable rows instead, since
- // any non-root converter never would be sure when it would be safe to re-use the buffer.
- new GenericInternalRow(current.toArray)
- }
-
- override def getConverter(fieldIndex: Int): Converter = converters(fieldIndex)
-
- // for child converters to update upstream values
- override protected[parquet] def updateField(fieldIndex: Int, value: Any): Unit = {
- current.update(fieldIndex, value)
- }
-
- override protected[parquet] def clearBuffer(): Unit = buffer.clear()
-
- override def start(): Unit = {
- current = ArrayBuffer.fill(size)(null)
- converters.foreach { converter =>
- if (!converter.isPrimitive) {
- converter.asInstanceOf[CatalystConverter].clearBuffer()
- }
- }
- }
-
- override def end(): Unit = {
- if (!isRootConverter) {
- assert(current != null) // there should be no empty groups
- buffer.append(new GenericInternalRow(current.toArray))
- parent.updateField(index, new GenericInternalRow(buffer.toArray.asInstanceOf[Array[Any]]))
- }
- }
-}
-
-/**
- * A `parquet.io.api.GroupConverter` that is able to convert a Parquet record
- * to a [[org.apache.spark.sql.catalyst.expressions.InternalRow]] object. Note that his
- * converter is optimized for rows of primitive types (non-nested records).
- */
-private[parquet] class CatalystPrimitiveRowConverter(
- protected[parquet] val schema: Array[FieldType],
- protected[parquet] var current: MutableRow)
- extends CatalystConverter {
-
- // This constructor is used for the root converter only
- def this(attributes: Array[Attribute]) =
- this(
- attributes.map(a => new FieldType(a.name, a.dataType, a.nullable)),
- new SpecificMutableRow(attributes.map(_.dataType)))
-
- protected [parquet] val converters: Array[Converter] =
- schema.zipWithIndex.map {
- case (field, idx) => CatalystConverter.createConverter(field, idx, this)
- }.toArray
-
- override val size = schema.size
-
- override val index = 0
-
- override val parent = null
-
- // Should be only called in root group converter!
- override def getCurrentRecord: InternalRow = current
-
- override def getConverter(fieldIndex: Int): Converter = converters(fieldIndex)
-
- // for child converters to update upstream values
- override protected[parquet] def updateField(fieldIndex: Int, value: Any): Unit = {
- throw new UnsupportedOperationException // child converters should use the
- // specific update methods below
- }
-
- override protected[parquet] def clearBuffer(): Unit = {}
-
- override def start(): Unit = {
- var i = 0
- while (i < size) {
- current.setNullAt(i)
- i = i + 1
- }
- }
-
- override def end(): Unit = {}
-
- // Overridden here to avoid auto-boxing for primitive types
- override protected[parquet] def updateBoolean(fieldIndex: Int, value: Boolean): Unit =
- current.setBoolean(fieldIndex, value)
-
- override protected[parquet] def updateInt(fieldIndex: Int, value: Int): Unit =
- current.setInt(fieldIndex, value)
-
- override protected[parquet] def updateDate(fieldIndex: Int, value: Int): Unit =
- current.setInt(fieldIndex, value)
-
- override protected[parquet] def updateLong(fieldIndex: Int, value: Long): Unit =
- current.setLong(fieldIndex, value)
-
- override protected[parquet] def updateShort(fieldIndex: Int, value: Short): Unit =
- current.setShort(fieldIndex, value)
-
- override protected[parquet] def updateByte(fieldIndex: Int, value: Byte): Unit =
- current.setByte(fieldIndex, value)
-
- override protected[parquet] def updateDouble(fieldIndex: Int, value: Double): Unit =
- current.setDouble(fieldIndex, value)
-
- override protected[parquet] def updateFloat(fieldIndex: Int, value: Float): Unit =
- current.setFloat(fieldIndex, value)
-
- override protected[parquet] def updateBinary(fieldIndex: Int, value: Binary): Unit =
- current.update(fieldIndex, value.getBytes)
-
- override protected[parquet] def updateString(fieldIndex: Int, value: Array[Byte]): Unit =
- current.update(fieldIndex, UTF8String.fromBytes(value))
-
- override protected[parquet] def updateTimestamp(fieldIndex: Int, value: Binary): Unit =
- current.setLong(fieldIndex, readTimestamp(value))
-
- override protected[parquet] def updateDecimal(
- fieldIndex: Int, value: Binary, ctype: DecimalType): Unit = {
- var decimal = current(fieldIndex).asInstanceOf[Decimal]
- if (decimal == null) {
- decimal = new Decimal
- current(fieldIndex) = decimal
- }
- readDecimal(decimal, value, ctype)
- }
-}
-
-/**
- * A `parquet.io.api.PrimitiveConverter` that converts Parquet types to Catalyst types.
- *
- * @param parent The parent group converter.
- * @param fieldIndex The index inside the record.
- */
-private[parquet] class CatalystPrimitiveConverter(
- parent: CatalystConverter,
- fieldIndex: Int) extends PrimitiveConverter {
- override def addBinary(value: Binary): Unit =
- parent.updateBinary(fieldIndex, value)
-
- override def addBoolean(value: Boolean): Unit =
- parent.updateBoolean(fieldIndex, value)
-
- override def addDouble(value: Double): Unit =
- parent.updateDouble(fieldIndex, value)
-
- override def addFloat(value: Float): Unit =
- parent.updateFloat(fieldIndex, value)
-
- override def addInt(value: Int): Unit =
- parent.updateInt(fieldIndex, value)
-
- override def addLong(value: Long): Unit =
- parent.updateLong(fieldIndex, value)
-}
-
-/**
- * A `parquet.io.api.PrimitiveConverter` that converts Parquet Binary to Catalyst String.
- * Supports dictionaries to reduce Binary to String conversion overhead.
- *
- * Follows pattern in Parquet of using dictionaries, where supported, for String conversion.
- *
- * @param parent The parent group converter.
- * @param fieldIndex The index inside the record.
- */
-private[parquet] class CatalystPrimitiveStringConverter(parent: CatalystConverter, fieldIndex: Int)
- extends CatalystPrimitiveConverter(parent, fieldIndex) {
-
- private[this] var dict: Array[Array[Byte]] = null
-
- override def hasDictionarySupport: Boolean = true
-
- override def setDictionary(dictionary: Dictionary): Unit =
- dict = Array.tabulate(dictionary.getMaxId + 1) { dictionary.decodeToBinary(_).getBytes }
-
- override def addValueFromDictionary(dictionaryId: Int): Unit =
- parent.updateString(fieldIndex, dict(dictionaryId))
-
- override def addBinary(value: Binary): Unit =
- parent.updateString(fieldIndex, value.getBytes)
-}
-
-private[parquet] object CatalystArrayConverter {
- val INITIAL_ARRAY_SIZE = 20
-}
-
-/**
- * A `parquet.io.api.GroupConverter` that converts a single-element groups that
- * match the characteristics of an array (see
- * [[org.apache.spark.sql.parquet.ParquetTypesConverter]]) into an
- * [[org.apache.spark.sql.types.ArrayType]].
- *
- * @param elementType The type of the array elements (complex or primitive)
- * @param index The position of this (array) field inside its parent converter
- * @param parent The parent converter
- * @param buffer A data buffer
- */
-private[parquet] class CatalystArrayConverter(
- val elementType: DataType,
- val index: Int,
- protected[parquet] val parent: CatalystConverter,
- protected[parquet] var buffer: Buffer[Any])
- extends CatalystConverter {
-
- def this(elementType: DataType, index: Int, parent: CatalystConverter) =
- this(
- elementType,
- index,
- parent,
- new ArrayBuffer[Any](CatalystArrayConverter.INITIAL_ARRAY_SIZE))
-
- protected[parquet] val converter: Converter = CatalystConverter.createConverter(
- new CatalystConverter.FieldType(
- CatalystConverter.ARRAY_ELEMENTS_SCHEMA_NAME,
- elementType,
- false),
- fieldIndex = 0,
- parent = this)
-
- override def getConverter(fieldIndex: Int): Converter = converter
-
- // arrays have only one (repeated) field, which is its elements
- override val size = 1
-
- override protected[parquet] def updateField(fieldIndex: Int, value: Any): Unit = {
- // fieldIndex is ignored (assumed to be zero but not checked)
- if (value == null) {
- throw new IllegalArgumentException("Null values inside Parquet arrays are not supported!")
- }
- buffer += value
- }
-
- override protected[parquet] def clearBuffer(): Unit = {
- buffer.clear()
- }
-
- override def start(): Unit = {
- if (!converter.isPrimitive) {
- converter.asInstanceOf[CatalystConverter].clearBuffer()
- }
- }
-
- override def end(): Unit = {
- assert(parent != null)
- // here we need to make sure to use ArrayScalaType
- parent.updateField(index, buffer.toArray.toSeq)
- clearBuffer()
- }
-}
-
-/**
- * A `parquet.io.api.GroupConverter` that converts a single-element groups that
- * match the characteristics of an array (see
- * [[org.apache.spark.sql.parquet.ParquetTypesConverter]]) into an
- * [[org.apache.spark.sql.types.ArrayType]].
- *
- * @param elementType The type of the array elements (native)
- * @param index The position of this (array) field inside its parent converter
- * @param parent The parent converter
- * @param capacity The (initial) capacity of the buffer
- */
-private[parquet] class CatalystNativeArrayConverter(
- val elementType: AtomicType,
- val index: Int,
- protected[parquet] val parent: CatalystConverter,
- protected[parquet] var capacity: Int = CatalystArrayConverter.INITIAL_ARRAY_SIZE)
- extends CatalystConverter {
-
- type NativeType = elementType.InternalType
-
- private var buffer: Array[NativeType] = elementType.classTag.newArray(capacity)
-
- private var elements: Int = 0
-
- protected[parquet] val converter: Converter = CatalystConverter.createConverter(
- new CatalystConverter.FieldType(
- CatalystConverter.ARRAY_ELEMENTS_SCHEMA_NAME,
- elementType,
- false),
- fieldIndex = 0,
- parent = this)
-
- override def getConverter(fieldIndex: Int): Converter = converter
-
- // arrays have only one (repeated) field, which is its elements
- override val size = 1
-
- override protected[parquet] def updateField(fieldIndex: Int, value: Any): Unit =
- throw new UnsupportedOperationException
-
- // Overridden here to avoid auto-boxing for primitive types
- override protected[parquet] def updateBoolean(fieldIndex: Int, value: Boolean): Unit = {
- checkGrowBuffer()
- buffer(elements) = value.asInstanceOf[NativeType]
- elements += 1
- }
-
- override protected[parquet] def updateInt(fieldIndex: Int, value: Int): Unit = {
- checkGrowBuffer()
- buffer(elements) = value.asInstanceOf[NativeType]
- elements += 1
- }
-
- override protected[parquet] def updateShort(fieldIndex: Int, value: Short): Unit = {
- checkGrowBuffer()
- buffer(elements) = value.asInstanceOf[NativeType]
- elements += 1
- }
-
- override protected[parquet] def updateByte(fieldIndex: Int, value: Byte): Unit = {
- checkGrowBuffer()
- buffer(elements) = value.asInstanceOf[NativeType]
- elements += 1
- }
-
- override protected[parquet] def updateLong(fieldIndex: Int, value: Long): Unit = {
- checkGrowBuffer()
- buffer(elements) = value.asInstanceOf[NativeType]
- elements += 1
- }
-
- override protected[parquet] def updateDouble(fieldIndex: Int, value: Double): Unit = {
- checkGrowBuffer()
- buffer(elements) = value.asInstanceOf[NativeType]
- elements += 1
- }
-
- override protected[parquet] def updateFloat(fieldIndex: Int, value: Float): Unit = {
- checkGrowBuffer()
- buffer(elements) = value.asInstanceOf[NativeType]
- elements += 1
- }
-
- override protected[parquet] def updateBinary(fieldIndex: Int, value: Binary): Unit = {
- checkGrowBuffer()
- buffer(elements) = value.getBytes.asInstanceOf[NativeType]
- elements += 1
- }
-
- override protected[parquet] def updateString(fieldIndex: Int, value: Array[Byte]): Unit = {
- checkGrowBuffer()
- buffer(elements) = UTF8String.fromBytes(value).asInstanceOf[NativeType]
- elements += 1
- }
-
- override protected[parquet] def clearBuffer(): Unit = {
- elements = 0
- }
-
- override def start(): Unit = {}
-
- override def end(): Unit = {
- assert(parent != null)
- // here we need to make sure to use ArrayScalaType
- parent.updateField(
- index,
- buffer.slice(0, elements).toSeq)
- clearBuffer()
- }
-
- private def checkGrowBuffer(): Unit = {
- if (elements >= capacity) {
- val newCapacity = 2 * capacity
- val tmp: Array[NativeType] = elementType.classTag.newArray(newCapacity)
- Array.copy(buffer, 0, tmp, 0, capacity)
- buffer = tmp
- capacity = newCapacity
- }
- }
-}
-
-/**
- * A `parquet.io.api.GroupConverter` that converts a single-element groups that
- * match the characteristics of an array contains null (see
- * [[org.apache.spark.sql.parquet.ParquetTypesConverter]]) into an
- * [[org.apache.spark.sql.types.ArrayType]].
- *
- * @param elementType The type of the array elements (complex or primitive)
- * @param index The position of this (array) field inside its parent converter
- * @param parent The parent converter
- * @param buffer A data buffer
- */
-private[parquet] class CatalystArrayContainsNullConverter(
- val elementType: DataType,
- val index: Int,
- protected[parquet] val parent: CatalystConverter,
- protected[parquet] var buffer: Buffer[Any])
- extends CatalystConverter {
-
- def this(elementType: DataType, index: Int, parent: CatalystConverter) =
- this(
- elementType,
- index,
- parent,
- new ArrayBuffer[Any](CatalystArrayConverter.INITIAL_ARRAY_SIZE))
-
- protected[parquet] val converter: Converter = new CatalystConverter {
-
- private var current: Any = null
-
- val converter = CatalystConverter.createConverter(
- new CatalystConverter.FieldType(
- CatalystConverter.ARRAY_ELEMENTS_SCHEMA_NAME,
- elementType,
- false),
- fieldIndex = 0,
- parent = this)
-
- override def getConverter(fieldIndex: Int): Converter = converter
-
- override def end(): Unit = parent.updateField(index, current)
-
- override def start(): Unit = {
- current = null
- }
-
- override protected[parquet] val size: Int = 1
- override protected[parquet] val index: Int = 0
- override protected[parquet] val parent = CatalystArrayContainsNullConverter.this
-
- override protected[parquet] def updateField(fieldIndex: Int, value: Any): Unit = {
- current = value
- }
-
- override protected[parquet] def clearBuffer(): Unit = {}
- }
-
- override def getConverter(fieldIndex: Int): Converter = converter
-
- // arrays have only one (repeated) field, which is its elements
- override val size = 1
-
- override protected[parquet] def updateField(fieldIndex: Int, value: Any): Unit = {
- buffer += value
- }
-
- override protected[parquet] def clearBuffer(): Unit = {
- buffer.clear()
- }
-
- override def start(): Unit = {}
-
- override def end(): Unit = {
- assert(parent != null)
- // here we need to make sure to use ArrayScalaType
- parent.updateField(index, buffer.toArray.toSeq)
- clearBuffer()
- }
-}
-
-/**
- * This converter is for multi-element groups of primitive or complex types
- * that have repetition level optional or required (so struct fields).
- *
- * @param schema The corresponding Catalyst schema in the form of a list of
- * attributes.
- * @param index
- * @param parent
- */
-private[parquet] class CatalystStructConverter(
- override protected[parquet] val schema: Array[FieldType],
- override protected[parquet] val index: Int,
- override protected[parquet] val parent: CatalystConverter)
- extends CatalystGroupConverter(schema, index, parent) {
-
- override protected[parquet] def clearBuffer(): Unit = {}
-
- // TODO: think about reusing the buffer
- override def end(): Unit = {
- assert(!isRootConverter)
- // here we need to make sure to use StructScalaType
- // Note: we need to actually make a copy of the array since we
- // may be in a nested field
- parent.updateField(index, new GenericInternalRow(current.toArray))
- }
-}
-
-/**
- * A `parquet.io.api.GroupConverter` that converts two-element groups that
- * match the characteristics of a map (see
- * [[org.apache.spark.sql.parquet.ParquetTypesConverter]]) into an
- * [[org.apache.spark.sql.types.MapType]].
- *
- * @param schema
- * @param index
- * @param parent
- */
-private[parquet] class CatalystMapConverter(
- protected[parquet] val schema: Array[FieldType],
- override protected[parquet] val index: Int,
- override protected[parquet] val parent: CatalystConverter)
- extends CatalystConverter {
-
- private val map = new HashMap[Any, Any]()
-
- private val keyValueConverter = new CatalystConverter {
- private var currentKey: Any = null
- private var currentValue: Any = null
- val keyConverter = CatalystConverter.createConverter(schema(0), 0, this)
- val valueConverter = CatalystConverter.createConverter(schema(1), 1, this)
-
- override def getConverter(fieldIndex: Int): Converter = {
- if (fieldIndex == 0) keyConverter else valueConverter
- }
-
- override def end(): Unit = CatalystMapConverter.this.map += currentKey -> currentValue
-
- override def start(): Unit = {
- currentKey = null
- currentValue = null
- }
-
- override protected[parquet] val size: Int = 2
- override protected[parquet] val index: Int = 0
- override protected[parquet] val parent: CatalystConverter = CatalystMapConverter.this
-
- override protected[parquet] def updateField(fieldIndex: Int, value: Any): Unit = {
- fieldIndex match {
- case 0 =>
- currentKey = value
- case 1 =>
- currentValue = value
- case _ =>
- new RuntimePermission(s"trying to update Map with fieldIndex $fieldIndex")
- }
- }
-
- override protected[parquet] def clearBuffer(): Unit = {}
- }
-
- override protected[parquet] val size: Int = 1
-
- override protected[parquet] def clearBuffer(): Unit = {}
-
- override def start(): Unit = {
- map.clear()
- }
-
- override def end(): Unit = {
- // here we need to make sure to use MapScalaType
- parent.updateField(index, map.toMap)
- }
-
- override def getConverter(fieldIndex: Int): Converter = keyValueConverter
-
- override protected[parquet] def updateField(fieldIndex: Int, value: Any): Unit =
- throw new UnsupportedOperationException
}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala
index 8402cd756140d..e8851ddb68026 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala
@@ -17,14 +17,17 @@
package org.apache.spark.sql.parquet
-import java.nio.{ByteOrder, ByteBuffer}
+import java.nio.{ByteBuffer, ByteOrder}
+import java.util
import java.util.{HashMap => JHashMap}
+import scala.collection.JavaConversions._
+
import org.apache.hadoop.conf.Configuration
import org.apache.parquet.column.ParquetProperties
import org.apache.parquet.hadoop.ParquetOutputFormat
import org.apache.parquet.hadoop.api.ReadSupport.ReadContext
-import org.apache.parquet.hadoop.api.{ReadSupport, WriteSupport}
+import org.apache.parquet.hadoop.api.{InitContext, ReadSupport, WriteSupport}
import org.apache.parquet.io.api._
import org.apache.parquet.schema.MessageType
@@ -36,87 +39,133 @@ import org.apache.spark.sql.types._
import org.apache.spark.unsafe.types.UTF8String
/**
- * A `parquet.io.api.RecordMaterializer` for Rows.
+ * A [[RecordMaterializer]] for Catalyst rows.
*
- *@param root The root group converter for the record.
+ * @param parquetSchema Parquet schema of the records to be read
+ * @param catalystSchema Catalyst schema of the rows to be constructed
*/
-private[parquet] class RowRecordMaterializer(root: CatalystConverter)
+private[parquet] class RowRecordMaterializer(parquetSchema: MessageType, catalystSchema: StructType)
extends RecordMaterializer[InternalRow] {
- def this(parquetSchema: MessageType, attributes: Seq[Attribute]) =
- this(CatalystConverter.createRootConverter(parquetSchema, attributes))
+ private val rootConverter = new CatalystRowConverter(parquetSchema, catalystSchema, NoopUpdater)
- override def getCurrentRecord: InternalRow = root.getCurrentRecord
+ override def getCurrentRecord: InternalRow = rootConverter.currentRow
- override def getRootConverter: GroupConverter = root.asInstanceOf[GroupConverter]
+ override def getRootConverter: GroupConverter = rootConverter
}
-/**
- * A `parquet.hadoop.api.ReadSupport` for Row objects.
- */
private[parquet] class RowReadSupport extends ReadSupport[InternalRow] with Logging {
-
override def prepareForRead(
conf: Configuration,
- stringMap: java.util.Map[String, String],
+ keyValueMetaData: util.Map[String, String],
fileSchema: MessageType,
readContext: ReadContext): RecordMaterializer[InternalRow] = {
- log.debug(s"preparing for read with Parquet file schema $fileSchema")
- // Note: this very much imitates AvroParquet
- val parquetSchema = readContext.getRequestedSchema
- var schema: Seq[Attribute] = null
-
- if (readContext.getReadSupportMetadata != null) {
- // first try to find the read schema inside the metadata (can result from projections)
- if (
- readContext
- .getReadSupportMetadata
- .get(RowReadSupport.SPARK_ROW_REQUESTED_SCHEMA) != null) {
- schema = ParquetTypesConverter.convertFromString(
- readContext.getReadSupportMetadata.get(RowReadSupport.SPARK_ROW_REQUESTED_SCHEMA))
- } else {
- // if unavailable, try the schema that was read originally from the file or provided
- // during the creation of the Parquet relation
- if (readContext.getReadSupportMetadata.get(RowReadSupport.SPARK_METADATA_KEY) != null) {
- schema = ParquetTypesConverter.convertFromString(
- readContext.getReadSupportMetadata.get(RowReadSupport.SPARK_METADATA_KEY))
- }
+ log.debug(s"Preparing for read Parquet file with message type: $fileSchema")
+
+ val toCatalyst = new CatalystSchemaConverter(conf)
+ val parquetRequestedSchema = readContext.getRequestedSchema
+
+ val catalystRequestedSchema =
+ Option(readContext.getReadSupportMetadata).map(_.toMap).flatMap { metadata =>
+ metadata
+ // First tries to read requested schema, which may result from projections
+ .get(RowReadSupport.SPARK_ROW_REQUESTED_SCHEMA)
+ // If not available, tries to read Catalyst schema from file metadata. It's only
+ // available if the target file is written by Spark SQL.
+ .orElse(metadata.get(RowReadSupport.SPARK_METADATA_KEY))
+ }.map(StructType.fromString).getOrElse {
+ logDebug("Catalyst schema not available, falling back to Parquet schema")
+ toCatalyst.convert(parquetRequestedSchema)
}
- }
- // if both unavailable, fall back to deducing the schema from the given Parquet schema
- // TODO: Why it can be null?
- if (schema == null) {
- log.debug("falling back to Parquet read schema")
- schema = ParquetTypesConverter.convertToAttributes(parquetSchema, false, true)
- }
- log.debug(s"list of attributes that will be read: $schema")
- new RowRecordMaterializer(parquetSchema, schema)
+
+ logDebug(s"Catalyst schema used to read Parquet files: $catalystRequestedSchema")
+ new RowRecordMaterializer(parquetRequestedSchema, catalystRequestedSchema)
}
- override def init(
- configuration: Configuration,
- keyValueMetaData: java.util.Map[String, String],
- fileSchema: MessageType): ReadContext = {
- var parquetSchema = fileSchema
- val metadata = new JHashMap[String, String]()
- val requestedAttributes = RowReadSupport.getRequestedSchema(configuration)
-
- if (requestedAttributes != null) {
- // If the parquet file is thrift derived, there is a good chance that
- // it will have the thrift class in metadata.
- val isThriftDerived = keyValueMetaData.keySet().contains("thrift.class")
- parquetSchema = ParquetTypesConverter.convertFromAttributes(requestedAttributes)
- metadata.put(
- RowReadSupport.SPARK_ROW_REQUESTED_SCHEMA,
- ParquetTypesConverter.convertToString(requestedAttributes))
- }
+ override def init(context: InitContext): ReadContext = {
+ val conf = context.getConfiguration
+
+ // If the target file was written by Spark SQL, we should be able to find a serialized Catalyst
+ // schema of this file from its the metadata.
+ val maybeRowSchema = Option(conf.get(RowWriteSupport.SPARK_ROW_SCHEMA))
+
+ // Optional schema of requested columns, in the form of a string serialized from a Catalyst
+ // `StructType` containing all requested columns.
+ val maybeRequestedSchema = Option(conf.get(RowReadSupport.SPARK_ROW_REQUESTED_SCHEMA))
+
+ // Below we construct a Parquet schema containing all requested columns. This schema tells
+ // Parquet which columns to read.
+ //
+ // If `maybeRequestedSchema` is defined, we assemble an equivalent Parquet schema. Otherwise,
+ // we have to fallback to the full file schema which contains all columns in the file.
+ // Obviously this may waste IO bandwidth since it may read more columns than requested.
+ //
+ // Two things to note:
+ //
+ // 1. It's possible that some requested columns don't exist in the target Parquet file. For
+ // example, in the case of schema merging, the globally merged schema may contain extra
+ // columns gathered from other Parquet files. These columns will be simply filled with nulls
+ // when actually reading the target Parquet file.
+ //
+ // 2. When `maybeRequestedSchema` is available, we can't simply convert the Catalyst schema to
+ // Parquet schema using `CatalystSchemaConverter`, because the mapping is not unique due to
+ // non-standard behaviors of some Parquet libraries/tools. For example, a Parquet file
+ // containing a single integer array field `f1` may have the following legacy 2-level
+ // structure:
+ //
+ // message root {
+ // optional group f1 (LIST) {
+ // required INT32 element;
+ // }
+ // }
+ //
+ // while `CatalystSchemaConverter` may generate a standard 3-level structure:
+ //
+ // message root {
+ // optional group f1 (LIST) {
+ // repeated group list {
+ // required INT32 element;
+ // }
+ // }
+ // }
+ //
+ // Apparently, we can't use the 2nd schema to read the target Parquet file as they have
+ // different physical structures.
+ val parquetRequestedSchema =
+ maybeRequestedSchema.fold(context.getFileSchema) { schemaString =>
+ val toParquet = new CatalystSchemaConverter(conf)
+ val fileSchema = context.getFileSchema.asGroupType()
+ val fileFieldNames = fileSchema.getFields.map(_.getName).toSet
+
+ StructType
+ // Deserializes the Catalyst schema of requested columns
+ .fromString(schemaString)
+ .map { field =>
+ if (fileFieldNames.contains(field.name)) {
+ // If the field exists in the target Parquet file, extracts the field type from the
+ // full file schema and makes a single-field Parquet schema
+ new MessageType("root", fileSchema.getType(field.name))
+ } else {
+ // Otherwise, just resorts to `CatalystSchemaConverter`
+ toParquet.convert(StructType(Array(field)))
+ }
+ }
+ // Merges all single-field Parquet schemas to form a complete schema for all requested
+ // columns. Note that it's possible that no columns are requested at all (e.g., count
+ // some partition column of a partitioned Parquet table). That's why `fold` is used here
+ // and always fallback to an empty Parquet schema.
+ .fold(new MessageType("root")) {
+ _ union _
+ }
+ }
- val origAttributesStr: String = configuration.get(RowWriteSupport.SPARK_ROW_SCHEMA)
- if (origAttributesStr != null) {
- metadata.put(RowReadSupport.SPARK_METADATA_KEY, origAttributesStr)
- }
+ val metadata =
+ Map.empty[String, String] ++
+ maybeRequestedSchema.map(RowReadSupport.SPARK_ROW_REQUESTED_SCHEMA -> _) ++
+ maybeRowSchema.map(RowWriteSupport.SPARK_ROW_SCHEMA -> _)
- new ReadSupport.ReadContext(parquetSchema, metadata)
+ logInfo(s"Going to read Parquet file with these requested columns: $parquetRequestedSchema")
+ new ReadContext(parquetRequestedSchema, metadata)
}
}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
index 6bc69c6ad0847..5464624e58e2f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
@@ -257,6 +257,10 @@ private[sql] class ParquetRelation2(
broadcastedConf: Broadcast[SerializableConfiguration]): RDD[Row] = {
val useMetadataCache = sqlContext.getConf(SQLConf.PARQUET_CACHE_METADATA)
val parquetFilterPushDown = sqlContext.conf.parquetFilterPushDown
+ val assumeBinaryIsString = sqlContext.conf.isParquetBinaryAsString
+ val assumeInt96IsTimestamp = sqlContext.conf.isParquetINT96AsTimestamp
+ val followParquetFormatSpec = sqlContext.conf.followParquetFormatSpec
+
// Create the function to set variable Parquet confs at both driver and executor side.
val initLocalJobFuncOpt =
ParquetRelation2.initializeLocalJobFunc(
@@ -264,7 +268,11 @@ private[sql] class ParquetRelation2(
filters,
dataSchema,
useMetadataCache,
- parquetFilterPushDown) _
+ parquetFilterPushDown,
+ assumeBinaryIsString,
+ assumeInt96IsTimestamp,
+ followParquetFormatSpec) _
+
// Create the function to set input paths at the driver side.
val setInputPaths = ParquetRelation2.initializeDriverSideJobFunc(inputFiles) _
@@ -469,9 +477,12 @@ private[sql] object ParquetRelation2 extends Logging {
filters: Array[Filter],
dataSchema: StructType,
useMetadataCache: Boolean,
- parquetFilterPushDown: Boolean)(job: Job): Unit = {
+ parquetFilterPushDown: Boolean,
+ assumeBinaryIsString: Boolean,
+ assumeInt96IsTimestamp: Boolean,
+ followParquetFormatSpec: Boolean)(job: Job): Unit = {
val conf = job.getConfiguration
- conf.set(ParquetInputFormat.READ_SUPPORT_CLASS, classOf[RowReadSupport].getName())
+ conf.set(ParquetInputFormat.READ_SUPPORT_CLASS, classOf[RowReadSupport].getName)
// Try to push down filters when filter push-down is enabled.
if (parquetFilterPushDown) {
@@ -495,6 +506,11 @@ private[sql] object ParquetRelation2 extends Logging {
// Tell FilteringParquetRowInputFormat whether it's okay to cache Parquet and FS metadata
conf.setBoolean(SQLConf.PARQUET_CACHE_METADATA.key, useMetadataCache)
+
+ // Sets flags for Parquet schema conversion
+ conf.setBoolean(SQLConf.PARQUET_BINARY_AS_STRING.key, assumeBinaryIsString)
+ conf.setBoolean(SQLConf.PARQUET_INT96_AS_TIMESTAMP.key, assumeInt96IsTimestamp)
+ conf.setBoolean(SQLConf.PARQUET_FOLLOW_PARQUET_FORMAT_SPEC.key, followParquetFormatSpec)
}
/** This closure sets input paths at the driver side. */
diff --git a/sql/core/src/test/README.md b/sql/core/src/test/README.md
new file mode 100644
index 0000000000000..3dd9861b4896d
--- /dev/null
+++ b/sql/core/src/test/README.md
@@ -0,0 +1,33 @@
+# Notes for Parquet compatibility tests
+
+The following directories and files are used for Parquet compatibility tests:
+
+```
+.
+├── README.md # This file
+├── avro
+│  ├── parquet-compat.avdl # Testing Avro IDL
+│  └── parquet-compat.avpr # !! NO TOUCH !! Protocol file generated from parquet-compat.avdl
+├── gen-java # !! NO TOUCH !! Generated Java code
+├── scripts
+│  └── gen-code.sh # Script used to generate Java code for Thrift and Avro
+└── thrift
+ └── parquet-compat.thrift # Testing Thrift schema
+```
+
+Generated Java code are used in the following test suites:
+
+- `org.apache.spark.sql.parquet.ParquetAvroCompatibilitySuite`
+- `org.apache.spark.sql.parquet.ParquetThriftCompatibilitySuite`
+
+To avoid code generation during build time, Java code generated from testing Thrift schema and Avro IDL are also checked in.
+
+When updating the testing Thrift schema and Avro IDL, please run `gen-code.sh` to update all the generated Java code.
+
+## Prerequisites
+
+Please ensure `avro-tools` and `thrift` are installed. You may install these two on Mac OS X via:
+
+```bash
+$ brew install thrift avro-tools
+```
diff --git a/sql/core/src/test/avro/parquet-compat.avdl b/sql/core/src/test/avro/parquet-compat.avdl
new file mode 100644
index 0000000000000..24729f6143e6c
--- /dev/null
+++ b/sql/core/src/test/avro/parquet-compat.avdl
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// This is a test protocol for testing parquet-avro compatibility.
+@namespace("org.apache.spark.sql.parquet.test.avro")
+protocol CompatibilityTest {
+ record Nested {
+ array nested_ints_column;
+ string nested_string_column;
+ }
+
+ record ParquetAvroCompat {
+ boolean bool_column;
+ int int_column;
+ long long_column;
+ float float_column;
+ double double_column;
+ bytes binary_column;
+ string string_column;
+
+ union { null, boolean } maybe_bool_column;
+ union { null, int } maybe_int_column;
+ union { null, long } maybe_long_column;
+ union { null, float } maybe_float_column;
+ union { null, double } maybe_double_column;
+ union { null, bytes } maybe_binary_column;
+ union { null, string } maybe_string_column;
+
+ array strings_column;
+ map string_to_int_column;
+ map> complex_column;
+ }
+}
diff --git a/sql/core/src/test/avro/parquet-compat.avpr b/sql/core/src/test/avro/parquet-compat.avpr
new file mode 100644
index 0000000000000..a83b7c990dd2e
--- /dev/null
+++ b/sql/core/src/test/avro/parquet-compat.avpr
@@ -0,0 +1,86 @@
+{
+ "protocol" : "CompatibilityTest",
+ "namespace" : "org.apache.spark.sql.parquet.test.avro",
+ "types" : [ {
+ "type" : "record",
+ "name" : "Nested",
+ "fields" : [ {
+ "name" : "nested_ints_column",
+ "type" : {
+ "type" : "array",
+ "items" : "int"
+ }
+ }, {
+ "name" : "nested_string_column",
+ "type" : "string"
+ } ]
+ }, {
+ "type" : "record",
+ "name" : "ParquetAvroCompat",
+ "fields" : [ {
+ "name" : "bool_column",
+ "type" : "boolean"
+ }, {
+ "name" : "int_column",
+ "type" : "int"
+ }, {
+ "name" : "long_column",
+ "type" : "long"
+ }, {
+ "name" : "float_column",
+ "type" : "float"
+ }, {
+ "name" : "double_column",
+ "type" : "double"
+ }, {
+ "name" : "binary_column",
+ "type" : "bytes"
+ }, {
+ "name" : "string_column",
+ "type" : "string"
+ }, {
+ "name" : "maybe_bool_column",
+ "type" : [ "null", "boolean" ]
+ }, {
+ "name" : "maybe_int_column",
+ "type" : [ "null", "int" ]
+ }, {
+ "name" : "maybe_long_column",
+ "type" : [ "null", "long" ]
+ }, {
+ "name" : "maybe_float_column",
+ "type" : [ "null", "float" ]
+ }, {
+ "name" : "maybe_double_column",
+ "type" : [ "null", "double" ]
+ }, {
+ "name" : "maybe_binary_column",
+ "type" : [ "null", "bytes" ]
+ }, {
+ "name" : "maybe_string_column",
+ "type" : [ "null", "string" ]
+ }, {
+ "name" : "strings_column",
+ "type" : {
+ "type" : "array",
+ "items" : "string"
+ }
+ }, {
+ "name" : "string_to_int_column",
+ "type" : {
+ "type" : "map",
+ "values" : "int"
+ }
+ }, {
+ "name" : "complex_column",
+ "type" : {
+ "type" : "map",
+ "values" : {
+ "type" : "array",
+ "items" : "Nested"
+ }
+ }
+ } ]
+ } ],
+ "messages" : { }
+}
\ No newline at end of file
diff --git a/sql/core/src/test/gen-java/org/apache/spark/sql/parquet/test/avro/CompatibilityTest.java b/sql/core/src/test/gen-java/org/apache/spark/sql/parquet/test/avro/CompatibilityTest.java
new file mode 100644
index 0000000000000..daec65a5bbe57
--- /dev/null
+++ b/sql/core/src/test/gen-java/org/apache/spark/sql/parquet/test/avro/CompatibilityTest.java
@@ -0,0 +1,17 @@
+/**
+ * Autogenerated by Avro
+ *
+ * DO NOT EDIT DIRECTLY
+ */
+package org.apache.spark.sql.parquet.test.avro;
+
+@SuppressWarnings("all")
+@org.apache.avro.specific.AvroGenerated
+public interface CompatibilityTest {
+ public static final org.apache.avro.Protocol PROTOCOL = org.apache.avro.Protocol.parse("{\"protocol\":\"CompatibilityTest\",\"namespace\":\"org.apache.spark.sql.parquet.test.avro\",\"types\":[{\"type\":\"record\",\"name\":\"Nested\",\"fields\":[{\"name\":\"nested_ints_column\",\"type\":{\"type\":\"array\",\"items\":\"int\"}},{\"name\":\"nested_string_column\",\"type\":{\"type\":\"string\",\"avro.java.string\":\"String\"}}]},{\"type\":\"record\",\"name\":\"ParquetAvroCompat\",\"fields\":[{\"name\":\"bool_column\",\"type\":\"boolean\"},{\"name\":\"int_column\",\"type\":\"int\"},{\"name\":\"long_column\",\"type\":\"long\"},{\"name\":\"float_column\",\"type\":\"float\"},{\"name\":\"double_column\",\"type\":\"double\"},{\"name\":\"binary_column\",\"type\":\"bytes\"},{\"name\":\"string_column\",\"type\":{\"type\":\"string\",\"avro.java.string\":\"String\"}},{\"name\":\"maybe_bool_column\",\"type\":[\"null\",\"boolean\"]},{\"name\":\"maybe_int_column\",\"type\":[\"null\",\"int\"]},{\"name\":\"maybe_long_column\",\"type\":[\"null\",\"long\"]},{\"name\":\"maybe_float_column\",\"type\":[\"null\",\"float\"]},{\"name\":\"maybe_double_column\",\"type\":[\"null\",\"double\"]},{\"name\":\"maybe_binary_column\",\"type\":[\"null\",\"bytes\"]},{\"name\":\"maybe_string_column\",\"type\":[\"null\",{\"type\":\"string\",\"avro.java.string\":\"String\"}]},{\"name\":\"strings_column\",\"type\":{\"type\":\"array\",\"items\":{\"type\":\"string\",\"avro.java.string\":\"String\"}}},{\"name\":\"string_to_int_column\",\"type\":{\"type\":\"map\",\"values\":\"int\",\"avro.java.string\":\"String\"}},{\"name\":\"complex_column\",\"type\":{\"type\":\"map\",\"values\":{\"type\":\"array\",\"items\":\"Nested\"},\"avro.java.string\":\"String\"}}]}],\"messages\":{}}");
+
+ @SuppressWarnings("all")
+ public interface Callback extends CompatibilityTest {
+ public static final org.apache.avro.Protocol PROTOCOL = org.apache.spark.sql.parquet.test.avro.CompatibilityTest.PROTOCOL;
+ }
+}
\ No newline at end of file
diff --git a/sql/core/src/test/gen-java/org/apache/spark/sql/parquet/test/avro/Nested.java b/sql/core/src/test/gen-java/org/apache/spark/sql/parquet/test/avro/Nested.java
new file mode 100644
index 0000000000000..051f1ee903863
--- /dev/null
+++ b/sql/core/src/test/gen-java/org/apache/spark/sql/parquet/test/avro/Nested.java
@@ -0,0 +1,196 @@
+/**
+ * Autogenerated by Avro
+ *
+ * DO NOT EDIT DIRECTLY
+ */
+package org.apache.spark.sql.parquet.test.avro;
+@SuppressWarnings("all")
+@org.apache.avro.specific.AvroGenerated
+public class Nested extends org.apache.avro.specific.SpecificRecordBase implements org.apache.avro.specific.SpecificRecord {
+ public static final org.apache.avro.Schema SCHEMA$ = new org.apache.avro.Schema.Parser().parse("{\"type\":\"record\",\"name\":\"Nested\",\"namespace\":\"org.apache.spark.sql.parquet.test.avro\",\"fields\":[{\"name\":\"nested_ints_column\",\"type\":{\"type\":\"array\",\"items\":\"int\"}},{\"name\":\"nested_string_column\",\"type\":{\"type\":\"string\",\"avro.java.string\":\"String\"}}]}");
+ public static org.apache.avro.Schema getClassSchema() { return SCHEMA$; }
+ @Deprecated public java.util.List nested_ints_column;
+ @Deprecated public java.lang.String nested_string_column;
+
+ /**
+ * Default constructor. Note that this does not initialize fields
+ * to their default values from the schema. If that is desired then
+ * one should use newBuilder().
+ */
+ public Nested() {}
+
+ /**
+ * All-args constructor.
+ */
+ public Nested(java.util.List nested_ints_column, java.lang.String nested_string_column) {
+ this.nested_ints_column = nested_ints_column;
+ this.nested_string_column = nested_string_column;
+ }
+
+ public org.apache.avro.Schema getSchema() { return SCHEMA$; }
+ // Used by DatumWriter. Applications should not call.
+ public java.lang.Object get(int field$) {
+ switch (field$) {
+ case 0: return nested_ints_column;
+ case 1: return nested_string_column;
+ default: throw new org.apache.avro.AvroRuntimeException("Bad index");
+ }
+ }
+ // Used by DatumReader. Applications should not call.
+ @SuppressWarnings(value="unchecked")
+ public void put(int field$, java.lang.Object value$) {
+ switch (field$) {
+ case 0: nested_ints_column = (java.util.List)value$; break;
+ case 1: nested_string_column = (java.lang.String)value$; break;
+ default: throw new org.apache.avro.AvroRuntimeException("Bad index");
+ }
+ }
+
+ /**
+ * Gets the value of the 'nested_ints_column' field.
+ */
+ public java.util.List getNestedIntsColumn() {
+ return nested_ints_column;
+ }
+
+ /**
+ * Sets the value of the 'nested_ints_column' field.
+ * @param value the value to set.
+ */
+ public void setNestedIntsColumn(java.util.List value) {
+ this.nested_ints_column = value;
+ }
+
+ /**
+ * Gets the value of the 'nested_string_column' field.
+ */
+ public java.lang.String getNestedStringColumn() {
+ return nested_string_column;
+ }
+
+ /**
+ * Sets the value of the 'nested_string_column' field.
+ * @param value the value to set.
+ */
+ public void setNestedStringColumn(java.lang.String value) {
+ this.nested_string_column = value;
+ }
+
+ /** Creates a new Nested RecordBuilder */
+ public static org.apache.spark.sql.parquet.test.avro.Nested.Builder newBuilder() {
+ return new org.apache.spark.sql.parquet.test.avro.Nested.Builder();
+ }
+
+ /** Creates a new Nested RecordBuilder by copying an existing Builder */
+ public static org.apache.spark.sql.parquet.test.avro.Nested.Builder newBuilder(org.apache.spark.sql.parquet.test.avro.Nested.Builder other) {
+ return new org.apache.spark.sql.parquet.test.avro.Nested.Builder(other);
+ }
+
+ /** Creates a new Nested RecordBuilder by copying an existing Nested instance */
+ public static org.apache.spark.sql.parquet.test.avro.Nested.Builder newBuilder(org.apache.spark.sql.parquet.test.avro.Nested other) {
+ return new org.apache.spark.sql.parquet.test.avro.Nested.Builder(other);
+ }
+
+ /**
+ * RecordBuilder for Nested instances.
+ */
+ public static class Builder extends org.apache.avro.specific.SpecificRecordBuilderBase
+ implements org.apache.avro.data.RecordBuilder {
+
+ private java.util.List nested_ints_column;
+ private java.lang.String nested_string_column;
+
+ /** Creates a new Builder */
+ private Builder() {
+ super(org.apache.spark.sql.parquet.test.avro.Nested.SCHEMA$);
+ }
+
+ /** Creates a Builder by copying an existing Builder */
+ private Builder(org.apache.spark.sql.parquet.test.avro.Nested.Builder other) {
+ super(other);
+ if (isValidValue(fields()[0], other.nested_ints_column)) {
+ this.nested_ints_column = data().deepCopy(fields()[0].schema(), other.nested_ints_column);
+ fieldSetFlags()[0] = true;
+ }
+ if (isValidValue(fields()[1], other.nested_string_column)) {
+ this.nested_string_column = data().deepCopy(fields()[1].schema(), other.nested_string_column);
+ fieldSetFlags()[1] = true;
+ }
+ }
+
+ /** Creates a Builder by copying an existing Nested instance */
+ private Builder(org.apache.spark.sql.parquet.test.avro.Nested other) {
+ super(org.apache.spark.sql.parquet.test.avro.Nested.SCHEMA$);
+ if (isValidValue(fields()[0], other.nested_ints_column)) {
+ this.nested_ints_column = data().deepCopy(fields()[0].schema(), other.nested_ints_column);
+ fieldSetFlags()[0] = true;
+ }
+ if (isValidValue(fields()[1], other.nested_string_column)) {
+ this.nested_string_column = data().deepCopy(fields()[1].schema(), other.nested_string_column);
+ fieldSetFlags()[1] = true;
+ }
+ }
+
+ /** Gets the value of the 'nested_ints_column' field */
+ public java.util.List getNestedIntsColumn() {
+ return nested_ints_column;
+ }
+
+ /** Sets the value of the 'nested_ints_column' field */
+ public org.apache.spark.sql.parquet.test.avro.Nested.Builder setNestedIntsColumn(java.util.List value) {
+ validate(fields()[0], value);
+ this.nested_ints_column = value;
+ fieldSetFlags()[0] = true;
+ return this;
+ }
+
+ /** Checks whether the 'nested_ints_column' field has been set */
+ public boolean hasNestedIntsColumn() {
+ return fieldSetFlags()[0];
+ }
+
+ /** Clears the value of the 'nested_ints_column' field */
+ public org.apache.spark.sql.parquet.test.avro.Nested.Builder clearNestedIntsColumn() {
+ nested_ints_column = null;
+ fieldSetFlags()[0] = false;
+ return this;
+ }
+
+ /** Gets the value of the 'nested_string_column' field */
+ public java.lang.String getNestedStringColumn() {
+ return nested_string_column;
+ }
+
+ /** Sets the value of the 'nested_string_column' field */
+ public org.apache.spark.sql.parquet.test.avro.Nested.Builder setNestedStringColumn(java.lang.String value) {
+ validate(fields()[1], value);
+ this.nested_string_column = value;
+ fieldSetFlags()[1] = true;
+ return this;
+ }
+
+ /** Checks whether the 'nested_string_column' field has been set */
+ public boolean hasNestedStringColumn() {
+ return fieldSetFlags()[1];
+ }
+
+ /** Clears the value of the 'nested_string_column' field */
+ public org.apache.spark.sql.parquet.test.avro.Nested.Builder clearNestedStringColumn() {
+ nested_string_column = null;
+ fieldSetFlags()[1] = false;
+ return this;
+ }
+
+ @Override
+ public Nested build() {
+ try {
+ Nested record = new Nested();
+ record.nested_ints_column = fieldSetFlags()[0] ? this.nested_ints_column : (java.util.List) defaultValue(fields()[0]);
+ record.nested_string_column = fieldSetFlags()[1] ? this.nested_string_column : (java.lang.String) defaultValue(fields()[1]);
+ return record;
+ } catch (Exception e) {
+ throw new org.apache.avro.AvroRuntimeException(e);
+ }
+ }
+ }
+}
diff --git a/sql/core/src/test/gen-java/org/apache/spark/sql/parquet/test/avro/ParquetAvroCompat.java b/sql/core/src/test/gen-java/org/apache/spark/sql/parquet/test/avro/ParquetAvroCompat.java
new file mode 100644
index 0000000000000..354c9d73cca31
--- /dev/null
+++ b/sql/core/src/test/gen-java/org/apache/spark/sql/parquet/test/avro/ParquetAvroCompat.java
@@ -0,0 +1,1001 @@
+/**
+ * Autogenerated by Avro
+ *
+ * DO NOT EDIT DIRECTLY
+ */
+package org.apache.spark.sql.parquet.test.avro;
+@SuppressWarnings("all")
+@org.apache.avro.specific.AvroGenerated
+public class ParquetAvroCompat extends org.apache.avro.specific.SpecificRecordBase implements org.apache.avro.specific.SpecificRecord {
+ public static final org.apache.avro.Schema SCHEMA$ = new org.apache.avro.Schema.Parser().parse("{\"type\":\"record\",\"name\":\"ParquetAvroCompat\",\"namespace\":\"org.apache.spark.sql.parquet.test.avro\",\"fields\":[{\"name\":\"bool_column\",\"type\":\"boolean\"},{\"name\":\"int_column\",\"type\":\"int\"},{\"name\":\"long_column\",\"type\":\"long\"},{\"name\":\"float_column\",\"type\":\"float\"},{\"name\":\"double_column\",\"type\":\"double\"},{\"name\":\"binary_column\",\"type\":\"bytes\"},{\"name\":\"string_column\",\"type\":{\"type\":\"string\",\"avro.java.string\":\"String\"}},{\"name\":\"maybe_bool_column\",\"type\":[\"null\",\"boolean\"]},{\"name\":\"maybe_int_column\",\"type\":[\"null\",\"int\"]},{\"name\":\"maybe_long_column\",\"type\":[\"null\",\"long\"]},{\"name\":\"maybe_float_column\",\"type\":[\"null\",\"float\"]},{\"name\":\"maybe_double_column\",\"type\":[\"null\",\"double\"]},{\"name\":\"maybe_binary_column\",\"type\":[\"null\",\"bytes\"]},{\"name\":\"maybe_string_column\",\"type\":[\"null\",{\"type\":\"string\",\"avro.java.string\":\"String\"}]},{\"name\":\"strings_column\",\"type\":{\"type\":\"array\",\"items\":{\"type\":\"string\",\"avro.java.string\":\"String\"}}},{\"name\":\"string_to_int_column\",\"type\":{\"type\":\"map\",\"values\":\"int\",\"avro.java.string\":\"String\"}},{\"name\":\"complex_column\",\"type\":{\"type\":\"map\",\"values\":{\"type\":\"array\",\"items\":{\"type\":\"record\",\"name\":\"Nested\",\"fields\":[{\"name\":\"nested_ints_column\",\"type\":{\"type\":\"array\",\"items\":\"int\"}},{\"name\":\"nested_string_column\",\"type\":{\"type\":\"string\",\"avro.java.string\":\"String\"}}]}},\"avro.java.string\":\"String\"}}]}");
+ public static org.apache.avro.Schema getClassSchema() { return SCHEMA$; }
+ @Deprecated public boolean bool_column;
+ @Deprecated public int int_column;
+ @Deprecated public long long_column;
+ @Deprecated public float float_column;
+ @Deprecated public double double_column;
+ @Deprecated public java.nio.ByteBuffer binary_column;
+ @Deprecated public java.lang.String string_column;
+ @Deprecated public java.lang.Boolean maybe_bool_column;
+ @Deprecated public java.lang.Integer maybe_int_column;
+ @Deprecated public java.lang.Long maybe_long_column;
+ @Deprecated public java.lang.Float maybe_float_column;
+ @Deprecated public java.lang.Double maybe_double_column;
+ @Deprecated public java.nio.ByteBuffer maybe_binary_column;
+ @Deprecated public java.lang.String maybe_string_column;
+ @Deprecated public java.util.List strings_column;
+ @Deprecated public java.util.Map string_to_int_column;
+ @Deprecated public java.util.Map> complex_column;
+
+ /**
+ * Default constructor. Note that this does not initialize fields
+ * to their default values from the schema. If that is desired then
+ * one should use newBuilder().
+ */
+ public ParquetAvroCompat() {}
+
+ /**
+ * All-args constructor.
+ */
+ public ParquetAvroCompat(java.lang.Boolean bool_column, java.lang.Integer int_column, java.lang.Long long_column, java.lang.Float float_column, java.lang.Double double_column, java.nio.ByteBuffer binary_column, java.lang.String string_column, java.lang.Boolean maybe_bool_column, java.lang.Integer maybe_int_column, java.lang.Long maybe_long_column, java.lang.Float maybe_float_column, java.lang.Double maybe_double_column, java.nio.ByteBuffer maybe_binary_column, java.lang.String maybe_string_column, java.util.List strings_column, java.util.Map string_to_int_column, java.util.Map> complex_column) {
+ this.bool_column = bool_column;
+ this.int_column = int_column;
+ this.long_column = long_column;
+ this.float_column = float_column;
+ this.double_column = double_column;
+ this.binary_column = binary_column;
+ this.string_column = string_column;
+ this.maybe_bool_column = maybe_bool_column;
+ this.maybe_int_column = maybe_int_column;
+ this.maybe_long_column = maybe_long_column;
+ this.maybe_float_column = maybe_float_column;
+ this.maybe_double_column = maybe_double_column;
+ this.maybe_binary_column = maybe_binary_column;
+ this.maybe_string_column = maybe_string_column;
+ this.strings_column = strings_column;
+ this.string_to_int_column = string_to_int_column;
+ this.complex_column = complex_column;
+ }
+
+ public org.apache.avro.Schema getSchema() { return SCHEMA$; }
+ // Used by DatumWriter. Applications should not call.
+ public java.lang.Object get(int field$) {
+ switch (field$) {
+ case 0: return bool_column;
+ case 1: return int_column;
+ case 2: return long_column;
+ case 3: return float_column;
+ case 4: return double_column;
+ case 5: return binary_column;
+ case 6: return string_column;
+ case 7: return maybe_bool_column;
+ case 8: return maybe_int_column;
+ case 9: return maybe_long_column;
+ case 10: return maybe_float_column;
+ case 11: return maybe_double_column;
+ case 12: return maybe_binary_column;
+ case 13: return maybe_string_column;
+ case 14: return strings_column;
+ case 15: return string_to_int_column;
+ case 16: return complex_column;
+ default: throw new org.apache.avro.AvroRuntimeException("Bad index");
+ }
+ }
+ // Used by DatumReader. Applications should not call.
+ @SuppressWarnings(value="unchecked")
+ public void put(int field$, java.lang.Object value$) {
+ switch (field$) {
+ case 0: bool_column = (java.lang.Boolean)value$; break;
+ case 1: int_column = (java.lang.Integer)value$; break;
+ case 2: long_column = (java.lang.Long)value$; break;
+ case 3: float_column = (java.lang.Float)value$; break;
+ case 4: double_column = (java.lang.Double)value$; break;
+ case 5: binary_column = (java.nio.ByteBuffer)value$; break;
+ case 6: string_column = (java.lang.String)value$; break;
+ case 7: maybe_bool_column = (java.lang.Boolean)value$; break;
+ case 8: maybe_int_column = (java.lang.Integer)value$; break;
+ case 9: maybe_long_column = (java.lang.Long)value$; break;
+ case 10: maybe_float_column = (java.lang.Float)value$; break;
+ case 11: maybe_double_column = (java.lang.Double)value$; break;
+ case 12: maybe_binary_column = (java.nio.ByteBuffer)value$; break;
+ case 13: maybe_string_column = (java.lang.String)value$; break;
+ case 14: strings_column = (java.util.List)value$; break;
+ case 15: string_to_int_column = (java.util.Map)value$; break;
+ case 16: complex_column = (java.util.Map>)value$; break;
+ default: throw new org.apache.avro.AvroRuntimeException("Bad index");
+ }
+ }
+
+ /**
+ * Gets the value of the 'bool_column' field.
+ */
+ public java.lang.Boolean getBoolColumn() {
+ return bool_column;
+ }
+
+ /**
+ * Sets the value of the 'bool_column' field.
+ * @param value the value to set.
+ */
+ public void setBoolColumn(java.lang.Boolean value) {
+ this.bool_column = value;
+ }
+
+ /**
+ * Gets the value of the 'int_column' field.
+ */
+ public java.lang.Integer getIntColumn() {
+ return int_column;
+ }
+
+ /**
+ * Sets the value of the 'int_column' field.
+ * @param value the value to set.
+ */
+ public void setIntColumn(java.lang.Integer value) {
+ this.int_column = value;
+ }
+
+ /**
+ * Gets the value of the 'long_column' field.
+ */
+ public java.lang.Long getLongColumn() {
+ return long_column;
+ }
+
+ /**
+ * Sets the value of the 'long_column' field.
+ * @param value the value to set.
+ */
+ public void setLongColumn(java.lang.Long value) {
+ this.long_column = value;
+ }
+
+ /**
+ * Gets the value of the 'float_column' field.
+ */
+ public java.lang.Float getFloatColumn() {
+ return float_column;
+ }
+
+ /**
+ * Sets the value of the 'float_column' field.
+ * @param value the value to set.
+ */
+ public void setFloatColumn(java.lang.Float value) {
+ this.float_column = value;
+ }
+
+ /**
+ * Gets the value of the 'double_column' field.
+ */
+ public java.lang.Double getDoubleColumn() {
+ return double_column;
+ }
+
+ /**
+ * Sets the value of the 'double_column' field.
+ * @param value the value to set.
+ */
+ public void setDoubleColumn(java.lang.Double value) {
+ this.double_column = value;
+ }
+
+ /**
+ * Gets the value of the 'binary_column' field.
+ */
+ public java.nio.ByteBuffer getBinaryColumn() {
+ return binary_column;
+ }
+
+ /**
+ * Sets the value of the 'binary_column' field.
+ * @param value the value to set.
+ */
+ public void setBinaryColumn(java.nio.ByteBuffer value) {
+ this.binary_column = value;
+ }
+
+ /**
+ * Gets the value of the 'string_column' field.
+ */
+ public java.lang.String getStringColumn() {
+ return string_column;
+ }
+
+ /**
+ * Sets the value of the 'string_column' field.
+ * @param value the value to set.
+ */
+ public void setStringColumn(java.lang.String value) {
+ this.string_column = value;
+ }
+
+ /**
+ * Gets the value of the 'maybe_bool_column' field.
+ */
+ public java.lang.Boolean getMaybeBoolColumn() {
+ return maybe_bool_column;
+ }
+
+ /**
+ * Sets the value of the 'maybe_bool_column' field.
+ * @param value the value to set.
+ */
+ public void setMaybeBoolColumn(java.lang.Boolean value) {
+ this.maybe_bool_column = value;
+ }
+
+ /**
+ * Gets the value of the 'maybe_int_column' field.
+ */
+ public java.lang.Integer getMaybeIntColumn() {
+ return maybe_int_column;
+ }
+
+ /**
+ * Sets the value of the 'maybe_int_column' field.
+ * @param value the value to set.
+ */
+ public void setMaybeIntColumn(java.lang.Integer value) {
+ this.maybe_int_column = value;
+ }
+
+ /**
+ * Gets the value of the 'maybe_long_column' field.
+ */
+ public java.lang.Long getMaybeLongColumn() {
+ return maybe_long_column;
+ }
+
+ /**
+ * Sets the value of the 'maybe_long_column' field.
+ * @param value the value to set.
+ */
+ public void setMaybeLongColumn(java.lang.Long value) {
+ this.maybe_long_column = value;
+ }
+
+ /**
+ * Gets the value of the 'maybe_float_column' field.
+ */
+ public java.lang.Float getMaybeFloatColumn() {
+ return maybe_float_column;
+ }
+
+ /**
+ * Sets the value of the 'maybe_float_column' field.
+ * @param value the value to set.
+ */
+ public void setMaybeFloatColumn(java.lang.Float value) {
+ this.maybe_float_column = value;
+ }
+
+ /**
+ * Gets the value of the 'maybe_double_column' field.
+ */
+ public java.lang.Double getMaybeDoubleColumn() {
+ return maybe_double_column;
+ }
+
+ /**
+ * Sets the value of the 'maybe_double_column' field.
+ * @param value the value to set.
+ */
+ public void setMaybeDoubleColumn(java.lang.Double value) {
+ this.maybe_double_column = value;
+ }
+
+ /**
+ * Gets the value of the 'maybe_binary_column' field.
+ */
+ public java.nio.ByteBuffer getMaybeBinaryColumn() {
+ return maybe_binary_column;
+ }
+
+ /**
+ * Sets the value of the 'maybe_binary_column' field.
+ * @param value the value to set.
+ */
+ public void setMaybeBinaryColumn(java.nio.ByteBuffer value) {
+ this.maybe_binary_column = value;
+ }
+
+ /**
+ * Gets the value of the 'maybe_string_column' field.
+ */
+ public java.lang.String getMaybeStringColumn() {
+ return maybe_string_column;
+ }
+
+ /**
+ * Sets the value of the 'maybe_string_column' field.
+ * @param value the value to set.
+ */
+ public void setMaybeStringColumn(java.lang.String value) {
+ this.maybe_string_column = value;
+ }
+
+ /**
+ * Gets the value of the 'strings_column' field.
+ */
+ public java.util.List getStringsColumn() {
+ return strings_column;
+ }
+
+ /**
+ * Sets the value of the 'strings_column' field.
+ * @param value the value to set.
+ */
+ public void setStringsColumn(java.util.List value) {
+ this.strings_column = value;
+ }
+
+ /**
+ * Gets the value of the 'string_to_int_column' field.
+ */
+ public java.util.Map getStringToIntColumn() {
+ return string_to_int_column;
+ }
+
+ /**
+ * Sets the value of the 'string_to_int_column' field.
+ * @param value the value to set.
+ */
+ public void setStringToIntColumn(java.util.Map value) {
+ this.string_to_int_column = value;
+ }
+
+ /**
+ * Gets the value of the 'complex_column' field.
+ */
+ public java.util.Map> getComplexColumn() {
+ return complex_column;
+ }
+
+ /**
+ * Sets the value of the 'complex_column' field.
+ * @param value the value to set.
+ */
+ public void setComplexColumn(java.util.Map> value) {
+ this.complex_column = value;
+ }
+
+ /** Creates a new ParquetAvroCompat RecordBuilder */
+ public static org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder newBuilder() {
+ return new org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder();
+ }
+
+ /** Creates a new ParquetAvroCompat RecordBuilder by copying an existing Builder */
+ public static org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder newBuilder(org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder other) {
+ return new org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder(other);
+ }
+
+ /** Creates a new ParquetAvroCompat RecordBuilder by copying an existing ParquetAvroCompat instance */
+ public static org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder newBuilder(org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat other) {
+ return new org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder(other);
+ }
+
+ /**
+ * RecordBuilder for ParquetAvroCompat instances.
+ */
+ public static class Builder extends org.apache.avro.specific.SpecificRecordBuilderBase
+ implements org.apache.avro.data.RecordBuilder {
+
+ private boolean bool_column;
+ private int int_column;
+ private long long_column;
+ private float float_column;
+ private double double_column;
+ private java.nio.ByteBuffer binary_column;
+ private java.lang.String string_column;
+ private java.lang.Boolean maybe_bool_column;
+ private java.lang.Integer maybe_int_column;
+ private java.lang.Long maybe_long_column;
+ private java.lang.Float maybe_float_column;
+ private java.lang.Double maybe_double_column;
+ private java.nio.ByteBuffer maybe_binary_column;
+ private java.lang.String maybe_string_column;
+ private java.util.List strings_column;
+ private java.util.Map string_to_int_column;
+ private java.util.Map> complex_column;
+
+ /** Creates a new Builder */
+ private Builder() {
+ super(org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.SCHEMA$);
+ }
+
+ /** Creates a Builder by copying an existing Builder */
+ private Builder(org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder other) {
+ super(other);
+ if (isValidValue(fields()[0], other.bool_column)) {
+ this.bool_column = data().deepCopy(fields()[0].schema(), other.bool_column);
+ fieldSetFlags()[0] = true;
+ }
+ if (isValidValue(fields()[1], other.int_column)) {
+ this.int_column = data().deepCopy(fields()[1].schema(), other.int_column);
+ fieldSetFlags()[1] = true;
+ }
+ if (isValidValue(fields()[2], other.long_column)) {
+ this.long_column = data().deepCopy(fields()[2].schema(), other.long_column);
+ fieldSetFlags()[2] = true;
+ }
+ if (isValidValue(fields()[3], other.float_column)) {
+ this.float_column = data().deepCopy(fields()[3].schema(), other.float_column);
+ fieldSetFlags()[3] = true;
+ }
+ if (isValidValue(fields()[4], other.double_column)) {
+ this.double_column = data().deepCopy(fields()[4].schema(), other.double_column);
+ fieldSetFlags()[4] = true;
+ }
+ if (isValidValue(fields()[5], other.binary_column)) {
+ this.binary_column = data().deepCopy(fields()[5].schema(), other.binary_column);
+ fieldSetFlags()[5] = true;
+ }
+ if (isValidValue(fields()[6], other.string_column)) {
+ this.string_column = data().deepCopy(fields()[6].schema(), other.string_column);
+ fieldSetFlags()[6] = true;
+ }
+ if (isValidValue(fields()[7], other.maybe_bool_column)) {
+ this.maybe_bool_column = data().deepCopy(fields()[7].schema(), other.maybe_bool_column);
+ fieldSetFlags()[7] = true;
+ }
+ if (isValidValue(fields()[8], other.maybe_int_column)) {
+ this.maybe_int_column = data().deepCopy(fields()[8].schema(), other.maybe_int_column);
+ fieldSetFlags()[8] = true;
+ }
+ if (isValidValue(fields()[9], other.maybe_long_column)) {
+ this.maybe_long_column = data().deepCopy(fields()[9].schema(), other.maybe_long_column);
+ fieldSetFlags()[9] = true;
+ }
+ if (isValidValue(fields()[10], other.maybe_float_column)) {
+ this.maybe_float_column = data().deepCopy(fields()[10].schema(), other.maybe_float_column);
+ fieldSetFlags()[10] = true;
+ }
+ if (isValidValue(fields()[11], other.maybe_double_column)) {
+ this.maybe_double_column = data().deepCopy(fields()[11].schema(), other.maybe_double_column);
+ fieldSetFlags()[11] = true;
+ }
+ if (isValidValue(fields()[12], other.maybe_binary_column)) {
+ this.maybe_binary_column = data().deepCopy(fields()[12].schema(), other.maybe_binary_column);
+ fieldSetFlags()[12] = true;
+ }
+ if (isValidValue(fields()[13], other.maybe_string_column)) {
+ this.maybe_string_column = data().deepCopy(fields()[13].schema(), other.maybe_string_column);
+ fieldSetFlags()[13] = true;
+ }
+ if (isValidValue(fields()[14], other.strings_column)) {
+ this.strings_column = data().deepCopy(fields()[14].schema(), other.strings_column);
+ fieldSetFlags()[14] = true;
+ }
+ if (isValidValue(fields()[15], other.string_to_int_column)) {
+ this.string_to_int_column = data().deepCopy(fields()[15].schema(), other.string_to_int_column);
+ fieldSetFlags()[15] = true;
+ }
+ if (isValidValue(fields()[16], other.complex_column)) {
+ this.complex_column = data().deepCopy(fields()[16].schema(), other.complex_column);
+ fieldSetFlags()[16] = true;
+ }
+ }
+
+ /** Creates a Builder by copying an existing ParquetAvroCompat instance */
+ private Builder(org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat other) {
+ super(org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.SCHEMA$);
+ if (isValidValue(fields()[0], other.bool_column)) {
+ this.bool_column = data().deepCopy(fields()[0].schema(), other.bool_column);
+ fieldSetFlags()[0] = true;
+ }
+ if (isValidValue(fields()[1], other.int_column)) {
+ this.int_column = data().deepCopy(fields()[1].schema(), other.int_column);
+ fieldSetFlags()[1] = true;
+ }
+ if (isValidValue(fields()[2], other.long_column)) {
+ this.long_column = data().deepCopy(fields()[2].schema(), other.long_column);
+ fieldSetFlags()[2] = true;
+ }
+ if (isValidValue(fields()[3], other.float_column)) {
+ this.float_column = data().deepCopy(fields()[3].schema(), other.float_column);
+ fieldSetFlags()[3] = true;
+ }
+ if (isValidValue(fields()[4], other.double_column)) {
+ this.double_column = data().deepCopy(fields()[4].schema(), other.double_column);
+ fieldSetFlags()[4] = true;
+ }
+ if (isValidValue(fields()[5], other.binary_column)) {
+ this.binary_column = data().deepCopy(fields()[5].schema(), other.binary_column);
+ fieldSetFlags()[5] = true;
+ }
+ if (isValidValue(fields()[6], other.string_column)) {
+ this.string_column = data().deepCopy(fields()[6].schema(), other.string_column);
+ fieldSetFlags()[6] = true;
+ }
+ if (isValidValue(fields()[7], other.maybe_bool_column)) {
+ this.maybe_bool_column = data().deepCopy(fields()[7].schema(), other.maybe_bool_column);
+ fieldSetFlags()[7] = true;
+ }
+ if (isValidValue(fields()[8], other.maybe_int_column)) {
+ this.maybe_int_column = data().deepCopy(fields()[8].schema(), other.maybe_int_column);
+ fieldSetFlags()[8] = true;
+ }
+ if (isValidValue(fields()[9], other.maybe_long_column)) {
+ this.maybe_long_column = data().deepCopy(fields()[9].schema(), other.maybe_long_column);
+ fieldSetFlags()[9] = true;
+ }
+ if (isValidValue(fields()[10], other.maybe_float_column)) {
+ this.maybe_float_column = data().deepCopy(fields()[10].schema(), other.maybe_float_column);
+ fieldSetFlags()[10] = true;
+ }
+ if (isValidValue(fields()[11], other.maybe_double_column)) {
+ this.maybe_double_column = data().deepCopy(fields()[11].schema(), other.maybe_double_column);
+ fieldSetFlags()[11] = true;
+ }
+ if (isValidValue(fields()[12], other.maybe_binary_column)) {
+ this.maybe_binary_column = data().deepCopy(fields()[12].schema(), other.maybe_binary_column);
+ fieldSetFlags()[12] = true;
+ }
+ if (isValidValue(fields()[13], other.maybe_string_column)) {
+ this.maybe_string_column = data().deepCopy(fields()[13].schema(), other.maybe_string_column);
+ fieldSetFlags()[13] = true;
+ }
+ if (isValidValue(fields()[14], other.strings_column)) {
+ this.strings_column = data().deepCopy(fields()[14].schema(), other.strings_column);
+ fieldSetFlags()[14] = true;
+ }
+ if (isValidValue(fields()[15], other.string_to_int_column)) {
+ this.string_to_int_column = data().deepCopy(fields()[15].schema(), other.string_to_int_column);
+ fieldSetFlags()[15] = true;
+ }
+ if (isValidValue(fields()[16], other.complex_column)) {
+ this.complex_column = data().deepCopy(fields()[16].schema(), other.complex_column);
+ fieldSetFlags()[16] = true;
+ }
+ }
+
+ /** Gets the value of the 'bool_column' field */
+ public java.lang.Boolean getBoolColumn() {
+ return bool_column;
+ }
+
+ /** Sets the value of the 'bool_column' field */
+ public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder setBoolColumn(boolean value) {
+ validate(fields()[0], value);
+ this.bool_column = value;
+ fieldSetFlags()[0] = true;
+ return this;
+ }
+
+ /** Checks whether the 'bool_column' field has been set */
+ public boolean hasBoolColumn() {
+ return fieldSetFlags()[0];
+ }
+
+ /** Clears the value of the 'bool_column' field */
+ public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder clearBoolColumn() {
+ fieldSetFlags()[0] = false;
+ return this;
+ }
+
+ /** Gets the value of the 'int_column' field */
+ public java.lang.Integer getIntColumn() {
+ return int_column;
+ }
+
+ /** Sets the value of the 'int_column' field */
+ public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder setIntColumn(int value) {
+ validate(fields()[1], value);
+ this.int_column = value;
+ fieldSetFlags()[1] = true;
+ return this;
+ }
+
+ /** Checks whether the 'int_column' field has been set */
+ public boolean hasIntColumn() {
+ return fieldSetFlags()[1];
+ }
+
+ /** Clears the value of the 'int_column' field */
+ public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder clearIntColumn() {
+ fieldSetFlags()[1] = false;
+ return this;
+ }
+
+ /** Gets the value of the 'long_column' field */
+ public java.lang.Long getLongColumn() {
+ return long_column;
+ }
+
+ /** Sets the value of the 'long_column' field */
+ public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder setLongColumn(long value) {
+ validate(fields()[2], value);
+ this.long_column = value;
+ fieldSetFlags()[2] = true;
+ return this;
+ }
+
+ /** Checks whether the 'long_column' field has been set */
+ public boolean hasLongColumn() {
+ return fieldSetFlags()[2];
+ }
+
+ /** Clears the value of the 'long_column' field */
+ public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder clearLongColumn() {
+ fieldSetFlags()[2] = false;
+ return this;
+ }
+
+ /** Gets the value of the 'float_column' field */
+ public java.lang.Float getFloatColumn() {
+ return float_column;
+ }
+
+ /** Sets the value of the 'float_column' field */
+ public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder setFloatColumn(float value) {
+ validate(fields()[3], value);
+ this.float_column = value;
+ fieldSetFlags()[3] = true;
+ return this;
+ }
+
+ /** Checks whether the 'float_column' field has been set */
+ public boolean hasFloatColumn() {
+ return fieldSetFlags()[3];
+ }
+
+ /** Clears the value of the 'float_column' field */
+ public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder clearFloatColumn() {
+ fieldSetFlags()[3] = false;
+ return this;
+ }
+
+ /** Gets the value of the 'double_column' field */
+ public java.lang.Double getDoubleColumn() {
+ return double_column;
+ }
+
+ /** Sets the value of the 'double_column' field */
+ public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder setDoubleColumn(double value) {
+ validate(fields()[4], value);
+ this.double_column = value;
+ fieldSetFlags()[4] = true;
+ return this;
+ }
+
+ /** Checks whether the 'double_column' field has been set */
+ public boolean hasDoubleColumn() {
+ return fieldSetFlags()[4];
+ }
+
+ /** Clears the value of the 'double_column' field */
+ public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder clearDoubleColumn() {
+ fieldSetFlags()[4] = false;
+ return this;
+ }
+
+ /** Gets the value of the 'binary_column' field */
+ public java.nio.ByteBuffer getBinaryColumn() {
+ return binary_column;
+ }
+
+ /** Sets the value of the 'binary_column' field */
+ public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder setBinaryColumn(java.nio.ByteBuffer value) {
+ validate(fields()[5], value);
+ this.binary_column = value;
+ fieldSetFlags()[5] = true;
+ return this;
+ }
+
+ /** Checks whether the 'binary_column' field has been set */
+ public boolean hasBinaryColumn() {
+ return fieldSetFlags()[5];
+ }
+
+ /** Clears the value of the 'binary_column' field */
+ public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder clearBinaryColumn() {
+ binary_column = null;
+ fieldSetFlags()[5] = false;
+ return this;
+ }
+
+ /** Gets the value of the 'string_column' field */
+ public java.lang.String getStringColumn() {
+ return string_column;
+ }
+
+ /** Sets the value of the 'string_column' field */
+ public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder setStringColumn(java.lang.String value) {
+ validate(fields()[6], value);
+ this.string_column = value;
+ fieldSetFlags()[6] = true;
+ return this;
+ }
+
+ /** Checks whether the 'string_column' field has been set */
+ public boolean hasStringColumn() {
+ return fieldSetFlags()[6];
+ }
+
+ /** Clears the value of the 'string_column' field */
+ public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder clearStringColumn() {
+ string_column = null;
+ fieldSetFlags()[6] = false;
+ return this;
+ }
+
+ /** Gets the value of the 'maybe_bool_column' field */
+ public java.lang.Boolean getMaybeBoolColumn() {
+ return maybe_bool_column;
+ }
+
+ /** Sets the value of the 'maybe_bool_column' field */
+ public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder setMaybeBoolColumn(java.lang.Boolean value) {
+ validate(fields()[7], value);
+ this.maybe_bool_column = value;
+ fieldSetFlags()[7] = true;
+ return this;
+ }
+
+ /** Checks whether the 'maybe_bool_column' field has been set */
+ public boolean hasMaybeBoolColumn() {
+ return fieldSetFlags()[7];
+ }
+
+ /** Clears the value of the 'maybe_bool_column' field */
+ public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder clearMaybeBoolColumn() {
+ maybe_bool_column = null;
+ fieldSetFlags()[7] = false;
+ return this;
+ }
+
+ /** Gets the value of the 'maybe_int_column' field */
+ public java.lang.Integer getMaybeIntColumn() {
+ return maybe_int_column;
+ }
+
+ /** Sets the value of the 'maybe_int_column' field */
+ public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder setMaybeIntColumn(java.lang.Integer value) {
+ validate(fields()[8], value);
+ this.maybe_int_column = value;
+ fieldSetFlags()[8] = true;
+ return this;
+ }
+
+ /** Checks whether the 'maybe_int_column' field has been set */
+ public boolean hasMaybeIntColumn() {
+ return fieldSetFlags()[8];
+ }
+
+ /** Clears the value of the 'maybe_int_column' field */
+ public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder clearMaybeIntColumn() {
+ maybe_int_column = null;
+ fieldSetFlags()[8] = false;
+ return this;
+ }
+
+ /** Gets the value of the 'maybe_long_column' field */
+ public java.lang.Long getMaybeLongColumn() {
+ return maybe_long_column;
+ }
+
+ /** Sets the value of the 'maybe_long_column' field */
+ public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder setMaybeLongColumn(java.lang.Long value) {
+ validate(fields()[9], value);
+ this.maybe_long_column = value;
+ fieldSetFlags()[9] = true;
+ return this;
+ }
+
+ /** Checks whether the 'maybe_long_column' field has been set */
+ public boolean hasMaybeLongColumn() {
+ return fieldSetFlags()[9];
+ }
+
+ /** Clears the value of the 'maybe_long_column' field */
+ public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder clearMaybeLongColumn() {
+ maybe_long_column = null;
+ fieldSetFlags()[9] = false;
+ return this;
+ }
+
+ /** Gets the value of the 'maybe_float_column' field */
+ public java.lang.Float getMaybeFloatColumn() {
+ return maybe_float_column;
+ }
+
+ /** Sets the value of the 'maybe_float_column' field */
+ public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder setMaybeFloatColumn(java.lang.Float value) {
+ validate(fields()[10], value);
+ this.maybe_float_column = value;
+ fieldSetFlags()[10] = true;
+ return this;
+ }
+
+ /** Checks whether the 'maybe_float_column' field has been set */
+ public boolean hasMaybeFloatColumn() {
+ return fieldSetFlags()[10];
+ }
+
+ /** Clears the value of the 'maybe_float_column' field */
+ public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder clearMaybeFloatColumn() {
+ maybe_float_column = null;
+ fieldSetFlags()[10] = false;
+ return this;
+ }
+
+ /** Gets the value of the 'maybe_double_column' field */
+ public java.lang.Double getMaybeDoubleColumn() {
+ return maybe_double_column;
+ }
+
+ /** Sets the value of the 'maybe_double_column' field */
+ public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder setMaybeDoubleColumn(java.lang.Double value) {
+ validate(fields()[11], value);
+ this.maybe_double_column = value;
+ fieldSetFlags()[11] = true;
+ return this;
+ }
+
+ /** Checks whether the 'maybe_double_column' field has been set */
+ public boolean hasMaybeDoubleColumn() {
+ return fieldSetFlags()[11];
+ }
+
+ /** Clears the value of the 'maybe_double_column' field */
+ public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder clearMaybeDoubleColumn() {
+ maybe_double_column = null;
+ fieldSetFlags()[11] = false;
+ return this;
+ }
+
+ /** Gets the value of the 'maybe_binary_column' field */
+ public java.nio.ByteBuffer getMaybeBinaryColumn() {
+ return maybe_binary_column;
+ }
+
+ /** Sets the value of the 'maybe_binary_column' field */
+ public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder setMaybeBinaryColumn(java.nio.ByteBuffer value) {
+ validate(fields()[12], value);
+ this.maybe_binary_column = value;
+ fieldSetFlags()[12] = true;
+ return this;
+ }
+
+ /** Checks whether the 'maybe_binary_column' field has been set */
+ public boolean hasMaybeBinaryColumn() {
+ return fieldSetFlags()[12];
+ }
+
+ /** Clears the value of the 'maybe_binary_column' field */
+ public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder clearMaybeBinaryColumn() {
+ maybe_binary_column = null;
+ fieldSetFlags()[12] = false;
+ return this;
+ }
+
+ /** Gets the value of the 'maybe_string_column' field */
+ public java.lang.String getMaybeStringColumn() {
+ return maybe_string_column;
+ }
+
+ /** Sets the value of the 'maybe_string_column' field */
+ public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder setMaybeStringColumn(java.lang.String value) {
+ validate(fields()[13], value);
+ this.maybe_string_column = value;
+ fieldSetFlags()[13] = true;
+ return this;
+ }
+
+ /** Checks whether the 'maybe_string_column' field has been set */
+ public boolean hasMaybeStringColumn() {
+ return fieldSetFlags()[13];
+ }
+
+ /** Clears the value of the 'maybe_string_column' field */
+ public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder clearMaybeStringColumn() {
+ maybe_string_column = null;
+ fieldSetFlags()[13] = false;
+ return this;
+ }
+
+ /** Gets the value of the 'strings_column' field */
+ public java.util.List getStringsColumn() {
+ return strings_column;
+ }
+
+ /** Sets the value of the 'strings_column' field */
+ public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder setStringsColumn(java.util.List value) {
+ validate(fields()[14], value);
+ this.strings_column = value;
+ fieldSetFlags()[14] = true;
+ return this;
+ }
+
+ /** Checks whether the 'strings_column' field has been set */
+ public boolean hasStringsColumn() {
+ return fieldSetFlags()[14];
+ }
+
+ /** Clears the value of the 'strings_column' field */
+ public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder clearStringsColumn() {
+ strings_column = null;
+ fieldSetFlags()[14] = false;
+ return this;
+ }
+
+ /** Gets the value of the 'string_to_int_column' field */
+ public java.util.Map getStringToIntColumn() {
+ return string_to_int_column;
+ }
+
+ /** Sets the value of the 'string_to_int_column' field */
+ public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder setStringToIntColumn(java.util.Map value) {
+ validate(fields()[15], value);
+ this.string_to_int_column = value;
+ fieldSetFlags()[15] = true;
+ return this;
+ }
+
+ /** Checks whether the 'string_to_int_column' field has been set */
+ public boolean hasStringToIntColumn() {
+ return fieldSetFlags()[15];
+ }
+
+ /** Clears the value of the 'string_to_int_column' field */
+ public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder clearStringToIntColumn() {
+ string_to_int_column = null;
+ fieldSetFlags()[15] = false;
+ return this;
+ }
+
+ /** Gets the value of the 'complex_column' field */
+ public java.util.Map> getComplexColumn() {
+ return complex_column;
+ }
+
+ /** Sets the value of the 'complex_column' field */
+ public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder setComplexColumn(java.util.Map> value) {
+ validate(fields()[16], value);
+ this.complex_column = value;
+ fieldSetFlags()[16] = true;
+ return this;
+ }
+
+ /** Checks whether the 'complex_column' field has been set */
+ public boolean hasComplexColumn() {
+ return fieldSetFlags()[16];
+ }
+
+ /** Clears the value of the 'complex_column' field */
+ public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder clearComplexColumn() {
+ complex_column = null;
+ fieldSetFlags()[16] = false;
+ return this;
+ }
+
+ @Override
+ public ParquetAvroCompat build() {
+ try {
+ ParquetAvroCompat record = new ParquetAvroCompat();
+ record.bool_column = fieldSetFlags()[0] ? this.bool_column : (java.lang.Boolean) defaultValue(fields()[0]);
+ record.int_column = fieldSetFlags()[1] ? this.int_column : (java.lang.Integer) defaultValue(fields()[1]);
+ record.long_column = fieldSetFlags()[2] ? this.long_column : (java.lang.Long) defaultValue(fields()[2]);
+ record.float_column = fieldSetFlags()[3] ? this.float_column : (java.lang.Float) defaultValue(fields()[3]);
+ record.double_column = fieldSetFlags()[4] ? this.double_column : (java.lang.Double) defaultValue(fields()[4]);
+ record.binary_column = fieldSetFlags()[5] ? this.binary_column : (java.nio.ByteBuffer) defaultValue(fields()[5]);
+ record.string_column = fieldSetFlags()[6] ? this.string_column : (java.lang.String) defaultValue(fields()[6]);
+ record.maybe_bool_column = fieldSetFlags()[7] ? this.maybe_bool_column : (java.lang.Boolean) defaultValue(fields()[7]);
+ record.maybe_int_column = fieldSetFlags()[8] ? this.maybe_int_column : (java.lang.Integer) defaultValue(fields()[8]);
+ record.maybe_long_column = fieldSetFlags()[9] ? this.maybe_long_column : (java.lang.Long) defaultValue(fields()[9]);
+ record.maybe_float_column = fieldSetFlags()[10] ? this.maybe_float_column : (java.lang.Float) defaultValue(fields()[10]);
+ record.maybe_double_column = fieldSetFlags()[11] ? this.maybe_double_column : (java.lang.Double) defaultValue(fields()[11]);
+ record.maybe_binary_column = fieldSetFlags()[12] ? this.maybe_binary_column : (java.nio.ByteBuffer) defaultValue(fields()[12]);
+ record.maybe_string_column = fieldSetFlags()[13] ? this.maybe_string_column : (java.lang.String) defaultValue(fields()[13]);
+ record.strings_column = fieldSetFlags()[14] ? this.strings_column : (java.util.List) defaultValue(fields()[14]);
+ record.string_to_int_column = fieldSetFlags()[15] ? this.string_to_int_column : (java.util.Map) defaultValue(fields()[15]);
+ record.complex_column = fieldSetFlags()[16] ? this.complex_column : (java.util.Map>) defaultValue(fields()[16]);
+ return record;
+ } catch (Exception e) {
+ throw new org.apache.avro.AvroRuntimeException(e);
+ }
+ }
+ }
+}
diff --git a/sql/core/src/test/gen-java/org/apache/spark/sql/parquet/test/thrift/Nested.java b/sql/core/src/test/gen-java/org/apache/spark/sql/parquet/test/thrift/Nested.java
new file mode 100644
index 0000000000000..281e60cc3ae34
--- /dev/null
+++ b/sql/core/src/test/gen-java/org/apache/spark/sql/parquet/test/thrift/Nested.java
@@ -0,0 +1,541 @@
+/**
+ * Autogenerated by Thrift Compiler (0.9.2)
+ *
+ * DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING
+ * @generated
+ */
+package org.apache.spark.sql.parquet.test.thrift;
+
+import org.apache.thrift.scheme.IScheme;
+import org.apache.thrift.scheme.SchemeFactory;
+import org.apache.thrift.scheme.StandardScheme;
+
+import org.apache.thrift.scheme.TupleScheme;
+import org.apache.thrift.protocol.TTupleProtocol;
+import org.apache.thrift.protocol.TProtocolException;
+import org.apache.thrift.EncodingUtils;
+import org.apache.thrift.TException;
+import org.apache.thrift.async.AsyncMethodCallback;
+import org.apache.thrift.server.AbstractNonblockingServer.*;
+import java.util.List;
+import java.util.ArrayList;
+import java.util.Map;
+import java.util.HashMap;
+import java.util.EnumMap;
+import java.util.Set;
+import java.util.HashSet;
+import java.util.EnumSet;
+import java.util.Collections;
+import java.util.BitSet;
+import java.nio.ByteBuffer;
+import java.util.Arrays;
+import javax.annotation.Generated;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+@SuppressWarnings({"cast", "rawtypes", "serial", "unchecked"})
+@Generated(value = "Autogenerated by Thrift Compiler (0.9.2)", date = "2015-7-7")
+public class Nested implements org.apache.thrift.TBase, java.io.Serializable, Cloneable, Comparable {
+ private static final org.apache.thrift.protocol.TStruct STRUCT_DESC = new org.apache.thrift.protocol.TStruct("Nested");
+
+ private static final org.apache.thrift.protocol.TField NESTED_INTS_COLUMN_FIELD_DESC = new org.apache.thrift.protocol.TField("nestedIntsColumn", org.apache.thrift.protocol.TType.LIST, (short)1);
+ private static final org.apache.thrift.protocol.TField NESTED_STRING_COLUMN_FIELD_DESC = new org.apache.thrift.protocol.TField("nestedStringColumn", org.apache.thrift.protocol.TType.STRING, (short)2);
+
+ private static final Map, SchemeFactory> schemes = new HashMap, SchemeFactory>();
+ static {
+ schemes.put(StandardScheme.class, new NestedStandardSchemeFactory());
+ schemes.put(TupleScheme.class, new NestedTupleSchemeFactory());
+ }
+
+ public List nestedIntsColumn; // required
+ public String nestedStringColumn; // required
+
+ /** The set of fields this struct contains, along with convenience methods for finding and manipulating them. */
+ public enum _Fields implements org.apache.thrift.TFieldIdEnum {
+ NESTED_INTS_COLUMN((short)1, "nestedIntsColumn"),
+ NESTED_STRING_COLUMN((short)2, "nestedStringColumn");
+
+ private static final Map byName = new HashMap();
+
+ static {
+ for (_Fields field : EnumSet.allOf(_Fields.class)) {
+ byName.put(field.getFieldName(), field);
+ }
+ }
+
+ /**
+ * Find the _Fields constant that matches fieldId, or null if its not found.
+ */
+ public static _Fields findByThriftId(int fieldId) {
+ switch(fieldId) {
+ case 1: // NESTED_INTS_COLUMN
+ return NESTED_INTS_COLUMN;
+ case 2: // NESTED_STRING_COLUMN
+ return NESTED_STRING_COLUMN;
+ default:
+ return null;
+ }
+ }
+
+ /**
+ * Find the _Fields constant that matches fieldId, throwing an exception
+ * if it is not found.
+ */
+ public static _Fields findByThriftIdOrThrow(int fieldId) {
+ _Fields fields = findByThriftId(fieldId);
+ if (fields == null) throw new IllegalArgumentException("Field " + fieldId + " doesn't exist!");
+ return fields;
+ }
+
+ /**
+ * Find the _Fields constant that matches name, or null if its not found.
+ */
+ public static _Fields findByName(String name) {
+ return byName.get(name);
+ }
+
+ private final short _thriftId;
+ private final String _fieldName;
+
+ _Fields(short thriftId, String fieldName) {
+ _thriftId = thriftId;
+ _fieldName = fieldName;
+ }
+
+ public short getThriftFieldId() {
+ return _thriftId;
+ }
+
+ public String getFieldName() {
+ return _fieldName;
+ }
+ }
+
+ // isset id assignments
+ public static final Map<_Fields, org.apache.thrift.meta_data.FieldMetaData> metaDataMap;
+ static {
+ Map<_Fields, org.apache.thrift.meta_data.FieldMetaData> tmpMap = new EnumMap<_Fields, org.apache.thrift.meta_data.FieldMetaData>(_Fields.class);
+ tmpMap.put(_Fields.NESTED_INTS_COLUMN, new org.apache.thrift.meta_data.FieldMetaData("nestedIntsColumn", org.apache.thrift.TFieldRequirementType.REQUIRED,
+ new org.apache.thrift.meta_data.ListMetaData(org.apache.thrift.protocol.TType.LIST,
+ new org.apache.thrift.meta_data.FieldValueMetaData(org.apache.thrift.protocol.TType.I32))));
+ tmpMap.put(_Fields.NESTED_STRING_COLUMN, new org.apache.thrift.meta_data.FieldMetaData("nestedStringColumn", org.apache.thrift.TFieldRequirementType.REQUIRED,
+ new org.apache.thrift.meta_data.FieldValueMetaData(org.apache.thrift.protocol.TType.STRING)));
+ metaDataMap = Collections.unmodifiableMap(tmpMap);
+ org.apache.thrift.meta_data.FieldMetaData.addStructMetaDataMap(Nested.class, metaDataMap);
+ }
+
+ public Nested() {
+ }
+
+ public Nested(
+ List nestedIntsColumn,
+ String nestedStringColumn)
+ {
+ this();
+ this.nestedIntsColumn = nestedIntsColumn;
+ this.nestedStringColumn = nestedStringColumn;
+ }
+
+ /**
+ * Performs a deep copy on other.
+ */
+ public Nested(Nested other) {
+ if (other.isSetNestedIntsColumn()) {
+ List __this__nestedIntsColumn = new ArrayList(other.nestedIntsColumn);
+ this.nestedIntsColumn = __this__nestedIntsColumn;
+ }
+ if (other.isSetNestedStringColumn()) {
+ this.nestedStringColumn = other.nestedStringColumn;
+ }
+ }
+
+ public Nested deepCopy() {
+ return new Nested(this);
+ }
+
+ @Override
+ public void clear() {
+ this.nestedIntsColumn = null;
+ this.nestedStringColumn = null;
+ }
+
+ public int getNestedIntsColumnSize() {
+ return (this.nestedIntsColumn == null) ? 0 : this.nestedIntsColumn.size();
+ }
+
+ public java.util.Iterator getNestedIntsColumnIterator() {
+ return (this.nestedIntsColumn == null) ? null : this.nestedIntsColumn.iterator();
+ }
+
+ public void addToNestedIntsColumn(int elem) {
+ if (this.nestedIntsColumn == null) {
+ this.nestedIntsColumn = new ArrayList();
+ }
+ this.nestedIntsColumn.add(elem);
+ }
+
+ public List getNestedIntsColumn() {
+ return this.nestedIntsColumn;
+ }
+
+ public Nested setNestedIntsColumn(List nestedIntsColumn) {
+ this.nestedIntsColumn = nestedIntsColumn;
+ return this;
+ }
+
+ public void unsetNestedIntsColumn() {
+ this.nestedIntsColumn = null;
+ }
+
+ /** Returns true if field nestedIntsColumn is set (has been assigned a value) and false otherwise */
+ public boolean isSetNestedIntsColumn() {
+ return this.nestedIntsColumn != null;
+ }
+
+ public void setNestedIntsColumnIsSet(boolean value) {
+ if (!value) {
+ this.nestedIntsColumn = null;
+ }
+ }
+
+ public String getNestedStringColumn() {
+ return this.nestedStringColumn;
+ }
+
+ public Nested setNestedStringColumn(String nestedStringColumn) {
+ this.nestedStringColumn = nestedStringColumn;
+ return this;
+ }
+
+ public void unsetNestedStringColumn() {
+ this.nestedStringColumn = null;
+ }
+
+ /** Returns true if field nestedStringColumn is set (has been assigned a value) and false otherwise */
+ public boolean isSetNestedStringColumn() {
+ return this.nestedStringColumn != null;
+ }
+
+ public void setNestedStringColumnIsSet(boolean value) {
+ if (!value) {
+ this.nestedStringColumn = null;
+ }
+ }
+
+ public void setFieldValue(_Fields field, Object value) {
+ switch (field) {
+ case NESTED_INTS_COLUMN:
+ if (value == null) {
+ unsetNestedIntsColumn();
+ } else {
+ setNestedIntsColumn((List)value);
+ }
+ break;
+
+ case NESTED_STRING_COLUMN:
+ if (value == null) {
+ unsetNestedStringColumn();
+ } else {
+ setNestedStringColumn((String)value);
+ }
+ break;
+
+ }
+ }
+
+ public Object getFieldValue(_Fields field) {
+ switch (field) {
+ case NESTED_INTS_COLUMN:
+ return getNestedIntsColumn();
+
+ case NESTED_STRING_COLUMN:
+ return getNestedStringColumn();
+
+ }
+ throw new IllegalStateException();
+ }
+
+ /** Returns true if field corresponding to fieldID is set (has been assigned a value) and false otherwise */
+ public boolean isSet(_Fields field) {
+ if (field == null) {
+ throw new IllegalArgumentException();
+ }
+
+ switch (field) {
+ case NESTED_INTS_COLUMN:
+ return isSetNestedIntsColumn();
+ case NESTED_STRING_COLUMN:
+ return isSetNestedStringColumn();
+ }
+ throw new IllegalStateException();
+ }
+
+ @Override
+ public boolean equals(Object that) {
+ if (that == null)
+ return false;
+ if (that instanceof Nested)
+ return this.equals((Nested)that);
+ return false;
+ }
+
+ public boolean equals(Nested that) {
+ if (that == null)
+ return false;
+
+ boolean this_present_nestedIntsColumn = true && this.isSetNestedIntsColumn();
+ boolean that_present_nestedIntsColumn = true && that.isSetNestedIntsColumn();
+ if (this_present_nestedIntsColumn || that_present_nestedIntsColumn) {
+ if (!(this_present_nestedIntsColumn && that_present_nestedIntsColumn))
+ return false;
+ if (!this.nestedIntsColumn.equals(that.nestedIntsColumn))
+ return false;
+ }
+
+ boolean this_present_nestedStringColumn = true && this.isSetNestedStringColumn();
+ boolean that_present_nestedStringColumn = true && that.isSetNestedStringColumn();
+ if (this_present_nestedStringColumn || that_present_nestedStringColumn) {
+ if (!(this_present_nestedStringColumn && that_present_nestedStringColumn))
+ return false;
+ if (!this.nestedStringColumn.equals(that.nestedStringColumn))
+ return false;
+ }
+
+ return true;
+ }
+
+ @Override
+ public int hashCode() {
+ List