apache · gatorsmile · Jul 14, 2016 · Jul 14, 2016 · Jul 14, 2016 · Jul 19, 2016
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/catalog/Catalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/catalog/Catalog.scala
@@ -223,6 +223,9 @@ abstract class Catalog {
    * If this table is cached as an InMemoryRelation, drop the original cached version and make the
    * new version cached lazily.
    *
+   * If the table's schema is inferred at runtime, infer the schema again and update the schema
+   * in the external catalog.
+   *
    * @since 2.0.0
    */
   def refreshTable(tableName: String): Unit

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
@@ -52,7 +52,7 @@ case class CreateDataSourceTableCommand(
     userSpecifiedSchema: Option[StructType],
     provider: String,
     options: Map[String, String],
-    partitionColumns: Array[String],
+    userSpecifiedPartitionColumns: Array[String],
     bucketSpec: Option[BucketSpec],
     ignoreIfExists: Boolean,
     managedIfNoPath: Boolean)
@@ -95,17 +95,39 @@ case class CreateDataSourceTableCommand(
       }
 
     // Create the relation to validate the arguments before writing the metadata to the metastore.
-    DataSource(
-      sparkSession = sparkSession,
-      userSpecifiedSchema = userSpecifiedSchema,
-      className = provider,
-      bucketSpec = None,
-      options = optionsWithPath).resolveRelation(checkPathExist = false)
+    val dataSource: HadoopFsRelation =
+      DataSource(
+        sparkSession = sparkSession,
+        userSpecifiedSchema = userSpecifiedSchema,
+        className = provider,
+        bucketSpec = None,
+        options = optionsWithPath)
+        .resolveRelation(checkPathExist = false).asInstanceOf[HadoopFsRelation]
+
+    if (userSpecifiedSchema.isEmpty && userSpecifiedPartitionColumns.length > 0) {
+      // The table does not have a specified schema, which means that the schema will be inferred
+      // when we load the table. So, we are not expecting partition columns and we will discover
+      // partitions when we load the table. However, if there are specified partition columns,
+      // we simply ignore them and provide a warning message.
+      logWarning(
+        s"Specified partition columns (${userSpecifiedPartitionColumns.mkString(",")}) will be " +
+          s"ignored. The schema and partition columns of table $tableIdent are inferred. " +
+          s"Schema: ${dataSource.schema.simpleString}; " +
+          s"Partition columns: ${dataSource.partitionSchema.fieldNames}")
+    }
+
+    val partitionColumns =
+      if (userSpecifiedSchema.isEmpty) {
+        dataSource.partitionSchema.fieldNames
+      } else {
+        userSpecifiedPartitionColumns
+      }
 
     CreateDataSourceTableUtils.createDataSourceTable(
       sparkSession = sparkSession,
       tableIdent = tableIdent,
-      userSpecifiedSchema = userSpecifiedSchema,
+      schema = dataSource.schema,
 dataSchema = dataSchema.asNullable, 
 dataSchema = dataSchema.asNullable, 
+      isSchemaInferred = userSpecifiedSchema.isEmpty,
       partitionColumns = partitionColumns,
       bucketSpec = bucketSpec,
       provider = provider,
@@ -256,7 +278,8 @@ case class CreateDataSourceTableAsSelectCommand(
       CreateDataSourceTableUtils.createDataSourceTable(
         sparkSession = sparkSession,
         tableIdent = tableIdent,
-        userSpecifiedSchema = Some(result.schema),
+        schema = result.schema,
+        isSchemaInferred = false,
         partitionColumns = partitionColumns,
         bucketSpec = bucketSpec,
         provider = provider,
@@ -270,7 +293,6 @@ case class CreateDataSourceTableAsSelectCommand(
   }
 }
 
-
 object CreateDataSourceTableUtils extends Logging {
 
   val DATASOURCE_PREFIX = "spark.sql.sources."
@@ -279,6 +301,7 @@ object CreateDataSourceTableUtils extends Logging {
   val DATASOURCE_OUTPUTPATH = DATASOURCE_PREFIX + "output.path"
   val DATASOURCE_SCHEMA = DATASOURCE_PREFIX + "schema"
   val DATASOURCE_SCHEMA_PREFIX = DATASOURCE_SCHEMA + "."
+  val DATASOURCE_SCHEMA_ISINFERRED = DATASOURCE_SCHEMA_PREFIX + "isInferred"
   val DATASOURCE_SCHEMA_NUMPARTS = DATASOURCE_SCHEMA_PREFIX + "numParts"
   val DATASOURCE_SCHEMA_NUMPARTCOLS = DATASOURCE_SCHEMA_PREFIX + "numPartCols"
   val DATASOURCE_SCHEMA_NUMSORTCOLS = DATASOURCE_SCHEMA_PREFIX + "numSortCols"
@@ -303,10 +326,40 @@ object CreateDataSourceTableUtils extends Logging {
     matcher.matches()
   }
 
+  /**
+   * Saves the schema (including partition info) into the table properties.
+   * Overwrites the schema, if already existed.
+   */
+  def saveSchema(
+      sparkSession: SparkSession,
+      schema: StructType,
+      partitionColumns: Array[String],
+      tableProperties: mutable.HashMap[String, String]): Unit = {
+    // Serialized JSON schema string may be too long to be stored into a single
+    // metastore SerDe property.  In this case, we split the JSON string and store each part as
+    // a separate table property.
+    val threshold = sparkSession.sessionState.conf.schemaStringLengthThreshold
+    val schemaJsonString = schema.json
+    // Split the JSON string.
+    val parts = schemaJsonString.grouped(threshold).toSeq
+    tableProperties.put(DATASOURCE_SCHEMA_NUMPARTS, parts.size.toString)
+    parts.zipWithIndex.foreach { case (part, index) =>
+      tableProperties.put(s"$DATASOURCE_SCHEMA_PART_PREFIX$index", part)
+    }
+
+    if (partitionColumns.length > 0) {
+      tableProperties.put(DATASOURCE_SCHEMA_NUMPARTCOLS, partitionColumns.length.toString)
+      partitionColumns.zipWithIndex.foreach { case (partCol, index) =>
+        tableProperties.put(s"$DATASOURCE_SCHEMA_PARTCOL_PREFIX$index", partCol)
+      }
+    }
+  }
+
   def createDataSourceTable(
       sparkSession: SparkSession,
       tableIdent: TableIdentifier,
-      userSpecifiedSchema: Option[StructType],
+      schema: StructType,
+      isSchemaInferred: Boolean,
       partitionColumns: Array[String],
       bucketSpec: Option[BucketSpec],
       provider: String,
@@ -315,28 +368,10 @@ object CreateDataSourceTableUtils extends Logging {
     val tableProperties = new mutable.HashMap[String, String]
     tableProperties.put(DATASOURCE_PROVIDER, provider)
 
-    // Saves optional user specified schema.  Serialized JSON schema string may be too long to be
-    // stored into a single metastore SerDe property.  In this case, we split the JSON string and
-    // store each part as a separate SerDe property.
-    userSpecifiedSchema.foreach { schema =>
-      val threshold = sparkSession.sessionState.conf.schemaStringLengthThreshold
-      val schemaJsonString = schema.json
-      // Split the JSON string.
-      val parts = schemaJsonString.grouped(threshold).toSeq
-      tableProperties.put(DATASOURCE_SCHEMA_NUMPARTS, parts.size.toString)
-      parts.zipWithIndex.foreach { case (part, index) =>
-        tableProperties.put(s"$DATASOURCE_SCHEMA_PART_PREFIX$index", part)
-      }
-    }
+    tableProperties.put(DATASOURCE_SCHEMA_ISINFERRED, isSchemaInferred.toString.toUpperCase)
+    saveSchema(sparkSession, schema, partitionColumns, tableProperties)
 
-    if (userSpecifiedSchema.isDefined && partitionColumns.length > 0) {
-      tableProperties.put(DATASOURCE_SCHEMA_NUMPARTCOLS, partitionColumns.length.toString)
-      partitionColumns.zipWithIndex.foreach { case (partCol, index) =>
-        tableProperties.put(s"$DATASOURCE_SCHEMA_PARTCOL_PREFIX$index", partCol)
-      }
-    }
-
-    if (userSpecifiedSchema.isDefined && bucketSpec.isDefined) {
+    if (bucketSpec.isDefined) {
       val BucketSpec(numBuckets, bucketColumnNames, sortColumnNames) = bucketSpec.get
 
       tableProperties.put(DATASOURCE_SCHEMA_NUMBUCKETS, numBuckets.toString)
@@ -353,16 +388,6 @@ object CreateDataSourceTableUtils extends Logging {
       }
     }
 
-    if (userSpecifiedSchema.isEmpty && partitionColumns.length > 0) {
-      // The table does not have a specified schema, which means that the schema will be inferred
-      // when we load the table. So, we are not expecting partition columns and we will discover
-      // partitions when we load the table. However, if there are specified partition columns,
-      // we simply ignore them and provide a warning message.
-      logWarning(
-        s"The schema and partitions of table $tableIdent will be inferred when it is loaded. " +
-          s"Specified partition columns (${partitionColumns.mkString(",")}) will be ignored.")
-    }
-
     val tableType = if (isExternal) {
       tableProperties.put("EXTERNAL", "TRUE")
       CatalogTableType.EXTERNAL
@@ -375,7 +400,7 @@ object CreateDataSourceTableUtils extends Logging {
     val dataSource =
       DataSource(
         sparkSession,
-        userSpecifiedSchema = userSpecifiedSchema,
+        userSpecifiedSchema = Some(schema),
         partitionColumns = partitionColumns,
         bucketSpec = bucketSpec,
         className = provider,

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
@@ -487,6 +487,10 @@ object DDLUtils {
     isDatasourceTable(table.properties)
   }
 
+  def isSchemaInferred(table: CatalogTable): Boolean = {
+    table.properties.get(DATASOURCE_SCHEMA_ISINFERRED) == Option(true.toString.toUpperCase)
+  }
+
   /**
    * If the command ALTER VIEW is to alter a table or ALTER TABLE is to alter a view,
    * issue an exception [[AnalysisException]].

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
@@ -413,15 +413,7 @@ case class DescribeTableCommand(table: TableIdentifier, isExtended: Boolean, isF
     } else {
       val metadata = catalog.getTableMetadata(table)
 
-      if (DDLUtils.isDatasourceTable(metadata)) {
-        DDLUtils.getSchemaFromTableProperties(metadata) match {
-          case Some(userSpecifiedSchema) => describeSchema(userSpecifiedSchema, result)
-          case None => describeSchema(catalog.lookupRelation(table).schema, result)
-        }
-      } else {
-        describeSchema(metadata.schema, result)
-      }
-
+      describeSchema(metadata, result)
       if (isExtended) {
         describeExtended(metadata, result)
       } else if (isFormatted) {
@@ -518,6 +510,19 @@ case class DescribeTableCommand(table: TableIdentifier, isExtended: Boolean, isF
     }
   }
 
+  private def describeSchema(
+      tableDesc: CatalogTable,
+      buffer: ArrayBuffer[Row]): Unit = {
+    if (DDLUtils.isDatasourceTable(tableDesc)) {
+      DDLUtils.getSchemaFromTableProperties(tableDesc) match {
+        case Some(userSpecifiedSchema) => describeSchema(userSpecifiedSchema, buffer)
+        case None => append(buffer, "# Schema of this table is inferred at runtime", "", "")
+      }
+    } else {
+      describeSchema(tableDesc.schema, buffer)
+    }
+  }
+
   private def describeSchema(schema: Seq[CatalogColumn], buffer: ArrayBuffer[Row]): Unit = {
     schema.foreach { column =>
       append(buffer, column.name, column.dataType.toLowerCase, column.comment.orNull)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.internal
 
 import scala.collection.JavaConverters._
+import scala.collection.mutable
 import scala.reflect.runtime.universe.TypeTag
 
 import org.apache.spark.annotation.Experimental
@@ -27,7 +28,8 @@ import org.apache.spark.sql.catalyst.{DefinedByConstructorParams, TableIdentifie
 import org.apache.spark.sql.catalyst.catalog.SessionCatalog
 import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
 import org.apache.spark.sql.catalyst.plans.logical.LocalRelation
-import org.apache.spark.sql.execution.datasources.CreateTableUsing
+import org.apache.spark.sql.execution.command.{CreateDataSourceTableUtils, DDLUtils}
+import org.apache.spark.sql.execution.datasources.{CreateTableUsing, DataSource, HadoopFsRelation}
 import org.apache.spark.sql.types.StructType
 
 
@@ -350,6 +352,54 @@ class CatalogImpl(sparkSession: SparkSession) extends Catalog {
     sparkSession.sharedState.cacheManager.lookupCachedData(qName).nonEmpty
   }
 
+  /**
+   * Refresh the inferred schema stored in the external catalog for data source tables.
+   */
+  private def refreshInferredSchema(tableIdent: TableIdentifier): Unit = {
+    val table = sessionCatalog.getTableMetadataOption(tableIdent)
+    table.foreach { tableDesc =>
+      if (DDLUtils.isDatasourceTable(tableDesc) && DDLUtils.isSchemaInferred(tableDesc)) {
+        val partitionColumns = DDLUtils.getPartitionColumnsFromTableProperties(tableDesc)
+        val bucketSpec = DDLUtils.getBucketSpecFromTableProperties(tableDesc)
+        val dataSource =
+          DataSource(
+            sparkSession,
+            userSpecifiedSchema = None,
+            partitionColumns = partitionColumns,
+            bucketSpec = bucketSpec,
+            className = tableDesc.properties(CreateDataSourceTableUtils.DATASOURCE_PROVIDER),
+            options = tableDesc.storage.serdeProperties)
+            .resolveRelation().asInstanceOf[HadoopFsRelation]
+
+        val schemaProperties = new mutable.HashMap[String, String]
+        CreateDataSourceTableUtils.saveSchema(
+          sparkSession, dataSource.schema, dataSource.partitionSchema.fieldNames, schemaProperties)
+
+        def isPropertyForInferredSchema(key: String): Boolean = {
+          key match {
+            case CreateDataSourceTableUtils.DATASOURCE_SCHEMA_NUMPARTS => true
+            case CreateDataSourceTableUtils.DATASOURCE_SCHEMA_NUMPARTCOLS => true
+            case _
+              if key.startsWith(CreateDataSourceTableUtils.DATASOURCE_SCHEMA_PART_PREFIX) ||
+                key.startsWith(CreateDataSourceTableUtils.DATASOURCE_SCHEMA_PARTCOL_PREFIX)
+              => true
+            case _ => false
+          }
+        }
+
+        // Keep the properties that are not for schema or partition columns
+        val tablePropertiesWithoutSchema = tableDesc.properties.filterKeys { k =>
+          !isPropertyForInferredSchema(k)
+        }
+
+        val newTable = tableDesc.copy(properties = tablePropertiesWithoutSchema ++ schemaProperties)
+
+        // Alter the schema-related table properties that are stored in external catalog.
+        sessionCatalog.alterTable(newTable)
+      }
+    }
+  }
+
   /**
    * Refresh the cache entry for a table, if any. For Hive metastore table, the metadata
    * is refreshed.
@@ -359,6 +409,13 @@ class CatalogImpl(sparkSession: SparkSession) extends Catalog {
    */
   override def refreshTable(tableName: String): Unit = {
     val tableIdent = sparkSession.sessionState.sqlParser.parseTableIdentifier(tableName)
+    // Refresh the schema in external catalog, if it is a data source table whose schema is inferred
+    // at runtime. For user-specified schema, we do not infer and update the schema.
+    // TODO: Support column-related ALTER TABLE DDL commands, and then users can update
+    // the user-specified schema.
+    refreshInferredSchema(tableIdent)
+    // Temp tables: refresh (or invalidate) any metadata/data cached in the plan recursively.
+    // Non-temp tables: refresh the metadata cache.
     sessionCatalog.refreshTable(tableIdent)
 
     // If this table is cached as an InMemoryRelation, drop the original