apache · mallman · Oct 31, 2018 · Nov 2, 2018 · Nov 2, 2018 · viirya
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
@@ -306,7 +306,15 @@ case class FileSourceScanExec(
       withOptPartitionCount
     }
 
-    withSelectedBucketsCount
+    val withOptColumnCount = relation.fileFormat match {
+      case columnar: ColumnarFileFormat =>
+        val sqlConf = relation.sparkSession.sessionState.conf
+        val columnCount = columnar.columnCountForSchema(sqlConf, requiredSchema)
+        withSelectedBucketsCount + ("ColumnCount" -> columnCount.toString)
+      case _ => withSelectedBucketsCount
+    }
+
+    withOptColumnCount
   }
 
   private lazy val inputRDD: RDD[InternalRow] = {

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ColumnarFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ColumnarFileFormat.scala
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources
+
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.types.StructType
+
+/**
+ * An optional mix-in for columnar [[FileFormat]]s. This trait provides some helpful metadata when
+ * debugging a physical query plan.
+ */
+private[sql] trait ColumnarFileFormat {
+  _: FileFormat =>
+
+  /** Returns the number of columns required to satisfy the given schema. */
+  def columnCountForSchema(conf: SQLConf, schema: StructType): Int
+}
diff --git a/...src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala b/...src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala
@@ -55,6 +55,7 @@ import org.apache.spark.util.{SerializableConfiguration, ThreadUtils}
 
 class ParquetFileFormat
   extends FileFormat
+  with ColumnarFileFormat
   with DataSourceRegister
   with Logging
   with Serializable {
@@ -72,6 +73,12 @@ class ParquetFileFormat
 
   override def equals(other: Any): Boolean = other.isInstanceOf[ParquetFileFormat]
 
+  override def columnCountForSchema(conf: SQLConf, schema: StructType): Int = {
+    val converter = new SparkToParquetSchemaConverter(conf)
+    val parquetSchema = converter.convert(schema)
+    parquetSchema.getPaths.size
+  }
+
   override def prepareWrite(
       sparkSession: SparkSession,
       job: Job,

diff --git a/.../scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaPruningSuite.scala b/.../scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaPruningSuite.scala
@@ -217,6 +217,19 @@ class ParquetSchemaPruningSuite
       Row("Y.") :: Nil)
   }
 
+  test("ColumnCount metadata value for pruned query should equal the number of columns read") {
+    withContacts {
+      val query = sql("select name.middle from contacts")
+      val fileSourceScans =
+        query.queryExecution.executedPlan.collect {
+          case scan: FileSourceScanExec => scan
+        }
+      assert(fileSourceScans.size === 1)
+      val contactsFileScan = fileSourceScans(0)
+      assert(contactsFileScan.metadata("ColumnCount") === "1")
+    }
+  }
+
   private def testSchemaPruning(testName: String)(testThunk: => Unit) {
     test(s"Spark vectorized reader - without partition data column - $testName") {
       withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "true") {