apache · Achuth17 · Jun 21, 2018 · Jun 21, 2018 · Jul 5, 2018 · Jul 5, 2018
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeColumnCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeColumnCommand.scala
@@ -47,7 +47,7 @@ case class AnalyzeColumnCommand(
     if (tableMeta.tableType == CatalogTableType.VIEW) {
       throw new AnalysisException("ANALYZE TABLE is not supported on views.")
     }
-    val sizeInBytes = CommandUtils.calculateTotalSize(sessionState, tableMeta)
+    val sizeInBytes = CommandUtils.calculateTotalSize(sparkSession, tableMeta)
 
     // Compute stats for each column
     val (rowCount, newColStats) = computeColumnStats(sparkSession, tableIdentWithDB, columnNames)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzePartitionCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzePartitionCommand.scala
@@ -24,6 +24,7 @@ import org.apache.spark.sql.catalyst.catalog.{CatalogTable, CatalogTableType}
 import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec
 import org.apache.spark.sql.catalyst.expressions.{And, EqualTo, Literal}
 import org.apache.spark.sql.execution.datasources.PartitioningUtils
+import org.apache.spark.util.SerializableConfiguration
 
 /**
  * Analyzes a given set of partitions to generate per-partition statistics, which will be used in

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeTableCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeTableCommand.scala
@@ -39,7 +39,7 @@ case class AnalyzeTableCommand(
     }
 
     // Compute stats for the whole table
-    val newTotalSize = CommandUtils.calculateTotalSize(sessionState, tableMeta)
+    val newTotalSize = CommandUtils.calculateTotalSize(sparkSession, tableMeta)
     val newRowCount =
       if (noscan) None else Some(BigInt(sparkSession.table(tableIdentWithDB).count()))
 

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandUtils.scala
@@ -21,12 +21,13 @@ import java.net.URI
 
 import scala.util.control.NonFatal
 
-import org.apache.hadoop.fs.{FileSystem, Path}
+import org.apache.hadoop.fs.{FileSystem, Path, PathFilter}
 
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.catalog.{CatalogStatistics, CatalogTable, CatalogTablePartition}
+import org.apache.spark.sql.execution.datasources.InMemoryFileIndex
 import org.apache.spark.sql.internal.SessionState
 
 
@@ -38,7 +39,7 @@ object CommandUtils extends Logging {
       val catalog = sparkSession.sessionState.catalog
       if (sparkSession.sessionState.conf.autoSizeUpdateEnabled) {
         val newTable = catalog.getTableMetadata(table.identifier)
-        val newSize = CommandUtils.calculateTotalSize(sparkSession.sessionState, newTable)
+        val newSize = CommandUtils.calculateTotalSize(sparkSession, newTable)
         val newStats = CatalogStatistics(sizeInBytes = newSize)
         catalog.alterTableStats(table.identifier, Some(newStats))
       } else {
@@ -47,15 +48,26 @@ object CommandUtils extends Logging {
     }
   }
 
-  def calculateTotalSize(sessionState: SessionState, catalogTable: CatalogTable): BigInt = {
+    def calculateTotalSize(spark: SparkSession, catalogTable: CatalogTable): BigInt = {
+
+    val sessionState = spark.sessionState
+    val stagingDir = sessionState.conf.getConfString("hive.exec.stagingdir", ".hive-staging")
+
     if (catalogTable.partitionColumnNames.isEmpty) {
-      calculateLocationSize(sessionState, catalogTable.identifier, catalogTable.storage.locationUri)
+      calculateLocationSize(sessionState, catalogTable.identifier,
+          catalogTable.storage.locationUri)
     } else {
       // Calculate table size as a sum of the visible partitions. See SPARK-21079
       val partitions = sessionState.catalog.listPartitions(catalogTable.identifier)
-      partitions.map { p =>
-        calculateLocationSize(sessionState, catalogTable.identifier, p.storage.locationUri)
-      }.sum
+      val paths = partitions.map(x => new Path(x.storage.locationUri.get.getPath))
+      val pathFilter = new PathFilter {
+        override def accept(path: Path): Boolean = {
+          !path.getName.startsWith(stagingDir)
+        }
+      }
+      val fileStatusSeq = InMemoryFileIndex.bulkListLeafFiles(paths,
+        sessionState.newHadoopConf(), pathFilter, spark).flatMap(x => x._2)
+      fileStatusSeq.map(fileStatus => fileStatus.getLen).sum
     }
   }
 

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InMemoryFileIndex.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InMemoryFileIndex.scala
@@ -162,7 +162,7 @@ object InMemoryFileIndex extends Logging {
    *
    * @return for each input path, the set of discovered files for the path
    */
-  private def bulkListLeafFiles(
+   def bulkListLeafFiles(
       paths: Seq[Path],
       hadoopConf: Configuration,
       filter: PathFilter,