From dd83c209f1692a2e5afb72fa7a2d039fd1e682c8 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Fri, 4 Mar 2016 16:18:15 +0800
Subject: [PATCH 01/29] [SPARK-13603][SQL] support SQL generation for subquery

## What changes were proposed in this pull request?

This is support SQL generation for subquery expressions, which will be replaced to a SubqueryHolder inside SQLBuilder recursively.

## How was this patch tested?

Added unit tests.

Author: Davies Liu <davies@databricks.com>

Closes #11453 from davies/sql_subquery.
---
 .../sql/catalyst/expressions/subquery.scala   |  2 --
 .../apache/spark/sql/hive/SQLBuilder.scala    | 21 ++++++++++++++-----
 .../spark/sql/hive/ExpressionToSQLSuite.scala |  5 +++++
 3 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/subquery.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/subquery.scala
index ddf214a4b30ac..968bbdb1a5f03 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/subquery.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/subquery.scala
@@ -76,6 +76,4 @@ case class ScalarSubquery(
   override def withNewPlan(plan: LogicalPlan): ScalarSubquery = ScalarSubquery(plan, exprId)
 
   override def toString: String = s"subquery#${exprId.id}"
-
-  // TODO: support sql()
 }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/SQLBuilder.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/SQLBuilder.scala
index 13a78c609e014..9a14ccff57f83 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/SQLBuilder.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/SQLBuilder.scala
@@ -24,13 +24,22 @@ import scala.util.control.NonFatal
 import org.apache.spark.Logging
 import org.apache.spark.sql.{DataFrame, SQLContext}
 import org.apache.spark.sql.catalyst.TableIdentifier
-import org.apache.spark.sql.catalyst.expressions.{Attribute, NamedExpression, NonSQLExpression,
-  SortOrder}
+import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.optimizer.CollapseProject
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules.{Rule, RuleExecutor}
 import org.apache.spark.sql.catalyst.util.quoteIdentifier
 import org.apache.spark.sql.execution.datasources.LogicalRelation
+import org.apache.spark.sql.types.{DataType, NullType}
+
+/**
+ * A place holder for generated SQL for subquery expression.
+ */
+case class SubqueryHolder(query: String) extends LeafExpression with Unevaluable {
+  override def dataType: DataType = NullType
+  override def nullable: Boolean = true
+  override def sql: String = s"($query)"
+}
 
 /**
  * A builder class used to convert a resolved logical plan into a SQL query string.  Note that this
@@ -46,7 +55,9 @@ class SQLBuilder(logicalPlan: LogicalPlan, sqlContext: SQLContext) extends Loggi
   def toSQL: String = {
     val canonicalizedPlan = Canonicalizer.execute(logicalPlan)
     try {
-      canonicalizedPlan.transformAllExpressions {
+      val replaced = canonicalizedPlan.transformAllExpressions {
+        case e: SubqueryExpression =>
+          SubqueryHolder(new SQLBuilder(e.query, sqlContext).toSQL)
         case e: NonSQLExpression =>
           throw new UnsupportedOperationException(
             s"Expression $e doesn't have a SQL representation"
@@ -54,14 +65,14 @@ class SQLBuilder(logicalPlan: LogicalPlan, sqlContext: SQLContext) extends Loggi
         case e => e
       }
 
-      val generatedSQL = toSQL(canonicalizedPlan, true)
+      val generatedSQL = toSQL(replaced, true)
       logDebug(
         s"""Built SQL query string successfully from given logical plan:
            |
            |# Original logical plan:
            |${logicalPlan.treeString}
            |# Canonicalized logical plan:
-           |${canonicalizedPlan.treeString}
+           |${replaced.treeString}
            |# Generated SQL:
            |$generatedSQL
          """.stripMargin)
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/ExpressionToSQLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/ExpressionToSQLSuite.scala
index d68c602a887f7..72765f05e7e49 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/ExpressionToSQLSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/ExpressionToSQLSuite.scala
@@ -268,4 +268,9 @@ class ExpressionToSQLSuite extends SQLBuilderTest with SQLTestUtils {
     checkSqlGeneration("SELECT input_file_name()")
     checkSqlGeneration("SELECT monotonically_increasing_id()")
   }
+
+  test("subquery") {
+    checkSqlGeneration("SELECT 1 + (SELECT 2)")
+    checkSqlGeneration("SELECT 1 + (SELECT 2 + (SELECT 3 as a))")
+  }
 }

From 27e88faa058c1364d0e99fffc0c5cb64ef817bd3 Mon Sep 17 00:00:00 2001
From: Abou Haydar Elias <abouhaydar.elias@gmail.com>
Date: Fri, 4 Mar 2016 10:01:52 +0000
Subject: [PATCH 02/29] =?UTF-8?q?[SPARK-13646][MLLIB]=20QuantileDiscretize?=
 =?UTF-8?q?r=20counts=20dataset=20twice=20in=20get=E2=80=A6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## What changes were proposed in this pull request?

It avoids counting the dataframe twice.

Author: Abou Haydar Elias <abouhaydar.elias@gmail.com>
Author: Elie A <abouhaydar.elias@gmail.com>

Closes #11491 from eliasah/quantile-discretizer-patch.
---
 .../scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala
index d75b3ef420211..18896fcc4d8c1 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala
@@ -118,7 +118,7 @@ object QuantileDiscretizer extends DefaultParamsReadable[QuantileDiscretizer] wi
     require(totalSamples > 0,
       "QuantileDiscretizer requires non-empty input dataset but was given an empty input.")
     val requiredSamples = math.max(numBins * numBins, minSamplesRequired)
-    val fraction = math.min(requiredSamples.toDouble / dataset.count(), 1.0)
+    val fraction = math.min(requiredSamples.toDouble / totalSamples, 1.0)
     dataset.sample(withReplacement = false, fraction, new XORShiftRandom(seed).nextInt()).collect()
   }
 

From c04dc27cedd3d75781fda4c24da16b6ada44d3e4 Mon Sep 17 00:00:00 2001
From: Holden Karau <holden@us.ibm.com>
Date: Fri, 4 Mar 2016 10:56:58 +0000
Subject: [PATCH 03/29] [SPARK-13398][STREAMING] Move away from thread pool
 task support to forkjoin

## What changes were proposed in this pull request?

Remove old deprecated ThreadPoolExecutor and replace with ExecutionContext using a ForkJoinPool. The downside of this is that scala's ForkJoinPool doesn't give us a way to specify the thread pool name (and is a wrapper of Java's in 2.12) except by providing a custom factory. Note that we can't use Java's ForkJoinPool directly in Scala 2.11 since it uses a ExecutionContext which reports system parallelism. One other implicit change that happens is the old ExecutionContext would have reported a different default parallelism since it used system parallelism rather than threadpool parallelism (this was likely not intended but also likely not a huge difference).

The previous version of this PR attempted to use an execution context constructed on the ThreadPool (but not the deprecated ThreadPoolExecutor class) so as to keep the ability to have human readable named threads but this reported system parallelism.

## How was this patch tested?

unit tests: streaming/testOnly org.apache.spark.streaming.util.*

Author: Holden Karau <holden@us.ibm.com>

Closes #11423 from holdenk/SPARK-13398-move-away-from-ThreadPoolTaskSupport-java-forkjoin.
---
 .../org/apache/spark/util/ThreadUtils.scala   | 18 +++++++++++++++
 .../util/FileBasedWriteAheadLog.scala         | 23 ++++++++++---------
 .../streaming/util/WriteAheadLogSuite.scala   |  9 +++++---
 3 files changed, 36 insertions(+), 14 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala b/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala
index f9fbe2ff858ce..9abbf4a7a3971 100644
--- a/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala
+++ b/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala
@@ -20,6 +20,7 @@ package org.apache.spark.util
 import java.util.concurrent._
 
 import scala.concurrent.{ExecutionContext, ExecutionContextExecutor}
+import scala.concurrent.forkjoin.{ForkJoinPool => SForkJoinPool, ForkJoinWorkerThread => SForkJoinWorkerThread}
 import scala.util.control.NonFatal
 
 import com.google.common.util.concurrent.{MoreExecutors, ThreadFactoryBuilder}
@@ -156,4 +157,21 @@ private[spark] object ThreadUtils {
         result
     }
   }
+
+  /**
+   * Construct a new Scala ForkJoinPool with a specified max parallelism and name prefix.
+   */
+  def newForkJoinPool(prefix: String, maxThreadNumber: Int): SForkJoinPool = {
+    // Custom factory to set thread names
+    val factory = new SForkJoinPool.ForkJoinWorkerThreadFactory {
+      override def newThread(pool: SForkJoinPool) =
+        new SForkJoinWorkerThread(pool) {
+          setName(prefix + "-" + super.getName)
+        }
+    }
+    new SForkJoinPool(maxThreadNumber, factory,
+      null, // handler
+      false // asyncMode
+    )
+  }
 }
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/util/FileBasedWriteAheadLog.scala b/streaming/src/main/scala/org/apache/spark/streaming/util/FileBasedWriteAheadLog.scala
index 314263f26ee60..a3b7e783acd8d 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/util/FileBasedWriteAheadLog.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/util/FileBasedWriteAheadLog.scala
@@ -18,11 +18,11 @@ package org.apache.spark.streaming.util
 
 import java.nio.ByteBuffer
 import java.util.{Iterator => JIterator}
-import java.util.concurrent.{RejectedExecutionException, ThreadPoolExecutor}
+import java.util.concurrent.RejectedExecutionException
 
 import scala.collection.JavaConverters._
 import scala.collection.mutable.ArrayBuffer
-import scala.collection.parallel.ThreadPoolTaskSupport
+import scala.collection.parallel.ExecutionContextTaskSupport
 import scala.concurrent.{Await, ExecutionContext, Future}
 import scala.language.postfixOps
 
@@ -62,8 +62,8 @@ private[streaming] class FileBasedWriteAheadLog(
   private val threadpoolName = {
     "WriteAheadLogManager" + callerName.map(c => s" for $c").getOrElse("")
   }
-  private val threadpool = ThreadUtils.newDaemonCachedThreadPool(threadpoolName, 20)
-  private val executionContext = ExecutionContext.fromExecutorService(threadpool)
+  private val forkJoinPool = ThreadUtils.newForkJoinPool(threadpoolName, 20)
+  private val executionContext = ExecutionContext.fromExecutorService(forkJoinPool)
 
   override protected def logName = {
     getClass.getName.stripSuffix("$") +
@@ -144,7 +144,7 @@ private[streaming] class FileBasedWriteAheadLog(
     } else {
       // For performance gains, it makes sense to parallelize the recovery if
       // closeFileAfterWrite = true
-      seqToParIterator(threadpool, logFilesToRead, readFile).asJava
+      seqToParIterator(executionContext, logFilesToRead, readFile).asJava
     }
   }
 
@@ -283,16 +283,17 @@ private[streaming] object FileBasedWriteAheadLog {
 
   /**
    * This creates an iterator from a parallel collection, by keeping at most `n` objects in memory
-   * at any given time, where `n` is the size of the thread pool. This is crucial for use cases
-   * where we create `FileBasedWriteAheadLogReader`s during parallel recovery. We don't want to
-   * open up `k` streams altogether where `k` is the size of the Seq that we want to parallelize.
+   * at any given time, where `n` is at most the max of the size of the thread pool or 8. This is
+   * crucial for use cases where we create `FileBasedWriteAheadLogReader`s during parallel recovery.
+   * We don't want to open up `k` streams altogether where `k` is the size of the Seq that we want
+   * to parallelize.
    */
   def seqToParIterator[I, O](
-      tpool: ThreadPoolExecutor,
+      executionContext: ExecutionContext,
       source: Seq[I],
       handler: I => Iterator[O]): Iterator[O] = {
-    val taskSupport = new ThreadPoolTaskSupport(tpool)
-    val groupSize = tpool.getMaximumPoolSize.max(8)
+    val taskSupport = new ExecutionContextTaskSupport(executionContext)
+    val groupSize = taskSupport.parallelismLevel.max(8)
     source.grouped(groupSize).flatMap { group =>
       val parallelCollection = group.par
       parallelCollection.tasksupport = taskSupport
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/util/WriteAheadLogSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/util/WriteAheadLogSuite.scala
index 7460e8629b696..8c980dee2cc06 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/util/WriteAheadLogSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/util/WriteAheadLogSuite.scala
@@ -228,7 +228,9 @@ class FileBasedWriteAheadLogSuite
      the list of files.
      */
     val numThreads = 8
-    val tpool = ThreadUtils.newDaemonFixedThreadPool(numThreads, "wal-test-thread-pool")
+    val fpool = ThreadUtils.newForkJoinPool("wal-test-thread-pool", numThreads)
+    val executionContext = ExecutionContext.fromExecutorService(fpool)
+
     class GetMaxCounter {
       private val value = new AtomicInteger()
       @volatile private var max: Int = 0
@@ -258,7 +260,8 @@ class FileBasedWriteAheadLogSuite
       val t = new Thread() {
         override def run() {
           // run the calculation on a separate thread so that we can release the latch
-          val iterator = FileBasedWriteAheadLog.seqToParIterator[Int, Int](tpool, testSeq, handle)
+          val iterator = FileBasedWriteAheadLog.seqToParIterator[Int, Int](executionContext,
+            testSeq, handle)
           collected = iterator.toSeq
         }
       }
@@ -273,7 +276,7 @@ class FileBasedWriteAheadLogSuite
       // make sure we didn't open too many Iterators
       assert(counter.getMax() <= numThreads)
     } finally {
-      tpool.shutdownNow()
+      fpool.shutdownNow()
     }
   }
 

From 204b02b56afe358b7f2d403fb6e2b9e8a7122798 Mon Sep 17 00:00:00 2001
From: Rajesh Balamohan <rbalamohan@apache.org>
Date: Fri, 4 Mar 2016 10:59:40 +0000
Subject: [PATCH 04/29] =?UTF-8?q?[SPARK-12925]=20Improve=20HiveInspectors.?=
 =?UTF-8?q?unwrap=20for=20StringObjectInspector.=E2=80=A6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Earlier fix did not copy the bytes and it is possible for higher level to reuse Text object. This was causing issues. Proposed fix now copies the bytes from Text. This still avoids the expensive encoding/decoding

Author: Rajesh Balamohan <rbalamohan@apache.org>

Closes #11477 from rajeshbalamohan/SPARK-12925.2.
---
 .../scala/org/apache/spark/sql/hive/HiveInspectors.scala     | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
index 3e91569109fc4..589862c7c02ee 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
@@ -320,9 +320,10 @@ private[hive] trait HiveInspectors {
       case hvoi: HiveCharObjectInspector =>
         UTF8String.fromString(hvoi.getPrimitiveJavaObject(data).getValue)
       case x: StringObjectInspector if x.preferWritable() =>
-        // Text is in UTF-8 already. No need to convert again via fromString
+        // Text is in UTF-8 already. No need to convert again via fromString. Copy bytes
         val wObj = x.getPrimitiveWritableObject(data)
-        UTF8String.fromBytes(wObj.getBytes, 0, wObj.getLength)
+        val result = wObj.copyBytes()
+        UTF8String.fromBytes(result, 0, result.length)
       case x: StringObjectInspector =>
         UTF8String.fromString(x.getPrimitiveJavaObject(data))
       case x: IntObjectInspector if x.preferWritable() => x.get(data)

From e617508244b508b59b4debb35cad3258cddbb9cf Mon Sep 17 00:00:00 2001
From: Masayoshi TSUZUKI <tsudukim@oss.nttdata.co.jp>
Date: Fri, 4 Mar 2016 13:53:53 +0000
Subject: [PATCH 05/29] [SPARK-13673][WINDOWS] Fixed not to pollute environment
 variables.

## What changes were proposed in this pull request?

This patch fixes the problem that `bin\beeline.cmd` pollutes environment variables.
The similar problem is reported and fixed in https://issues.apache.org/jira/browse/SPARK-3943, but `bin\beeline.cmd` seems to be added later.

## How was this patch tested?

manual tests:
  I executed the new `bin\beeline.cmd` and confirmed that %SPARK_HOME% doesn't remain in the command prompt.

Author: Masayoshi TSUZUKI <tsudukim@oss.nttdata.co.jp>

Closes #11516 from tsudukim/feature/SPARK-13673.
---
 bin/beeline.cmd | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/bin/beeline.cmd b/bin/beeline.cmd
index 8ddaa419967a5..02464bd088792 100644
--- a/bin/beeline.cmd
+++ b/bin/beeline.cmd
@@ -17,5 +17,4 @@ rem See the License for the specific language governing permissions and
 rem limitations under the License.
 rem
 
-set SPARK_HOME=%~dp0..
-cmd /V /E /C "%SPARK_HOME%\bin\spark-class.cmd" org.apache.hive.beeline.BeeLine %*
+cmd /V /E /C "%~dp0spark-class.cmd" org.apache.hive.beeline.BeeLine %*

From c8f25459ed4ad6b51a5f11665364cfe0b84f7b3c Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Fri, 4 Mar 2016 08:25:41 -0800
Subject: [PATCH 06/29] [SPARK-13676] Fix mismatched default values for
 regParam in LogisticRegression

## What changes were proposed in this pull request?

The default value of regularization parameter for `LogisticRegression` algorithm is different in Scala and Python. We should provide the same value.

**Scala**
```
scala> new org.apache.spark.ml.classification.LogisticRegression().getRegParam
res0: Double = 0.0
```

**Python**
```
>>> from pyspark.ml.classification import LogisticRegression
>>> LogisticRegression().getRegParam()
0.1
```

## How was this patch tested?
manual. Check the following in `pyspark`.
```
>>> from pyspark.ml.classification import LogisticRegression
>>> LogisticRegression().getRegParam()
0.0
```

Author: Dongjoon Hyun <dongjoon@apache.org>

Closes #11519 from dongjoon-hyun/SPARK-13676.
---
 python/pyspark/ml/classification.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py
index 253af15cb5cd9..29d1d203f2a81 100644
--- a/python/pyspark/ml/classification.py
+++ b/python/pyspark/ml/classification.py
@@ -79,12 +79,12 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti
 
     @keyword_only
     def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
-                 maxIter=100, regParam=0.1, elasticNetParam=0.0, tol=1e-6, fitIntercept=True,
+                 maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True,
                  threshold=0.5, thresholds=None, probabilityCol="probability",
                  rawPredictionCol="rawPrediction", standardization=True, weightCol=None):
         """
         __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
-                 maxIter=100, regParam=0.1, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, \
+                 maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, \
                  threshold=0.5, thresholds=None, probabilityCol="probability", \
                  rawPredictionCol="rawPrediction", standardization=True, weightCol=None)
         If the threshold and thresholds Params are both set, they must be equivalent.
@@ -92,7 +92,7 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred
         super(LogisticRegression, self).__init__()
         self._java_obj = self._new_java_obj(
             "org.apache.spark.ml.classification.LogisticRegression", self.uid)
-        self._setDefault(maxIter=100, regParam=0.1, tol=1E-6, threshold=0.5)
+        self._setDefault(maxIter=100, regParam=0.0, tol=1E-6, threshold=0.5)
         kwargs = self.__init__._input_kwargs
         self.setParams(**kwargs)
         self._checkThresholdConsistency()
@@ -100,12 +100,12 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred
     @keyword_only
     @since("1.3.0")
     def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction",
-                  maxIter=100, regParam=0.1, elasticNetParam=0.0, tol=1e-6, fitIntercept=True,
+                  maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True,
                   threshold=0.5, thresholds=None, probabilityCol="probability",
                   rawPredictionCol="rawPrediction", standardization=True, weightCol=None):
         """
         setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
-                  maxIter=100, regParam=0.1, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, \
+                  maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, \
                   threshold=0.5, thresholds=None, probabilityCol="probability", \
                   rawPredictionCol="rawPrediction", standardization=True, weightCol=None)
         Sets params for logistic regression.

From 83302c3bff13bd7734426c81d9c83bf4beb211c9 Mon Sep 17 00:00:00 2001
From: Xusen Yin <yinxusen@gmail.com>
Date: Fri, 4 Mar 2016 08:32:24 -0800
Subject: [PATCH 07/29] [SPARK-13036][SPARK-13318][SPARK-13319] Add save/load
 for feature.py

Add save/load for feature.py. Meanwhile, add save/load for `ElementwiseProduct` in Scala side and fix a bug of missing `setDefault` in `VectorSlicer` and `StopWordsRemover`.

In this PR I ignore the `RFormula` and `RFormulaModel` because its Scala implementation is pending in https://github.com/apache/spark/pull/9884. I'll add them in this PR if https://github.com/apache/spark/pull/9884 gets merged first. Or add a follow-up JIRA for `RFormula`.

Author: Xusen Yin <yinxusen@gmail.com>

Closes #11203 from yinxusen/SPARK-13036.
---
 .../spark/ml/feature/ElementwiseProduct.scala |  13 +-
 .../ml/feature/ElementwiseProductSuite.scala  |  35 ++
 python/pyspark/ml/feature.py                  | 341 +++++++++++++++---
 3 files changed, 341 insertions(+), 48 deletions(-)
 create mode 100644 mllib/src/test/scala/org/apache/spark/ml/feature/ElementwiseProductSuite.scala

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/ElementwiseProduct.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/ElementwiseProduct.scala
index 1e758cb775de7..2c7ffdb7ba697 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/ElementwiseProduct.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/ElementwiseProduct.scala
@@ -17,10 +17,10 @@
 
 package org.apache.spark.ml.feature
 
-import org.apache.spark.annotation.Experimental
+import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.ml.UnaryTransformer
 import org.apache.spark.ml.param.Param
-import org.apache.spark.ml.util.Identifiable
+import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable}
 import org.apache.spark.mllib.feature
 import org.apache.spark.mllib.linalg.{Vector, VectorUDT}
 import org.apache.spark.sql.types.DataType
@@ -33,7 +33,7 @@ import org.apache.spark.sql.types.DataType
  */
 @Experimental
 class ElementwiseProduct(override val uid: String)
-  extends UnaryTransformer[Vector, Vector, ElementwiseProduct] {
+  extends UnaryTransformer[Vector, Vector, ElementwiseProduct] with DefaultParamsWritable {
 
   def this() = this(Identifiable.randomUID("elemProd"))
 
@@ -57,3 +57,10 @@ class ElementwiseProduct(override val uid: String)
 
   override protected def outputDataType: DataType = new VectorUDT()
 }
+
+@Since("2.0.0")
+object ElementwiseProduct extends DefaultParamsReadable[ElementwiseProduct] {
+
+  @Since("2.0.0")
+  override def load(path: String): ElementwiseProduct = super.load(path)
+}
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/ElementwiseProductSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/ElementwiseProductSuite.scala
new file mode 100644
index 0000000000000..fc1c05de233ea
--- /dev/null
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/ElementwiseProductSuite.scala
@@ -0,0 +1,35 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.feature
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.ml.util.DefaultReadWriteTest
+import org.apache.spark.mllib.linalg.Vectors
+import org.apache.spark.mllib.util.MLlibTestSparkContext
+
+class ElementwiseProductSuite
+  extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {
+
+  test("read/write") {
+    val ep = new ElementwiseProduct()
+      .setInputCol("myInputCol")
+      .setOutputCol("myOutputCol")
+      .setScalingVec(Vectors.dense(0.1, 0.2))
+    testDefaultReadWrite(ep)
+  }
+}
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index fb31c7310c0a8..5025493c42c38 100644
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -22,7 +22,7 @@
 from pyspark import since
 from pyspark.rdd import ignore_unicode_prefix
 from pyspark.ml.param.shared import *
-from pyspark.ml.util import keyword_only
+from pyspark.ml.util import keyword_only, MLReadable, MLWritable
 from pyspark.ml.wrapper import JavaEstimator, JavaModel, JavaTransformer, _jvm
 from pyspark.mllib.common import inherit_doc
 from pyspark.mllib.linalg import _convert_to_vector
@@ -58,7 +58,7 @@
 
 
 @inherit_doc
-class Binarizer(JavaTransformer, HasInputCol, HasOutputCol):
+class Binarizer(JavaTransformer, HasInputCol, HasOutputCol, MLReadable, MLWritable):
     """
     .. note:: Experimental
 
@@ -73,6 +73,11 @@ class Binarizer(JavaTransformer, HasInputCol, HasOutputCol):
     >>> params = {binarizer.threshold: -0.5, binarizer.outputCol: "vector"}
     >>> binarizer.transform(df, params).head().vector
     1.0
+    >>> binarizerPath = temp_path + "/binarizer"
+    >>> binarizer.save(binarizerPath)
+    >>> loadedBinarizer = Binarizer.load(binarizerPath)
+    >>> loadedBinarizer.getThreshold() == binarizer.getThreshold()
+    True
 
     .. versionadded:: 1.4.0
     """
@@ -118,7 +123,7 @@ def getThreshold(self):
 
 
 @inherit_doc
-class Bucketizer(JavaTransformer, HasInputCol, HasOutputCol):
+class Bucketizer(JavaTransformer, HasInputCol, HasOutputCol, MLReadable, MLWritable):
     """
     .. note:: Experimental
 
@@ -138,6 +143,11 @@ class Bucketizer(JavaTransformer, HasInputCol, HasOutputCol):
     2.0
     >>> bucketizer.setParams(outputCol="b").transform(df).head().b
     0.0
+    >>> bucketizerPath = temp_path + "/bucketizer"
+    >>> bucketizer.save(bucketizerPath)
+    >>> loadedBucketizer = Bucketizer.load(bucketizerPath)
+    >>> loadedBucketizer.getSplits() == bucketizer.getSplits()
+    True
 
     .. versionadded:: 1.3.0
     """
@@ -188,7 +198,7 @@ def getSplits(self):
 
 
 @inherit_doc
-class CountVectorizer(JavaEstimator, HasInputCol, HasOutputCol):
+class CountVectorizer(JavaEstimator, HasInputCol, HasOutputCol, MLReadable, MLWritable):
     """
     .. note:: Experimental
 
@@ -207,8 +217,22 @@ class CountVectorizer(JavaEstimator, HasInputCol, HasOutputCol):
     |1    |[a, b, b, c, a]|(3,[0,1,2],[2.0,2.0,1.0])|
     +-----+---------------+-------------------------+
     ...
-    >>> sorted(map(str, model.vocabulary))
-    ['a', 'b', 'c']
+    >>> sorted(model.vocabulary) == ['a', 'b', 'c']
+    True
+    >>> countVectorizerPath = temp_path + "/count-vectorizer"
+    >>> cv.save(countVectorizerPath)
+    >>> loadedCv = CountVectorizer.load(countVectorizerPath)
+    >>> loadedCv.getMinDF() == cv.getMinDF()
+    True
+    >>> loadedCv.getMinTF() == cv.getMinTF()
+    True
+    >>> loadedCv.getVocabSize() == cv.getVocabSize()
+    True
+    >>> modelPath = temp_path + "/count-vectorizer-model"
+    >>> model.save(modelPath)
+    >>> loadedModel = CountVectorizerModel.load(modelPath)
+    >>> loadedModel.vocabulary == model.vocabulary
+    True
 
     .. versionadded:: 1.6.0
     """
@@ -300,7 +324,7 @@ def _create_model(self, java_model):
         return CountVectorizerModel(java_model)
 
 
-class CountVectorizerModel(JavaModel):
+class CountVectorizerModel(JavaModel, MLReadable, MLWritable):
     """
     .. note:: Experimental
 
@@ -319,7 +343,7 @@ def vocabulary(self):
 
 
 @inherit_doc
-class DCT(JavaTransformer, HasInputCol, HasOutputCol):
+class DCT(JavaTransformer, HasInputCol, HasOutputCol, MLReadable, MLWritable):
     """
     .. note:: Experimental
 
@@ -341,6 +365,11 @@ class DCT(JavaTransformer, HasInputCol, HasOutputCol):
     >>> df3 = DCT(inverse=True, inputCol="resultVec", outputCol="origVec").transform(df2)
     >>> df3.head().origVec
     DenseVector([5.0, 8.0, 6.0])
+    >>> dctPath = temp_path + "/dct"
+    >>> dct.save(dctPath)
+    >>> loadedDtc = DCT.load(dctPath)
+    >>> loadedDtc.getInverse()
+    False
 
     .. versionadded:: 1.6.0
     """
@@ -386,7 +415,7 @@ def getInverse(self):
 
 
 @inherit_doc
-class ElementwiseProduct(JavaTransformer, HasInputCol, HasOutputCol):
+class ElementwiseProduct(JavaTransformer, HasInputCol, HasOutputCol, MLReadable, MLWritable):
     """
     .. note:: Experimental
 
@@ -402,6 +431,11 @@ class ElementwiseProduct(JavaTransformer, HasInputCol, HasOutputCol):
     DenseVector([2.0, 2.0, 9.0])
     >>> ep.setParams(scalingVec=Vectors.dense([2.0, 3.0, 5.0])).transform(df).head().eprod
     DenseVector([4.0, 3.0, 15.0])
+    >>> elementwiseProductPath = temp_path + "/elementwise-product"
+    >>> ep.save(elementwiseProductPath)
+    >>> loadedEp = ElementwiseProduct.load(elementwiseProductPath)
+    >>> loadedEp.getScalingVec() == ep.getScalingVec()
+    True
 
     .. versionadded:: 1.5.0
     """
@@ -447,7 +481,7 @@ def getScalingVec(self):
 
 
 @inherit_doc
-class HashingTF(JavaTransformer, HasInputCol, HasOutputCol, HasNumFeatures):
+class HashingTF(JavaTransformer, HasInputCol, HasOutputCol, HasNumFeatures, MLReadable, MLWritable):
     """
     .. note:: Experimental
 
@@ -463,6 +497,11 @@ class HashingTF(JavaTransformer, HasInputCol, HasOutputCol, HasNumFeatures):
     >>> params = {hashingTF.numFeatures: 5, hashingTF.outputCol: "vector"}
     >>> hashingTF.transform(df, params).head().vector
     SparseVector(5, {2: 1.0, 3: 1.0, 4: 1.0})
+    >>> hashingTFPath = temp_path + "/hashing-tf"
+    >>> hashingTF.save(hashingTFPath)
+    >>> loadedHashingTF = HashingTF.load(hashingTFPath)
+    >>> loadedHashingTF.getNumFeatures() == hashingTF.getNumFeatures()
+    True
 
     .. versionadded:: 1.3.0
     """
@@ -490,7 +529,7 @@ def setParams(self, numFeatures=1 << 18, inputCol=None, outputCol=None):
 
 
 @inherit_doc
-class IDF(JavaEstimator, HasInputCol, HasOutputCol):
+class IDF(JavaEstimator, HasInputCol, HasOutputCol, MLReadable, MLWritable):
     """
     .. note:: Experimental
 
@@ -500,13 +539,24 @@ class IDF(JavaEstimator, HasInputCol, HasOutputCol):
     >>> df = sqlContext.createDataFrame([(DenseVector([1.0, 2.0]),),
     ...     (DenseVector([0.0, 1.0]),), (DenseVector([3.0, 0.2]),)], ["tf"])
     >>> idf = IDF(minDocFreq=3, inputCol="tf", outputCol="idf")
-    >>> idf.fit(df).transform(df).head().idf
+    >>> model = idf.fit(df)
+    >>> model.transform(df).head().idf
     DenseVector([0.0, 0.0])
     >>> idf.setParams(outputCol="freqs").fit(df).transform(df).collect()[1].freqs
     DenseVector([0.0, 0.0])
     >>> params = {idf.minDocFreq: 1, idf.outputCol: "vector"}
     >>> idf.fit(df, params).transform(df).head().vector
     DenseVector([0.2877, 0.0])
+    >>> idfPath = temp_path + "/idf"
+    >>> idf.save(idfPath)
+    >>> loadedIdf = IDF.load(idfPath)
+    >>> loadedIdf.getMinDocFreq() == idf.getMinDocFreq()
+    True
+    >>> modelPath = temp_path + "/idf-model"
+    >>> model.save(modelPath)
+    >>> loadedModel = IDFModel.load(modelPath)
+    >>> loadedModel.transform(df).head().idf == model.transform(df).head().idf
+    True
 
     .. versionadded:: 1.4.0
     """
@@ -554,7 +604,7 @@ def _create_model(self, java_model):
         return IDFModel(java_model)
 
 
-class IDFModel(JavaModel):
+class IDFModel(JavaModel, MLReadable, MLWritable):
     """
     .. note:: Experimental
 
@@ -565,7 +615,7 @@ class IDFModel(JavaModel):
 
 
 @inherit_doc
-class MaxAbsScaler(JavaEstimator, HasInputCol, HasOutputCol):
+class MaxAbsScaler(JavaEstimator, HasInputCol, HasOutputCol, MLReadable, MLWritable):
     """
     .. note:: Experimental
 
@@ -585,6 +635,18 @@ class MaxAbsScaler(JavaEstimator, HasInputCol, HasOutputCol):
     |[2.0]| [1.0]|
     +-----+------+
     ...
+    >>> scalerPath = temp_path + "/max-abs-scaler"
+    >>> maScaler.save(scalerPath)
+    >>> loadedMAScaler = MaxAbsScaler.load(scalerPath)
+    >>> loadedMAScaler.getInputCol() == maScaler.getInputCol()
+    True
+    >>> loadedMAScaler.getOutputCol() == maScaler.getOutputCol()
+    True
+    >>> modelPath = temp_path + "/max-abs-scaler-model"
+    >>> model.save(modelPath)
+    >>> loadedModel = MaxAbsScalerModel.load(modelPath)
+    >>> loadedModel.maxAbs == model.maxAbs
+    True
 
     .. versionadded:: 2.0.0
     """
@@ -614,7 +676,7 @@ def _create_model(self, java_model):
         return MaxAbsScalerModel(java_model)
 
 
-class MaxAbsScalerModel(JavaModel):
+class MaxAbsScalerModel(JavaModel, MLReadable, MLWritable):
     """
     .. note:: Experimental
 
@@ -623,9 +685,17 @@ class MaxAbsScalerModel(JavaModel):
     .. versionadded:: 2.0.0
     """
 
+    @property
+    @since("2.0.0")
+    def maxAbs(self):
+        """
+        Max Abs vector.
+        """
+        return self._call_java("maxAbs")
+
 
 @inherit_doc
-class MinMaxScaler(JavaEstimator, HasInputCol, HasOutputCol):
+class MinMaxScaler(JavaEstimator, HasInputCol, HasOutputCol, MLReadable, MLWritable):
     """
     .. note:: Experimental
 
@@ -656,6 +726,20 @@ class MinMaxScaler(JavaEstimator, HasInputCol, HasOutputCol):
     |[2.0]| [1.0]|
     +-----+------+
     ...
+    >>> minMaxScalerPath = temp_path + "/min-max-scaler"
+    >>> mmScaler.save(minMaxScalerPath)
+    >>> loadedMMScaler = MinMaxScaler.load(minMaxScalerPath)
+    >>> loadedMMScaler.getMin() == mmScaler.getMin()
+    True
+    >>> loadedMMScaler.getMax() == mmScaler.getMax()
+    True
+    >>> modelPath = temp_path + "/min-max-scaler-model"
+    >>> model.save(modelPath)
+    >>> loadedModel = MinMaxScalerModel.load(modelPath)
+    >>> loadedModel.originalMin == model.originalMin
+    True
+    >>> loadedModel.originalMax == model.originalMax
+    True
 
     .. versionadded:: 1.6.0
     """
@@ -718,7 +802,7 @@ def _create_model(self, java_model):
         return MinMaxScalerModel(java_model)
 
 
-class MinMaxScalerModel(JavaModel):
+class MinMaxScalerModel(JavaModel, MLReadable, MLWritable):
     """
     .. note:: Experimental
 
@@ -746,7 +830,7 @@ def originalMax(self):
 
 @inherit_doc
 @ignore_unicode_prefix
-class NGram(JavaTransformer, HasInputCol, HasOutputCol):
+class NGram(JavaTransformer, HasInputCol, HasOutputCol, MLReadable, MLWritable):
     """
     .. note:: Experimental
 
@@ -775,6 +859,11 @@ class NGram(JavaTransformer, HasInputCol, HasOutputCol):
     Traceback (most recent call last):
         ...
     TypeError: Method setParams forces keyword arguments.
+    >>> ngramPath = temp_path + "/ngram"
+    >>> ngram.save(ngramPath)
+    >>> loadedNGram = NGram.load(ngramPath)
+    >>> loadedNGram.getN() == ngram.getN()
+    True
 
     .. versionadded:: 1.5.0
     """
@@ -819,7 +908,7 @@ def getN(self):
 
 
 @inherit_doc
-class Normalizer(JavaTransformer, HasInputCol, HasOutputCol):
+class Normalizer(JavaTransformer, HasInputCol, HasOutputCol, MLReadable, MLWritable):
     """
     .. note:: Experimental
 
@@ -836,6 +925,11 @@ class Normalizer(JavaTransformer, HasInputCol, HasOutputCol):
     >>> params = {normalizer.p: 1.0, normalizer.inputCol: "dense", normalizer.outputCol: "vector"}
     >>> normalizer.transform(df, params).head().vector
     DenseVector([0.4286, -0.5714])
+    >>> normalizerPath = temp_path + "/normalizer"
+    >>> normalizer.save(normalizerPath)
+    >>> loadedNormalizer = Normalizer.load(normalizerPath)
+    >>> loadedNormalizer.getP() == normalizer.getP()
+    True
 
     .. versionadded:: 1.4.0
     """
@@ -880,7 +974,7 @@ def getP(self):
 
 
 @inherit_doc
-class OneHotEncoder(JavaTransformer, HasInputCol, HasOutputCol):
+class OneHotEncoder(JavaTransformer, HasInputCol, HasOutputCol, MLReadable, MLWritable):
     """
     .. note:: Experimental
 
@@ -913,6 +1007,11 @@ class OneHotEncoder(JavaTransformer, HasInputCol, HasOutputCol):
     >>> params = {encoder.dropLast: False, encoder.outputCol: "test"}
     >>> encoder.transform(td, params).head().test
     SparseVector(3, {0: 1.0})
+    >>> onehotEncoderPath = temp_path + "/onehot-encoder"
+    >>> encoder.save(onehotEncoderPath)
+    >>> loadedEncoder = OneHotEncoder.load(onehotEncoderPath)
+    >>> loadedEncoder.getDropLast() == encoder.getDropLast()
+    True
 
     .. versionadded:: 1.4.0
     """
@@ -957,7 +1056,7 @@ def getDropLast(self):
 
 
 @inherit_doc
-class PolynomialExpansion(JavaTransformer, HasInputCol, HasOutputCol):
+class PolynomialExpansion(JavaTransformer, HasInputCol, HasOutputCol, MLReadable, MLWritable):
     """
     .. note:: Experimental
 
@@ -974,6 +1073,11 @@ class PolynomialExpansion(JavaTransformer, HasInputCol, HasOutputCol):
     DenseVector([0.5, 0.25, 2.0, 1.0, 4.0])
     >>> px.setParams(outputCol="test").transform(df).head().test
     DenseVector([0.5, 0.25, 2.0, 1.0, 4.0])
+    >>> polyExpansionPath = temp_path + "/poly-expansion"
+    >>> px.save(polyExpansionPath)
+    >>> loadedPx = PolynomialExpansion.load(polyExpansionPath)
+    >>> loadedPx.getDegree() == px.getDegree()
+    True
 
     .. versionadded:: 1.4.0
     """
@@ -1019,7 +1123,8 @@ def getDegree(self):
 
 
 @inherit_doc
-class QuantileDiscretizer(JavaEstimator, HasInputCol, HasOutputCol, HasSeed):
+class QuantileDiscretizer(JavaEstimator, HasInputCol, HasOutputCol, HasSeed, MLReadable,
+                          MLWritable):
     """
     .. note:: Experimental
 
@@ -1043,6 +1148,11 @@ class QuantileDiscretizer(JavaEstimator, HasInputCol, HasOutputCol, HasSeed):
     >>> bucketed = bucketizer.transform(df).head()
     >>> bucketed.buckets
     0.0
+    >>> quantileDiscretizerPath = temp_path + "/quantile-discretizer"
+    >>> qds.save(quantileDiscretizerPath)
+    >>> loadedQds = QuantileDiscretizer.load(quantileDiscretizerPath)
+    >>> loadedQds.getNumBuckets() == qds.getNumBuckets()
+    True
 
     .. versionadded:: 2.0.0
     """
@@ -1103,7 +1213,7 @@ def _create_model(self, java_model):
 
 @inherit_doc
 @ignore_unicode_prefix
-class RegexTokenizer(JavaTransformer, HasInputCol, HasOutputCol):
+class RegexTokenizer(JavaTransformer, HasInputCol, HasOutputCol, MLReadable, MLWritable):
     """
     .. note:: Experimental
 
@@ -1131,6 +1241,13 @@ class RegexTokenizer(JavaTransformer, HasInputCol, HasOutputCol):
     Traceback (most recent call last):
         ...
     TypeError: Method setParams forces keyword arguments.
+    >>> regexTokenizerPath = temp_path + "/regex-tokenizer"
+    >>> reTokenizer.save(regexTokenizerPath)
+    >>> loadedReTokenizer = RegexTokenizer.load(regexTokenizerPath)
+    >>> loadedReTokenizer.getMinTokenLength() == reTokenizer.getMinTokenLength()
+    True
+    >>> loadedReTokenizer.getGaps() == reTokenizer.getGaps()
+    True
 
     .. versionadded:: 1.4.0
     """
@@ -1228,7 +1345,7 @@ def getToLowercase(self):
 
 
 @inherit_doc
-class SQLTransformer(JavaTransformer):
+class SQLTransformer(JavaTransformer, MLReadable, MLWritable):
     """
     .. note:: Experimental
 
@@ -1241,6 +1358,11 @@ class SQLTransformer(JavaTransformer):
     ...     statement="SELECT *, (v1 + v2) AS v3, (v1 * v2) AS v4 FROM __THIS__")
     >>> sqlTrans.transform(df).head()
     Row(id=0, v1=1.0, v2=3.0, v3=4.0, v4=3.0)
+    >>> sqlTransformerPath = temp_path + "/sql-transformer"
+    >>> sqlTrans.save(sqlTransformerPath)
+    >>> loadedSqlTrans = SQLTransformer.load(sqlTransformerPath)
+    >>> loadedSqlTrans.getStatement() == sqlTrans.getStatement()
+    True
 
     .. versionadded:: 1.6.0
     """
@@ -1284,7 +1406,7 @@ def getStatement(self):
 
 
 @inherit_doc
-class StandardScaler(JavaEstimator, HasInputCol, HasOutputCol):
+class StandardScaler(JavaEstimator, HasInputCol, HasOutputCol, MLReadable, MLWritable):
     """
     .. note:: Experimental
 
@@ -1301,6 +1423,20 @@ class StandardScaler(JavaEstimator, HasInputCol, HasOutputCol):
     DenseVector([1.4142])
     >>> model.transform(df).collect()[1].scaled
     DenseVector([1.4142])
+    >>> standardScalerPath = temp_path + "/standard-scaler"
+    >>> standardScaler.save(standardScalerPath)
+    >>> loadedStandardScaler = StandardScaler.load(standardScalerPath)
+    >>> loadedStandardScaler.getWithMean() == standardScaler.getWithMean()
+    True
+    >>> loadedStandardScaler.getWithStd() == standardScaler.getWithStd()
+    True
+    >>> modelPath = temp_path + "/standard-scaler-model"
+    >>> model.save(modelPath)
+    >>> loadedModel = StandardScalerModel.load(modelPath)
+    >>> loadedModel.std == model.std
+    True
+    >>> loadedModel.mean == model.mean
+    True
 
     .. versionadded:: 1.4.0
     """
@@ -1363,7 +1499,7 @@ def _create_model(self, java_model):
         return StandardScalerModel(java_model)
 
 
-class StandardScalerModel(JavaModel):
+class StandardScalerModel(JavaModel, MLReadable, MLWritable):
     """
     .. note:: Experimental
 
@@ -1390,7 +1526,8 @@ def mean(self):
 
 
 @inherit_doc
-class StringIndexer(JavaEstimator, HasInputCol, HasOutputCol, HasHandleInvalid):
+class StringIndexer(JavaEstimator, HasInputCol, HasOutputCol, HasHandleInvalid, MLReadable,
+                    MLWritable):
     """
     .. note:: Experimental
 
@@ -1410,6 +1547,21 @@ class StringIndexer(JavaEstimator, HasInputCol, HasOutputCol, HasHandleInvalid):
     >>> sorted(set([(i[0], str(i[1])) for i in itd.select(itd.id, itd.label2).collect()]),
     ...     key=lambda x: x[0])
     [(0, 'a'), (1, 'b'), (2, 'c'), (3, 'a'), (4, 'a'), (5, 'c')]
+    >>> stringIndexerPath = temp_path + "/string-indexer"
+    >>> stringIndexer.save(stringIndexerPath)
+    >>> loadedIndexer = StringIndexer.load(stringIndexerPath)
+    >>> loadedIndexer.getHandleInvalid() == stringIndexer.getHandleInvalid()
+    True
+    >>> modelPath = temp_path + "/string-indexer-model"
+    >>> model.save(modelPath)
+    >>> loadedModel = StringIndexerModel.load(modelPath)
+    >>> loadedModel.labels == model.labels
+    True
+    >>> indexToStringPath = temp_path + "/index-to-string"
+    >>> inverter.save(indexToStringPath)
+    >>> loadedInverter = IndexToString.load(indexToStringPath)
+    >>> loadedInverter.getLabels() == inverter.getLabels()
+    True
 
     .. versionadded:: 1.4.0
     """
@@ -1439,7 +1591,7 @@ def _create_model(self, java_model):
         return StringIndexerModel(java_model)
 
 
-class StringIndexerModel(JavaModel):
+class StringIndexerModel(JavaModel, MLReadable, MLWritable):
     """
     .. note:: Experimental
 
@@ -1458,7 +1610,7 @@ def labels(self):
 
 
 @inherit_doc
-class IndexToString(JavaTransformer, HasInputCol, HasOutputCol):
+class IndexToString(JavaTransformer, HasInputCol, HasOutputCol, MLReadable, MLWritable):
     """
     .. note:: Experimental
 
@@ -1512,13 +1664,25 @@ def getLabels(self):
         return self.getOrDefault(self.labels)
 
 
-class StopWordsRemover(JavaTransformer, HasInputCol, HasOutputCol):
+class StopWordsRemover(JavaTransformer, HasInputCol, HasOutputCol, MLReadable, MLWritable):
     """
     .. note:: Experimental
 
     A feature transformer that filters out stop words from input.
     Note: null values from input array are preserved unless adding null to stopWords explicitly.
 
+    >>> df = sqlContext.createDataFrame([(["a", "b", "c"],)], ["text"])
+    >>> remover = StopWordsRemover(inputCol="text", outputCol="words", stopWords=["b"])
+    >>> remover.transform(df).head().words == ['a', 'c']
+    True
+    >>> stopWordsRemoverPath = temp_path + "/stopwords-remover"
+    >>> remover.save(stopWordsRemoverPath)
+    >>> loadedRemover = StopWordsRemover.load(stopWordsRemoverPath)
+    >>> loadedRemover.getStopWords() == remover.getStopWords()
+    True
+    >>> loadedRemover.getCaseSensitive() == remover.getCaseSensitive()
+    True
+
     .. versionadded:: 1.6.0
     """
 
@@ -1538,7 +1702,7 @@ def __init__(self, inputCol=None, outputCol=None, stopWords=None,
                                             self.uid)
         stopWordsObj = _jvm().org.apache.spark.ml.feature.StopWords
         defaultStopWords = stopWordsObj.English()
-        self._setDefault(stopWords=defaultStopWords)
+        self._setDefault(stopWords=defaultStopWords, caseSensitive=False)
         kwargs = self.__init__._input_kwargs
         self.setParams(**kwargs)
 
@@ -1587,7 +1751,7 @@ def getCaseSensitive(self):
 
 @inherit_doc
 @ignore_unicode_prefix
-class Tokenizer(JavaTransformer, HasInputCol, HasOutputCol):
+class Tokenizer(JavaTransformer, HasInputCol, HasOutputCol, MLReadable, MLWritable):
     """
     .. note:: Experimental
 
@@ -1611,6 +1775,11 @@ class Tokenizer(JavaTransformer, HasInputCol, HasOutputCol):
     Traceback (most recent call last):
         ...
     TypeError: Method setParams forces keyword arguments.
+    >>> tokenizerPath = temp_path + "/tokenizer"
+    >>> tokenizer.save(tokenizerPath)
+    >>> loadedTokenizer = Tokenizer.load(tokenizerPath)
+    >>> loadedTokenizer.transform(df).head().tokens == tokenizer.transform(df).head().tokens
+    True
 
     .. versionadded:: 1.3.0
     """
@@ -1637,7 +1806,7 @@ def setParams(self, inputCol=None, outputCol=None):
 
 
 @inherit_doc
-class VectorAssembler(JavaTransformer, HasInputCols, HasOutputCol):
+class VectorAssembler(JavaTransformer, HasInputCols, HasOutputCol, MLReadable, MLWritable):
     """
     .. note:: Experimental
 
@@ -1652,6 +1821,11 @@ class VectorAssembler(JavaTransformer, HasInputCols, HasOutputCol):
     >>> params = {vecAssembler.inputCols: ["b", "a"], vecAssembler.outputCol: "vector"}
     >>> vecAssembler.transform(df, params).head().vector
     DenseVector([0.0, 1.0])
+    >>> vectorAssemblerPath = temp_path + "/vector-assembler"
+    >>> vecAssembler.save(vectorAssemblerPath)
+    >>> loadedAssembler = VectorAssembler.load(vectorAssemblerPath)
+    >>> loadedAssembler.transform(df).head().freqs == vecAssembler.transform(df).head().freqs
+    True
 
     .. versionadded:: 1.4.0
     """
@@ -1678,7 +1852,7 @@ def setParams(self, inputCols=None, outputCol=None):
 
 
 @inherit_doc
-class VectorIndexer(JavaEstimator, HasInputCol, HasOutputCol):
+class VectorIndexer(JavaEstimator, HasInputCol, HasOutputCol, MLReadable, MLWritable):
     """
     .. note:: Experimental
 
@@ -1734,6 +1908,18 @@ class VectorIndexer(JavaEstimator, HasInputCol, HasOutputCol):
     >>> model2 = indexer.fit(df, params)
     >>> model2.transform(df).head().vector
     DenseVector([1.0, 0.0])
+    >>> vectorIndexerPath = temp_path + "/vector-indexer"
+    >>> indexer.save(vectorIndexerPath)
+    >>> loadedIndexer = VectorIndexer.load(vectorIndexerPath)
+    >>> loadedIndexer.getMaxCategories() == indexer.getMaxCategories()
+    True
+    >>> modelPath = temp_path + "/vector-indexer-model"
+    >>> model.save(modelPath)
+    >>> loadedModel = VectorIndexerModel.load(modelPath)
+    >>> loadedModel.numFeatures == model.numFeatures
+    True
+    >>> loadedModel.categoryMaps == model.categoryMaps
+    True
 
     .. versionadded:: 1.4.0
     """
@@ -1783,7 +1969,7 @@ def _create_model(self, java_model):
         return VectorIndexerModel(java_model)
 
 
-class VectorIndexerModel(JavaModel):
+class VectorIndexerModel(JavaModel, MLReadable, MLWritable):
     """
     .. note:: Experimental
 
@@ -1812,7 +1998,7 @@ def categoryMaps(self):
 
 
 @inherit_doc
-class VectorSlicer(JavaTransformer, HasInputCol, HasOutputCol):
+class VectorSlicer(JavaTransformer, HasInputCol, HasOutputCol, MLReadable, MLWritable):
     """
     .. note:: Experimental
 
@@ -1834,6 +2020,13 @@ class VectorSlicer(JavaTransformer, HasInputCol, HasOutputCol):
     >>> vs = VectorSlicer(inputCol="features", outputCol="sliced", indices=[1, 4])
     >>> vs.transform(df).head().sliced
     DenseVector([2.3, 1.0])
+    >>> vectorSlicerPath = temp_path + "/vector-slicer"
+    >>> vs.save(vectorSlicerPath)
+    >>> loadedVs = VectorSlicer.load(vectorSlicerPath)
+    >>> loadedVs.getIndices() == vs.getIndices()
+    True
+    >>> loadedVs.getNames() == vs.getNames()
+    True
 
     .. versionadded:: 1.6.0
     """
@@ -1852,6 +2045,7 @@ def __init__(self, inputCol=None, outputCol=None, indices=None, names=None):
         """
         super(VectorSlicer, self).__init__()
         self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.VectorSlicer", self.uid)
+        self._setDefault(indices=[], names=[])
         kwargs = self.__init__._input_kwargs
         self.setParams(**kwargs)
 
@@ -1898,7 +2092,8 @@ def getNames(self):
 
 @inherit_doc
 @ignore_unicode_prefix
-class Word2Vec(JavaEstimator, HasStepSize, HasMaxIter, HasSeed, HasInputCol, HasOutputCol):
+class Word2Vec(JavaEstimator, HasStepSize, HasMaxIter, HasSeed, HasInputCol, HasOutputCol,
+               MLReadable, MLWritable):
     """
     .. note:: Experimental
 
@@ -1907,7 +2102,8 @@ class Word2Vec(JavaEstimator, HasStepSize, HasMaxIter, HasSeed, HasInputCol, Has
 
     >>> sent = ("a b " * 100 + "a c " * 10).split(" ")
     >>> doc = sqlContext.createDataFrame([(sent,), (sent,)], ["sentence"])
-    >>> model = Word2Vec(vectorSize=5, seed=42, inputCol="sentence", outputCol="model").fit(doc)
+    >>> word2Vec = Word2Vec(vectorSize=5, seed=42, inputCol="sentence", outputCol="model")
+    >>> model = word2Vec.fit(doc)
     >>> model.getVectors().show()
     +----+--------------------+
     |word|              vector|
@@ -1927,6 +2123,22 @@ class Word2Vec(JavaEstimator, HasStepSize, HasMaxIter, HasSeed, HasInputCol, Has
     ...
     >>> model.transform(doc).head().model
     DenseVector([0.5524, -0.4995, -0.3599, 0.0241, 0.3461])
+    >>> word2vecPath = temp_path + "/word2vec"
+    >>> word2Vec.save(word2vecPath)
+    >>> loadedWord2Vec = Word2Vec.load(word2vecPath)
+    >>> loadedWord2Vec.getVectorSize() == word2Vec.getVectorSize()
+    True
+    >>> loadedWord2Vec.getNumPartitions() == word2Vec.getNumPartitions()
+    True
+    >>> loadedWord2Vec.getMinCount() == word2Vec.getMinCount()
+    True
+    >>> modelPath = temp_path + "/word2vec-model"
+    >>> model.save(modelPath)
+    >>> loadedModel = Word2VecModel.load(modelPath)
+    >>> loadedModel.getVectors().first().word == model.getVectors().first().word
+    True
+    >>> loadedModel.getVectors().first().vector == model.getVectors().first().vector
+    True
 
     .. versionadded:: 1.4.0
     """
@@ -2014,7 +2226,7 @@ def _create_model(self, java_model):
         return Word2VecModel(java_model)
 
 
-class Word2VecModel(JavaModel):
+class Word2VecModel(JavaModel, MLReadable, MLWritable):
     """
     .. note:: Experimental
 
@@ -2045,7 +2257,7 @@ def findSynonyms(self, word, num):
 
 
 @inherit_doc
-class PCA(JavaEstimator, HasInputCol, HasOutputCol):
+class PCA(JavaEstimator, HasInputCol, HasOutputCol, MLReadable, MLWritable):
     """
     .. note:: Experimental
 
@@ -2062,6 +2274,18 @@ class PCA(JavaEstimator, HasInputCol, HasOutputCol):
     DenseVector([1.648..., -4.013...])
     >>> model.explainedVariance
     DenseVector([0.794..., 0.205...])
+    >>> pcaPath = temp_path + "/pca"
+    >>> pca.save(pcaPath)
+    >>> loadedPca = PCA.load(pcaPath)
+    >>> loadedPca.getK() == pca.getK()
+    True
+    >>> modelPath = temp_path + "/pca-model"
+    >>> model.save(modelPath)
+    >>> loadedModel = PCAModel.load(modelPath)
+    >>> loadedModel.pc == model.pc
+    True
+    >>> loadedModel.explainedVariance == model.explainedVariance
+    True
 
     .. versionadded:: 1.5.0
     """
@@ -2107,7 +2331,7 @@ def _create_model(self, java_model):
         return PCAModel(java_model)
 
 
-class PCAModel(JavaModel):
+class PCAModel(JavaModel, MLReadable, MLWritable):
     """
     .. note:: Experimental
 
@@ -2226,7 +2450,8 @@ class RFormulaModel(JavaModel):
 
 
 @inherit_doc
-class ChiSqSelector(JavaEstimator, HasFeaturesCol, HasOutputCol, HasLabelCol):
+class ChiSqSelector(JavaEstimator, HasFeaturesCol, HasOutputCol, HasLabelCol, MLReadable,
+                    MLWritable):
     """
     .. note:: Experimental
 
@@ -2245,6 +2470,16 @@ class ChiSqSelector(JavaEstimator, HasFeaturesCol, HasOutputCol, HasLabelCol):
     DenseVector([1.0])
     >>> model.selectedFeatures
     [3]
+    >>> chiSqSelectorPath = temp_path + "/chi-sq-selector"
+    >>> selector.save(chiSqSelectorPath)
+    >>> loadedSelector = ChiSqSelector.load(chiSqSelectorPath)
+    >>> loadedSelector.getNumTopFeatures() == selector.getNumTopFeatures()
+    True
+    >>> modelPath = temp_path + "/chi-sq-selector-model"
+    >>> model.save(modelPath)
+    >>> loadedModel = ChiSqSelectorModel.load(modelPath)
+    >>> loadedModel.selectedFeatures == model.selectedFeatures
+    True
 
     .. versionadded:: 2.0.0
     """
@@ -2302,7 +2537,7 @@ def _create_model(self, java_model):
         return ChiSqSelectorModel(java_model)
 
 
-class ChiSqSelectorModel(JavaModel):
+class ChiSqSelectorModel(JavaModel, MLReadable, MLWritable):
     """
     .. note:: Experimental
 
@@ -2322,9 +2557,16 @@ def selectedFeatures(self):
 
 if __name__ == "__main__":
     import doctest
+    import tempfile
+
+    import pyspark.ml.feature
     from pyspark.context import SparkContext
     from pyspark.sql import Row, SQLContext
+
     globs = globals().copy()
+    features = pyspark.ml.feature.__dict__.copy()
+    globs.update(features)
+
     # The small batch size here ensures that we see multiple batches,
     # even in these small test examples:
     sc = SparkContext("local[2]", "ml.feature tests")
@@ -2335,7 +2577,16 @@ def selectedFeatures(self):
                                Row(id=2, label="c"), Row(id=3, label="a"),
                                Row(id=4, label="a"), Row(id=5, label="c")], 2)
     globs['stringIndDf'] = sqlContext.createDataFrame(testData)
-    (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS)
-    sc.stop()
+    temp_path = tempfile.mkdtemp()
+    globs['temp_path'] = temp_path
+    try:
+        (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS)
+        sc.stop()
+    finally:
+        from shutil import rmtree
+        try:
+            rmtree(temp_path)
+        except OSError:
+            pass
     if failure_count:
         exit(-1)

From b7d41474216787e9cd38c04a15c43d5d02f02f93 Mon Sep 17 00:00:00 2001
From: Andrew Or <andrew@databricks.com>
Date: Fri, 4 Mar 2016 10:32:00 -0800
Subject: [PATCH 08/29] [SPARK-13633][SQL] Move things into catalyst.parser
 package

## What changes were proposed in this pull request?

This patch simply moves things to existing package `o.a.s.sql.catalyst.parser` in an effort to reduce the size of the diff in #11048. This is conceptually the same as a recently merged patch #11482.

## How was this patch tested?

Jenkins.

Author: Andrew Or <andrew@databricks.com>

Closes #11506 from andrewor14/parser-package.
---
 .../sql/catalyst/{ => parser}/AbstractSparkSQLParser.scala  | 2 +-
 .../apache/spark/sql/catalyst/{ => parser}/CatalystQl.scala | 6 ++++--
 .../sql/catalyst/{util => parser}/DataTypeParser.scala      | 3 +--
 .../catalyst/{util => parser}/LegacyTypeStringParser.scala  | 2 +-
 .../spark/sql/catalyst/{ => parser}/ParserInterface.scala   | 3 ++-
 .../main/scala/org/apache/spark/sql/types/StructType.scala  | 3 ++-
 .../spark/sql/catalyst/{ => parser}/CatalystQlSuite.scala   | 3 ++-
 .../sql/catalyst/{util => parser}/DataTypeParserSuite.scala | 2 +-
 sql/core/src/main/scala/org/apache/spark/sql/Column.scala   | 3 ++-
 .../src/main/scala/org/apache/spark/sql/SQLContext.scala    | 1 +
 .../main/scala/org/apache/spark/sql/execution/SparkQl.scala | 4 ++--
 .../sql/execution/datasources/parquet/ParquetRelation.scala | 2 +-
 .../src/main/scala/org/apache/spark/sql/functions.scala     | 3 ++-
 .../scala/org/apache/spark/sql/internal/SessionState.scala  | 2 +-
 .../org/apache/spark/sql/hive/HiveMetastoreCatalog.scala    | 2 +-
 .../scala/org/apache/spark/sql/hive/HiveSessionState.scala  | 2 +-
 16 files changed, 25 insertions(+), 18 deletions(-)
 rename sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/{ => parser}/AbstractSparkSQLParser.scala (99%)
 rename sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/{ => parser}/CatalystQl.scala (99%)
 rename sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/{util => parser}/DataTypeParser.scala (97%)
 rename sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/{util => parser}/LegacyTypeStringParser.scala (98%)
 rename sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/{ => parser}/ParserInterface.scala (93%)
 rename sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/{ => parser}/CatalystQlSuite.scala (98%)
 rename sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/{util => parser}/DataTypeParserSuite.scala (99%)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/AbstractSparkSQLParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AbstractSparkSQLParser.scala
similarity index 99%
rename from sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/AbstractSparkSQLParser.scala
rename to sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AbstractSparkSQLParser.scala
index 38fa5cb585ee7..7b456a6de3cd4 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/AbstractSparkSQLParser.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AbstractSparkSQLParser.scala
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.catalyst
+package org.apache.spark.sql.catalyst.parser
 
 import scala.language.implicitConversions
 import scala.util.parsing.combinator.lexical.StdLexical
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystQl.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/CatalystQl.scala
similarity index 99%
rename from sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystQl.scala
rename to sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/CatalystQl.scala
index a0a56d728cde9..d2318417e3e68 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystQl.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/CatalystQl.scala
@@ -14,15 +14,16 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.spark.sql.catalyst
+
+package org.apache.spark.sql.catalyst.parser
 
 import java.sql.Date
 
 import org.apache.spark.sql.AnalysisException
+import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.analysis._
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.aggregate.Count
-import org.apache.spark.sql.catalyst.parser._
 import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.trees.CurrentOrigin
@@ -30,6 +31,7 @@ import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.CalendarInterval
 import org.apache.spark.util.random.RandomSampler
 
+
 /**
  * This class translates SQL to Catalyst [[LogicalPlan]]s or [[Expression]]s.
  */
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DataTypeParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/DataTypeParser.scala
similarity index 97%
rename from sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DataTypeParser.scala
rename to sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/DataTypeParser.scala
index 515c071c283b0..21deb821071e3 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DataTypeParser.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/DataTypeParser.scala
@@ -15,13 +15,12 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.catalyst.util
+package org.apache.spark.sql.catalyst.parser
 
 import scala.language.implicitConversions
 import scala.util.matching.Regex
 import scala.util.parsing.combinator.syntactical.StandardTokenParsers
 
-import org.apache.spark.sql.catalyst.SqlLexical
 import org.apache.spark.sql.types._
 
 /**
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/LegacyTypeStringParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/LegacyTypeStringParser.scala
similarity index 98%
rename from sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/LegacyTypeStringParser.scala
rename to sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/LegacyTypeStringParser.scala
index e27cf9c1989f3..60d7361242c69 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/LegacyTypeStringParser.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/LegacyTypeStringParser.scala
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.catalyst.util
+package org.apache.spark.sql.catalyst.parser
 
 import scala.util.parsing.combinator.RegexParsers
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ParserInterface.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParserInterface.scala
similarity index 93%
rename from sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ParserInterface.scala
rename to sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParserInterface.scala
index 24ec452c4d2ef..7f35d650b9571 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ParserInterface.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParserInterface.scala
@@ -15,8 +15,9 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.catalyst
+package org.apache.spark.sql.catalyst.parser
 
+import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.expressions.Expression
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala
index 271ca95a24126..1238eefcb6062 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala
@@ -25,7 +25,8 @@ import org.json4s.JsonDSL._
 import org.apache.spark.SparkException
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, InterpretedOrdering}
-import org.apache.spark.sql.catalyst.util.{quoteIdentifier, DataTypeParser, LegacyTypeStringParser}
+import org.apache.spark.sql.catalyst.parser.{DataTypeParser, LegacyTypeStringParser}
+import org.apache.spark.sql.catalyst.util.quoteIdentifier
 
 /**
  * :: DeveloperApi ::
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/CatalystQlSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/CatalystQlSuite.scala
similarity index 98%
rename from sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/CatalystQlSuite.scala
rename to sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/CatalystQlSuite.scala
index 53a8d6e53e38a..0660791c4c939 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/CatalystQlSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/CatalystQlSuite.scala
@@ -15,9 +15,10 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.catalyst
+package org.apache.spark.sql.catalyst.parser
 
 import org.apache.spark.sql.AnalysisException
+import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.analysis._
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.PlanTest
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DataTypeParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DataTypeParserSuite.scala
similarity index 99%
rename from sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DataTypeParserSuite.scala
rename to sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DataTypeParserSuite.scala
index bebf708965474..7d3608033ba59 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DataTypeParserSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DataTypeParserSuite.scala
@@ -15,7 +15,7 @@
 * limitations under the License.
 */
 
-package org.apache.spark.sql.catalyst.util
+package org.apache.spark.sql.catalyst.parser
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.types._
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
index 6c7929c362270..0fa81594ee187 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
@@ -24,7 +24,8 @@ import org.apache.spark.annotation.Experimental
 import org.apache.spark.sql.catalyst.analysis._
 import org.apache.spark.sql.catalyst.encoders.{encoderFor, ExpressionEncoder}
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.util.{usePrettyExpression, DataTypeParser}
+import org.apache.spark.sql.catalyst.parser.DataTypeParser
+import org.apache.spark.sql.catalyst.util.usePrettyExpression
 import org.apache.spark.sql.execution.aggregate.TypedAggregateExpression
 import org.apache.spark.sql.functions.lit
 import org.apache.spark.sql.types._
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index 39dad16e405b3..c742bf2f8923f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -35,6 +35,7 @@ import org.apache.spark.sql.catalyst.analysis._
 import org.apache.spark.sql.catalyst.encoders.encoderFor
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.optimizer.Optimizer
+import org.apache.spark.sql.catalyst.parser.ParserInterface
 import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan, Range}
 import org.apache.spark.sql.catalyst.rules.RuleExecutor
 import org.apache.spark.sql.execution._
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkQl.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkQl.scala
index bc690f6634a56..9143258abbc5b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkQl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkQl.scala
@@ -17,9 +17,9 @@
 package org.apache.spark.sql.execution
 
 import org.apache.spark.sql.{AnalysisException, SaveMode}
-import org.apache.spark.sql.catalyst.{CatalystQl, TableIdentifier}
+import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation
-import org.apache.spark.sql.catalyst.parser.{ASTNode, ParserConf, SimpleParserConf}
+import org.apache.spark.sql.catalyst.parser.{ASTNode, CatalystQl, ParserConf, SimpleParserConf}
 import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, OneRowRelation}
 import org.apache.spark.sql.execution.command._
 import org.apache.spark.sql.execution.datasources._
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRelation.scala
index 7ea098c72bf44..b8af832861a0f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRelation.scala
@@ -45,7 +45,7 @@ import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.rdd.{RDD, SqlNewHadoopPartition, SqlNewHadoopRDD}
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.util.LegacyTypeStringParser
+import org.apache.spark.sql.catalyst.parser.LegacyTypeStringParser
 import org.apache.spark.sql.execution.datasources.{PartitionSpec, _}
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.sources._
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index b9873d38a664f..86412c34895aa 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -22,11 +22,12 @@ import scala.reflect.runtime.universe.{typeTag, TypeTag}
 import scala.util.Try
 
 import org.apache.spark.annotation.Experimental
-import org.apache.spark.sql.catalyst.{CatalystQl, ScalaReflection}
+import org.apache.spark.sql.catalyst.ScalaReflection
 import org.apache.spark.sql.catalyst.analysis.{Star, UnresolvedFunction}
 import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.aggregate._
+import org.apache.spark.sql.catalyst.parser.CatalystQl
 import org.apache.spark.sql.catalyst.plans.logical.BroadcastHint
 import org.apache.spark.sql.expressions.UserDefinedFunction
 import org.apache.spark.sql.types._
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/SessionState.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/SessionState.scala
index f93a405f77fc7..f5f36544a702c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/internal/SessionState.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/SessionState.scala
@@ -18,9 +18,9 @@
 package org.apache.spark.sql.internal
 
 import org.apache.spark.sql.{ContinuousQueryManager, SQLContext, UDFRegistration}
-import org.apache.spark.sql.catalyst.ParserInterface
 import org.apache.spark.sql.catalyst.analysis.{Analyzer, Catalog, FunctionRegistry, SimpleCatalog}
 import org.apache.spark.sql.catalyst.optimizer.Optimizer
+import org.apache.spark.sql.catalyst.parser.ParserInterface
 import org.apache.spark.sql.catalyst.rules.RuleExecutor
 import org.apache.spark.sql.execution._
 import org.apache.spark.sql.execution.datasources.{PreInsertCastAndRename, ResolveDataSource}
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index ee8ec2d9f72b9..a053108b7d7f5 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -36,10 +36,10 @@ import org.apache.spark.sql.catalyst.{InternalRow, TableIdentifier}
 import org.apache.spark.sql.catalyst.analysis.{Catalog, MultiInstanceRelation, OverrideCatalog}
 import org.apache.spark.sql.catalyst.catalog._
 import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.parser.DataTypeParser
 import org.apache.spark.sql.catalyst.plans.logical
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules._
-import org.apache.spark.sql.catalyst.util.DataTypeParser
 import org.apache.spark.sql.execution.{datasources, FileRelation}
 import org.apache.spark.sql.execution.datasources.{Partition => ParquetPartition, _}
 import org.apache.spark.sql.execution.datasources.parquet.ParquetRelation
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionState.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionState.scala
index 0d4b79f5319a8..8207e78b4aa70 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionState.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionState.scala
@@ -18,8 +18,8 @@
 package org.apache.spark.sql.hive
 
 import org.apache.spark.sql._
-import org.apache.spark.sql.catalyst.ParserInterface
 import org.apache.spark.sql.catalyst.analysis.{Analyzer, FunctionRegistry, OverrideCatalog}
+import org.apache.spark.sql.catalyst.parser.ParserInterface
 import org.apache.spark.sql.execution.{python, SparkPlanner}
 import org.apache.spark.sql.execution.datasources._
 import org.apache.spark.sql.internal.{SessionState, SQLConf}

From 5f42c28b119b79c0ea4910c478853d451cd1a967 Mon Sep 17 00:00:00 2001
From: Alex Bozarth <ajbozart@us.ibm.com>
Date: Fri, 4 Mar 2016 17:04:09 -0600
Subject: [PATCH 09/29] [SPARK-13459][WEB UI] Separate Alive and Dead Executors
 in Executor Totals Table

## What changes were proposed in this pull request?

Now that dead executors are shown in the executors table (#10058) the totals table is updated to include the separate totals for alive and dead executors as well as the current total, as originally discussed in #10668

## How was this patch tested?

Manually verified by running the Standalone Web UI in the latest Safari and Firefox ESR

Author: Alex Bozarth <ajbozart@us.ibm.com>

Closes #11381 from ajbozarth/spark13459.
---
 .../apache/spark/ui/exec/ExecutorsPage.scala  | 84 ++++++++++---------
 1 file changed, 45 insertions(+), 39 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/ui/exec/ExecutorsPage.scala b/core/src/main/scala/org/apache/spark/ui/exec/ExecutorsPage.scala
index eba7a312ba81f..791dbe5c272b5 100644
--- a/core/src/main/scala/org/apache/spark/ui/exec/ExecutorsPage.scala
+++ b/core/src/main/scala/org/apache/spark/ui/exec/ExecutorsPage.scala
@@ -86,7 +86,7 @@ private[ui] class ExecutorsPage(
           <th>Failed Tasks</th>
           <th>Complete Tasks</th>
           <th>Total Tasks</th>
-          <th data-toggle="tooltip" title={ToolTips.TASK_TIME}>Task Time (GC Time)</th>
+          <th><span data-toggle="tooltip" title={ToolTips.TASK_TIME}>Task Time (GC Time)</span></th>
           <th><span data-toggle="tooltip" title={ToolTips.INPUT}>Input</span></th>
           <th><span data-toggle="tooltip" title={ToolTips.SHUFFLE_READ}>Shuffle Read</span></th>
           <th>
@@ -109,13 +109,8 @@ private[ui] class ExecutorsPage(
     val content =
       <div class="row">
         <div class="span12">
-          <h4>Dead Executors({deadExecutorInfo.size})</h4>
-        </div>
-      </div>
-      <div class="row">
-        <div class="span12">
-          <h4>Active Executors({activeExecutorInfo.size})</h4>
-          {execSummary(activeExecutorInfo)}
+          <h4>Summary</h4>
+          {execSummary(activeExecutorInfo, deadExecutorInfo)}
         </div>
       </div>
       <div class = "row">
@@ -198,7 +193,7 @@ private[ui] class ExecutorsPage(
     </tr>
   }
 
-  private def execSummary(execInfo: Seq[ExecutorSummary]): Seq[Node] = {
+  private def execSummaryRow(execInfo: Seq[ExecutorSummary], rowName: String): Seq[Node] = {
     val maximumMemory = execInfo.map(_.maxMemory).sum
     val memoryUsed = execInfo.map(_.memoryUsed).sum
     val diskUsed = execInfo.map(_.diskUsed).sum
@@ -207,37 +202,46 @@ private[ui] class ExecutorsPage(
     val totalShuffleRead = execInfo.map(_.totalShuffleRead).sum
     val totalShuffleWrite = execInfo.map(_.totalShuffleWrite).sum
 
-    val sumContent =
-      <tr>
-        <td>{execInfo.map(_.rddBlocks).sum}</td>
-        <td sorttable_customkey={memoryUsed.toString}>
-          {Utils.bytesToString(memoryUsed)} /
-          {Utils.bytesToString(maximumMemory)}
-        </td>
-        <td sorttable_customkey={diskUsed.toString}>
-          {Utils.bytesToString(diskUsed)}
-        </td>
-        <td>{totalCores}</td>
-        {taskData(execInfo.map(_.maxTasks).sum,
-        execInfo.map(_.activeTasks).sum,
-        execInfo.map(_.failedTasks).sum,
-        execInfo.map(_.completedTasks).sum,
-        execInfo.map(_.totalTasks).sum,
-        execInfo.map(_.totalDuration).sum,
-        execInfo.map(_.totalGCTime).sum)}
-        <td sorttable_customkey={totalInputBytes.toString}>
-          {Utils.bytesToString(totalInputBytes)}
-        </td>
-        <td sorttable_customkey={totalShuffleRead.toString}>
-          {Utils.bytesToString(totalShuffleRead)}
-        </td>
-        <td sorttable_customkey={totalShuffleWrite.toString}>
-          {Utils.bytesToString(totalShuffleWrite)}
-        </td>
-      </tr>;
+    <tr>
+      <td><b>{rowName}({execInfo.size})</b></td>
+      <td>{execInfo.map(_.rddBlocks).sum}</td>
+      <td sorttable_customkey={memoryUsed.toString}>
+        {Utils.bytesToString(memoryUsed)} /
+        {Utils.bytesToString(maximumMemory)}
+      </td>
+      <td sorttable_customkey={diskUsed.toString}>
+        {Utils.bytesToString(diskUsed)}
+      </td>
+      <td>{totalCores}</td>
+      {taskData(execInfo.map(_.maxTasks).sum,
+      execInfo.map(_.activeTasks).sum,
+      execInfo.map(_.failedTasks).sum,
+      execInfo.map(_.completedTasks).sum,
+      execInfo.map(_.totalTasks).sum,
+      execInfo.map(_.totalDuration).sum,
+      execInfo.map(_.totalGCTime).sum)}
+      <td sorttable_customkey={totalInputBytes.toString}>
+        {Utils.bytesToString(totalInputBytes)}
+      </td>
+      <td sorttable_customkey={totalShuffleRead.toString}>
+        {Utils.bytesToString(totalShuffleRead)}
+      </td>
+      <td sorttable_customkey={totalShuffleWrite.toString}>
+        {Utils.bytesToString(totalShuffleWrite)}
+      </td>
+    </tr>
+  }
+
+  private def execSummary(activeExecInfo: Seq[ExecutorSummary], deadExecInfo: Seq[ExecutorSummary]):
+    Seq[Node] = {
+    val totalExecInfo = activeExecInfo ++ deadExecInfo
+    val activeRow = execSummaryRow(activeExecInfo, "Active");
+    val deadRow = execSummaryRow(deadExecInfo, "Dead");
+    val totalRow = execSummaryRow(totalExecInfo, "Total");
 
     <table class={UIUtils.TABLE_CLASS_STRIPED}>
       <thead>
+        <th></th>
         <th>RDD Blocks</th>
         <th><span data-toggle="tooltip" title={ToolTips.STORAGE_MEMORY}>Storage Memory</span></th>
         <th>Disk Used</th>
@@ -246,7 +250,7 @@ private[ui] class ExecutorsPage(
         <th>Failed Tasks</th>
         <th>Complete Tasks</th>
         <th>Total Tasks</th>
-        <th data-toggle="tooltip" title={ToolTips.TASK_TIME}>Task Time (GC Time)</th>
+        <th><span data-toggle="tooltip" title={ToolTips.TASK_TIME}>Task Time (GC Time)</span></th>
         <th><span data-toggle="tooltip" title={ToolTips.INPUT}>Input</span></th>
         <th><span data-toggle="tooltip" title={ToolTips.SHUFFLE_READ}>Shuffle Read</span></th>
         <th>
@@ -256,7 +260,9 @@ private[ui] class ExecutorsPage(
         </th>
       </thead>
       <tbody>
-        {sumContent}
+        {activeRow}
+        {deadRow}
+        {totalRow}
       </tbody>
     </table>
   }

From a6e2bd31f52f9e9452e52ab5b846de3dee8b98a7 Mon Sep 17 00:00:00 2001
From: Nong Li <nong@databricks.com>
Date: Fri, 4 Mar 2016 15:15:48 -0800
Subject: [PATCH 10/29] [SPARK-13255] [SQL] Update vectorized reader to
 directly return ColumnarBatch instead of InternalRows.

## What changes were proposed in this pull request?

(Please fill in changes proposed in this fix)

Currently, the parquet reader returns rows one by one which is bad for performance. This patch
updates the reader to directly return ColumnarBatches. This is only enabled with whole stage
codegen, which is the only operator currently that is able to consume ColumnarBatches (instead
of rows). The current implementation is a bit of a hack to get this to work and we should do
more refactoring of these low level interfaces to make this work better.

## How was this patch tested?

```
Results:
TPCDS:                             Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)
---------------------------------------------------------------------------------
q55 (before)                             8897 / 9265         12.9          77.2
q55                                      5486 / 5753         21.0          47.6
```

Author: Nong Li <nong@databricks.com>

Closes #11435 from nongli/spark-13255.
---
 .../parquet/UnsafeRowParquetRecordReader.java | 29 ++++++--
 .../vectorized/ColumnVectorUtils.java         | 57 +++++++++++++++
 .../execution/vectorized/ColumnarBatch.java   | 12 +++
 .../vectorized/OnHeapColumnVector.java        |  3 -
 .../spark/sql/execution/ExistingRDD.scala     | 67 +++++++++++++++--
 .../datasources/DataSourceStrategy.scala      | 72 ++++++++++++++++--
 .../datasources/SqlNewHadoopRDD.scala         |  8 +-
 .../datasources/parquet/ParquetIOSuite.scala  |  8 +-
 .../parquet/ParquetReadBenchmark.scala        | 73 ++++++++++++++-----
 9 files changed, 284 insertions(+), 45 deletions(-)

diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/UnsafeRowParquetRecordReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/UnsafeRowParquetRecordReader.java
index 57dbd7c2ff56f..7d768b165f833 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/UnsafeRowParquetRecordReader.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/UnsafeRowParquetRecordReader.java
@@ -37,7 +37,6 @@
 import org.apache.parquet.schema.Type;
 
 import org.apache.spark.memory.MemoryMode;
-import org.apache.spark.sql.catalyst.InternalRow;
 import org.apache.spark.sql.catalyst.expressions.UnsafeRow;
 import org.apache.spark.sql.catalyst.expressions.codegen.BufferHolder;
 import org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter;
@@ -57,10 +56,14 @@
  *
  * TODO: handle complex types, decimal requiring more than 8 bytes, INT96. Schema mismatch.
  * All of these can be handled efficiently and easily with codegen.
+ *
+ * This class can either return InternalRows or ColumnarBatches. With whole stage codegen
+ * enabled, this class returns ColumnarBatches which offers significant performance gains.
+ * TODO: make this always return ColumnarBatches.
  */
-public class UnsafeRowParquetRecordReader extends SpecificParquetRecordReaderBase<InternalRow> {
+public class UnsafeRowParquetRecordReader extends SpecificParquetRecordReaderBase<Object> {
   /**
-   * Batch of unsafe rows that we assemble and the current index we've returned. Everytime this
+   * Batch of unsafe rows that we assemble and the current index we've returned. Every time this
    * batch is used up (batchIdx == numBatched), we populated the batch.
    */
   private UnsafeRow[] rows = new UnsafeRow[64];
@@ -115,11 +118,15 @@ public class UnsafeRowParquetRecordReader extends SpecificParquetRecordReaderBas
    * code between the path that uses the MR decoders and the vectorized ones.
    *
    * TODOs:
-   *  - Implement all the encodings to support vectorized.
    *  - Implement v2 page formats (just make sure we create the correct decoders).
    */
   private ColumnarBatch columnarBatch;
 
+  /**
+   * If true, this class returns batches instead of rows.
+   */
+  private boolean returnColumnarBatch;
+
   /**
    * The default config on whether columnarBatch should be offheap.
    */
@@ -169,6 +176,8 @@ public void close() throws IOException {
 
   @Override
   public boolean nextKeyValue() throws IOException, InterruptedException {
+    if (returnColumnarBatch) return nextBatch();
+
     if (batchIdx >= numBatched) {
       if (vectorizedDecode()) {
         if (!nextBatch()) return false;
@@ -181,7 +190,9 @@ public boolean nextKeyValue() throws IOException, InterruptedException {
   }
 
   @Override
-  public InternalRow getCurrentValue() throws IOException, InterruptedException {
+  public Object getCurrentValue() throws IOException, InterruptedException {
+    if (returnColumnarBatch) return columnarBatch;
+
     if (vectorizedDecode()) {
       return columnarBatch.getRow(batchIdx - 1);
     } else {
@@ -210,6 +221,14 @@ public ColumnarBatch resultBatch(MemoryMode memMode) {
     return columnarBatch;
   }
 
+  /**
+   * Can be called before any rows are returned to enable returning columnar batches directly.
+   */
+  public void enableReturningBatches() {
+    assert(vectorizedDecode());
+    returnColumnarBatch = true;
+  }
+
   /**
    * Advances to the next batch of rows. Returns false if there are no more.
    */
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVectorUtils.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVectorUtils.java
index 681ace3387139..68f146f7a2622 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVectorUtils.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVectorUtils.java
@@ -26,9 +26,11 @@
 
 import org.apache.spark.memory.MemoryMode;
 import org.apache.spark.sql.Row;
+import org.apache.spark.sql.catalyst.InternalRow;
 import org.apache.spark.sql.catalyst.util.DateTimeUtils;
 import org.apache.spark.sql.types.*;
 import org.apache.spark.unsafe.types.CalendarInterval;
+import org.apache.spark.unsafe.types.UTF8String;
 
 /**
  * Utilities to help manipulate data associate with ColumnVectors. These should be used mostly
@@ -36,6 +38,61 @@
  * These utilities are mostly used to convert ColumnVectors into other formats.
  */
 public class ColumnVectorUtils {
+  /**
+   * Populates the entire `col` with `row[fieldIdx]`
+   */
+  public static void populate(ColumnVector col, InternalRow row, int fieldIdx) {
+    int capacity = col.capacity;
+    DataType t = col.dataType();
+
+    if (row.isNullAt(fieldIdx)) {
+      col.putNulls(0, capacity);
+    } else {
+      if (t == DataTypes.BooleanType) {
+        col.putBooleans(0, capacity, row.getBoolean(fieldIdx));
+      } else if (t == DataTypes.ByteType) {
+        col.putBytes(0, capacity, row.getByte(fieldIdx));
+      } else if (t == DataTypes.ShortType) {
+        col.putShorts(0, capacity, row.getShort(fieldIdx));
+      } else if (t == DataTypes.IntegerType) {
+        col.putInts(0, capacity, row.getInt(fieldIdx));
+      } else if (t == DataTypes.LongType) {
+        col.putLongs(0, capacity, row.getLong(fieldIdx));
+      } else if (t == DataTypes.FloatType) {
+        col.putFloats(0, capacity, row.getFloat(fieldIdx));
+      } else if (t == DataTypes.DoubleType) {
+        col.putDoubles(0, capacity, row.getDouble(fieldIdx));
+      } else if (t == DataTypes.StringType) {
+        UTF8String v = row.getUTF8String(fieldIdx);
+        byte[] bytes = v.getBytes();
+        for (int i = 0; i < capacity; i++) {
+          col.putByteArray(i, bytes);
+        }
+      } else if (t instanceof DecimalType) {
+        DecimalType dt = (DecimalType)t;
+        Decimal d = row.getDecimal(fieldIdx, dt.precision(), dt.scale());
+        if (dt.precision() <= Decimal.MAX_INT_DIGITS()) {
+          col.putInts(0, capacity, (int)d.toUnscaledLong());
+        } else if (dt.precision() <= Decimal.MAX_LONG_DIGITS()) {
+          col.putLongs(0, capacity, d.toUnscaledLong());
+        } else {
+          final BigInteger integer = d.toJavaBigDecimal().unscaledValue();
+          byte[] bytes = integer.toByteArray();
+          for (int i = 0; i < capacity; i++) {
+            col.putByteArray(i, bytes, 0, bytes.length);
+          }
+        }
+      } else if (t instanceof CalendarIntervalType) {
+        CalendarInterval c = (CalendarInterval)row.get(fieldIdx, t);
+        col.getChildColumn(0).putInts(0, capacity, c.months);
+        col.getChildColumn(1).putLongs(0, capacity, c.microseconds);
+      } else if (t instanceof DateType) {
+        Date date = (Date)row.get(fieldIdx, t);
+        col.putInts(0, capacity, DateTimeUtils.fromJavaDate(date));
+      }
+    }
+  }
+
   /**
    * Returns the array data as the java primitive array.
    * For example, an array of IntegerType will return an int[].
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnarBatch.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnarBatch.java
index 2a780588384ed..18763672c6e84 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnarBatch.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnarBatch.java
@@ -22,6 +22,7 @@
 import org.apache.commons.lang.NotImplementedException;
 
 import org.apache.spark.memory.MemoryMode;
+import org.apache.spark.sql.Column;
 import org.apache.spark.sql.catalyst.InternalRow;
 import org.apache.spark.sql.catalyst.expressions.GenericMutableRow;
 import org.apache.spark.sql.catalyst.expressions.UnsafeRow;
@@ -315,6 +316,17 @@ public int numValidRows() {
    */
   public ColumnVector column(int ordinal) { return columns[ordinal]; }
 
+  /**
+   * Sets (replaces) the column at `ordinal` with column. This can be used to do very efficient
+   * projections.
+   */
+  public void setColumn(int ordinal, ColumnVector column) {
+    if (column instanceof OffHeapColumnVector) {
+      throw new NotImplementedException("Need to ref count columns.");
+    }
+    columns[ordinal] = column;
+  }
+
   /**
    * Returns the row in this batch at `rowId`. Returned row is reused across calls.
    */
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OnHeapColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OnHeapColumnVector.java
index 305e84a86bdc7..03160d1ec36ce 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OnHeapColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OnHeapColumnVector.java
@@ -62,9 +62,6 @@ public final long nullsNativeAddress() {
 
   @Override
   public final void close() {
-    nulls = null;
-    intData = null;
-    doubleData = null;
   }
 
   //
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ExistingRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ExistingRDD.scala
index 2cbe3f2c94202..36e656b8b6abf 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ExistingRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ExistingRDD.scala
@@ -139,9 +139,14 @@ private[sql] case class PhysicalRDD(
   // Support codegen so that we can avoid the UnsafeRow conversion in all cases. Codegen
   // never requires UnsafeRow as input.
   override protected def doProduce(ctx: CodegenContext): String = {
+    val columnarBatchClz = "org.apache.spark.sql.execution.vectorized.ColumnarBatch"
     val input = ctx.freshName("input")
+    val idx = ctx.freshName("batchIdx")
+    val batch = ctx.freshName("batch")
     // PhysicalRDD always just has one input
     ctx.addMutableState("scala.collection.Iterator", input, s"$input = inputs[0];")
+    ctx.addMutableState(columnarBatchClz, batch, s"$batch = null;")
+    ctx.addMutableState("int", idx, s"$idx = 0;")
 
     val exprs = output.zipWithIndex.map(x => new BoundReference(x._2, x._1.dataType, true))
     val row = ctx.freshName("row")
@@ -149,14 +154,62 @@ private[sql] case class PhysicalRDD(
     ctx.INPUT_ROW = row
     ctx.currentVars = null
     val columns = exprs.map(_.gen(ctx))
+
+    // The input RDD can either return (all) ColumnarBatches or InternalRows. We determine this
+    // by looking at the first value of the RDD and then calling the function which will process
+    // the remaining. It is faster to return batches.
+    // TODO: The abstractions between this class and SqlNewHadoopRDD makes it difficult to know
+    // here which path to use. Fix this.
+
+
+    val scanBatches = ctx.freshName("processBatches")
+    ctx.addNewFunction(scanBatches,
+      s"""
+      | private void $scanBatches() throws java.io.IOException {
+      |  while (true) {
+      |     int numRows = $batch.numRows();
+      |     if ($idx == 0) $numOutputRows.add(numRows);
+      |
+      |     while ($idx < numRows) {
+      |       InternalRow $row = $batch.getRow($idx++);
+      |       ${columns.map(_.code).mkString("\n").trim}
+      |       ${consume(ctx, columns).trim}
+      |       if (shouldStop()) return;
+      |     }
+      |
+      |     if (!$input.hasNext()) {
+      |       $batch = null;
+      |       break;
+      |     }
+      |     $batch = ($columnarBatchClz)$input.next();
+      |     $idx = 0;
+      |   }
+      | }""".stripMargin)
+
+    val scanRows = ctx.freshName("processRows")
+    ctx.addNewFunction(scanRows,
+      s"""
+       | private void $scanRows(InternalRow $row) throws java.io.IOException {
+       |   while (true) {
+       |     $numOutputRows.add(1);
+       |     ${columns.map(_.code).mkString("\n").trim}
+       |     ${consume(ctx, columns).trim}
+       |     if (shouldStop()) return;
+       |     if (!$input.hasNext()) break;
+       |     $row = (InternalRow)$input.next();
+       |   }
+       | }""".stripMargin)
+
     s"""
-       | while ($input.hasNext()) {
-       |   InternalRow $row = (InternalRow) $input.next();
-       |   $numOutputRows.add(1);
-       |   ${columns.map(_.code).mkString("\n").trim}
-       |   ${consume(ctx, columns).trim}
-       |   if (shouldStop()) {
-       |     return;
+       | if ($batch != null) {
+       |   $scanBatches();
+       | } else if ($input.hasNext()) {
+       |   Object value = $input.next();
+       |   if (value instanceof $columnarBatchClz) {
+       |     $batch = ($columnarBatchClz)value;
+       |     $scanBatches();
+       |   } else {
+       |     $scanRows((InternalRow)value);
        |   }
        | }
      """.stripMargin
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
index ceb35107bf7d8..69a6d23203b93 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
@@ -23,8 +23,9 @@ import org.apache.spark.{Logging, TaskContext}
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.rdd.{MapPartitionsRDD, RDD, UnionRDD}
 import org.apache.spark.sql._
-import org.apache.spark.sql.catalyst.{expressions, CatalystTypeConverters, InternalRow}
+import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
 import org.apache.spark.sql.catalyst.CatalystTypeConverters.convertToScala
+import org.apache.spark.sql.catalyst.expressions
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.planning.PhysicalOperation
 import org.apache.spark.sql.catalyst.plans.logical
@@ -33,8 +34,9 @@ import org.apache.spark.sql.catalyst.plans.physical.HashPartitioning
 import org.apache.spark.sql.execution.PhysicalRDD.{INPUT_PATHS, PUSHED_FILTERS}
 import org.apache.spark.sql.execution.SparkPlan
 import org.apache.spark.sql.execution.command.ExecutedCommand
+import org.apache.spark.sql.execution.vectorized.{ColumnarBatch, ColumnVectorUtils}
 import org.apache.spark.sql.sources._
-import org.apache.spark.sql.types.{StringType, StructType}
+import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
 import org.apache.spark.util.{SerializableConfiguration, Utils}
 import org.apache.spark.util.collection.BitSet
@@ -220,6 +222,44 @@ private[sql] object DataSourceStrategy extends Strategy with Logging {
     sparkPlan
   }
 
+  /**
+   * Creates a ColumnarBatch that contains the values for `requiredColumns`. These columns can
+   * either come from `input` (columns scanned from the data source) or from the partitioning
+   * values (data from `partitionValues`). This is done *once* per physical partition. When
+   * the column is from `input`, it just references the same underlying column. When using
+   * partition columns, the column is populated once.
+   * TODO: there's probably a cleaner way to do this.
+   */
+  private def projectedColumnBatch(
+      input: ColumnarBatch,
+      requiredColumns: Seq[Attribute],
+      dataColumns: Seq[Attribute],
+      partitionColumnSchema: StructType,
+      partitionValues: InternalRow) : ColumnarBatch = {
+    val result = ColumnarBatch.allocate(StructType.fromAttributes(requiredColumns))
+    var resultIdx = 0
+    var inputIdx = 0
+
+    while (resultIdx < requiredColumns.length) {
+      val attr = requiredColumns(resultIdx)
+      if (inputIdx < dataColumns.length && requiredColumns(resultIdx) == dataColumns(inputIdx)) {
+        result.setColumn(resultIdx, input.column(inputIdx))
+        inputIdx += 1
+      } else {
+        require(partitionColumnSchema.fields.filter(_.name.equals(attr.name)).length == 1)
+        var partitionIdx = 0
+        partitionColumnSchema.fields.foreach { f => {
+          if (f.name.equals(attr.name)) {
+            ColumnVectorUtils.populate(result.column(resultIdx), partitionValues, partitionIdx)
+          }
+          partitionIdx += 1
+        }}
+      }
+      resultIdx += 1
+    }
+    result
+  }
+
   private def mergeWithPartitionValues(
       requiredColumns: Seq[Attribute],
       dataColumns: Seq[Attribute],
@@ -239,7 +279,7 @@ private[sql] object DataSourceStrategy extends Strategy with Logging {
         }
       }
 
-      val mapPartitionsFunc = (_: TaskContext, _: Int, iterator: Iterator[InternalRow]) => {
+      val mapPartitionsFunc = (_: TaskContext, _: Int, iterator: Iterator[Object]) => {
         // Note that we can't use an `UnsafeRowJoiner` to replace the following `JoinedRow` and
         // `UnsafeProjection`.  Because the projection may also adjust column order.
         val mutableJoinedRow = new JoinedRow()
@@ -247,9 +287,27 @@ private[sql] object DataSourceStrategy extends Strategy with Logging {
         val unsafeProjection =
           UnsafeProjection.create(requiredColumns, dataColumns ++ partitionColumns)
 
-        iterator.map { unsafeDataRow =>
-          unsafeProjection(mutableJoinedRow(unsafeDataRow, unsafePartitionValues))
-        }
+        // If we are returning batches directly, we need to augment them with the partitioning
+        // columns. We want to do this without a row by row operation.
+        var columnBatch: ColumnarBatch = null
+        var mergedBatch: ColumnarBatch = null
+
+        iterator.map { input => {
+          if (input.isInstanceOf[InternalRow]) {
+            unsafeProjection(mutableJoinedRow(
+              input.asInstanceOf[InternalRow], unsafePartitionValues))
+          } else {
+            require(input.isInstanceOf[ColumnarBatch])
+            val inputBatch = input.asInstanceOf[ColumnarBatch]
+            if (inputBatch != mergedBatch) {
+              mergedBatch = inputBatch
+              columnBatch = projectedColumnBatch(inputBatch, requiredColumns,
+                dataColumns, partitionColumnSchema, partitionValues)
+            }
+            columnBatch.setNumRows(inputBatch.numRows())
+            columnBatch
+          }
+        }}
       }
 
       // This is an internal RDD whose call site the user should not be concerned with
@@ -257,7 +315,7 @@ private[sql] object DataSourceStrategy extends Strategy with Logging {
       // the call site may add up.
       Utils.withDummyCallSite(dataRows.sparkContext) {
         new MapPartitionsRDD(dataRows, mapPartitionsFunc, preservesPartitioning = false)
-      }
+      }.asInstanceOf[RDD[InternalRow]]
     } else {
       dataRows
     }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SqlNewHadoopRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SqlNewHadoopRDD.scala
index f4271d165c9bd..c4c7eccab6f69 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SqlNewHadoopRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SqlNewHadoopRDD.scala
@@ -102,6 +102,8 @@ private[spark] class SqlNewHadoopRDD[V: ClassTag](
     sqlContext.getConf(SQLConf.PARQUET_UNSAFE_ROW_RECORD_READER_ENABLED.key).toBoolean
   protected val enableVectorizedParquetReader: Boolean =
     sqlContext.getConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key).toBoolean
+  protected val enableWholestageCodegen: Boolean =
+    sqlContext.getConf(SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key).toBoolean
 
   override def getPartitions: Array[SparkPartition] = {
     val conf = getConf(isDriverSide = true)
@@ -179,7 +181,11 @@ private[spark] class SqlNewHadoopRDD[V: ClassTag](
           parquetReader.close()
         } else {
           reader = parquetReader.asInstanceOf[RecordReader[Void, V]]
-          if (enableVectorizedParquetReader) parquetReader.resultBatch()
+          if (enableVectorizedParquetReader) {
+            parquetReader.resultBatch()
+            // Whole stage codegen (PhysicalRDD) is able to deal with batches directly
+            if (enableWholestageCodegen) parquetReader.enableReturningBatches();
+          }
         }
       }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala
index c85eeddc2c6d9..cf8a9fdd46fca 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala
@@ -37,7 +37,7 @@ import org.apache.parquet.schema.{MessageType, MessageTypeParser}
 
 import org.apache.spark.SparkException
 import org.apache.spark.sql._
-import org.apache.spark.sql.catalyst.ScalaReflection
+import org.apache.spark.sql.catalyst.{InternalRow, ScalaReflection}
 import org.apache.spark.sql.catalyst.expressions.UnsafeRow
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.internal.SQLConf
@@ -683,7 +683,7 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSQLContext {
           reader.initialize(file, null)
           val result = mutable.ArrayBuffer.empty[(Int, String)]
           while (reader.nextKeyValue()) {
-            val row = reader.getCurrentValue
+            val row = reader.getCurrentValue.asInstanceOf[InternalRow]
             val v = (row.getInt(0), row.getString(1))
             result += v
           }
@@ -700,7 +700,7 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSQLContext {
           reader.initialize(file, ("_2" :: Nil).asJava)
           val result = mutable.ArrayBuffer.empty[(String)]
           while (reader.nextKeyValue()) {
-            val row = reader.getCurrentValue
+            val row = reader.getCurrentValue.asInstanceOf[InternalRow]
             result += row.getString(0)
           }
           assert(data.map(_._2) == result)
@@ -716,7 +716,7 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSQLContext {
           reader.initialize(file, ("_2" :: "_1" :: Nil).asJava)
           val result = mutable.ArrayBuffer.empty[(String, Int)]
           while (reader.nextKeyValue()) {
-            val row = reader.getCurrentValue
+            val row = reader.getCurrentValue.asInstanceOf[InternalRow]
             val v = (row.getString(0), row.getInt(1))
             result += v
           }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadBenchmark.scala
index 14dbdf34093e9..38c3618a82ef9 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadBenchmark.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadBenchmark.scala
@@ -22,8 +22,9 @@ import scala.collection.JavaConverters._
 import scala.util.Try
 
 import org.apache.spark.{SparkConf, SparkContext}
-import org.apache.spark.sql.SQLContext
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.SQLContext
 import org.apache.spark.util.{Benchmark, Utils}
 
 /**
@@ -94,14 +95,14 @@ object ParquetReadBenchmark {
 
         val files = SpecificParquetRecordReaderBase.listDirectory(dir).toArray
         // Driving the parquet reader directly without Spark.
-        parquetReaderBenchmark.addCase("ParquetReader") { num =>
+        parquetReaderBenchmark.addCase("ParquetReader Non-Vectorized") { num =>
           var sum = 0L
           files.map(_.asInstanceOf[String]).foreach { p =>
             val reader = new UnsafeRowParquetRecordReader
             reader.initialize(p, ("id" :: Nil).asJava)
 
             while (reader.nextKeyValue()) {
-              val record = reader.getCurrentValue
+              val record = reader.getCurrentValue.asInstanceOf[InternalRow]
               if (!record.isNullAt(0)) sum += record.getInt(0)
             }
             reader.close()
@@ -109,7 +110,7 @@ object ParquetReadBenchmark {
         }
 
         // Driving the parquet reader in batch mode directly.
-        parquetReaderBenchmark.addCase("ParquetReader(Batched)") { num =>
+        parquetReaderBenchmark.addCase("ParquetReader Vectorized") { num =>
           var sum = 0L
           files.map(_.asInstanceOf[String]).foreach { p =>
             val reader = new UnsafeRowParquetRecordReader
@@ -132,7 +133,7 @@ object ParquetReadBenchmark {
         }
 
         // Decoding in vectorized but having the reader return rows.
-        parquetReaderBenchmark.addCase("ParquetReader(Batch -> Row)") { num =>
+        parquetReaderBenchmark.addCase("ParquetReader Vectorized -> Row") { num =>
           var sum = 0L
           files.map(_.asInstanceOf[String]).foreach { p =>
             val reader = new UnsafeRowParquetRecordReader
@@ -156,9 +157,9 @@ object ParquetReadBenchmark {
         Intel(R) Core(TM) i7-4870HQ CPU @ 2.50GHz
         SQL Single Int Column Scan:         Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
         -------------------------------------------------------------------------------------------
-        SQL Parquet Vectorized                    657 /  778         23.9          41.8       1.0X
-        SQL Parquet MR                           1606 / 1731          9.8         102.1       0.4X
-        SQL Parquet Non-Vectorized               1133 / 1216         13.9          72.1       0.6X
+        SQL Parquet Vectorized                    215 /  262         73.0          13.7       1.0X
+        SQL Parquet MR                           1946 / 2083          8.1         123.7       0.1X
+        SQL Parquet Non-Vectorized               1079 / 1213         14.6          68.6       0.2X
         */
         sqlBenchmark.run()
 
@@ -166,9 +167,9 @@ object ParquetReadBenchmark {
         Intel(R) Core(TM) i7-4870HQ CPU @ 2.50GHz
         Parquet Reader Single Int Column    Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
         -------------------------------------------------------------------------------------------
-        ParquetReader                             565 /  609         27.8          35.9       1.0X
-        ParquetReader(Batched)                    165 /  174         95.3          10.5       3.4X
-        ParquetReader(Batch -> Row)               158 /  188         99.3          10.1       3.6X
+        ParquetReader Non-Vectorized              610 /  737         25.8          38.8       1.0X
+        ParquetReader Vectorized                  123 /  152        127.8           7.8       5.0X
+        ParquetReader Vectorized -> Row           165 /  180         95.2          10.5       3.7X
         */
         parquetReaderBenchmark.run()
       }
@@ -209,7 +210,7 @@ object ParquetReadBenchmark {
             val reader = new UnsafeRowParquetRecordReader
             reader.initialize(p, null)
             while (reader.nextKeyValue()) {
-              val record = reader.getCurrentValue
+              val record = reader.getCurrentValue.asInstanceOf[InternalRow]
               if (!record.isNullAt(0)) sum1 += record.getInt(0)
               if (!record.isNullAt(1)) sum2 += record.getUTF8String(1).numBytes()
             }
@@ -221,10 +222,10 @@ object ParquetReadBenchmark {
         Intel(R) Core(TM) i7-4870HQ CPU @ 2.50GHz
         Int and String Scan:                Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
         -------------------------------------------------------------------------------------------
-        SQL Parquet Vectorized                   1025 / 1180         10.2          97.8       1.0X
-        SQL Parquet MR                           2157 / 2222          4.9         205.7       0.5X
-        SQL Parquet Non-vectorized               1450 / 1466          7.2         138.3       0.7X
-        ParquetReader Non-vectorized             1005 / 1022         10.4          95.9       1.0X
+        SQL Parquet Vectorized                    628 /  720         16.7          59.9       1.0X
+        SQL Parquet MR                           1905 / 2239          5.5         181.7       0.3X
+        SQL Parquet Non-vectorized               1429 / 1732          7.3         136.3       0.4X
+        ParquetReader Non-vectorized              989 / 1357         10.6          94.3       0.6X
         */
         benchmark.run()
       }
@@ -255,17 +256,53 @@ object ParquetReadBenchmark {
         Intel(R) Core(TM) i7-4870HQ CPU @ 2.50GHz
         String Dictionary:                  Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
         -------------------------------------------------------------------------------------------
-        SQL Parquet Vectorized                    578 /  593         18.1          55.1       1.0X
-        SQL Parquet MR                           1021 / 1032         10.3          97.4       0.6X
+        SQL Parquet Vectorized                    329 /  337         31.9          31.4       1.0X
+        SQL Parquet MR                           1131 / 1325          9.3         107.8       0.3X
         */
         benchmark.run()
       }
     }
   }
 
+  def partitionTableScanBenchmark(values: Int): Unit = {
+    withTempPath { dir =>
+      withTempTable("t1", "tempTable") {
+        sqlContext.range(values).registerTempTable("t1")
+        sqlContext.sql("select id % 2 as p, cast(id as INT) as id from t1")
+          .write.partitionBy("p").parquet(dir.getCanonicalPath)
+        sqlContext.read.parquet(dir.getCanonicalPath).registerTempTable("tempTable")
+
+        val benchmark = new Benchmark("Partitioned Table", values)
+
+        benchmark.addCase("Read data column") { iter =>
+          sqlContext.sql("select sum(id) from tempTable").collect
+        }
+
+        benchmark.addCase("Read partition column") { iter =>
+          sqlContext.sql("select sum(p) from tempTable").collect
+        }
+
+        benchmark.addCase("Read both columns") { iter =>
+          sqlContext.sql("select sum(p), sum(id) from tempTable").collect
+        }
+
+        /*
+        Intel(R) Core(TM) i7-4870HQ CPU @ 2.50GHz
+        Partitioned Table:                  Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+        -------------------------------------------------------------------------------------------
+        Read data column                          191 /  250         82.1          12.2       1.0X
+        Read partition column                      82 /   86        192.4           5.2       2.3X
+        Read both columns                         220 /  248         71.5          14.0       0.9X
+         */
+        benchmark.run()
+      }
+    }
+  }
+
   def main(args: Array[String]): Unit = {
     intScanBenchmark(1024 * 1024 * 15)
     intStringScanBenchmark(1024 * 1024 * 10)
     stringDictionaryScanBenchmark(1024 * 1024 * 10)
+    partitionTableScanBenchmark(1024 * 1024 * 15)
   }
 }

From f19228eed89cf8e22a07a7ef7f37a5f6f8a3d455 Mon Sep 17 00:00:00 2001
From: Jason White <jason.white@shopify.com>
Date: Fri, 4 Mar 2016 16:04:56 -0800
Subject: [PATCH 11/29] =?UTF-8?q?[SPARK-12073][STREAMING]=20backpressure?=
 =?UTF-8?q?=20rate=20controller=20consumes=20events=20preferentially=20fro?=
 =?UTF-8?q?m=20lagg=E2=80=A6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…ing partitions

I'm pretty sure this is the reason we couldn't easily recover from an unbalanced Kafka partition under heavy load when using backpressure.

`maxMessagesPerPartition` calculates an appropriate limit for the message rate from all partitions, and then divides by the number of partitions to determine how many messages to retrieve per partition. The problem with this approach is that when one partition is behind by millions of records (due to random Kafka issues), but the rate estimator calculates only 100k total messages can be retrieved, each partition (out of say 32) only retrieves max 100k/32=3125 messages.

This PR (still needing a test) determines a per-partition desired message count by using the current lag for each partition to preferentially weight the total message limit among the partitions. In this situation, if each partition gets 1k messages, but 1 partition starts 1M behind, then the total number of messages to retrieve is (32 * 1k + 1M) = 1032000 messages, of which the one partition needs 1001000. So, it gets (1001000 / 1032000) = 97% of the 100k messages, and the other 31 partitions share the remaining 3%.

Assuming all of 100k the messages are retrieved and processed within the batch window, the rate calculator will increase the number of messages to retrieve in the next batch, until it reaches a new stable point or the backlog is finished processed.

We're going to try deploying this internally at Shopify to see if this resolves our issue.

tdas koeninger holdenk

Author: Jason White <jason.white@shopify.com>

Closes #10089 from JasonMWhite/rate_controller_offsets.
---
 .../kafka/DirectKafkaInputDStream.scala       | 44 +++++++-----
 .../streaming/kafka/KafkaTestUtils.scala      |  9 ++-
 .../kafka/JavaDirectKafkaStreamSuite.java     |  2 +-
 .../streaming/kafka/JavaKafkaRDDSuite.java    |  2 +-
 .../streaming/kafka/JavaKafkaStreamSuite.java |  2 +-
 .../kafka/DirectKafkaStreamSuite.scala        | 68 ++++++++++++++++---
 project/MimaExcludes.scala                    |  4 ++
 7 files changed, 101 insertions(+), 30 deletions(-)

diff --git a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/DirectKafkaInputDStream.scala b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/DirectKafkaInputDStream.scala
index 54d8c8b03f206..0eaaf408c0112 100644
--- a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/DirectKafkaInputDStream.scala
+++ b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/DirectKafkaInputDStream.scala
@@ -89,23 +89,32 @@ class DirectKafkaInputDStream[
 
   private val maxRateLimitPerPartition: Int = context.sparkContext.getConf.getInt(
       "spark.streaming.kafka.maxRatePerPartition", 0)
-  protected def maxMessagesPerPartition: Option[Long] = {
+
+  protected[streaming] def maxMessagesPerPartition(
+      offsets: Map[TopicAndPartition, Long]): Option[Map[TopicAndPartition, Long]] = {
     val estimatedRateLimit = rateController.map(_.getLatestRate().toInt)
-    val numPartitions = currentOffsets.keys.size
-
-    val effectiveRateLimitPerPartition = estimatedRateLimit
-      .filter(_ > 0)
-      .map { limit =>
-        if (maxRateLimitPerPartition > 0) {
-          Math.min(maxRateLimitPerPartition, (limit / numPartitions))
-        } else {
-          limit / numPartitions
+
+    // calculate a per-partition rate limit based on current lag
+    val effectiveRateLimitPerPartition = estimatedRateLimit.filter(_ > 0) match {
+      case Some(rate) =>
+        val lagPerPartition = offsets.map { case (tp, offset) =>
+          tp -> Math.max(offset - currentOffsets(tp), 0)
+        }
+        val totalLag = lagPerPartition.values.sum
+
+        lagPerPartition.map { case (tp, lag) =>
+          val backpressureRate = Math.round(lag / totalLag.toFloat * rate)
+          tp -> (if (maxRateLimitPerPartition > 0) {
+            Math.min(backpressureRate, maxRateLimitPerPartition)} else backpressureRate)
         }
-      }.getOrElse(maxRateLimitPerPartition)
+      case None => offsets.map { case (tp, offset) => tp -> maxRateLimitPerPartition }
+    }
 
-    if (effectiveRateLimitPerPartition > 0) {
+    if (effectiveRateLimitPerPartition.values.sum > 0) {
       val secsPerBatch = context.graph.batchDuration.milliseconds.toDouble / 1000
-      Some((secsPerBatch * effectiveRateLimitPerPartition).toLong)
+      Some(effectiveRateLimitPerPartition.map {
+        case (tp, limit) => tp -> (secsPerBatch * limit).toLong
+      })
     } else {
       None
     }
@@ -134,9 +143,12 @@ class DirectKafkaInputDStream[
   // limits the maximum number of messages per partition
   protected def clamp(
     leaderOffsets: Map[TopicAndPartition, LeaderOffset]): Map[TopicAndPartition, LeaderOffset] = {
-    maxMessagesPerPartition.map { mmp =>
-      leaderOffsets.map { case (tp, lo) =>
-        tp -> lo.copy(offset = Math.min(currentOffsets(tp) + mmp, lo.offset))
+    val offsets = leaderOffsets.mapValues(lo => lo.offset)
+
+    maxMessagesPerPartition(offsets).map { mmp =>
+      mmp.map { case (tp, messages) =>
+        val lo = leaderOffsets(tp)
+        tp -> lo.copy(offset = Math.min(currentOffsets(tp) + messages, lo.offset))
       }
     }.getOrElse(leaderOffsets)
   }
diff --git a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaTestUtils.scala b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaTestUtils.scala
index a76fa6671a4b0..a5ea1d6d2848d 100644
--- a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaTestUtils.scala
+++ b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaTestUtils.scala
@@ -152,12 +152,15 @@ private[kafka] class KafkaTestUtils extends Logging {
   }
 
   /** Create a Kafka topic and wait until it is propagated to the whole cluster */
-  def createTopic(topic: String): Unit = {
-    AdminUtils.createTopic(zkClient, topic, 1, 1)
+  def createTopic(topic: String, partitions: Int): Unit = {
+    AdminUtils.createTopic(zkClient, topic, partitions, 1)
     // wait until metadata is propagated
-    waitUntilMetadataIsPropagated(topic, 0)
+    (0 until partitions).foreach { p => waitUntilMetadataIsPropagated(topic, p) }
   }
 
+  /** Single-argument version for backwards compatibility */
+  def createTopic(topic: String): Unit = createTopic(topic, 1)
+
   /** Java-friendly function for sending messages to the Kafka broker */
   def sendMessages(topic: String, messageToFreq: JMap[String, JInt]): Unit = {
     sendMessages(topic, Map(messageToFreq.asScala.mapValues(_.intValue()).toSeq: _*))
diff --git a/external/kafka/src/test/java/org/apache/spark/streaming/kafka/JavaDirectKafkaStreamSuite.java b/external/kafka/src/test/java/org/apache/spark/streaming/kafka/JavaDirectKafkaStreamSuite.java
index 4891e4f4a17bc..fa6b0dbc8c219 100644
--- a/external/kafka/src/test/java/org/apache/spark/streaming/kafka/JavaDirectKafkaStreamSuite.java
+++ b/external/kafka/src/test/java/org/apache/spark/streaming/kafka/JavaDirectKafkaStreamSuite.java
@@ -168,7 +168,7 @@ private static Map<TopicAndPartition, Long> topicOffsetToMap(String topic, Long
 
   private  String[] createTopicAndSendData(String topic) {
     String[] data = { topic + "-1", topic + "-2", topic + "-3"};
-    kafkaTestUtils.createTopic(topic);
+    kafkaTestUtils.createTopic(topic, 1);
     kafkaTestUtils.sendMessages(topic, data);
     return data;
   }
diff --git a/external/kafka/src/test/java/org/apache/spark/streaming/kafka/JavaKafkaRDDSuite.java b/external/kafka/src/test/java/org/apache/spark/streaming/kafka/JavaKafkaRDDSuite.java
index afcc6cfccd39a..c41b6297b0481 100644
--- a/external/kafka/src/test/java/org/apache/spark/streaming/kafka/JavaKafkaRDDSuite.java
+++ b/external/kafka/src/test/java/org/apache/spark/streaming/kafka/JavaKafkaRDDSuite.java
@@ -149,7 +149,7 @@ public String call(MessageAndMetadata<String, String> msgAndMd) {
 
   private  String[] createTopicAndSendData(String topic) {
     String[] data = { topic + "-1", topic + "-2", topic + "-3"};
-    kafkaTestUtils.createTopic(topic);
+    kafkaTestUtils.createTopic(topic, 1);
     kafkaTestUtils.sendMessages(topic, data);
     return data;
   }
diff --git a/external/kafka/src/test/java/org/apache/spark/streaming/kafka/JavaKafkaStreamSuite.java b/external/kafka/src/test/java/org/apache/spark/streaming/kafka/JavaKafkaStreamSuite.java
index 617c92a008fc5..868df64e8c944 100644
--- a/external/kafka/src/test/java/org/apache/spark/streaming/kafka/JavaKafkaStreamSuite.java
+++ b/external/kafka/src/test/java/org/apache/spark/streaming/kafka/JavaKafkaStreamSuite.java
@@ -76,7 +76,7 @@ public void testKafkaStream() throws InterruptedException {
     sent.put("b", 3);
     sent.put("c", 10);
 
-    kafkaTestUtils.createTopic(topic);
+    kafkaTestUtils.createTopic(topic, 1);
     kafkaTestUtils.sendMessages(topic, sent);
 
     Map<String, String> kafkaParams = new HashMap<>();
diff --git a/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/DirectKafkaStreamSuite.scala b/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/DirectKafkaStreamSuite.scala
index 8398178e9b79b..b2c81d1534ee6 100644
--- a/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/DirectKafkaStreamSuite.scala
+++ b/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/DirectKafkaStreamSuite.scala
@@ -353,10 +353,38 @@ class DirectKafkaStreamSuite
     ssc.stop()
   }
 
+  test("maxMessagesPerPartition with backpressure disabled") {
+    val topic = "maxMessagesPerPartition"
+    val kafkaStream = getDirectKafkaStream(topic, None)
+
+    val input = Map(TopicAndPartition(topic, 0) -> 50L, TopicAndPartition(topic, 1) -> 50L)
+    assert(kafkaStream.maxMessagesPerPartition(input).get ==
+      Map(TopicAndPartition(topic, 0) -> 10L, TopicAndPartition(topic, 1) -> 10L))
+  }
+
+  test("maxMessagesPerPartition with no lag") {
+    val topic = "maxMessagesPerPartition"
+    val rateController = Some(new ConstantRateController(0, new ConstantEstimator(100), 100))
+    val kafkaStream = getDirectKafkaStream(topic, rateController)
+
+    val input = Map(TopicAndPartition(topic, 0) -> 0L, TopicAndPartition(topic, 1) -> 0L)
+    assert(kafkaStream.maxMessagesPerPartition(input).isEmpty)
+  }
+
+  test("maxMessagesPerPartition respects max rate") {
+    val topic = "maxMessagesPerPartition"
+    val rateController = Some(new ConstantRateController(0, new ConstantEstimator(100), 1000))
+    val kafkaStream = getDirectKafkaStream(topic, rateController)
+
+    val input = Map(TopicAndPartition(topic, 0) -> 1000L, TopicAndPartition(topic, 1) -> 1000L)
+    assert(kafkaStream.maxMessagesPerPartition(input).get ==
+      Map(TopicAndPartition(topic, 0) -> 10L, TopicAndPartition(topic, 1) -> 10L))
+  }
+
   test("using rate controller") {
     val topic = "backpressure"
-    val topicPartition = TopicAndPartition(topic, 0)
-    kafkaTestUtils.createTopic(topic)
+    val topicPartitions = Set(TopicAndPartition(topic, 0), TopicAndPartition(topic, 1))
+    kafkaTestUtils.createTopic(topic, 2)
     val kafkaParams = Map(
       "metadata.broker.list" -> kafkaTestUtils.brokerAddress,
       "auto.offset.reset" -> "smallest"
@@ -364,8 +392,8 @@ class DirectKafkaStreamSuite
 
     val batchIntervalMilliseconds = 100
     val estimator = new ConstantEstimator(100)
-    val messageKeys = (1 to 200).map(_.toString)
-    val messages = messageKeys.map((_, 1)).toMap
+    val messages = Map("foo" -> 200)
+    kafkaTestUtils.sendMessages(topic, messages)
 
     val sparkConf = new SparkConf()
       // Safe, even with streaming, because we're using the direct API.
@@ -380,11 +408,11 @@ class DirectKafkaStreamSuite
     val kafkaStream = withClue("Error creating direct stream") {
       val kc = new KafkaCluster(kafkaParams)
       val messageHandler = (mmd: MessageAndMetadata[String, String]) => (mmd.key, mmd.message)
-      val m = kc.getEarliestLeaderOffsets(Set(topicPartition))
+      val m = kc.getEarliestLeaderOffsets(topicPartitions)
         .fold(e => Map.empty[TopicAndPartition, Long], m => m.mapValues(lo => lo.offset))
 
       new DirectKafkaInputDStream[String, String, StringDecoder, StringDecoder, (String, String)](
-        ssc, kafkaParams, m, messageHandler) {
+          ssc, kafkaParams, m, messageHandler) {
         override protected[streaming] val rateController =
           Some(new DirectKafkaRateController(id, estimator))
       }
@@ -405,13 +433,12 @@ class DirectKafkaStreamSuite
     ssc.start()
 
     // Try different rate limits.
-    // Send data to Kafka and wait for arrays of data to appear matching the rate.
+    // Wait for arrays of data to appear matching the rate.
     Seq(100, 50, 20).foreach { rate =>
       collectedData.clear()       // Empty this buffer on each pass.
       estimator.updateRate(rate)  // Set a new rate.
       // Expect blocks of data equal to "rate", scaled by the interval length in secs.
       val expectedSize = Math.round(rate * batchIntervalMilliseconds * 0.001)
-      kafkaTestUtils.sendMessages(topic, messages)
       eventually(timeout(5.seconds), interval(batchIntervalMilliseconds.milliseconds)) {
         // Assert that rate estimator values are used to determine maxMessagesPerPartition.
         // Funky "-" in message makes the complete assertion message read better.
@@ -430,6 +457,25 @@ class DirectKafkaStreamSuite
       rdd.asInstanceOf[KafkaRDD[K, V, _, _, (K, V)]].offsetRanges
     }.toSeq.sortBy { _._1 }
   }
+
+  private def getDirectKafkaStream(topic: String, mockRateController: Option[RateController]) = {
+    val batchIntervalMilliseconds = 100
+
+    val sparkConf = new SparkConf()
+      .setMaster("local[1]")
+      .setAppName(this.getClass.getSimpleName)
+      .set("spark.streaming.kafka.maxRatePerPartition", "100")
+
+    // Setup the streaming context
+    ssc = new StreamingContext(sparkConf, Milliseconds(batchIntervalMilliseconds))
+
+    val earliestOffsets = Map(TopicAndPartition(topic, 0) -> 0L, TopicAndPartition(topic, 1) -> 0L)
+    val messageHandler = (mmd: MessageAndMetadata[String, String]) => (mmd.key, mmd.message)
+    new DirectKafkaInputDStream[String, String, StringDecoder, StringDecoder, (String, String)](
+      ssc, Map[String, String](), earliestOffsets, messageHandler) {
+      override protected[streaming] val rateController = mockRateController
+    }
+  }
 }
 
 object DirectKafkaStreamSuite {
@@ -468,3 +514,9 @@ private[streaming] class ConstantEstimator(@volatile private var rate: Long)
       processingDelay: Long,
       schedulingDelay: Long): Option[Double] = Some(rate)
 }
+
+private[streaming] class ConstantRateController(id: Int, estimator: RateEstimator, rate: Long)
+  extends RateController(id, estimator) {
+  override def publish(rate: Long): Unit = ()
+  override def getLatestRate(): Long = rate
+}
diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index 9ce37fc753c46..983f71684c38b 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -288,6 +288,10 @@ object MimaExcludes {
         ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.SQLConf$SQLConfEntry"),
         ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.SQLConf$"),
         ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.SQLConf$SQLConfEntry$")
+      ) ++ Seq(
+        // SPARK-12073: backpressure rate controller consumes events preferentially from lagging partitions
+        ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.streaming.kafka.KafkaTestUtils.createTopic"),
+        ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.streaming.kafka.DirectKafkaInputDStream.maxMessagesPerPartition")
       )
     case v if v.startsWith("1.6") =>
       Seq(

From adce5ee721c6a844ff21dfcd8515859458fe611d Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Sat, 5 Mar 2016 19:25:03 +0800
Subject: [PATCH 12/29] [SPARK-12720][SQL] SQL Generation Support for Cube,
 Rollup, and Grouping Sets

#### What changes were proposed in this pull request?

This PR is for supporting SQL generation for cube, rollup and grouping sets.

For example, a query using rollup:
```SQL
SELECT count(*) as cnt, key % 5, grouping_id() FROM t1 GROUP BY key % 5 WITH ROLLUP
```
Original logical plan:
```
  Aggregate [(key#17L % cast(5 as bigint))#47L,grouping__id#46],
            [(count(1),mode=Complete,isDistinct=false) AS cnt#43L,
             (key#17L % cast(5 as bigint))#47L AS _c1#45L,
             grouping__id#46 AS _c2#44]
  +- Expand [List(key#17L, value#18, (key#17L % cast(5 as bigint))#47L, 0),
             List(key#17L, value#18, null, 1)],
            [key#17L,value#18,(key#17L % cast(5 as bigint))#47L,grouping__id#46]
     +- Project [key#17L,
                 value#18,
                 (key#17L % cast(5 as bigint)) AS (key#17L % cast(5 as bigint))#47L]
        +- Subquery t1
           +- Relation[key#17L,value#18] ParquetRelation
```
Converted SQL:
```SQL
  SELECT count( 1) AS `cnt`,
         (`t1`.`key` % CAST(5 AS BIGINT)),
         grouping_id() AS `_c2`
  FROM `default`.`t1`
  GROUP BY (`t1`.`key` % CAST(5 AS BIGINT))
  GROUPING SETS (((`t1`.`key` % CAST(5 AS BIGINT))), ())
```

#### How was the this patch tested?

Added eight test cases in `LogicalPlanToSQLSuite`.

Author: gatorsmile <gatorsmile@gmail.com>
Author: xiaoli <lixiao1983@gmail.com>
Author: Xiao Li <xiaoli@Xiaos-MacBook-Pro.local>

Closes #11283 from gatorsmile/groupingSetsToSQL.
---
 python/pyspark/sql/functions.py               |  14 +-
 .../sql/catalyst/expressions/grouping.scala   |   1 +
 .../apache/spark/sql/hive/SQLBuilder.scala    |  76 +++++++++-
 .../sql/hive/LogicalPlanToSQLSuite.scala      | 143 ++++++++++++++++++
 4 files changed, 226 insertions(+), 8 deletions(-)

diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index 92e724fef4963..88924e2981fbb 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -348,13 +348,13 @@ def grouping_id(*cols):
     grouping columns).
 
     >>> df.cube("name").agg(grouping_id(), sum("age")).orderBy("name").show()
-    +-----+------------+--------+
-    | name|groupingid()|sum(age)|
-    +-----+------------+--------+
-    | null|           1|       7|
-    |Alice|           0|       2|
-    |  Bob|           0|       5|
-    +-----+------------+--------+
+    +-----+-------------+--------+
+    | name|grouping_id()|sum(age)|
+    +-----+-------------+--------+
+    | null|            1|       7|
+    |Alice|            0|       2|
+    |  Bob|            0|       5|
+    +-----+-------------+--------+
     """
     sc = SparkContext._active_spark_context
     jc = sc._jvm.functions.grouping_id(_to_seq(sc, cols, _to_java_column))
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/grouping.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/grouping.scala
index a204060630050..437e417266fb3 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/grouping.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/grouping.scala
@@ -63,4 +63,5 @@ case class GroupingID(groupByExprs: Seq[Expression]) extends Expression with Une
   override def children: Seq[Expression] = groupByExprs
   override def dataType: DataType = IntegerType
   override def nullable: Boolean = false
+  override def prettyName: String = "grouping_id"
 }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/SQLBuilder.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/SQLBuilder.scala
index 9a14ccff57f83..8d411a9a40a1f 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/SQLBuilder.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/SQLBuilder.scala
@@ -30,7 +30,7 @@ import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules.{Rule, RuleExecutor}
 import org.apache.spark.sql.catalyst.util.quoteIdentifier
 import org.apache.spark.sql.execution.datasources.LogicalRelation
-import org.apache.spark.sql.types.{DataType, NullType}
+import org.apache.spark.sql.types.{ByteType, DataType, IntegerType, NullType}
 
 /**
  * A place holder for generated SQL for subquery expression.
@@ -118,6 +118,9 @@ class SQLBuilder(logicalPlan: LogicalPlan, sqlContext: SQLContext) extends Loggi
     case p: Project =>
       projectToSQL(p, isDistinct = false)
 
+    case a @ Aggregate(_, _, e @ Expand(_, _, p: Project)) if isGroupingSet(a, e, p) =>
+      groupingSetToSQL(a, e, p)
+
     case p: Aggregate =>
       aggregateToSQL(p)
 
@@ -244,6 +247,77 @@ class SQLBuilder(logicalPlan: LogicalPlan, sqlContext: SQLContext) extends Loggi
     )
   }
 
+  private def sameOutput(output1: Seq[Attribute], output2: Seq[Attribute]): Boolean =
+    output1.size == output2.size &&
+      output1.zip(output2).forall(pair => pair._1.semanticEquals(pair._2))
+
+  private def isGroupingSet(a: Aggregate, e: Expand, p: Project): Boolean = {
+    assert(a.child == e && e.child == p)
+    a.groupingExpressions.forall(_.isInstanceOf[Attribute]) &&
+      sameOutput(e.output, p.child.output ++ a.groupingExpressions.map(_.asInstanceOf[Attribute]))
+  }
+
+  private def groupingSetToSQL(
+      agg: Aggregate,
+      expand: Expand,
+      project: Project): String = {
+    assert(agg.groupingExpressions.length > 1)
+
+    // The last column of Expand is always grouping ID
+    val gid = expand.output.last
+
+    val numOriginalOutput = project.child.output.length
+    // Assumption: Aggregate's groupingExpressions is composed of
+    // 1) the attributes of aliased group by expressions
+    // 2) gid, which is always the last one
+    val groupByAttributes = agg.groupingExpressions.dropRight(1).map(_.asInstanceOf[Attribute])
+    // Assumption: Project's projectList is composed of
+    // 1) the original output (Project's child.output),
+    // 2) the aliased group by expressions.
+    val groupByExprs = project.projectList.drop(numOriginalOutput).map(_.asInstanceOf[Alias].child)
+    val groupingSQL = groupByExprs.map(_.sql).mkString(", ")
+
+    // a map from group by attributes to the original group by expressions.
+    val groupByAttrMap = AttributeMap(groupByAttributes.zip(groupByExprs))
+
+    val groupingSet: Seq[Seq[Expression]] = expand.projections.map { project =>
+      // Assumption: expand.projections is composed of
+      // 1) the original output (Project's child.output),
+      // 2) group by attributes(or null literal)
+      // 3) gid, which is always the last one in each project in Expand
+      project.drop(numOriginalOutput).dropRight(1).collect {
+        case attr: Attribute if groupByAttrMap.contains(attr) => groupByAttrMap(attr)
+      }
+    }
+    val groupingSetSQL =
+      "GROUPING SETS(" +
+        groupingSet.map(e => s"(${e.map(_.sql).mkString(", ")})").mkString(", ") + ")"
+
+    val aggExprs = agg.aggregateExpressions.map { case expr =>
+      expr.transformDown {
+        // grouping_id() is converted to VirtualColumn.groupingIdName by Analyzer. Revert it back.
+        case ar: AttributeReference if ar == gid => GroupingID(Nil)
+        case ar: AttributeReference if groupByAttrMap.contains(ar) => groupByAttrMap(ar)
+        case a @ Cast(BitwiseAnd(
+            ShiftRight(ar: AttributeReference, Literal(value: Any, IntegerType)),
+            Literal(1, IntegerType)), ByteType) if ar == gid =>
+          // for converting an expression to its original SQL format grouping(col)
+          val idx = groupByExprs.length - 1 - value.asInstanceOf[Int]
+          groupByExprs.lift(idx).map(Grouping).getOrElse(a)
+      }
+    }
+
+    build(
+      "SELECT",
+      aggExprs.map(_.sql).mkString(", "),
+      if (agg.child == OneRowRelation) "" else "FROM",
+      toSQL(project.child),
+      "GROUP BY",
+      groupingSQL,
+      groupingSetSQL
+    )
+  }
+
   object Canonicalizer extends RuleExecutor[LogicalPlan] {
     override protected def batches: Seq[Batch] = Seq(
       Batch("Canonicalizer", FixedPoint(100),
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/LogicalPlanToSQLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/LogicalPlanToSQLSuite.scala
index d708fcf8dd4d9..f457d43e19a50 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/LogicalPlanToSQLSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/LogicalPlanToSQLSuite.scala
@@ -218,6 +218,149 @@ class LogicalPlanToSQLSuite extends SQLBuilderTest with SQLTestUtils {
     checkHiveQl("SELECT DISTINCT id FROM parquet_t0")
   }
 
+  test("rollup/cube #1") {
+    // Original logical plan:
+    //   Aggregate [(key#17L % cast(5 as bigint))#47L,grouping__id#46],
+    //             [(count(1),mode=Complete,isDistinct=false) AS cnt#43L,
+    //              (key#17L % cast(5 as bigint))#47L AS _c1#45L,
+    //              grouping__id#46 AS _c2#44]
+    //   +- Expand [List(key#17L, value#18, (key#17L % cast(5 as bigint))#47L, 0),
+    //              List(key#17L, value#18, null, 1)],
+    //             [key#17L,value#18,(key#17L % cast(5 as bigint))#47L,grouping__id#46]
+    //      +- Project [key#17L,
+    //                  value#18,
+    //                  (key#17L % cast(5 as bigint)) AS (key#17L % cast(5 as bigint))#47L]
+    //         +- Subquery t1
+    //            +- Relation[key#17L,value#18] ParquetRelation
+    // Converted SQL:
+    //   SELECT count( 1) AS `cnt`,
+    //          (`t1`.`key` % CAST(5 AS BIGINT)),
+    //          grouping_id() AS `_c2`
+    //   FROM `default`.`t1`
+    //   GROUP BY (`t1`.`key` % CAST(5 AS BIGINT))
+    //   GROUPING SETS (((`t1`.`key` % CAST(5 AS BIGINT))), ())
+    checkHiveQl(
+      "SELECT count(*) as cnt, key%5, grouping_id() FROM parquet_t1 GROUP BY key % 5 WITH ROLLUP")
+    checkHiveQl(
+      "SELECT count(*) as cnt, key%5, grouping_id() FROM parquet_t1 GROUP BY key % 5 WITH CUBE")
+  }
+
+  test("rollup/cube #2") {
+    checkHiveQl("SELECT key, value, count(value) FROM parquet_t1 GROUP BY key, value WITH ROLLUP")
+    checkHiveQl("SELECT key, value, count(value) FROM parquet_t1 GROUP BY key, value WITH CUBE")
+  }
+
+  test("rollup/cube #3") {
+    checkHiveQl(
+      "SELECT key, count(value), grouping_id() FROM parquet_t1 GROUP BY key, value WITH ROLLUP")
+    checkHiveQl(
+      "SELECT key, count(value), grouping_id() FROM parquet_t1 GROUP BY key, value WITH CUBE")
+  }
+
+  test("rollup/cube #4") {
+    checkHiveQl(
+      s"""
+        |SELECT count(*) as cnt, key % 5 as k1, key - 5 as k2, grouping_id() FROM parquet_t1
+        |GROUP BY key % 5, key - 5 WITH ROLLUP
+      """.stripMargin)
+    checkHiveQl(
+      s"""
+        |SELECT count(*) as cnt, key % 5 as k1, key - 5 as k2, grouping_id() FROM parquet_t1
+        |GROUP BY key % 5, key - 5 WITH CUBE
+      """.stripMargin)
+  }
+
+  test("rollup/cube #5") {
+    checkHiveQl(
+      s"""
+        |SELECT count(*) AS cnt, key % 5 AS k1, key - 5 AS k2, grouping_id(key % 5, key - 5) AS k3
+        |FROM (SELECT key, key%2, key - 5 FROM parquet_t1) t GROUP BY key%5, key-5
+        |WITH ROLLUP
+      """.stripMargin)
+    checkHiveQl(
+      s"""
+        |SELECT count(*) AS cnt, key % 5 AS k1, key - 5 AS k2, grouping_id(key % 5, key - 5) AS k3
+        |FROM (SELECT key, key % 2, key - 5 FROM parquet_t1) t GROUP BY key % 5, key - 5
+        |WITH CUBE
+      """.stripMargin)
+  }
+
+  test("rollup/cube #6") {
+    checkHiveQl("SELECT a, b, sum(c) FROM parquet_t2 GROUP BY ROLLUP(a, b) ORDER BY a, b")
+    checkHiveQl("SELECT a, b, sum(c) FROM parquet_t2 GROUP BY CUBE(a, b) ORDER BY a, b")
+    checkHiveQl("SELECT a, b, sum(a) FROM parquet_t2 GROUP BY ROLLUP(a, b) ORDER BY a, b")
+    checkHiveQl("SELECT a, b, sum(a) FROM parquet_t2 GROUP BY CUBE(a, b) ORDER BY a, b")
+    checkHiveQl("SELECT a + b, b, sum(a - b) FROM parquet_t2 GROUP BY a + b, b WITH ROLLUP")
+    checkHiveQl("SELECT a + b, b, sum(a - b) FROM parquet_t2 GROUP BY a + b, b WITH CUBE")
+  }
+
+  test("rollup/cube #7") {
+    checkHiveQl("SELECT a, b, grouping_id(a, b) FROM parquet_t2 GROUP BY cube(a, b)")
+    checkHiveQl("SELECT a, b, grouping(b) FROM parquet_t2 GROUP BY cube(a, b)")
+    checkHiveQl("SELECT a, b, grouping(a) FROM parquet_t2 GROUP BY cube(a, b)")
+  }
+
+  test("rollup/cube #8") {
+    // grouping_id() is part of another expression
+    checkHiveQl(
+      s"""
+         |SELECT hkey AS k1, value - 5 AS k2, hash(grouping_id()) AS hgid
+         |FROM (SELECT hash(key) as hkey, key as value FROM parquet_t1) t GROUP BY hkey, value-5
+         |WITH ROLLUP
+      """.stripMargin)
+    checkHiveQl(
+      s"""
+         |SELECT hkey AS k1, value - 5 AS k2, hash(grouping_id()) AS hgid
+         |FROM (SELECT hash(key) as hkey, key as value FROM parquet_t1) t GROUP BY hkey, value-5
+         |WITH CUBE
+      """.stripMargin)
+  }
+
+  test("rollup/cube #9") {
+    // self join is used as the child node of ROLLUP/CUBE with replaced quantifiers
+    checkHiveQl(
+      s"""
+         |SELECT t.key - 5, cnt, SUM(cnt)
+         |FROM (SELECT x.key, COUNT(*) as cnt
+         |FROM parquet_t1 x JOIN parquet_t1 y ON x.key = y.key GROUP BY x.key) t
+         |GROUP BY cnt, t.key - 5
+         |WITH ROLLUP
+      """.stripMargin)
+    checkHiveQl(
+      s"""
+         |SELECT t.key - 5, cnt, SUM(cnt)
+         |FROM (SELECT x.key, COUNT(*) as cnt
+         |FROM parquet_t1 x JOIN parquet_t1 y ON x.key = y.key GROUP BY x.key) t
+         |GROUP BY cnt, t.key - 5
+         |WITH CUBE
+      """.stripMargin)
+  }
+
+  test("grouping sets #1") {
+    checkHiveQl(
+      s"""
+         |SELECT count(*) AS cnt, key % 5 AS k1, key - 5 AS k2, grouping_id() AS k3
+         |FROM (SELECT key, key % 2, key - 5 FROM parquet_t1) t GROUP BY key % 5, key - 5
+         |GROUPING SETS (key % 5, key - 5)
+      """.stripMargin)
+  }
+
+  test("grouping sets #2") {
+    checkHiveQl(
+      "SELECT a, b, sum(c) FROM parquet_t2 GROUP BY a, b GROUPING SETS (a, b) ORDER BY a, b")
+    checkHiveQl(
+      "SELECT a, b, sum(c) FROM parquet_t2 GROUP BY a, b GROUPING SETS (a) ORDER BY a, b")
+    checkHiveQl(
+      "SELECT a, b, sum(c) FROM parquet_t2 GROUP BY a, b GROUPING SETS (b) ORDER BY a, b")
+    checkHiveQl(
+      "SELECT a, b, sum(c) FROM parquet_t2 GROUP BY a, b GROUPING SETS (()) ORDER BY a, b")
+    checkHiveQl(
+      s"""
+         |SELECT a, b, sum(c) FROM parquet_t2 GROUP BY a, b
+         |GROUPING SETS ((), (a), (a, b)) ORDER BY a, b
+      """.stripMargin)
+  }
+
   test("cluster by") {
     checkHiveQl("SELECT id FROM parquet_t0 CLUSTER BY id")
   }

From 8290004d94760c22d6d3ca8dda3003ac8644422f Mon Sep 17 00:00:00 2001
From: Shixiong Zhu <shixiong@databricks.com>
Date: Sat, 5 Mar 2016 15:26:27 -0800
Subject: [PATCH 13/29] [SPARK-13693][STREAMING][TESTS] Stop StreamingContext
 before deleting checkpoint dir

## What changes were proposed in this pull request?

Stop StreamingContext before deleting checkpoint dir to avoid the race condition that deleting the checkpoint dir and writing checkpoint happen at the same time.

The flaky test log is here: https://amplab.cs.berkeley.edu/jenkins/job/spark-master-test-sbt-hadoop-2.7/256/testReport/junit/org.apache.spark.streaming/MapWithStateSuite/_It_is_not_a_test_/

## How was this patch tested?

unit tests

Author: Shixiong Zhu <shixiong@databricks.com>

Closes #11531 from zsxwing/SPARK-13693.
---
 .../scala/org/apache/spark/streaming/MapWithStateSuite.scala    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/streaming/src/test/scala/org/apache/spark/streaming/MapWithStateSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/MapWithStateSuite.scala
index b6d6585bd8244..403400904bac2 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/MapWithStateSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/MapWithStateSuite.scala
@@ -43,10 +43,10 @@ class MapWithStateSuite extends SparkFunSuite
   }
 
   after {
+    StreamingContext.getActive().foreach { _.stop(stopSparkContext = false) }
     if (checkpointDir != null) {
       Utils.deleteRecursively(checkpointDir)
     }
-    StreamingContext.getActive().foreach { _.stop(stopSparkContext = false) }
   }
 
   override def beforeAll(): Unit = {

From 8ff88094daa4945e7d718baa7b20703fd8087ab0 Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Sun, 6 Mar 2016 12:54:04 +0800
Subject: [PATCH 14/29] Revert "[SPARK-13616][SQL] Let SQLBuilder convert
 logical plan without a project on top of it"

This reverts commit f87ce0504ea0697969ac3e67690c78697b76e94a.

According to discussion in #11466, let's revert PR #11466 for safe.

Author: Cheng Lian <lian@databricks.com>

Closes #11539 from liancheng/revert-pr-11466.
---
 .../apache/spark/sql/hive/SQLBuilder.scala    | 23 +----------
 .../sql/hive/LogicalPlanToSQLSuite.scala      | 41 -------------------
 2 files changed, 1 insertion(+), 63 deletions(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/SQLBuilder.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/SQLBuilder.scala
index 8d411a9a40a1f..683f738054c5a 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/SQLBuilder.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/SQLBuilder.scala
@@ -65,7 +65,7 @@ class SQLBuilder(logicalPlan: LogicalPlan, sqlContext: SQLContext) extends Loggi
         case e => e
       }
 
-      val generatedSQL = toSQL(replaced, true)
+      val generatedSQL = toSQL(replaced)
       logDebug(
         s"""Built SQL query string successfully from given logical plan:
            |
@@ -90,27 +90,6 @@ class SQLBuilder(logicalPlan: LogicalPlan, sqlContext: SQLContext) extends Loggi
     }
   }
 
-  private def toSQL(node: LogicalPlan, topNode: Boolean): String = {
-    if (topNode) {
-      node match {
-        case d: Distinct => toSQL(node)
-        case p: Project => toSQL(node)
-        case a: Aggregate => toSQL(node)
-        case s: Sort => toSQL(node)
-        case r: RepartitionByExpression => toSQL(node)
-        case _ =>
-          build(
-            "SELECT",
-            node.output.map(_.sql).mkString(", "),
-            "FROM",
-            toSQL(node)
-          )
-      }
-    } else {
-      toSQL(node)
-    }
-  }
-
   private def toSQL(node: LogicalPlan): String = node match {
     case Distinct(p: Project) =>
       projectToSQL(p, isDistinct = true)
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/LogicalPlanToSQLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/LogicalPlanToSQLSuite.scala
index f457d43e19a50..ed85856f017df 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/LogicalPlanToSQLSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/LogicalPlanToSQLSuite.scala
@@ -19,8 +19,6 @@ package org.apache.spark.sql.hive
 
 import scala.util.control.NonFatal
 
-import org.apache.spark.sql.{DataFrame, SQLContext}
-import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.test.SQLTestUtils
 
@@ -56,33 +54,6 @@ class LogicalPlanToSQLSuite extends SQLBuilderTest with SQLTestUtils {
     sql("DROP TABLE IF EXISTS t0")
   }
 
-  private def checkPlan(plan: LogicalPlan, sqlContext: SQLContext, expected: String): Unit = {
-    val convertedSQL = try new SQLBuilder(plan, sqlContext).toSQL catch {
-      case NonFatal(e) =>
-        fail(
-          s"""Cannot convert the following logical query plan back to SQL query string:
-             |
-             |# Original logical query plan:
-             |${plan.treeString}
-           """.stripMargin, e)
-    }
-
-    try {
-      checkAnswer(sql(convertedSQL), DataFrame(sqlContext, plan))
-    } catch { case cause: Throwable =>
-      fail(
-        s"""Failed to execute converted SQL string or got wrong answer:
-           |
-           |# Converted SQL query string:
-           |$convertedSQL
-           |
-           |# Original logical query plan:
-           |${plan.treeString}
-         """.stripMargin,
-        cause)
-    }
-  }
-
   private def checkHiveQl(hiveQl: String): Unit = {
     val df = sql(hiveQl)
 
@@ -186,18 +157,6 @@ class LogicalPlanToSQLSuite extends SQLBuilderTest with SQLTestUtils {
       "SELECT x.key, COUNT(*) FROM parquet_t1 x JOIN parquet_t1 y ON x.key = y.key group by x.key")
   }
 
-  test("join plan") {
-    val expectedSql = "SELECT x.key FROM parquet_t1 x JOIN parquet_t1 y ON x.key = y.key"
-
-    val df1 = sqlContext.table("parquet_t1").as("x")
-    val df2 = sqlContext.table("parquet_t1").as("y")
-    val joinPlan = df1.join(df2).queryExecution.analyzed
-
-    // Make sure we have a plain Join operator without Project on top of it.
-    assert(joinPlan.isInstanceOf[Join])
-    checkPlan(joinPlan, sqlContext, expectedSql)
-  }
-
   test("case") {
     checkHiveQl("SELECT CASE WHEN id % 2 > 0 THEN 0 WHEN id % 2 = 0 THEN 1 END FROM parquet_t0")
   }

From ee913e6e2d58dfac20f3f06ff306081bd0e48066 Mon Sep 17 00:00:00 2001
From: Shixiong Zhu <shixiong@databricks.com>
Date: Sun, 6 Mar 2016 08:57:01 -0800
Subject: [PATCH 15/29] [SPARK-13697] [PYSPARK] Fix the missing module name of
 TransformFunctionSerializer.loads

## What changes were proposed in this pull request?

Set the function's module name to `__main__` if it's missing in `TransformFunctionSerializer.loads`.

## How was this patch tested?

Manually test in the shell.

Before this patch:
```
>>> from pyspark.streaming import StreamingContext
>>> from pyspark.streaming.util import TransformFunction
>>> ssc = StreamingContext(sc, 1)
>>> func = TransformFunction(sc, lambda x: x, sc.serializer)
>>> func.rdd_wrapper(lambda x: x)
TransformFunction(<function <lambda> at 0x106ac8b18>)
>>> bytes = bytearray(ssc._transformerSerializer.serializer.dumps((func.func, func.rdd_wrap_func, func.deserializers)))
>>> func2 = ssc._transformerSerializer.loads(bytes)
>>> print(func2.func.__module__)
None
>>> print(func2.rdd_wrap_func.__module__)
None
>>>
```
After this patch:
```
>>> from pyspark.streaming import StreamingContext
>>> from pyspark.streaming.util import TransformFunction
>>> ssc = StreamingContext(sc, 1)
>>> func = TransformFunction(sc, lambda x: x, sc.serializer)
>>> func.rdd_wrapper(lambda x: x)
TransformFunction(<function <lambda> at 0x108bf1b90>)
>>> bytes = bytearray(ssc._transformerSerializer.serializer.dumps((func.func, func.rdd_wrap_func, func.deserializers)))
>>> func2 = ssc._transformerSerializer.loads(bytes)
>>> print(func2.func.__module__)
__main__
>>> print(func2.rdd_wrap_func.__module__)
__main__
>>>
```

Author: Shixiong Zhu <shixiong@databricks.com>

Closes #11535 from zsxwing/loads-module.
---
 python/pyspark/cloudpickle.py | 4 +++-
 python/pyspark/tests.py       | 6 ++++++
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/python/pyspark/cloudpickle.py b/python/pyspark/cloudpickle.py
index 95b3abc74244b..e56e22a9b920e 100644
--- a/python/pyspark/cloudpickle.py
+++ b/python/pyspark/cloudpickle.py
@@ -241,6 +241,7 @@ def save_function_tuple(self, func):
         save(f_globals)
         save(defaults)
         save(dct)
+        save(func.__module__)
         write(pickle.TUPLE)
         write(pickle.REDUCE)  # applies _fill_function on the tuple
 
@@ -698,13 +699,14 @@ def _genpartial(func, args, kwds):
     return partial(func, *args, **kwds)
 
 
-def _fill_function(func, globals, defaults, dict):
+def _fill_function(func, globals, defaults, dict, module):
     """ Fills in the rest of function data into the skeleton function object
         that were created via _make_skel_func().
          """
     func.__globals__.update(globals)
     func.__defaults__ = defaults
     func.__dict__ = dict
+    func.__module__ = module
 
     return func
 
diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py
index 23720502a82c8..a5a83c7e38b3c 100644
--- a/python/pyspark/tests.py
+++ b/python/pyspark/tests.py
@@ -228,6 +228,12 @@ def test_itemgetter(self):
         getter2 = ser.loads(ser.dumps(getter))
         self.assertEqual(getter(d), getter2(d))
 
+    def test_function_module_name(self):
+        ser = CloudPickleSerializer()
+        func = lambda x: x
+        func2 = ser.loads(ser.dumps(func))
+        self.assertEqual(func.__module__, func2.__module__)
+
     def test_attrgetter(self):
         from operator import attrgetter
         ser = CloudPickleSerializer()

From bc7a3ec290904f2d8802583bb0557bca1b8b01ff Mon Sep 17 00:00:00 2001
From: Andrew Or <andrew@databricks.com>
Date: Mon, 7 Mar 2016 00:14:40 -0800
Subject: [PATCH 16/29] [SPARK-13685][SQL] Rename catalog.Catalog to
 ExternalCatalog

## What changes were proposed in this pull request?

Today we have `analysis.Catalog` and `catalog.Catalog`. In the future the former will call the latter. When that happens, if both of them are still called `Catalog` it will be very confusing. This patch renames the latter `ExternalCatalog` because it is expected to talk to external systems.

## How was this patch tested?

Jenkins.

Author: Andrew Or <andrew@databricks.com>

Closes #11526 from andrewor14/rename-catalog.
---
 .../sql/catalyst/analysis/NoSuchItemException.scala |  2 +-
 .../sql/catalyst/catalog/InMemoryCatalog.scala      |  8 ++++++--
 .../spark/sql/catalyst/catalog/interface.scala      | 13 ++++++++-----
 .../sql/catalyst/catalog/CatalogTestCases.scala     | 10 +++++-----
 .../sql/catalyst/catalog/InMemoryCatalogSuite.scala |  2 +-
 .../org/apache/spark/sql/hive/HiveCatalog.scala     |  4 ++--
 .../apache/spark/sql/hive/client/HiveClient.scala   | 12 ++++++------
 .../spark/sql/hive/client/HiveClientImpl.scala      |  8 ++++----
 .../apache/spark/sql/hive/HiveCatalogSuite.scala    |  2 +-
 9 files changed, 34 insertions(+), 27 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/NoSuchItemException.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/NoSuchItemException.scala
index 81399db9bc070..e9f04eecf8d70 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/NoSuchItemException.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/NoSuchItemException.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.catalyst.analysis
 
-import org.apache.spark.sql.catalyst.catalog.Catalog.TablePartitionSpec
+import org.apache.spark.sql.catalyst.catalog.ExternalCatalog.TablePartitionSpec
 
 
 /**
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala
index cba4de34f2b44..f3fa7958db41b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala
@@ -25,10 +25,14 @@ import org.apache.spark.sql.AnalysisException
 /**
  * An in-memory (ephemeral) implementation of the system catalog.
  *
+ * This is a dummy implementation that does not require setting up external systems.
+ * It is intended for testing or exploration purposes only and should not be used
+ * in production.
+ *
  * All public methods should be synchronized for thread-safety.
  */
-class InMemoryCatalog extends Catalog {
-  import Catalog._
+class InMemoryCatalog extends ExternalCatalog {
+  import ExternalCatalog._
 
   private class TableDesc(var table: CatalogTable) {
     val partitions = new mutable.HashMap[TablePartitionSpec, CatalogTablePartition]
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
index dac5f023d1f58..db34af3d26fc5 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
@@ -26,12 +26,13 @@ import org.apache.spark.sql.AnalysisException
  * Interface for the system catalog (of columns, partitions, tables, and databases).
  *
  * This is only used for non-temporary items, and implementations must be thread-safe as they
- * can be accessed in multiple threads.
+ * can be accessed in multiple threads. This is an external catalog because it is expected to
+ * interact with external systems.
  *
  * Implementations should throw [[AnalysisException]] when table or database don't exist.
  */
-abstract class Catalog {
-  import Catalog._
+abstract class ExternalCatalog {
+  import ExternalCatalog._
 
   protected def requireDbExists(db: String): Unit = {
     if (!databaseExists(db)) {
@@ -198,7 +199,9 @@ case class CatalogColumn(
  * @param spec partition spec values indexed by column name
  * @param storage storage format of the partition
  */
-case class CatalogTablePartition(spec: Catalog.TablePartitionSpec, storage: CatalogStorageFormat)
+case class CatalogTablePartition(
+    spec: ExternalCatalog.TablePartitionSpec,
+    storage: CatalogStorageFormat)
 
 
 /**
@@ -263,7 +266,7 @@ case class CatalogDatabase(
     properties: Map[String, String])
 
 
-object Catalog {
+object ExternalCatalog {
   /**
    * Specifications of a table partition. Mapping column name to column value.
    */
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/CatalogTestCases.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/CatalogTestCases.scala
index e0d1220d13e7c..b03ba81b50572 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/CatalogTestCases.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/CatalogTestCases.scala
@@ -24,9 +24,9 @@ import org.apache.spark.sql.AnalysisException
 
 
 /**
- * A reasonable complete test suite (i.e. behaviors) for a [[Catalog]].
+ * A reasonable complete test suite (i.e. behaviors) for a [[ExternalCatalog]].
  *
- * Implementations of the [[Catalog]] interface can create test suites by extending this.
+ * Implementations of the [[ExternalCatalog]] interface can create test suites by extending this.
  */
 abstract class CatalogTestCases extends SparkFunSuite with BeforeAndAfterEach {
   private lazy val storageFormat = CatalogStorageFormat(
@@ -45,7 +45,7 @@ abstract class CatalogTestCases extends SparkFunSuite with BeforeAndAfterEach {
   protected val tableOutputFormat: String = "org.apache.park.serde.MyOutputFormat"
   protected def newUriForDatabase(): String = "uri"
   protected def resetState(): Unit = { }
-  protected def newEmptyCatalog(): Catalog
+  protected def newEmptyCatalog(): ExternalCatalog
 
   // Clear all state after each test
   override def afterEach(): Unit = {
@@ -68,7 +68,7 @@ abstract class CatalogTestCases extends SparkFunSuite with BeforeAndAfterEach {
    *     - part2
    *   - func1
    */
-  private def newBasicCatalog(): Catalog = {
+  private def newBasicCatalog(): ExternalCatalog = {
     val catalog = newEmptyCatalog()
     // When testing against a real catalog, the default database may already exist
     catalog.createDatabase(newDb("default"), ignoreIfExists = true)
@@ -104,7 +104,7 @@ abstract class CatalogTestCases extends SparkFunSuite with BeforeAndAfterEach {
    * Note: Hive sets some random serde things, so we just compare the specs here.
    */
   private def catalogPartitionsEqual(
-      catalog: Catalog,
+      catalog: ExternalCatalog,
       db: String,
       table: String,
       parts: Seq[CatalogTablePartition]): Boolean = {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalogSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalogSuite.scala
index 871f0a0f46a22..9531758ffd597 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalogSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalogSuite.scala
@@ -19,5 +19,5 @@ package org.apache.spark.sql.catalyst.catalog
 
 /** Test suite for the [[InMemoryCatalog]]. */
 class InMemoryCatalogSuite extends CatalogTestCases {
-  override protected def newEmptyCatalog(): Catalog = new InMemoryCatalog
+  override protected def newEmptyCatalog(): ExternalCatalog = new InMemoryCatalog
 }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveCatalog.scala
index 21b9cfb820eaa..5185e9aac05f0 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveCatalog.scala
@@ -33,8 +33,8 @@ import org.apache.spark.sql.hive.client.HiveClient
  * A persistent implementation of the system catalog using Hive.
  * All public methods must be synchronized for thread-safety.
  */
-private[spark] class HiveCatalog(client: HiveClient) extends Catalog with Logging {
-  import Catalog._
+private[spark] class HiveCatalog(client: HiveClient) extends ExternalCatalog with Logging {
+  import ExternalCatalog._
 
   // Exceptions thrown by the hive client that we would like to wrap
   private val clientExceptions = Set(
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClient.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClient.scala
index 6a0a089fd1f44..b32aff25be68d 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClient.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClient.scala
@@ -132,7 +132,7 @@ private[hive] trait HiveClient {
   def dropPartitions(
       db: String,
       table: String,
-      specs: Seq[Catalog.TablePartitionSpec]): Unit
+      specs: Seq[ExternalCatalog.TablePartitionSpec]): Unit
 
   /**
    * Rename one or many existing table partitions, assuming they exist.
@@ -140,8 +140,8 @@ private[hive] trait HiveClient {
   def renamePartitions(
       db: String,
       table: String,
-      specs: Seq[Catalog.TablePartitionSpec],
-      newSpecs: Seq[Catalog.TablePartitionSpec]): Unit
+      specs: Seq[ExternalCatalog.TablePartitionSpec],
+      newSpecs: Seq[ExternalCatalog.TablePartitionSpec]): Unit
 
   /**
    * Alter one or more table partitions whose specs match the ones specified in `newParts`,
@@ -156,7 +156,7 @@ private[hive] trait HiveClient {
   final def getPartition(
       dbName: String,
       tableName: String,
-      spec: Catalog.TablePartitionSpec): CatalogTablePartition = {
+      spec: ExternalCatalog.TablePartitionSpec): CatalogTablePartition = {
     getPartitionOption(dbName, tableName, spec).getOrElse {
       throw new NoSuchPartitionException(dbName, tableName, spec)
     }
@@ -166,14 +166,14 @@ private[hive] trait HiveClient {
   final def getPartitionOption(
       db: String,
       table: String,
-      spec: Catalog.TablePartitionSpec): Option[CatalogTablePartition] = {
+      spec: ExternalCatalog.TablePartitionSpec): Option[CatalogTablePartition] = {
     getPartitionOption(getTable(db, table), spec)
   }
 
   /** Returns the specified partition or None if it does not exist. */
   def getPartitionOption(
       table: CatalogTable,
-      spec: Catalog.TablePartitionSpec): Option[CatalogTablePartition]
+      spec: ExternalCatalog.TablePartitionSpec): Option[CatalogTablePartition]
 
   /** Returns all partitions for the given table. */
   final def getAllPartitions(db: String, table: String): Seq[CatalogTablePartition] = {
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
index 5d62854c40c5d..c1c8e631ee740 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
@@ -366,7 +366,7 @@ private[hive] class HiveClientImpl(
   override def dropPartitions(
       db: String,
       table: String,
-      specs: Seq[Catalog.TablePartitionSpec]): Unit = withHiveState {
+      specs: Seq[ExternalCatalog.TablePartitionSpec]): Unit = withHiveState {
     // TODO: figure out how to drop multiple partitions in one call
     specs.foreach { s => client.dropPartition(db, table, s.values.toList.asJava, true) }
   }
@@ -374,8 +374,8 @@ private[hive] class HiveClientImpl(
   override def renamePartitions(
       db: String,
       table: String,
-      specs: Seq[Catalog.TablePartitionSpec],
-      newSpecs: Seq[Catalog.TablePartitionSpec]): Unit = withHiveState {
+      specs: Seq[ExternalCatalog.TablePartitionSpec],
+      newSpecs: Seq[ExternalCatalog.TablePartitionSpec]): Unit = withHiveState {
     require(specs.size == newSpecs.size, "number of old and new partition specs differ")
     val catalogTable = getTable(db, table)
     val hiveTable = toHiveTable(catalogTable)
@@ -397,7 +397,7 @@ private[hive] class HiveClientImpl(
 
   override def getPartitionOption(
       table: CatalogTable,
-      spec: Catalog.TablePartitionSpec): Option[CatalogTablePartition] = withHiveState {
+      spec: ExternalCatalog.TablePartitionSpec): Option[CatalogTablePartition] = withHiveState {
     val hiveTable = toHiveTable(table)
     val hivePartition = client.getPartition(hiveTable, spec.asJava, false)
     Option(hivePartition).map(fromHivePartition)
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveCatalogSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveCatalogSuite.scala
index f73e7e2351447..f557abcd522e6 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveCatalogSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveCatalogSuite.scala
@@ -44,6 +44,6 @@ class HiveCatalogSuite extends CatalogTestCases {
 
   protected override def resetState(): Unit = client.reset()
 
-  protected override def newEmptyCatalog(): Catalog = new HiveCatalog(client)
+  protected override def newEmptyCatalog(): ExternalCatalog = new HiveCatalog(client)
 
 }

From 4b13896ebf7cecf9d50514a62165b612ee18124a Mon Sep 17 00:00:00 2001
From: rmishra <rmishra@pivotal.io>
Date: Mon, 7 Mar 2016 09:55:49 +0000
Subject: [PATCH 17/29] [SPARK-13705][DOCS] UpdateStateByKey Operation
 documentation incorrectly refers to StatefulNetworkWordCount

## What changes were proposed in this pull request?
The reference to StatefulNetworkWordCount.scala from updateStatesByKey documentation should be removed, till there is a example for updateStatesByKey.

## How was this patch tested?
Have tested the new documentation with jekyll build.

Author: rmishra <rmishra@pivotal.io>

Closes #11545 from rishitesh/SPARK-13705.
---
 docs/streaming-programming-guide.md | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/docs/streaming-programming-guide.md b/docs/streaming-programming-guide.md
index 5d67a0a9a986a..e92b01aa7774a 100644
--- a/docs/streaming-programming-guide.md
+++ b/docs/streaming-programming-guide.md
@@ -872,10 +872,7 @@ val runningCounts = pairs.updateStateByKey[Int](updateFunction _)
 {% endhighlight %}
 
 The update function will be called for each word, with `newValues` having a sequence of 1's (from
-the `(word, 1)` pairs) and the `runningCount` having the previous count. For the complete
-Scala code, take a look at the example
-[StatefulNetworkWordCount.scala]({{site.SPARK_GITHUB_URL}}/blob/master/examples/src/main/scala/org/apache
-/spark/examples/streaming/StatefulNetworkWordCount.scala).
+the `(word, 1)` pairs) and the `runningCount` having the previous count.
 
 </div>
 <div data-lang="java" markdown="1">

From 03f57a6c2dd6ffd4038ca9cecbfc221deaf52393 Mon Sep 17 00:00:00 2001
From: Yury Liavitski <seconds.before@gmail.com>
Date: Mon, 7 Mar 2016 10:54:33 +0000
Subject: [PATCH 18/29] Fixing the type of the sentiment happiness value

## What changes were proposed in this pull request?

Added the conversion to int for the 'happiness value' read from the file. Otherwise, later on line 75 the multiplication will multiply a string by a number, yielding values like "-2-2" instead of -4.

## How was this patch tested?

Tested manually.

Author: Yury Liavitski <seconds.before@gmail.com>
Author: Yury Liavitski <yury.liavitski@il111.ice.local>

Closes #11540 from heliocentrist/fix-sentiment-value-type.
---
 .../examples/streaming/TwitterHashTagJoinSentiments.scala     | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/src/main/scala/org/apache/spark/examples/streaming/TwitterHashTagJoinSentiments.scala b/examples/src/main/scala/org/apache/spark/examples/streaming/TwitterHashTagJoinSentiments.scala
index edf0e0b7b2b46..a8d392ca35b40 100644
--- a/examples/src/main/scala/org/apache/spark/examples/streaming/TwitterHashTagJoinSentiments.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/streaming/TwitterHashTagJoinSentiments.scala
@@ -56,8 +56,8 @@ object TwitterHashTagJoinSentiments {
     val wordSentimentFilePath = "data/streaming/AFINN-111.txt"
     val wordSentiments = ssc.sparkContext.textFile(wordSentimentFilePath).map { line =>
       val Array(word, happinessValue) = line.split("\t")
-      (word, happinessValue)
-    } cache()
+      (word, happinessValue.toInt)
+    }.cache()
 
     // Determine the hash tags with the highest sentiment values by joining the streaming RDD
     // with the static RDD inside the transform() method and then multiplying

From d7eac9d7951c19302ed41fe03eaa38394aeb9c1a Mon Sep 17 00:00:00 2001
From: Dilip Biswal <dbiswal@us.ibm.com>
Date: Mon, 7 Mar 2016 09:46:28 -0800
Subject: [PATCH 19/29] [SPARK-13651] Generator outputs are not resolved
 correctly resulting in run time error

## What changes were proposed in this pull request?

```
Seq(("id1", "value1")).toDF("key", "value").registerTempTable("src")
sqlContext.sql("SELECT t1.* FROM src LATERAL VIEW explode(map('key1', 100, 'key2', 200)) t1 AS key, value")
```
Results in following logical plan

```
Project [key#2,value#3]
+- Generate explode(HiveGenericUDF#org.apache.hadoop.hive.ql.udf.generic.GenericUDFMap(key1,100,key2,200)), true, false, Some(genoutput), [key#2,value#3]
   +- SubqueryAlias src
      +- Project [_1#0 AS key#2,_2#1 AS value#3]
         +- LocalRelation [_1#0,_2#1], [[id1,value1]]
```

The above query fails with following runtime error.
```
java.lang.ClassCastException: java.lang.Integer cannot be cast to org.apache.spark.unsafe.types.UTF8String
	at org.apache.spark.sql.catalyst.expressions.BaseGenericInternalRow$class.getUTF8String(rows.scala:46)
	at org.apache.spark.sql.catalyst.expressions.GenericInternalRow.getUTF8String(rows.scala:221)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply(generated.java:42)
	at org.apache.spark.sql.execution.Generate$$anonfun$doExecute$1$$anonfun$apply$9.apply(Generate.scala:98)
	at org.apache.spark.sql.execution.Generate$$anonfun$doExecute$1$$anonfun$apply$9.apply(Generate.scala:96)
	at scala.collection.Iterator$$anon$11.next(Iterator.scala:370)
	at scala.collection.Iterator$$anon$11.next(Iterator.scala:370)
	at scala.collection.Iterator$class.foreach(Iterator.scala:742)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1194)
        <stack-trace omitted.....>
```
In this case the generated outputs are wrongly resolved from its child (LocalRelation) due to
https://github.com/apache/spark/blob/master/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala#L537-L548
## How was this patch tested?

(Please explain how this patch was tested. E.g. unit tests, integration tests, manual tests)

Added unit tests in hive/SQLQuerySuite and AnalysisSuite

Author: Dilip Biswal <dbiswal@us.ibm.com>

Closes #11497 from dilipbiswal/spark-13651.
---
 .../apache/spark/sql/catalyst/analysis/Analyzer.scala  |  5 +++--
 .../spark/sql/hive/execution/SQLQuerySuite.scala       | 10 ++++++++++
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index fbbc3ee891c6b..b5fa372643bd4 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -512,8 +512,9 @@ class Analyzer(
 
       // A special case for Generate, because the output of Generate should not be resolved by
       // ResolveReferences. Attributes in the output will be resolved by ResolveGenerate.
-      case g @ Generate(generator, join, outer, qualifier, output, child)
-        if child.resolved && !generator.resolved =>
+      case g @ Generate(generator, _, _, _, _, _) if generator.resolved => g
+
+      case g @ Generate(generator, join, outer, qualifier, output, child) =>
         val newG = resolveExpression(generator, child, throws = true)
         if (newG.fastEquals(generator)) {
           g
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index e478bcd0ed653..2f8c2beb17f4b 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -92,6 +92,16 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
     checkAnswer(query, Row(1, 1) :: Row(1, 2) :: Row(1, 3) :: Nil)
   }
 
+  test("SPARK-13651: generator outputs shouldn't be resolved from its child's output") {
+    withTempTable("src") {
+      Seq(("id1", "value1")).toDF("key", "value").registerTempTable("src")
+      val query =
+        sql("SELECT genoutput.* FROM src " +
+          "LATERAL VIEW explode(map('key1', 100, 'key2', 200)) genoutput AS key, value")
+      checkAnswer(query, Row("key1", 100) :: Row("key2", 200) :: Nil)
+    }
+  }
+
   test("SPARK-6851: Self-joined converted parquet tables") {
     val orders = Seq(
       Order(1, "Atlas", "MTB", 234, "2015-01-07", "John D", "Pacifica", "CA", 20151),

From 489641117651d11806d2773b7ded7c163d0260e5 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Mon, 7 Mar 2016 10:32:34 -0800
Subject: [PATCH 20/29] [SPARK-13694][SQL] QueryPlan.expressions should always
 include all expressions

## What changes were proposed in this pull request?

It's weird that expressions don't always have all the expressions in it. This PR marks `QueryPlan.expressions` final to forbid sub classes overriding it to exclude some expressions. Currently only `Generate` override it, we can use `producedAttributes` to fix the unresolved attribute problem for it.

Note that this PR doesn't fix the problem in #11497

## How was this patch tested?

existing tests.

Author: Wenchen Fan <wenchen@databricks.com>

Closes #11532 from cloud-fan/generate.
---
 .../scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala | 2 +-
 .../spark/sql/catalyst/plans/logical/basicOperators.scala     | 4 +---
 .../org/apache/spark/sql/catalyst/plans/logical/object.scala  | 2 --
 .../main/scala/org/apache/spark/sql/execution/Generate.scala  | 2 +-
 4 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
index 0e0453b517d92..c62d5ead86925 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
@@ -194,7 +194,7 @@ abstract class QueryPlan[PlanType <: TreeNode[PlanType]] extends TreeNode[PlanTy
   }
 
   /** Returns all of the expressions present in this query plan operator. */
-  def expressions: Seq[Expression] = {
+  final def expressions: Seq[Expression] = {
     // Recursively find all expressions from a traversable.
     def seqToExpressions(seq: Traversable[Any]): Traversable[Expression] = seq.flatMap {
       case e: Expression => e :: Nil
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala
index 522348735aadf..411594c95166c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala
@@ -89,9 +89,7 @@ case class Generate(
       generatorOutput.forall(_.resolved)
   }
 
-  // we don't want the gOutput to be taken as part of the expressions
-  // as that will cause exceptions like unresolved attributes etc.
-  override def expressions: Seq[Expression] = generator :: Nil
+  override def producedAttributes: AttributeSet = AttributeSet(generatorOutput)
 
   def output: Seq[Attribute] = {
     val qualified = qualifier.map(q =>
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/object.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/object.scala
index 3f97662957b8e..da7f81c785461 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/object.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/object.scala
@@ -208,8 +208,6 @@ case class CoGroup(
     left: LogicalPlan,
     right: LogicalPlan) extends BinaryNode with ObjectOperator {
 
-  override def producedAttributes: AttributeSet = outputSet
-
   override def deserializers: Seq[(Expression, Seq[Attribute])] =
     // The `leftGroup` and `rightGroup` are guaranteed te be of same schema, so it's safe to resolve
     // the `keyDeserializer` based on either of them, here we pick the left one.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/Generate.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/Generate.scala
index 6bc4649d432ae..9938d2169f1c3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/Generate.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/Generate.scala
@@ -58,7 +58,7 @@ case class Generate(
   private[sql] override lazy val metrics = Map(
     "numOutputRows" -> SQLMetrics.createLongMetric(sparkContext, "number of output rows"))
 
-  override def expressions: Seq[Expression] = generator :: Nil
+  override def producedAttributes: AttributeSet = AttributeSet(output)
 
   val boundGenerator = BindReferences.bindReference(generator, child.output)
 

From ef77003178eb5cdcb4fe519fc540917656c5d577 Mon Sep 17 00:00:00 2001
From: Sameer Agarwal <sameer@databricks.com>
Date: Mon, 7 Mar 2016 12:04:59 -0800
Subject: [PATCH 21/29] [SPARK-13495][SQL] Add Null Filters in the query plan
 for Filters/Joins based on their data constraints

## What changes were proposed in this pull request?

This PR adds an optimizer rule to eliminate reading (unnecessary) NULL values if they are not required for correctness by inserting `isNotNull` filters is the query plan. These filters are currently inserted beneath existing `Filter` and `Join` operators and are inferred based on their data constraints.

Note: While this optimization is applicable to all types of join, it primarily benefits `Inner` and `LeftSemi` joins.

## How was this patch tested?

1. Added a new `NullFilteringSuite` that tests for `IsNotNull` filters in the query plan for joins and filters. Also, tests interaction with the `CombineFilters` optimizer rules.
2. Test generated ExpressionTrees via `OrcFilterSuite`
3. Test filter source pushdown logic via `SimpleTextHadoopFsRelationSuite`

cc yhuai nongli

Author: Sameer Agarwal <sameer@databricks.com>

Closes #11372 from sameeragarwal/gen-isnotnull.
---
 .../sql/catalyst/optimizer/Optimizer.scala    | 49 ++++++++++
 .../optimizer/NullFilteringSuite.scala        | 95 +++++++++++++++++++
 .../spark/sql/catalyst/plans/PlanTest.scala   | 18 +++-
 .../spark/sql/execution/PlannerSuite.scala    |  2 +-
 .../parquet/ParquetFilterSuite.scala          |  6 +-
 .../spark/sql/hive/orc/OrcFilterSuite.scala   | 16 ++--
 .../SimpleTextHadoopFsRelationSuite.scala     |  4 +-
 .../sql/sources/SimpleTextRelation.scala      | 12 ++-
 8 files changed, 182 insertions(+), 20 deletions(-)
 create mode 100644 sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/NullFilteringSuite.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index c83ec0fcb54b7..69ceea632977d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -77,6 +77,7 @@ abstract class Optimizer extends RuleExecutor[LogicalPlan] {
       CombineLimits,
       CombineUnions,
       // Constant folding and strength reduction
+      NullFiltering,
       NullPropagation,
       OptimizeIn,
       ConstantFolding,
@@ -593,6 +594,54 @@ object NullPropagation extends Rule[LogicalPlan] {
   }
 }
 
+/**
+ * Attempts to eliminate reading (unnecessary) NULL values if they are not required for correctness
+ * by inserting isNotNull filters in the query plan. These filters are currently inserted beneath
+ * existing Filters and Join operators and are inferred based on their data constraints.
+ *
+ * Note: While this optimization is applicable to all types of join, it primarily benefits Inner and
+ * LeftSemi joins.
+ */
+object NullFiltering extends Rule[LogicalPlan] with PredicateHelper {
+  def apply(plan: LogicalPlan): LogicalPlan = plan transform {
+    case filter @ Filter(condition, child) =>
+      // We generate a list of additional isNotNull filters from the operator's existing constraints
+      // but remove those that are either already part of the filter condition or are part of the
+      // operator's child constraints.
+      val newIsNotNullConstraints = filter.constraints.filter(_.isInstanceOf[IsNotNull]) --
+        (child.constraints ++ splitConjunctivePredicates(condition))
+      if (newIsNotNullConstraints.nonEmpty) {
+        Filter(And(newIsNotNullConstraints.reduce(And), condition), child)
+      } else {
+        filter
+      }
+
+    case join @ Join(left, right, joinType, condition) =>
+      val leftIsNotNullConstraints = join.constraints
+        .filter(_.isInstanceOf[IsNotNull])
+        .filter(_.references.subsetOf(left.outputSet)) -- left.constraints
+      val rightIsNotNullConstraints =
+        join.constraints
+          .filter(_.isInstanceOf[IsNotNull])
+          .filter(_.references.subsetOf(right.outputSet)) -- right.constraints
+      val newLeftChild = if (leftIsNotNullConstraints.nonEmpty) {
+        Filter(leftIsNotNullConstraints.reduce(And), left)
+      } else {
+        left
+      }
+      val newRightChild = if (rightIsNotNullConstraints.nonEmpty) {
+        Filter(rightIsNotNullConstraints.reduce(And), right)
+      } else {
+        right
+      }
+      if (newLeftChild != left || newRightChild != right) {
+        Join(newLeftChild, newRightChild, joinType, condition)
+      } else {
+        join
+      }
+  }
+}
+
 /**
  * Replaces [[Expression Expressions]] that can be statically evaluated with
  * equivalent [[Literal]] values.
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/NullFilteringSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/NullFilteringSuite.scala
new file mode 100644
index 0000000000000..7e52d5ef6749c
--- /dev/null
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/NullFilteringSuite.scala
@@ -0,0 +1,95 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.optimizer
+
+import org.apache.spark.sql.catalyst.dsl.expressions._
+import org.apache.spark.sql.catalyst.dsl.plans._
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.plans._
+import org.apache.spark.sql.catalyst.plans.logical._
+import org.apache.spark.sql.catalyst.rules._
+
+class NullFilteringSuite extends PlanTest {
+
+  object Optimize extends RuleExecutor[LogicalPlan] {
+    val batches = Batch("NullFiltering", Once, NullFiltering) ::
+      Batch("CombineFilters", FixedPoint(5), CombineFilters) :: Nil
+  }
+
+  val testRelation = LocalRelation('a.int, 'b.int, 'c.int)
+
+  test("filter: filter out nulls in condition") {
+    val originalQuery = testRelation.where('a === 1).analyze
+    val correctAnswer = testRelation.where(IsNotNull('a) && 'a === 1).analyze
+    val optimized = Optimize.execute(originalQuery)
+    comparePlans(optimized, correctAnswer)
+  }
+
+  test("single inner join: filter out nulls on either side on equi-join keys") {
+    val x = testRelation.subquery('x)
+    val y = testRelation.subquery('y)
+    val originalQuery = x.join(y,
+      condition = Some("x.a".attr === "y.a".attr && "x.b".attr === 1 && "y.c".attr > 5)).analyze
+    val left = x.where(IsNotNull('a) && IsNotNull('b))
+    val right = y.where(IsNotNull('a) && IsNotNull('c))
+    val correctAnswer = left.join(right,
+      condition = Some("x.a".attr === "y.a".attr && "x.b".attr === 1 && "y.c".attr > 5)).analyze
+    val optimized = Optimize.execute(originalQuery)
+    comparePlans(optimized, correctAnswer)
+  }
+
+  test("single inner join with pre-existing filters: filter out nulls on either side") {
+    val x = testRelation.subquery('x)
+    val y = testRelation.subquery('y)
+    val originalQuery = x.where('b > 5).join(y.where('c === 10),
+      condition = Some("x.a".attr === "y.a".attr)).analyze
+    val left = x.where(IsNotNull('a) && IsNotNull('b) && 'b > 5)
+    val right = y.where(IsNotNull('a) && IsNotNull('c) && 'c === 10)
+    val correctAnswer = left.join(right,
+      condition = Some("x.a".attr === "y.a".attr)).analyze
+    val optimized = Optimize.execute(originalQuery)
+    comparePlans(optimized, correctAnswer)
+  }
+
+  test("single outer join: no null filters are generated") {
+    val x = testRelation.subquery('x)
+    val y = testRelation.subquery('y)
+    val originalQuery = x.join(y, FullOuter,
+      condition = Some("x.a".attr === "y.a".attr)).analyze
+    val optimized = Optimize.execute(originalQuery)
+    comparePlans(optimized, originalQuery)
+  }
+
+  test("multiple inner joins: filter out nulls on all sides on equi-join keys") {
+    val t1 = testRelation.subquery('t1)
+    val t2 = testRelation.subquery('t2)
+    val t3 = testRelation.subquery('t3)
+    val t4 = testRelation.subquery('t4)
+
+    val originalQuery = t1
+      .join(t2, condition = Some("t1.b".attr === "t2.b".attr))
+      .join(t3, condition = Some("t2.b".attr === "t3.b".attr))
+      .join(t4, condition = Some("t3.b".attr === "t4.b".attr)).analyze
+    val correctAnswer = t1.where(IsNotNull('b))
+      .join(t2.where(IsNotNull('b)), condition = Some("t1.b".attr === "t2.b".attr))
+      .join(t3.where(IsNotNull('b)), condition = Some("t2.b".attr === "t3.b".attr))
+      .join(t4.where(IsNotNull('b)), condition = Some("t3.b".attr === "t4.b".attr)).analyze
+    val optimized = Optimize.execute(originalQuery)
+    comparePlans(optimized, correctAnswer)
+  }
+}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/PlanTest.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/PlanTest.scala
index f9874088b5884..0541844e0bfcd 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/PlanTest.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/PlanTest.scala
@@ -25,7 +25,7 @@ import org.apache.spark.sql.catalyst.util._
 /**
  * Provides helper methods for comparing plans.
  */
-abstract class PlanTest extends SparkFunSuite {
+abstract class PlanTest extends SparkFunSuite with PredicateHelper {
   /**
    * Since attribute references are given globally unique ids during analysis,
    * we must normalize them to check if two different queries are identical.
@@ -39,10 +39,22 @@ abstract class PlanTest extends SparkFunSuite {
     }
   }
 
+  /**
+   * Normalizes the filter conditions that appear in the plan. For instance,
+   * ((expr 1 && expr 2) && expr 3), (expr 1 && expr 2 && expr 3), (expr 3 && (expr 1 && expr 2)
+   * etc., will all now be equivalent.
+   */
+  private def normalizeFilters(plan: LogicalPlan) = {
+    plan transform {
+      case filter @ Filter(condition: Expression, child: LogicalPlan) =>
+        Filter(splitConjunctivePredicates(condition).sortBy(_.hashCode()).reduce(And), child)
+    }
+  }
+
   /** Fails the test if the two plans do not match */
   protected def comparePlans(plan1: LogicalPlan, plan2: LogicalPlan) {
-    val normalized1 = normalizeExprIds(plan1)
-    val normalized2 = normalizeExprIds(plan2)
+    val normalized1 = normalizeFilters(normalizeExprIds(plan1))
+    val normalized2 = normalizeFilters(normalizeExprIds(plan2))
     if (normalized1 != normalized2) {
       fail(
         s"""
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
index f66e08e6ca5c8..a733237a5e717 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
@@ -159,7 +159,7 @@ class PlannerSuite extends SharedSQLContext {
 
       withTempTable("testPushed") {
         val exp = sql("select * from testPushed where key = 15").queryExecution.sparkPlan
-        assert(exp.toString.contains("PushedFilters: [EqualTo(key,15)]"))
+        assert(exp.toString.contains("PushedFilters: [IsNotNull(key), EqualTo(key,15)]"))
       }
     }
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala
index bd51154c58aa6..d2947676a0e58 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala
@@ -74,10 +74,8 @@ class ParquetFilterSuite extends QueryTest with ParquetTest with SharedSQLContex
         selectedFilters.foreach { pred =>
           val maybeFilter = ParquetFilters.createFilter(df.schema, pred)
           assert(maybeFilter.isDefined, s"Couldn't generate filter predicate for $pred")
-          maybeFilter.foreach { f =>
-            // Doesn't bother checking type parameters here (e.g. `Eq[Integer]`)
-            assert(f.getClass === filterClass)
-          }
+          // Doesn't bother checking type parameters here (e.g. `Eq[Integer]`)
+          maybeFilter.exists(_.getClass === filterClass)
         }
         checker(stripSparkFilter(query), expected)
       }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcFilterSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcFilterSuite.scala
index c94e73c4aa300..6ca334dc6d5fe 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcFilterSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcFilterSuite.scala
@@ -61,8 +61,8 @@ class OrcFilterSuite extends QueryTest with OrcTest {
       (predicate: Predicate, filterOperator: PredicateLeaf.Operator)
       (implicit df: DataFrame): Unit = {
     def checkComparisonOperator(filter: SearchArgument) = {
-      val operator = filter.getLeaves.asScala.head.getOperator
-      assert(operator === filterOperator)
+      val operator = filter.getLeaves.asScala
+      assert(operator.map(_.getOperator).contains(filterOperator))
     }
     checkFilterPredicate(df, predicate, checkComparisonOperator)
   }
@@ -216,8 +216,9 @@ class OrcFilterSuite extends QueryTest with OrcTest {
       )
       checkFilterPredicate(
         !('_1 < 4),
-        """leaf-0 = (LESS_THAN _1 4)
-          |expr = (not leaf-0)""".stripMargin.trim
+        """leaf-0 = (IS_NULL _1)
+          |leaf-1 = (LESS_THAN _1 4)
+          |expr = (and (not leaf-0) (not leaf-1))""".stripMargin.trim
       )
       checkFilterPredicate(
         '_1 < 2 || '_1 > 3,
@@ -227,9 +228,10 @@ class OrcFilterSuite extends QueryTest with OrcTest {
       )
       checkFilterPredicate(
         '_1 < 2 && '_1 > 3,
-        """leaf-0 = (LESS_THAN _1 2)
-          |leaf-1 = (LESS_THAN_EQUALS _1 3)
-          |expr = (and leaf-0 (not leaf-1))""".stripMargin.trim
+        """leaf-0 = (IS_NULL _1)
+          |leaf-1 = (LESS_THAN _1 2)
+          |leaf-2 = (LESS_THAN_EQUALS _1 3)
+          |expr = (and (not leaf-0) leaf-1 (not leaf-2))""".stripMargin.trim
       )
     }
   }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextHadoopFsRelationSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextHadoopFsRelationSuite.scala
index 9ab3e11609cec..e64bb77a03a58 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextHadoopFsRelationSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextHadoopFsRelationSuite.scala
@@ -192,14 +192,14 @@ class SimpleTextHadoopFsRelationSuite extends HadoopFsRelationTest with Predicat
       }
 
       markup("Checking pushed filters")
-      assert(SimpleTextRelation.pushedFilters === pushedFilters.toSet)
+      assert(pushedFilters.toSet.subsetOf(SimpleTextRelation.pushedFilters))
 
       val expectedInconvertibleFilters = inconvertibleFilters.map(_.expr).toSet
       val expectedUnhandledFilters = unhandledFilters.map(_.expr).toSet
       val expectedPartitioningFilters = partitioningFilters.map(_.expr).toSet
 
       markup("Checking unhandled and inconvertible filters")
-      assert(expectedInconvertibleFilters ++ expectedUnhandledFilters === nonPushedFilters)
+      assert((expectedInconvertibleFilters ++ expectedUnhandledFilters).subsetOf(nonPushedFilters))
 
       markup("Checking partitioning filters")
       val actualPartitioningFilters = splitConjunctivePredicates(filter.expr).filter {
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextRelation.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextRelation.scala
index 9cdf1fc585866..bb552d6aa3e3f 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextRelation.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextRelation.scala
@@ -141,12 +141,17 @@ class SimpleTextRelation(
       // Constructs a filter predicate to simulate filter push-down
       val predicate = {
         val filterCondition: Expression = filters.collect {
-          // According to `unhandledFilters`, `SimpleTextRelation` only handles `GreaterThan` filter
+          // According to `unhandledFilters`, `SimpleTextRelation` only handles `GreaterThan` and
+          // `isNotNull` filters
           case sources.GreaterThan(column, value) =>
             val dataType = dataSchema(column).dataType
             val literal = Literal.create(value, dataType)
             val attribute = inputAttributes.find(_.name == column).get
             expressions.GreaterThan(attribute, literal)
+          case sources.IsNotNull(column) =>
+            val dataType = dataSchema(column).dataType
+            val attribute = inputAttributes.find(_.name == column).get
+            expressions.IsNotNull(attribute)
         }.reduceOption(expressions.And).getOrElse(Literal(true))
         InterpretedPredicate.create(filterCondition, inputAttributes)
       }
@@ -184,11 +189,12 @@ class SimpleTextRelation(
     }
   }
 
-  // `SimpleTextRelation` only handles `GreaterThan` filter.  This is used to test filter push-down
-  // and `BaseRelation.unhandledFilters()`.
+  // `SimpleTextRelation` only handles `GreaterThan` and `IsNotNull` filters.  This is used to test
+  // filter push-down and `BaseRelation.unhandledFilters()`.
   override def unhandledFilters(filters: Array[Filter]): Array[Filter] = {
     filters.filter {
       case _: GreaterThan => false
+      case _: IsNotNull => false
       case _ => true
     }
   }

From e72914f37de85519fc2aa131bac69d7582de98c8 Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Mon, 7 Mar 2016 12:06:46 -0800
Subject: [PATCH 22/29] [SPARK-12243][BUILD][PYTHON] PySpark tests are slow in
 Jenkins.

## What changes were proposed in this pull request?

In the Jenkins pull request builder, PySpark tests take around [962 seconds ](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/52530/console) of end-to-end time to run, despite the fact that we run four Python test suites in parallel. According to the log, the basic reason is that the long running test starts at the end due to FIFO queue. We first try to reduce the test time by just starting some long running tests first with simple priority queue.

```
========================================================================
Running PySpark tests
========================================================================
...
Finished test(python3.4): pyspark.streaming.tests (213s)
Finished test(pypy): pyspark.sql.tests (92s)
Finished test(pypy): pyspark.streaming.tests (280s)
Tests passed in 962 seconds
```

## How was this patch tested?

Manual check.
Check 'Running PySpark tests' part of the Jenkins log.

Author: Dongjoon Hyun <dongjoon@apache.org>

Closes #11551 from dongjoon-hyun/SPARK-12243.
---
 python/run-tests.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/python/run-tests.py b/python/run-tests.py
index ee73eb1506ca4..a9f8854e6f66a 100755
--- a/python/run-tests.py
+++ b/python/run-tests.py
@@ -157,7 +157,7 @@ def main():
     LOGGER.info("Will test against the following Python executables: %s", python_execs)
     LOGGER.info("Will test the following Python modules: %s", [x.name for x in modules_to_test])
 
-    task_queue = Queue.Queue()
+    task_queue = Queue.PriorityQueue()
     for python_exec in python_execs:
         python_implementation = subprocess_check_output(
             [python_exec, "-c", "import platform; print(platform.python_implementation())"],
@@ -168,12 +168,17 @@ def main():
         for module in modules_to_test:
             if python_implementation not in module.blacklisted_python_implementations:
                 for test_goal in module.python_test_goals:
-                    task_queue.put((python_exec, test_goal))
+                    if test_goal in ('pyspark.streaming.tests', 'pyspark.mllib.tests',
+                                     'pyspark.tests', 'pyspark.sql.tests'):
+                        priority = 0
+                    else:
+                        priority = 100
+                    task_queue.put((priority, (python_exec, test_goal)))
 
     def process_queue(task_queue):
         while True:
             try:
-                (python_exec, test_goal) = task_queue.get_nowait()
+                (priority, (python_exec, test_goal)) = task_queue.get_nowait()
             except Queue.Empty:
                 break
             try:

From a3ec50a4bc867aec7c0796457c4442c14d1bcc2c Mon Sep 17 00:00:00 2001
From: CodingCat <zhunansjtu@gmail.com>
Date: Mon, 7 Mar 2016 12:07:50 -0800
Subject: [PATCH 23/29] [MINOR][DOC] improve the doc for
 "spark.memory.offHeap.size"

The description of "spark.memory.offHeap.size" in the current document does not clearly state that memory is counted with bytes....

This PR contains a small fix for this tiny issue

document fix

Author: CodingCat <zhunansjtu@gmail.com>

Closes #11561 from CodingCat/master.
---
 docs/configuration.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/configuration.md b/docs/configuration.md
index e9b66238bd189..937852ffdecda 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -769,7 +769,7 @@ Apart from these, the following properties are also available, and may be useful
   <td><code>spark.memory.offHeap.size</code></td>
   <td>0</td>
   <td>
-    The absolute amount of memory which can be used for off-heap allocation.
+    The absolute amount of memory in bytes which can be used for off-heap allocation.
     This setting has no impact on heap memory usage, so if your executors' total memory consumption must fit within some hard limit then be sure to shrink your JVM heap size accordingly.
     This must be set to a positive value when <code>spark.memory.offHeap.enabled=true</code>.
   </td>

From b6071a7001aff7a8319e13b31c59e3cc86aad523 Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Mon, 7 Mar 2016 12:09:27 -0800
Subject: [PATCH 24/29] [SPARK-13722][SQL] No Push Down for Non-deterministics
 Predicates through Generate

#### What changes were proposed in this pull request?

Non-deterministic predicates should not be pushed through Generate.

#### How was this patch tested?

Added a test case in `FilterPushdownSuite.scala`

Author: gatorsmile <gatorsmile@gmail.com>

Closes #11562 from gatorsmile/pushPredicateDownWindow.
---
 .../sql/catalyst/optimizer/Optimizer.scala     |  2 +-
 .../optimizer/FilterPushdownSuite.scala        | 18 ++++++++++++++++++
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index 69ceea632977d..deea7238f564c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -901,7 +901,7 @@ object PushPredicateThroughGenerate extends Rule[LogicalPlan] with PredicateHelp
       // Predicates that reference attributes produced by the `Generate` operator cannot
       // be pushed below the operator.
       val (pushDown, stayUp) = splitConjunctivePredicates(condition).partition { cond =>
-        cond.references subsetOf g.child.outputSet
+        cond.references.subsetOf(g.child.outputSet) && cond.deterministic
       }
       if (pushDown.nonEmpty) {
         val pushDownPredicate = pushDown.reduce(And)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala
index 1292aa0003dd7..97a0cde381233 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala
@@ -496,6 +496,24 @@ class FilterPushdownSuite extends PlanTest {
     comparePlans(optimized, correctAnswer)
   }
 
+  test("generate: non-deterministic predicate referenced no generated column") {
+    val originalQuery = {
+      testRelationWithArrayType
+        .generate(Explode('c_arr), true, false, Some("arr"))
+        .where(('b >= 5) && ('a + Rand(10).as("rnd") > 6))
+    }
+    val optimized = Optimize.execute(originalQuery.analyze)
+    val correctAnswer = {
+      testRelationWithArrayType
+        .where('b >= 5)
+        .generate(Explode('c_arr), true, false, Some("arr"))
+        .where('a + Rand(10).as("rnd") > 6)
+        .analyze
+    }
+
+    comparePlans(optimized, correctAnswer)
+  }
+
   test("generate: part of conjuncts referenced generated column") {
     val generator = Explode('c_arr)
     val originalQuery = {

From e9e67b39abb23a88d8be2d0fea5b5fd93184a25b Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Mon, 7 Mar 2016 13:45:45 -0800
Subject: [PATCH 25/29] [SPARK-13655] Improve isolation between tests in
 KinesisBackedBlockRDDSuite

This patch modifies `KinesisBackedBlockRDDTests` to increase the isolation between tests in order to fix a bug which causes the tests to hang.

See #11558 for more details.

/cc zsxwing srowen

Author: Josh Rosen <joshrosen@databricks.com>

Closes #11564 from JoshRosen/SPARK-13655.
---
 .../kinesis/KinesisBackedBlockRDDSuite.scala  | 30 +++++++++++--------
 1 file changed, 17 insertions(+), 13 deletions(-)

diff --git a/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDDSuite.scala b/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDDSuite.scala
index e916f1ee0893b..2555332d222da 100644
--- a/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDDSuite.scala
+++ b/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDDSuite.scala
@@ -17,13 +17,13 @@
 
 package org.apache.spark.streaming.kinesis
 
-import org.scalatest.BeforeAndAfterAll
+import org.scalatest.BeforeAndAfterEach
 
-import org.apache.spark.{SparkConf, SparkContext, SparkException}
+import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext, SparkException}
 import org.apache.spark.storage.{BlockId, BlockManager, StorageLevel, StreamBlockId}
 
 abstract class KinesisBackedBlockRDDTests(aggregateTestData: Boolean)
-  extends KinesisFunSuite with BeforeAndAfterAll {
+  extends KinesisFunSuite with BeforeAndAfterEach with LocalSparkContext {
 
   private val testData = 1 to 8
 
@@ -35,10 +35,10 @@ abstract class KinesisBackedBlockRDDTests(aggregateTestData: Boolean)
   private var shardIdToRange: Map[String, SequenceNumberRange] = null
   private var allRanges: Seq[SequenceNumberRange] = null
 
-  private var sc: SparkContext = null
   private var blockManager: BlockManager = null
 
   override def beforeAll(): Unit = {
+    super.beforeAll()
     runIfTestsEnabled("Prepare KinesisTestUtils") {
       testUtils = new KPLBasedKinesisTestUtils()
       testUtils.createStream()
@@ -55,19 +55,23 @@ abstract class KinesisBackedBlockRDDTests(aggregateTestData: Boolean)
         (shardId, seqNumRange)
       }
       allRanges = shardIdToRange.values.toSeq
-
-      val conf = new SparkConf().setMaster("local[4]").setAppName("KinesisBackedBlockRDDSuite")
-      sc = new SparkContext(conf)
-      blockManager = sc.env.blockManager
     }
   }
 
+  override def beforeEach(): Unit = {
+    super.beforeEach()
+    val conf = new SparkConf().setMaster("local[4]").setAppName("KinesisBackedBlockRDDSuite")
+    sc = new SparkContext(conf)
+    blockManager = sc.env.blockManager
+  }
+
   override def afterAll(): Unit = {
-    if (testUtils != null) {
-      testUtils.deleteStream()
-    }
-    if (sc != null) {
-      sc.stop()
+    try {
+      if (testUtils != null) {
+        testUtils.deleteStream()
+      }
+    } finally {
+      super.afterAll()
     }
   }
 

From e1fb857992074164dcaa02498c5a9604fac6f57e Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Mon, 7 Mar 2016 14:13:44 -0800
Subject: [PATCH 26/29] [SPARK-529][CORE][YARN] Add type-safe config keys to
 SparkConf.

This is, in a way, the basics to enable SPARK-529 (which was closed as
won't fix but I think is still valuable). In fact, Spark SQL created
something for that, and this change basically factors out that code
and inserts it into SparkConf, with some extra bells and whistles.

To showcase the usage of this pattern, I modified the YARN backend
to use the new config keys (defined in the new `config` package object
under `o.a.s.deploy.yarn`). Most of the changes are mechanic, although
logic had to be slightly modified in a handful of places.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #10205 from vanzin/conf-opts.
---
 .../apache/spark/network/util/JavaUtils.java  |  25 +-
 .../scala/org/apache/spark/SparkConf.scala    |  39 ++-
 .../spark/internal/config/ConfigBuilder.scala | 184 +++++++++++++
 .../spark/internal/config/ConfigEntry.scala   | 111 ++++++++
 .../spark/internal/config/package.scala       |  76 ++++++
 .../internal/config/ConfigEntrySuite.scala    | 155 +++++++++++
 .../yarn/AMDelegationTokenRenewer.scala       |  14 +-
 .../spark/deploy/yarn/ApplicationMaster.scala |  28 +-
 .../org/apache/spark/deploy/yarn/Client.scala | 230 ++++++++---------
 .../spark/deploy/yarn/ClientArguments.scala   |  53 ++--
 .../yarn/ExecutorDelegationTokenUpdater.scala |   3 +-
 .../spark/deploy/yarn/ExecutorRunnable.scala  |  14 +-
 ...yPreferredContainerPlacementStrategy.scala |   6 +-
 .../spark/deploy/yarn/YarnAllocator.scala     |  10 +-
 .../spark/deploy/yarn/YarnRMClient.scala      |   3 +-
 .../deploy/yarn/YarnSparkHadoopUtil.scala     |  18 +-
 .../org/apache/spark/deploy/yarn/config.scala | 243 ++++++++++++++++++
 .../cluster/SchedulerExtensionService.scala   |  32 +--
 .../spark/deploy/yarn/ClientSuite.scala       |  26 +-
 .../ExtensionServiceIntegrationSuite.scala    |   4 +-
 20 files changed, 1019 insertions(+), 255 deletions(-)
 create mode 100644 core/src/main/scala/org/apache/spark/internal/config/ConfigBuilder.scala
 create mode 100644 core/src/main/scala/org/apache/spark/internal/config/ConfigEntry.scala
 create mode 100644 core/src/main/scala/org/apache/spark/internal/config/package.scala
 create mode 100644 core/src/test/scala/org/apache/spark/internal/config/ConfigEntrySuite.scala
 create mode 100644 yarn/src/main/scala/org/apache/spark/deploy/yarn/config.scala

diff --git a/common/network-common/src/main/java/org/apache/spark/network/util/JavaUtils.java b/common/network-common/src/main/java/org/apache/spark/network/util/JavaUtils.java
index b3d8e0cd7cdcd..ccc527306d920 100644
--- a/common/network-common/src/main/java/org/apache/spark/network/util/JavaUtils.java
+++ b/common/network-common/src/main/java/org/apache/spark/network/util/JavaUtils.java
@@ -159,10 +159,10 @@ private static boolean isSymlink(File file) throws IOException {
       .build();
 
   /**
-   * Convert a passed time string (e.g. 50s, 100ms, or 250us) to a time count for
-   * internal use. If no suffix is provided a direct conversion is attempted.
+   * Convert a passed time string (e.g. 50s, 100ms, or 250us) to a time count in the given unit.
+   * The unit is also considered the default if the given string does not specify a unit.
    */
-  private static long parseTimeString(String str, TimeUnit unit) {
+  public static long timeStringAs(String str, TimeUnit unit) {
     String lower = str.toLowerCase().trim();
 
     try {
@@ -195,7 +195,7 @@ private static long parseTimeString(String str, TimeUnit unit) {
    * no suffix is provided, the passed number is assumed to be in ms.
    */
   public static long timeStringAsMs(String str) {
-    return parseTimeString(str, TimeUnit.MILLISECONDS);
+    return timeStringAs(str, TimeUnit.MILLISECONDS);
   }
 
   /**
@@ -203,15 +203,14 @@ public static long timeStringAsMs(String str) {
    * no suffix is provided, the passed number is assumed to be in seconds.
    */
   public static long timeStringAsSec(String str) {
-    return parseTimeString(str, TimeUnit.SECONDS);
+    return timeStringAs(str, TimeUnit.SECONDS);
   }
 
   /**
-   * Convert a passed byte string (e.g. 50b, 100kb, or 250mb) to a ByteUnit for
-   * internal use. If no suffix is provided a direct conversion of the provided default is
-   * attempted.
+   * Convert a passed byte string (e.g. 50b, 100kb, or 250mb) to the given. If no suffix is
+   * provided, a direct conversion to the provided unit is attempted.
    */
-  private static long parseByteString(String str, ByteUnit unit) {
+  public static long byteStringAs(String str, ByteUnit unit) {
     String lower = str.toLowerCase().trim();
 
     try {
@@ -252,7 +251,7 @@ private static long parseByteString(String str, ByteUnit unit) {
    * If no suffix is provided, the passed number is assumed to be in bytes.
    */
   public static long byteStringAsBytes(String str) {
-    return parseByteString(str, ByteUnit.BYTE);
+    return byteStringAs(str, ByteUnit.BYTE);
   }
 
   /**
@@ -262,7 +261,7 @@ public static long byteStringAsBytes(String str) {
    * If no suffix is provided, the passed number is assumed to be in kibibytes.
    */
   public static long byteStringAsKb(String str) {
-    return parseByteString(str, ByteUnit.KiB);
+    return byteStringAs(str, ByteUnit.KiB);
   }
 
   /**
@@ -272,7 +271,7 @@ public static long byteStringAsKb(String str) {
    * If no suffix is provided, the passed number is assumed to be in mebibytes.
    */
   public static long byteStringAsMb(String str) {
-    return parseByteString(str, ByteUnit.MiB);
+    return byteStringAs(str, ByteUnit.MiB);
   }
 
   /**
@@ -282,7 +281,7 @@ public static long byteStringAsMb(String str) {
    * If no suffix is provided, the passed number is assumed to be in gibibytes.
    */
   public static long byteStringAsGb(String str) {
-    return parseByteString(str, ByteUnit.GiB);
+    return byteStringAs(str, ByteUnit.GiB);
   }
 
   /**
diff --git a/core/src/main/scala/org/apache/spark/SparkConf.scala b/core/src/main/scala/org/apache/spark/SparkConf.scala
index b81bfb3182212..16423e771a3de 100644
--- a/core/src/main/scala/org/apache/spark/SparkConf.scala
+++ b/core/src/main/scala/org/apache/spark/SparkConf.scala
@@ -17,13 +17,15 @@
 
 package org.apache.spark
 
-import java.util.concurrent.ConcurrentHashMap
+import java.util.concurrent.{ConcurrentHashMap, TimeUnit}
 
 import scala.collection.JavaConverters._
 import scala.collection.mutable.LinkedHashSet
 
 import org.apache.avro.{Schema, SchemaNormalization}
 
+import org.apache.spark.internal.config.{ConfigEntry, OptionalConfigEntry}
+import org.apache.spark.network.util.JavaUtils
 import org.apache.spark.serializer.KryoSerializer
 import org.apache.spark.util.Utils
 
@@ -74,6 +76,16 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging {
     this
   }
 
+  private[spark] def set[T](entry: ConfigEntry[T], value: T): SparkConf = {
+    set(entry.key, entry.stringConverter(value))
+    this
+  }
+
+  private[spark] def set[T](entry: OptionalConfigEntry[T], value: T): SparkConf = {
+    set(entry.key, entry.rawStringConverter(value))
+    this
+  }
+
   /**
    * The master URL to connect to, such as "local" to run locally with one thread, "local[4]" to
    * run locally with 4 cores, or "spark://master:7077" to run on a Spark standalone cluster.
@@ -148,6 +160,20 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging {
     this
   }
 
+  private[spark] def setIfMissing[T](entry: ConfigEntry[T], value: T): SparkConf = {
+    if (settings.putIfAbsent(entry.key, entry.stringConverter(value)) == null) {
+      logDeprecationWarning(entry.key)
+    }
+    this
+  }
+
+  private[spark] def setIfMissing[T](entry: OptionalConfigEntry[T], value: T): SparkConf = {
+    if (settings.putIfAbsent(entry.key, entry.rawStringConverter(value)) == null) {
+      logDeprecationWarning(entry.key)
+    }
+    this
+  }
+
   /**
    * Use Kryo serialization and register the given set of classes with Kryo.
    * If called multiple times, this will append the classes from all calls together.
@@ -198,6 +224,17 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging {
     getOption(key).getOrElse(defaultValue)
   }
 
+  /**
+   * Retrieves the value of a pre-defined configuration entry.
+   *
+   * - This is an internal Spark API.
+   * - The return type if defined by the configuration entry.
+   * - This will throw an exception is the config is not optional and the value is not set.
+   */
+  private[spark] def get[T](entry: ConfigEntry[T]): T = {
+    entry.readFrom(this)
+  }
+
   /**
    * Get a time parameter as seconds; throws a NoSuchElementException if it's not set. If no
    * suffix is provided then seconds are assumed.
diff --git a/core/src/main/scala/org/apache/spark/internal/config/ConfigBuilder.scala b/core/src/main/scala/org/apache/spark/internal/config/ConfigBuilder.scala
new file mode 100644
index 0000000000000..770b43697a176
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/internal/config/ConfigBuilder.scala
@@ -0,0 +1,184 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.internal.config
+
+import java.util.concurrent.TimeUnit
+
+import org.apache.spark.network.util.{ByteUnit, JavaUtils}
+
+private object ConfigHelpers {
+
+  def toNumber[T](s: String, converter: String => T, key: String, configType: String): T = {
+    try {
+      converter(s)
+    } catch {
+      case _: NumberFormatException =>
+        throw new IllegalArgumentException(s"$key should be $configType, but was $s")
+    }
+  }
+
+  def toBoolean(s: String, key: String): Boolean = {
+    try {
+      s.toBoolean
+    } catch {
+      case _: IllegalArgumentException =>
+        throw new IllegalArgumentException(s"$key should be boolean, but was $s")
+    }
+  }
+
+  def stringToSeq[T](str: String, converter: String => T): Seq[T] = {
+    str.split(",").map(_.trim()).filter(_.nonEmpty).map(converter)
+  }
+
+  def seqToString[T](v: Seq[T], stringConverter: T => String): String = {
+    v.map(stringConverter).mkString(",")
+  }
+
+  def timeFromString(str: String, unit: TimeUnit): Long = JavaUtils.timeStringAs(str, unit)
+
+  def timeToString(v: Long, unit: TimeUnit): String = TimeUnit.MILLISECONDS.convert(v, unit) + "ms"
+
+  def byteFromString(str: String, unit: ByteUnit): Long = {
+    val (input, multiplier) =
+      if (str.length() > 0 && str.charAt(0) == '-') {
+        (str.substring(1), -1)
+      } else {
+        (str, 1)
+      }
+    multiplier * JavaUtils.byteStringAs(input, unit)
+  }
+
+  def byteToString(v: Long, unit: ByteUnit): String = unit.convertTo(v, ByteUnit.BYTE) + "b"
+
+}
+
+/**
+ * A type-safe config builder. Provides methods for transforming the input data (which can be
+ * used, e.g., for validation) and creating the final config entry.
+ *
+ * One of the methods that return a [[ConfigEntry]] must be called to create a config entry that
+ * can be used with [[SparkConf]].
+ */
+private[spark] class TypedConfigBuilder[T](
+  val parent: ConfigBuilder,
+  val converter: String => T,
+  val stringConverter: T => String) {
+
+  import ConfigHelpers._
+
+  def this(parent: ConfigBuilder, converter: String => T) = {
+    this(parent, converter, Option(_).map(_.toString).orNull)
+  }
+
+  def transform(fn: T => T): TypedConfigBuilder[T] = {
+    new TypedConfigBuilder(parent, s => fn(converter(s)), stringConverter)
+  }
+
+  def checkValues(validValues: Set[T]): TypedConfigBuilder[T] = {
+    transform { v =>
+      if (!validValues.contains(v)) {
+        throw new IllegalArgumentException(
+          s"The value of ${parent.key} should be one of ${validValues.mkString(", ")}, but was $v")
+      }
+      v
+    }
+  }
+
+  def toSequence: TypedConfigBuilder[Seq[T]] = {
+    new TypedConfigBuilder(parent, stringToSeq(_, converter), seqToString(_, stringConverter))
+  }
+
+  /** Creates a [[ConfigEntry]] that does not require a default value. */
+  def optional: OptionalConfigEntry[T] = {
+    new OptionalConfigEntry[T](parent.key, converter, stringConverter, parent._doc, parent._public)
+  }
+
+  /** Creates a [[ConfigEntry]] that has a default value. */
+  def withDefault(default: T): ConfigEntry[T] = {
+    val transformedDefault = converter(stringConverter(default))
+    new ConfigEntryWithDefault[T](parent.key, transformedDefault, converter, stringConverter,
+      parent._doc, parent._public)
+  }
+
+  /**
+   * Creates a [[ConfigEntry]] that has a default value. The default value is provided as a
+   * [[String]] and must be a valid value for the entry.
+   */
+  def withDefaultString(default: String): ConfigEntry[T] = {
+    val typedDefault = converter(default)
+    new ConfigEntryWithDefault[T](parent.key, typedDefault, converter, stringConverter, parent._doc,
+      parent._public)
+  }
+
+}
+
+/**
+ * Basic builder for Spark configurations. Provides methods for creating type-specific builders.
+ *
+ * @see TypedConfigBuilder
+ */
+private[spark] case class ConfigBuilder(key: String) {
+
+  import ConfigHelpers._
+
+  var _public = true
+  var _doc = ""
+
+  def internal: ConfigBuilder = {
+    _public = false
+    this
+  }
+
+  def doc(s: String): ConfigBuilder = {
+    _doc = s
+    this
+  }
+
+  def intConf: TypedConfigBuilder[Int] = {
+    new TypedConfigBuilder(this, toNumber(_, _.toInt, key, "int"))
+  }
+
+  def longConf: TypedConfigBuilder[Long] = {
+    new TypedConfigBuilder(this, toNumber(_, _.toLong, key, "long"))
+  }
+
+  def doubleConf: TypedConfigBuilder[Double] = {
+    new TypedConfigBuilder(this, toNumber(_, _.toDouble, key, "double"))
+  }
+
+  def booleanConf: TypedConfigBuilder[Boolean] = {
+    new TypedConfigBuilder(this, toBoolean(_, key))
+  }
+
+  def stringConf: TypedConfigBuilder[String] = {
+    new TypedConfigBuilder(this, v => v)
+  }
+
+  def timeConf(unit: TimeUnit): TypedConfigBuilder[Long] = {
+    new TypedConfigBuilder(this, timeFromString(_, unit), timeToString(_, unit))
+  }
+
+  def bytesConf(unit: ByteUnit): TypedConfigBuilder[Long] = {
+    new TypedConfigBuilder(this, byteFromString(_, unit), byteToString(_, unit))
+  }
+
+  def fallbackConf[T](fallback: ConfigEntry[T]): ConfigEntry[T] = {
+    new FallbackConfigEntry(key, _doc, _public, fallback)
+  }
+
+}
diff --git a/core/src/main/scala/org/apache/spark/internal/config/ConfigEntry.scala b/core/src/main/scala/org/apache/spark/internal/config/ConfigEntry.scala
new file mode 100644
index 0000000000000..f7296b487c0e9
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/internal/config/ConfigEntry.scala
@@ -0,0 +1,111 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.internal.config
+
+import org.apache.spark.SparkConf
+
+/**
+ * An entry contains all meta information for a configuration.
+ *
+ * @param key the key for the configuration
+ * @param defaultValue the default value for the configuration
+ * @param valueConverter how to convert a string to the value. It should throw an exception if the
+ *                       string does not have the required format.
+ * @param stringConverter how to convert a value to a string that the user can use it as a valid
+ *                        string value. It's usually `toString`. But sometimes, a custom converter
+ *                        is necessary. E.g., if T is List[String], `a, b, c` is better than
+ *                        `List(a, b, c)`.
+ * @param doc the documentation for the configuration
+ * @param isPublic if this configuration is public to the user. If it's `false`, this
+ *                 configuration is only used internally and we should not expose it to users.
+ * @tparam T the value type
+ */
+private[spark] abstract class ConfigEntry[T] (
+    val key: String,
+    val valueConverter: String => T,
+    val stringConverter: T => String,
+    val doc: String,
+    val isPublic: Boolean) {
+
+  def defaultValueString: String
+
+  def readFrom(conf: SparkConf): T
+
+  // This is used by SQLConf, since it doesn't use SparkConf to store settings and thus cannot
+  // use readFrom().
+  def defaultValue: Option[T] = None
+
+  override def toString: String = {
+    s"ConfigEntry(key=$key, defaultValue=$defaultValueString, doc=$doc, public=$isPublic)"
+  }
+}
+
+private class ConfigEntryWithDefault[T] (
+    key: String,
+    _defaultValue: T,
+    valueConverter: String => T,
+    stringConverter: T => String,
+    doc: String,
+    isPublic: Boolean)
+    extends ConfigEntry(key, valueConverter, stringConverter, doc, isPublic) {
+
+  override def defaultValue: Option[T] = Some(_defaultValue)
+
+  override def defaultValueString: String = stringConverter(_defaultValue)
+
+  override def readFrom(conf: SparkConf): T = {
+    conf.getOption(key).map(valueConverter).getOrElse(_defaultValue)
+  }
+
+}
+
+/**
+ * A config entry that does not have a default value.
+ */
+private[spark] class OptionalConfigEntry[T](
+    key: String,
+    val rawValueConverter: String => T,
+    val rawStringConverter: T => String,
+    doc: String,
+    isPublic: Boolean)
+    extends ConfigEntry[Option[T]](key, s => Some(rawValueConverter(s)),
+      v => v.map(rawStringConverter).orNull, doc, isPublic) {
+
+  override def defaultValueString: String = "<undefined>"
+
+  override def readFrom(conf: SparkConf): Option[T] = conf.getOption(key).map(rawValueConverter)
+
+}
+
+/**
+ * A config entry whose default value is defined by another config entry.
+ */
+private class FallbackConfigEntry[T] (
+    key: String,
+    doc: String,
+    isPublic: Boolean,
+    private val fallback: ConfigEntry[T])
+    extends ConfigEntry[T](key, fallback.valueConverter, fallback.stringConverter, doc, isPublic) {
+
+  override def defaultValueString: String = s"<value of ${fallback.key}>"
+
+  override def readFrom(conf: SparkConf): T = {
+    conf.getOption(key).map(valueConverter).getOrElse(fallback.readFrom(conf))
+  }
+
+}
diff --git a/core/src/main/scala/org/apache/spark/internal/config/package.scala b/core/src/main/scala/org/apache/spark/internal/config/package.scala
new file mode 100644
index 0000000000000..f2f20b3207577
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/internal/config/package.scala
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.internal
+
+import org.apache.spark.launcher.SparkLauncher
+
+package object config {
+
+  private[spark] val DRIVER_CLASS_PATH =
+    ConfigBuilder(SparkLauncher.DRIVER_EXTRA_CLASSPATH).stringConf.optional
+
+  private[spark] val DRIVER_JAVA_OPTIONS =
+    ConfigBuilder(SparkLauncher.DRIVER_EXTRA_JAVA_OPTIONS).stringConf.optional
+
+  private[spark] val DRIVER_LIBRARY_PATH =
+    ConfigBuilder(SparkLauncher.DRIVER_EXTRA_LIBRARY_PATH).stringConf.optional
+
+  private[spark] val DRIVER_USER_CLASS_PATH_FIRST =
+    ConfigBuilder("spark.driver.userClassPathFirst").booleanConf.withDefault(false)
+
+  private[spark] val EXECUTOR_CLASS_PATH =
+    ConfigBuilder(SparkLauncher.EXECUTOR_EXTRA_CLASSPATH).stringConf.optional
+
+  private[spark] val EXECUTOR_JAVA_OPTIONS =
+    ConfigBuilder(SparkLauncher.EXECUTOR_EXTRA_JAVA_OPTIONS).stringConf.optional
+
+  private[spark] val EXECUTOR_LIBRARY_PATH =
+    ConfigBuilder(SparkLauncher.EXECUTOR_EXTRA_LIBRARY_PATH).stringConf.optional
+
+  private[spark] val EXECUTOR_USER_CLASS_PATH_FIRST =
+    ConfigBuilder("spark.executor.userClassPathFirst").booleanConf.withDefault(false)
+
+  private[spark] val IS_PYTHON_APP = ConfigBuilder("spark.yarn.isPython").internal
+    .booleanConf.withDefault(false)
+
+  private[spark] val CPUS_PER_TASK = ConfigBuilder("spark.task.cpus").intConf.withDefault(1)
+
+  private[spark] val DYN_ALLOCATION_MIN_EXECUTORS =
+    ConfigBuilder("spark.dynamicAllocation.minExecutors").intConf.withDefault(0)
+
+  private[spark] val DYN_ALLOCATION_INITIAL_EXECUTORS =
+    ConfigBuilder("spark.dynamicAllocation.initialExecutors")
+      .fallbackConf(DYN_ALLOCATION_MIN_EXECUTORS)
+
+  private[spark] val DYN_ALLOCATION_MAX_EXECUTORS =
+    ConfigBuilder("spark.dynamicAllocation.maxExecutors").intConf.withDefault(Int.MaxValue)
+
+  private[spark] val SHUFFLE_SERVICE_ENABLED =
+    ConfigBuilder("spark.shuffle.service.enabled").booleanConf.withDefault(false)
+
+  private[spark] val KEYTAB = ConfigBuilder("spark.yarn.keytab")
+    .doc("Location of user's keytab.")
+    .stringConf.optional
+
+  private[spark] val PRINCIPAL = ConfigBuilder("spark.yarn.principal")
+    .doc("Name of the Kerberos principal.")
+    .stringConf.optional
+
+  private[spark] val EXECUTOR_INSTANCES = ConfigBuilder("spark.executor.instances").intConf.optional
+
+}
diff --git a/core/src/test/scala/org/apache/spark/internal/config/ConfigEntrySuite.scala b/core/src/test/scala/org/apache/spark/internal/config/ConfigEntrySuite.scala
new file mode 100644
index 0000000000000..0644148eaea56
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/internal/config/ConfigEntrySuite.scala
@@ -0,0 +1,155 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.internal.config
+
+import java.util.concurrent.TimeUnit
+
+import org.apache.spark.{SparkConf, SparkFunSuite}
+import org.apache.spark.network.util.ByteUnit
+
+class ConfigEntrySuite extends SparkFunSuite {
+
+  test("conf entry: int") {
+    val conf = new SparkConf()
+    val iConf = ConfigBuilder("spark.int").intConf.withDefault(1)
+    assert(conf.get(iConf) === 1)
+    conf.set(iConf, 2)
+    assert(conf.get(iConf) === 2)
+  }
+
+  test("conf entry: long") {
+    val conf = new SparkConf()
+    val lConf = ConfigBuilder("spark.long").longConf.withDefault(0L)
+    conf.set(lConf, 1234L)
+    assert(conf.get(lConf) === 1234L)
+  }
+
+  test("conf entry: double") {
+    val conf = new SparkConf()
+    val dConf = ConfigBuilder("spark.double").doubleConf.withDefault(0.0)
+    conf.set(dConf, 20.0)
+    assert(conf.get(dConf) === 20.0)
+  }
+
+  test("conf entry: boolean") {
+    val conf = new SparkConf()
+    val bConf = ConfigBuilder("spark.boolean").booleanConf.withDefault(false)
+    assert(!conf.get(bConf))
+    conf.set(bConf, true)
+    assert(conf.get(bConf))
+  }
+
+  test("conf entry: optional") {
+    val conf = new SparkConf()
+    val optionalConf = ConfigBuilder("spark.optional").intConf.optional
+    assert(conf.get(optionalConf) === None)
+    conf.set(optionalConf, 1)
+    assert(conf.get(optionalConf) === Some(1))
+  }
+
+  test("conf entry: fallback") {
+    val conf = new SparkConf()
+    val parentConf = ConfigBuilder("spark.int").intConf.withDefault(1)
+    val confWithFallback = ConfigBuilder("spark.fallback").fallbackConf(parentConf)
+    assert(conf.get(confWithFallback) === 1)
+    conf.set(confWithFallback, 2)
+    assert(conf.get(parentConf) === 1)
+    assert(conf.get(confWithFallback) === 2)
+  }
+
+  test("conf entry: time") {
+    val conf = new SparkConf()
+    val time = ConfigBuilder("spark.time").timeConf(TimeUnit.SECONDS).withDefaultString("1h")
+    assert(conf.get(time) === 3600L)
+    conf.set(time.key, "1m")
+    assert(conf.get(time) === 60L)
+  }
+
+  test("conf entry: bytes") {
+    val conf = new SparkConf()
+    val bytes = ConfigBuilder("spark.bytes").bytesConf(ByteUnit.KiB).withDefaultString("1m")
+    assert(conf.get(bytes) === 1024L)
+    conf.set(bytes.key, "1k")
+    assert(conf.get(bytes) === 1L)
+  }
+
+  test("conf entry: string seq") {
+    val conf = new SparkConf()
+    val seq = ConfigBuilder("spark.seq").stringConf.toSequence.withDefault(Seq())
+    conf.set(seq.key, "1,,2, 3 , , 4")
+    assert(conf.get(seq) === Seq("1", "2", "3", "4"))
+    conf.set(seq, Seq("1", "2"))
+    assert(conf.get(seq) === Seq("1", "2"))
+  }
+
+  test("conf entry: int seq") {
+    val conf = new SparkConf()
+    val seq = ConfigBuilder("spark.seq").intConf.toSequence.withDefault(Seq())
+    conf.set(seq.key, "1,,2, 3 , , 4")
+    assert(conf.get(seq) === Seq(1, 2, 3, 4))
+    conf.set(seq, Seq(1, 2))
+    assert(conf.get(seq) === Seq(1, 2))
+  }
+
+  test("conf entry: transformation") {
+    val conf = new SparkConf()
+    val transformationConf = ConfigBuilder("spark.transformation")
+      .stringConf
+      .transform(_.toLowerCase())
+      .withDefault("FOO")
+
+    assert(conf.get(transformationConf) === "foo")
+    conf.set(transformationConf, "BAR")
+    assert(conf.get(transformationConf) === "bar")
+  }
+
+  test("conf entry: valid values check") {
+    val conf = new SparkConf()
+    val enum = ConfigBuilder("spark.enum")
+      .stringConf
+      .checkValues(Set("a", "b", "c"))
+      .withDefault("a")
+    assert(conf.get(enum) === "a")
+
+    conf.set(enum, "b")
+    assert(conf.get(enum) === "b")
+
+    conf.set(enum, "d")
+    val enumError = intercept[IllegalArgumentException] {
+      conf.get(enum)
+    }
+    assert(enumError.getMessage === s"The value of ${enum.key} should be one of a, b, c, but was d")
+  }
+
+  test("conf entry: conversion error") {
+    val conf = new SparkConf()
+    val conversionTest = ConfigBuilder("spark.conversionTest").doubleConf.optional
+    conf.set(conversionTest.key, "abc")
+    val conversionError = intercept[IllegalArgumentException] {
+      conf.get(conversionTest)
+    }
+    assert(conversionError.getMessage === s"${conversionTest.key} should be double, but was abc")
+  }
+
+  test("default value handling is null-safe") {
+    val conf = new SparkConf()
+    val stringConf = ConfigBuilder("spark.string").stringConf.withDefault(null)
+    assert(conf.get(stringConf) === null)
+  }
+
+}
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/AMDelegationTokenRenewer.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/AMDelegationTokenRenewer.scala
index 70b67d21ecc78..6e95bb97105fd 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/AMDelegationTokenRenewer.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/AMDelegationTokenRenewer.scala
@@ -27,6 +27,8 @@ import org.apache.hadoop.security.UserGroupInformation
 
 import org.apache.spark.{Logging, SparkConf}
 import org.apache.spark.deploy.SparkHadoopUtil
+import org.apache.spark.deploy.yarn.config._
+import org.apache.spark.internal.config._
 import org.apache.spark.util.ThreadUtils
 
 /*
@@ -60,11 +62,9 @@ private[yarn] class AMDelegationTokenRenewer(
 
   private val hadoopUtil = YarnSparkHadoopUtil.get
 
-  private val credentialsFile = sparkConf.get("spark.yarn.credentials.file")
-  private val daysToKeepFiles =
-    sparkConf.getInt("spark.yarn.credentials.file.retention.days", 5)
-  private val numFilesToKeep =
-    sparkConf.getInt("spark.yarn.credentials.file.retention.count", 5)
+  private val credentialsFile = sparkConf.get(CREDENTIALS_FILE_PATH)
+  private val daysToKeepFiles = sparkConf.get(CREDENTIALS_FILE_MAX_RETENTION)
+  private val numFilesToKeep = sparkConf.get(CREDENTIAL_FILE_MAX_COUNT)
   private val freshHadoopConf =
     hadoopUtil.getConfBypassingFSCache(hadoopConf, new Path(credentialsFile).toUri.getScheme)
 
@@ -76,8 +76,8 @@ private[yarn] class AMDelegationTokenRenewer(
    *
    */
   private[spark] def scheduleLoginFromKeytab(): Unit = {
-    val principal = sparkConf.get("spark.yarn.principal")
-    val keytab = sparkConf.get("spark.yarn.keytab")
+    val principal = sparkConf.get(PRINCIPAL).get
+    val keytab = sparkConf.get(KEYTAB).get
 
     /**
      * Schedule re-login and creation of new tokens. If tokens have already expired, this method
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
index 9f586bf4c1979..7d7bf88b9eb12 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
@@ -32,6 +32,8 @@ import org.apache.hadoop.yarn.conf.YarnConfiguration
 import org.apache.spark._
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.deploy.history.HistoryServer
+import org.apache.spark.deploy.yarn.config._
+import org.apache.spark.internal.config._
 import org.apache.spark.rpc._
 import org.apache.spark.scheduler.cluster.{CoarseGrainedSchedulerBackend, YarnSchedulerBackend}
 import org.apache.spark.scheduler.cluster.CoarseGrainedClusterMessages._
@@ -65,16 +67,15 @@ private[spark] class ApplicationMaster(
   // allocation is enabled), with a minimum of 3.
 
   private val maxNumExecutorFailures = {
-    val defaultKey =
+    val effectiveNumExecutors =
       if (Utils.isDynamicAllocationEnabled(sparkConf)) {
-        "spark.dynamicAllocation.maxExecutors"
+        sparkConf.get(DYN_ALLOCATION_MAX_EXECUTORS)
       } else {
-        "spark.executor.instances"
+        sparkConf.get(EXECUTOR_INSTANCES).getOrElse(0)
       }
-    val effectiveNumExecutors = sparkConf.getInt(defaultKey, 0)
     val defaultMaxNumExecutorFailures = math.max(3, 2 * effectiveNumExecutors)
 
-    sparkConf.getInt("spark.yarn.max.executor.failures", defaultMaxNumExecutorFailures)
+    sparkConf.get(MAX_EXECUTOR_FAILURES).getOrElse(defaultMaxNumExecutorFailures)
   }
 
   @volatile private var exitCode = 0
@@ -95,14 +96,13 @@ private[spark] class ApplicationMaster(
   private val heartbeatInterval = {
     // Ensure that progress is sent before YarnConfiguration.RM_AM_EXPIRY_INTERVAL_MS elapses.
     val expiryInterval = yarnConf.getInt(YarnConfiguration.RM_AM_EXPIRY_INTERVAL_MS, 120000)
-    math.max(0, math.min(expiryInterval / 2,
-      sparkConf.getTimeAsMs("spark.yarn.scheduler.heartbeat.interval-ms", "3s")))
+    math.max(0, math.min(expiryInterval / 2, sparkConf.get(RM_HEARTBEAT_INTERVAL)))
   }
 
   // Initial wait interval before allocator poll, to allow for quicker ramp up when executors are
   // being requested.
   private val initialAllocationInterval = math.min(heartbeatInterval,
-    sparkConf.getTimeAsMs("spark.yarn.scheduler.initial-allocation.interval", "200ms"))
+    sparkConf.get(INITIAL_HEARTBEAT_INTERVAL))
 
   // Next wait interval before allocator poll.
   private var nextAllocationInterval = initialAllocationInterval
@@ -178,7 +178,7 @@ private[spark] class ApplicationMaster(
 
       // If the credentials file config is present, we must periodically renew tokens. So create
       // a new AMDelegationTokenRenewer
-      if (sparkConf.contains("spark.yarn.credentials.file")) {
+      if (sparkConf.contains(CREDENTIALS_FILE_PATH.key)) {
         delegationTokenRenewerOption = Some(new AMDelegationTokenRenewer(sparkConf, yarnConf))
         // If a principal and keytab have been set, use that to create new credentials for executors
         // periodically
@@ -275,7 +275,7 @@ private[spark] class ApplicationMaster(
     val appId = client.getAttemptId().getApplicationId().toString()
     val attemptId = client.getAttemptId().getAttemptId().toString()
     val historyAddress =
-      sparkConf.getOption("spark.yarn.historyServer.address")
+      sparkConf.get(HISTORY_SERVER_ADDRESS)
         .map { text => SparkHadoopUtil.get.substituteHadoopVariables(text, yarnConf) }
         .map { address => s"${address}${HistoryServer.UI_PATH_PREFIX}/${appId}/${attemptId}" }
         .getOrElse("")
@@ -355,7 +355,7 @@ private[spark] class ApplicationMaster(
 
   private def launchReporterThread(): Thread = {
     // The number of failures in a row until Reporter thread give up
-    val reporterMaxFailures = sparkConf.getInt("spark.yarn.scheduler.reporterThread.maxFailures", 5)
+    val reporterMaxFailures = sparkConf.get(MAX_REPORTER_THREAD_FAILURES)
 
     val t = new Thread {
       override def run() {
@@ -429,7 +429,7 @@ private[spark] class ApplicationMaster(
   private def cleanupStagingDir(fs: FileSystem) {
     var stagingDirPath: Path = null
     try {
-      val preserveFiles = sparkConf.getBoolean("spark.yarn.preserve.staging.files", false)
+      val preserveFiles = sparkConf.get(PRESERVE_STAGING_FILES)
       if (!preserveFiles) {
         stagingDirPath = new Path(System.getenv("SPARK_YARN_STAGING_DIR"))
         if (stagingDirPath == null) {
@@ -448,7 +448,7 @@ private[spark] class ApplicationMaster(
   private def waitForSparkContextInitialized(): SparkContext = {
     logInfo("Waiting for spark context initialization")
     sparkContextRef.synchronized {
-      val totalWaitTime = sparkConf.getTimeAsMs("spark.yarn.am.waitTime", "100s")
+      val totalWaitTime = sparkConf.get(AM_MAX_WAIT_TIME)
       val deadline = System.currentTimeMillis() + totalWaitTime
 
       while (sparkContextRef.get() == null && System.currentTimeMillis < deadline && !finished) {
@@ -473,7 +473,7 @@ private[spark] class ApplicationMaster(
 
     // Spark driver should already be up since it launched us, but we don't want to
     // wait forever, so wait 100 seconds max to match the cluster mode setting.
-    val totalWaitTimeMs = sparkConf.getTimeAsMs("spark.yarn.am.waitTime", "100s")
+    val totalWaitTimeMs = sparkConf.get(AM_MAX_WAIT_TIME)
     val deadline = System.currentTimeMillis + totalWaitTimeMs
 
     while (!driverUp && !finished && System.currentTimeMillis < deadline) {
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
index be45e9597f301..36073de90d6af 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
@@ -51,6 +51,8 @@ import org.apache.hadoop.yarn.util.Records
 
 import org.apache.spark.{Logging, SecurityManager, SparkConf, SparkContext, SparkException}
 import org.apache.spark.deploy.SparkHadoopUtil
+import org.apache.spark.deploy.yarn.config._
+import org.apache.spark.internal.config._
 import org.apache.spark.launcher.{LauncherBackend, SparkAppHandle, YarnCommandBuilderUtils}
 import org.apache.spark.util.Utils
 
@@ -87,8 +89,7 @@ private[spark] class Client(
       }
     }
   }
-  private val fireAndForget = isClusterMode &&
-    !sparkConf.getBoolean("spark.yarn.submit.waitAppCompletion", true)
+  private val fireAndForget = isClusterMode && sparkConf.get(WAIT_FOR_APP_COMPLETION)
 
   private var appId: ApplicationId = null
 
@@ -156,7 +157,7 @@ private[spark] class Client(
   private def cleanupStagingDir(appId: ApplicationId): Unit = {
     val appStagingDir = getAppStagingDir(appId)
     try {
-      val preserveFiles = sparkConf.getBoolean("spark.yarn.preserve.staging.files", false)
+      val preserveFiles = sparkConf.get(PRESERVE_STAGING_FILES)
       val stagingDirPath = new Path(appStagingDir)
       val fs = FileSystem.get(hadoopConf)
       if (!preserveFiles && fs.exists(stagingDirPath)) {
@@ -181,39 +182,36 @@ private[spark] class Client(
     appContext.setQueue(args.amQueue)
     appContext.setAMContainerSpec(containerContext)
     appContext.setApplicationType("SPARK")
-    sparkConf.getOption(CONF_SPARK_YARN_APPLICATION_TAGS)
-      .map(StringUtils.getTrimmedStringCollection(_))
-      .filter(!_.isEmpty())
-      .foreach { tagCollection =>
-        try {
-          // The setApplicationTags method was only introduced in Hadoop 2.4+, so we need to use
-          // reflection to set it, printing a warning if a tag was specified but the YARN version
-          // doesn't support it.
-          val method = appContext.getClass().getMethod(
-            "setApplicationTags", classOf[java.util.Set[String]])
-          method.invoke(appContext, new java.util.HashSet[String](tagCollection))
-        } catch {
-          case e: NoSuchMethodException =>
-            logWarning(s"Ignoring $CONF_SPARK_YARN_APPLICATION_TAGS because this version of " +
-              "YARN does not support it")
-        }
+
+    sparkConf.get(APPLICATION_TAGS).foreach { tags =>
+      try {
+        // The setApplicationTags method was only introduced in Hadoop 2.4+, so we need to use
+        // reflection to set it, printing a warning if a tag was specified but the YARN version
+        // doesn't support it.
+        val method = appContext.getClass().getMethod(
+          "setApplicationTags", classOf[java.util.Set[String]])
+        method.invoke(appContext, new java.util.HashSet[String](tags.asJava))
+      } catch {
+        case e: NoSuchMethodException =>
+          logWarning(s"Ignoring ${APPLICATION_TAGS.key} because this version of " +
+            "YARN does not support it")
       }
-    sparkConf.getOption("spark.yarn.maxAppAttempts").map(_.toInt) match {
+    }
+    sparkConf.get(MAX_APP_ATTEMPTS) match {
       case Some(v) => appContext.setMaxAppAttempts(v)
-      case None => logDebug("spark.yarn.maxAppAttempts is not set. " +
+      case None => logDebug(s"${MAX_APP_ATTEMPTS.key} is not set. " +
           "Cluster's default value will be used.")
     }
 
-    if (sparkConf.contains("spark.yarn.am.attemptFailuresValidityInterval")) {
+    sparkConf.get(ATTEMPT_FAILURE_VALIDITY_INTERVAL_MS).foreach { interval =>
       try {
-        val interval = sparkConf.getTimeAsMs("spark.yarn.am.attemptFailuresValidityInterval")
         val method = appContext.getClass().getMethod(
           "setAttemptFailuresValidityInterval", classOf[Long])
         method.invoke(appContext, interval: java.lang.Long)
       } catch {
         case e: NoSuchMethodException =>
-          logWarning("Ignoring spark.yarn.am.attemptFailuresValidityInterval because the version " +
-            "of YARN does not support it")
+          logWarning(s"Ignoring ${ATTEMPT_FAILURE_VALIDITY_INTERVAL_MS.key} because " +
+            "the version of YARN does not support it")
       }
     }
 
@@ -221,28 +219,28 @@ private[spark] class Client(
     capability.setMemory(args.amMemory + amMemoryOverhead)
     capability.setVirtualCores(args.amCores)
 
-    if (sparkConf.contains("spark.yarn.am.nodeLabelExpression")) {
-      try {
-        val amRequest = Records.newRecord(classOf[ResourceRequest])
-        amRequest.setResourceName(ResourceRequest.ANY)
-        amRequest.setPriority(Priority.newInstance(0))
-        amRequest.setCapability(capability)
-        amRequest.setNumContainers(1)
-        val amLabelExpression = sparkConf.get("spark.yarn.am.nodeLabelExpression")
-        val method = amRequest.getClass.getMethod("setNodeLabelExpression", classOf[String])
-        method.invoke(amRequest, amLabelExpression)
-
-        val setResourceRequestMethod =
-          appContext.getClass.getMethod("setAMContainerResourceRequest", classOf[ResourceRequest])
-        setResourceRequestMethod.invoke(appContext, amRequest)
-      } catch {
-        case e: NoSuchMethodException =>
-          logWarning("Ignoring spark.yarn.am.nodeLabelExpression because the version " +
-            "of YARN does not support it")
-          appContext.setResource(capability)
-      }
-    } else {
-      appContext.setResource(capability)
+    sparkConf.get(AM_NODE_LABEL_EXPRESSION) match {
+      case Some(expr) =>
+        try {
+          val amRequest = Records.newRecord(classOf[ResourceRequest])
+          amRequest.setResourceName(ResourceRequest.ANY)
+          amRequest.setPriority(Priority.newInstance(0))
+          amRequest.setCapability(capability)
+          amRequest.setNumContainers(1)
+          val method = amRequest.getClass.getMethod("setNodeLabelExpression", classOf[String])
+          method.invoke(amRequest, expr)
+
+          val setResourceRequestMethod =
+            appContext.getClass.getMethod("setAMContainerResourceRequest", classOf[ResourceRequest])
+          setResourceRequestMethod.invoke(appContext, amRequest)
+        } catch {
+          case e: NoSuchMethodException =>
+            logWarning(s"Ignoring ${AM_NODE_LABEL_EXPRESSION.key} because the version " +
+              "of YARN does not support it")
+            appContext.setResource(capability)
+        }
+      case None =>
+        appContext.setResource(capability)
     }
 
     appContext
@@ -345,8 +343,8 @@ private[spark] class Client(
     YarnSparkHadoopUtil.get.obtainTokenForHiveMetastore(sparkConf, hadoopConf, credentials)
     YarnSparkHadoopUtil.get.obtainTokenForHBase(sparkConf, hadoopConf, credentials)
 
-    val replication = sparkConf.getInt("spark.yarn.submit.file.replication",
-      fs.getDefaultReplication(dst)).toShort
+    val replication = sparkConf.get(STAGING_FILE_REPLICATION).map(_.toShort)
+      .getOrElse(fs.getDefaultReplication(dst))
     val localResources = HashMap[String, LocalResource]()
     FileSystem.mkdirs(fs, dst, new FsPermission(STAGING_DIR_PERMISSION))
 
@@ -419,7 +417,7 @@ private[spark] class Client(
       logInfo("To enable the AM to login from keytab, credentials are being copied over to the AM" +
         " via the YARN Secure Distributed Cache.")
       val (_, localizedPath) = distribute(keytab,
-        destName = Some(sparkConf.get("spark.yarn.keytab")),
+        destName = sparkConf.get(KEYTAB),
         appMasterOnly = true)
       require(localizedPath != null, "Keytab file already distributed.")
     }
@@ -433,8 +431,8 @@ private[spark] class Client(
      *   (3) Spark property key to set if the scheme is not local
      */
     List(
-      (SPARK_JAR, sparkJar(sparkConf), CONF_SPARK_JAR),
-      (APP_JAR, args.userJar, CONF_SPARK_USER_JAR),
+      (SPARK_JAR_NAME, sparkJar(sparkConf), SPARK_JAR.key),
+      (APP_JAR_NAME, args.userJar, APP_JAR.key),
       ("log4j.properties", oldLog4jConf.orNull, null)
     ).foreach { case (destName, path, confKey) =>
       if (path != null && !path.trim().isEmpty()) {
@@ -472,7 +470,7 @@ private[spark] class Client(
       }
     }
     if (cachedSecondaryJarLinks.nonEmpty) {
-      sparkConf.set(CONF_SPARK_YARN_SECONDARY_JARS, cachedSecondaryJarLinks.mkString(","))
+      sparkConf.set(SECONDARY_JARS, cachedSecondaryJarLinks)
     }
 
     if (isClusterMode && args.primaryPyFile != null) {
@@ -586,7 +584,7 @@ private[spark] class Client(
     val creds = new Credentials()
     val nns = YarnSparkHadoopUtil.get.getNameNodesToAccess(sparkConf) + stagingDirPath
     YarnSparkHadoopUtil.get.obtainTokensForNamenodes(
-      nns, hadoopConf, creds, Some(sparkConf.get("spark.yarn.principal")))
+      nns, hadoopConf, creds, sparkConf.get(PRINCIPAL))
     val t = creds.getAllTokens.asScala
       .filter(_.getKind == DelegationTokenIdentifier.HDFS_DELEGATION_KIND)
       .head
@@ -606,8 +604,7 @@ private[spark] class Client(
       pySparkArchives: Seq[String]): HashMap[String, String] = {
     logInfo("Setting up the launch environment for our AM container")
     val env = new HashMap[String, String]()
-    val extraCp = sparkConf.getOption("spark.driver.extraClassPath")
-    populateClasspath(args, yarnConf, sparkConf, env, true, extraCp)
+    populateClasspath(args, yarnConf, sparkConf, env, true, sparkConf.get(DRIVER_CLASS_PATH))
     env("SPARK_YARN_MODE") = "true"
     env("SPARK_YARN_STAGING_DIR") = stagingDir
     env("SPARK_USER") = UserGroupInformation.getCurrentUser().getShortUserName()
@@ -615,11 +612,10 @@ private[spark] class Client(
       val remoteFs = FileSystem.get(hadoopConf)
       val stagingDirPath = new Path(remoteFs.getHomeDirectory, stagingDir)
       val credentialsFile = "credentials-" + UUID.randomUUID().toString
-      sparkConf.set(
-        "spark.yarn.credentials.file", new Path(stagingDirPath, credentialsFile).toString)
+      sparkConf.set(CREDENTIALS_FILE_PATH, new Path(stagingDirPath, credentialsFile).toString)
       logInfo(s"Credentials file set to: $credentialsFile")
       val renewalInterval = getTokenRenewalInterval(stagingDirPath)
-      sparkConf.set("spark.yarn.token.renewal.interval", renewalInterval.toString)
+      sparkConf.set(TOKEN_RENEWAL_INTERVAL, renewalInterval)
     }
 
     // Pick up any environment variables for the AM provided through spark.yarn.appMasterEnv.*
@@ -713,7 +709,7 @@ private[spark] class Client(
     val appId = newAppResponse.getApplicationId
     val appStagingDir = getAppStagingDir(appId)
     val pySparkArchives =
-      if (sparkConf.getBoolean("spark.yarn.isPython", false)) {
+      if (sparkConf.get(IS_PYTHON_APP)) {
         findPySparkArchives()
       } else {
         Nil
@@ -766,36 +762,33 @@ private[spark] class Client(
 
     // Include driver-specific java options if we are launching a driver
     if (isClusterMode) {
-      val driverOpts = sparkConf.getOption("spark.driver.extraJavaOptions")
-        .orElse(sys.env.get("SPARK_JAVA_OPTS"))
+      val driverOpts = sparkConf.get(DRIVER_JAVA_OPTIONS).orElse(sys.env.get("SPARK_JAVA_OPTS"))
       driverOpts.foreach { opts =>
         javaOpts ++= Utils.splitCommandString(opts).map(YarnSparkHadoopUtil.escapeForShell)
       }
-      val libraryPaths = Seq(sys.props.get("spark.driver.extraLibraryPath"),
+      val libraryPaths = Seq(sparkConf.get(DRIVER_LIBRARY_PATH),
         sys.props.get("spark.driver.libraryPath")).flatten
       if (libraryPaths.nonEmpty) {
         prefixEnv = Some(getClusterPath(sparkConf, Utils.libraryPathEnvPrefix(libraryPaths)))
       }
-      if (sparkConf.getOption("spark.yarn.am.extraJavaOptions").isDefined) {
-        logWarning("spark.yarn.am.extraJavaOptions will not take effect in cluster mode")
+      if (sparkConf.get(AM_JAVA_OPTIONS).isDefined) {
+        logWarning(s"${AM_JAVA_OPTIONS.key} will not take effect in cluster mode")
       }
     } else {
       // Validate and include yarn am specific java options in yarn-client mode.
-      val amOptsKey = "spark.yarn.am.extraJavaOptions"
-      val amOpts = sparkConf.getOption(amOptsKey)
-      amOpts.foreach { opts =>
+      sparkConf.get(AM_JAVA_OPTIONS).foreach { opts =>
         if (opts.contains("-Dspark")) {
-          val msg = s"$amOptsKey is not allowed to set Spark options (was '$opts'). "
+          val msg = s"$${amJavaOptions.key} is not allowed to set Spark options (was '$opts'). "
           throw new SparkException(msg)
         }
         if (opts.contains("-Xmx") || opts.contains("-Xms")) {
-          val msg = s"$amOptsKey is not allowed to alter memory settings (was '$opts')."
+          val msg = s"$${amJavaOptions.key} is not allowed to alter memory settings (was '$opts')."
           throw new SparkException(msg)
         }
         javaOpts ++= Utils.splitCommandString(opts).map(YarnSparkHadoopUtil.escapeForShell)
       }
 
-      sparkConf.getOption("spark.yarn.am.extraLibraryPath").foreach { paths =>
+      sparkConf.get(AM_LIBRARY_PATH).foreach { paths =>
         prefixEnv = Some(getClusterPath(sparkConf, Utils.libraryPathEnvPrefix(Seq(paths))))
       }
     }
@@ -883,17 +876,10 @@ private[spark] class Client(
   }
 
   def setupCredentials(): Unit = {
-    loginFromKeytab = args.principal != null || sparkConf.contains("spark.yarn.principal")
+    loginFromKeytab = args.principal != null || sparkConf.contains(PRINCIPAL.key)
     if (loginFromKeytab) {
-      principal =
-        if (args.principal != null) args.principal else sparkConf.get("spark.yarn.principal")
-      keytab = {
-        if (args.keytab != null) {
-          args.keytab
-        } else {
-          sparkConf.getOption("spark.yarn.keytab").orNull
-        }
-      }
+      principal = Option(args.principal).orElse(sparkConf.get(PRINCIPAL)).get
+      keytab = Option(args.keytab).orElse(sparkConf.get(KEYTAB)).orNull
 
       require(keytab != null, "Keytab must be specified when principal is specified.")
       logInfo("Attempting to login to the Kerberos" +
@@ -902,8 +888,8 @@ private[spark] class Client(
       // Generate a file name that can be used for the keytab file, that does not conflict
       // with any user file.
       val keytabFileName = f.getName + "-" + UUID.randomUUID().toString
-      sparkConf.set("spark.yarn.keytab", keytabFileName)
-      sparkConf.set("spark.yarn.principal", principal)
+      sparkConf.set(KEYTAB.key, keytabFileName)
+      sparkConf.set(PRINCIPAL.key, principal)
     }
     credentials = UserGroupInformation.getCurrentUser.getCredentials
   }
@@ -923,7 +909,7 @@ private[spark] class Client(
       appId: ApplicationId,
       returnOnRunning: Boolean = false,
       logApplicationReport: Boolean = true): (YarnApplicationState, FinalApplicationStatus) = {
-    val interval = sparkConf.getLong("spark.yarn.report.interval", 1000)
+    val interval = sparkConf.get(REPORT_INTERVAL)
     var lastState: YarnApplicationState = null
     while (true) {
       Thread.sleep(interval)
@@ -1071,14 +1057,14 @@ object Client extends Logging {
     val args = new ClientArguments(argStrings, sparkConf)
     // to maintain backwards-compatibility
     if (!Utils.isDynamicAllocationEnabled(sparkConf)) {
-      sparkConf.setIfMissing("spark.executor.instances", args.numExecutors.toString)
+      sparkConf.setIfMissing(EXECUTOR_INSTANCES, args.numExecutors)
     }
     new Client(args, sparkConf).run()
   }
 
   // Alias for the Spark assembly jar and the user jar
-  val SPARK_JAR: String = "__spark__.jar"
-  val APP_JAR: String = "__app__.jar"
+  val SPARK_JAR_NAME: String = "__spark__.jar"
+  val APP_JAR_NAME: String = "__app__.jar"
 
   // URI scheme that identifies local resources
   val LOCAL_SCHEME = "local"
@@ -1087,20 +1073,8 @@ object Client extends Logging {
   val SPARK_STAGING: String = ".sparkStaging"
 
   // Location of any user-defined Spark jars
-  val CONF_SPARK_JAR = "spark.yarn.jar"
   val ENV_SPARK_JAR = "SPARK_JAR"
 
-  // Internal config to propagate the location of the user's jar to the driver/executors
-  val CONF_SPARK_USER_JAR = "spark.yarn.user.jar"
-
-  // Internal config to propagate the locations of any extra jars to add to the classpath
-  // of the executors
-  val CONF_SPARK_YARN_SECONDARY_JARS = "spark.yarn.secondary.jars"
-
-  // Comma-separated list of strings to pass through as YARN application tags appearing
-  // in YARN ApplicationReports, which can be used for filtering when querying YARN.
-  val CONF_SPARK_YARN_APPLICATION_TAGS = "spark.yarn.tags"
-
   // Staging directory is private! -> rwx--------
   val STAGING_DIR_PERMISSION: FsPermission =
     FsPermission.createImmutable(Integer.parseInt("700", 8).toShort)
@@ -1125,23 +1099,23 @@ object Client extends Logging {
    * Find the user-defined Spark jar if configured, or return the jar containing this
    * class if not.
    *
-   * This method first looks in the SparkConf object for the CONF_SPARK_JAR key, and in the
+   * This method first looks in the SparkConf object for the spark.yarn.jar key, and in the
    * user environment if that is not found (for backwards compatibility).
    */
   private def sparkJar(conf: SparkConf): String = {
-    if (conf.contains(CONF_SPARK_JAR)) {
-      conf.get(CONF_SPARK_JAR)
-    } else if (System.getenv(ENV_SPARK_JAR) != null) {
-      logWarning(
-        s"$ENV_SPARK_JAR detected in the system environment. This variable has been deprecated " +
-          s"in favor of the $CONF_SPARK_JAR configuration variable.")
-      System.getenv(ENV_SPARK_JAR)
-    } else {
-      SparkContext.jarOfClass(this.getClass).getOrElse(throw new SparkException("Could not "
-        + "find jar containing Spark classes. The jar can be defined using the "
-        + "spark.yarn.jar configuration option. If testing Spark, either set that option or "
-        + "make sure SPARK_PREPEND_CLASSES is not set."))
-    }
+    conf.get(SPARK_JAR).getOrElse(
+      if (System.getenv(ENV_SPARK_JAR) != null) {
+        logWarning(
+          s"$ENV_SPARK_JAR detected in the system environment. This variable has been deprecated " +
+            s"in favor of the ${SPARK_JAR.key} configuration variable.")
+        System.getenv(ENV_SPARK_JAR)
+      } else {
+        SparkContext.jarOfClass(this.getClass).getOrElse(throw new SparkException("Could not "
+          + "find jar containing Spark classes. The jar can be defined using the "
+          + s"${SPARK_JAR.key} configuration option. If testing Spark, either set that option "
+          + "or make sure SPARK_PREPEND_CLASSES is not set."))
+      }
+    )
   }
 
   /**
@@ -1240,7 +1214,7 @@ object Client extends Logging {
           LOCALIZED_CONF_DIR, env)
     }
 
-    if (sparkConf.getBoolean("spark.yarn.user.classpath.first", false)) {
+    if (sparkConf.get(USER_CLASS_PATH_FIRST)) {
       // in order to properly add the app jar when user classpath is first
       // we have to do the mainJar separate in order to send the right thing
       // into addFileToClasspath
@@ -1248,21 +1222,21 @@ object Client extends Logging {
         if (args != null) {
           getMainJarUri(Option(args.userJar))
         } else {
-          getMainJarUri(sparkConf.getOption(CONF_SPARK_USER_JAR))
+          getMainJarUri(sparkConf.get(APP_JAR))
         }
-      mainJar.foreach(addFileToClasspath(sparkConf, conf, _, APP_JAR, env))
+      mainJar.foreach(addFileToClasspath(sparkConf, conf, _, APP_JAR_NAME, env))
 
       val secondaryJars =
         if (args != null) {
-          getSecondaryJarUris(Option(args.addJars))
+          getSecondaryJarUris(Option(args.addJars).map(_.split(",").toSeq))
         } else {
-          getSecondaryJarUris(sparkConf.getOption(CONF_SPARK_YARN_SECONDARY_JARS))
+          getSecondaryJarUris(sparkConf.get(SECONDARY_JARS))
         }
       secondaryJars.foreach { x =>
         addFileToClasspath(sparkConf, conf, x, null, env)
       }
     }
-    addFileToClasspath(sparkConf, conf, new URI(sparkJar(sparkConf)), SPARK_JAR, env)
+    addFileToClasspath(sparkConf, conf, new URI(sparkJar(sparkConf)), SPARK_JAR_NAME, env)
     populateHadoopClasspath(conf, env)
     sys.env.get(ENV_DIST_CLASSPATH).foreach { cp =>
       addClasspathEntry(getClusterPath(sparkConf, cp), env)
@@ -1275,8 +1249,8 @@ object Client extends Logging {
    * @param conf Spark configuration.
    */
   def getUserClasspath(conf: SparkConf): Array[URI] = {
-    val mainUri = getMainJarUri(conf.getOption(CONF_SPARK_USER_JAR))
-    val secondaryUris = getSecondaryJarUris(conf.getOption(CONF_SPARK_YARN_SECONDARY_JARS))
+    val mainUri = getMainJarUri(conf.get(APP_JAR))
+    val secondaryUris = getSecondaryJarUris(conf.get(SECONDARY_JARS))
     (mainUri ++ secondaryUris).toArray
   }
 
@@ -1284,11 +1258,11 @@ object Client extends Logging {
     mainJar.flatMap { path =>
       val uri = Utils.resolveURI(path)
       if (uri.getScheme == LOCAL_SCHEME) Some(uri) else None
-    }.orElse(Some(new URI(APP_JAR)))
+    }.orElse(Some(new URI(APP_JAR_NAME)))
   }
 
-  private def getSecondaryJarUris(secondaryJars: Option[String]): Seq[URI] = {
-    secondaryJars.map(_.split(",")).toSeq.flatten.map(new URI(_))
+  private def getSecondaryJarUris(secondaryJars: Option[Seq[String]]): Seq[URI] = {
+    secondaryJars.getOrElse(Nil).map(new URI(_))
   }
 
   /**
@@ -1345,8 +1319,8 @@ object Client extends Logging {
    * If either config is not available, the input path is returned.
    */
   def getClusterPath(conf: SparkConf, path: String): String = {
-    val localPath = conf.get("spark.yarn.config.gatewayPath", null)
-    val clusterPath = conf.get("spark.yarn.config.replacementPath", null)
+    val localPath = conf.get(GATEWAY_ROOT_PATH)
+    val clusterPath = conf.get(REPLACEMENT_ROOT_PATH)
     if (localPath != null && clusterPath != null) {
       path.replace(localPath, clusterPath)
     } else {
@@ -1405,9 +1379,9 @@ object Client extends Logging {
    */
   def isUserClassPathFirst(conf: SparkConf, isDriver: Boolean): Boolean = {
     if (isDriver) {
-      conf.getBoolean("spark.driver.userClassPathFirst", false)
+      conf.get(DRIVER_USER_CLASS_PATH_FIRST)
     } else {
-      conf.getBoolean("spark.executor.userClassPathFirst", false)
+      conf.get(EXECUTOR_USER_CLASS_PATH_FIRST)
     }
   }
 
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ClientArguments.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ClientArguments.scala
index a9f4374357356..47b4cc300907b 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ClientArguments.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ClientArguments.scala
@@ -21,10 +21,15 @@ import scala.collection.mutable.ArrayBuffer
 
 import org.apache.spark.{SparkConf, SparkException}
 import org.apache.spark.deploy.yarn.YarnSparkHadoopUtil._
+import org.apache.spark.deploy.yarn.config._
+import org.apache.spark.internal.config._
 import org.apache.spark.util.{IntParam, MemoryParam, Utils}
 
 // TODO: Add code and support for ensuring that yarn resource 'tasks' are location aware !
-private[spark] class ClientArguments(args: Array[String], sparkConf: SparkConf) {
+private[spark] class ClientArguments(
+    args: Array[String],
+    sparkConf: SparkConf) {
+
   var addJars: String = null
   var files: String = null
   var archives: String = null
@@ -37,9 +42,9 @@ private[spark] class ClientArguments(args: Array[String], sparkConf: SparkConf)
   var executorMemory = 1024 // MB
   var executorCores = 1
   var numExecutors = DEFAULT_NUMBER_EXECUTORS
-  var amQueue = sparkConf.get("spark.yarn.queue", "default")
-  var amMemory: Int = 512 // MB
-  var amCores: Int = 1
+  var amQueue = sparkConf.get(QUEUE_NAME)
+  var amMemory: Int = _
+  var amCores: Int = _
   var appName: String = "Spark"
   var priority = 0
   var principal: String = null
@@ -48,11 +53,6 @@ private[spark] class ClientArguments(args: Array[String], sparkConf: SparkConf)
 
   private var driverMemory: Int = Utils.DEFAULT_DRIVER_MEM_MB // MB
   private var driverCores: Int = 1
-  private val driverMemOverheadKey = "spark.yarn.driver.memoryOverhead"
-  private val amMemKey = "spark.yarn.am.memory"
-  private val amMemOverheadKey = "spark.yarn.am.memoryOverhead"
-  private val driverCoresKey = "spark.driver.cores"
-  private val amCoresKey = "spark.yarn.am.cores"
   private val isDynamicAllocationEnabled = Utils.isDynamicAllocationEnabled(sparkConf)
 
   parseArgs(args.toList)
@@ -60,33 +60,33 @@ private[spark] class ClientArguments(args: Array[String], sparkConf: SparkConf)
   validateArgs()
 
   // Additional memory to allocate to containers
-  val amMemoryOverheadConf = if (isClusterMode) driverMemOverheadKey else amMemOverheadKey
-  val amMemoryOverhead = sparkConf.getInt(amMemoryOverheadConf,
-    math.max((MEMORY_OVERHEAD_FACTOR * amMemory).toInt, MEMORY_OVERHEAD_MIN))
+  val amMemoryOverheadEntry = if (isClusterMode) DRIVER_MEMORY_OVERHEAD else AM_MEMORY_OVERHEAD
+  val amMemoryOverhead = sparkConf.get(amMemoryOverheadEntry).getOrElse(
+    math.max((MEMORY_OVERHEAD_FACTOR * amMemory).toLong, MEMORY_OVERHEAD_MIN)).toInt
 
-  val executorMemoryOverhead = sparkConf.getInt("spark.yarn.executor.memoryOverhead",
-    math.max((MEMORY_OVERHEAD_FACTOR * executorMemory).toInt, MEMORY_OVERHEAD_MIN))
+  val executorMemoryOverhead = sparkConf.get(EXECUTOR_MEMORY_OVERHEAD).getOrElse(
+    math.max((MEMORY_OVERHEAD_FACTOR * executorMemory).toLong, MEMORY_OVERHEAD_MIN)).toInt
 
   /** Load any default arguments provided through environment variables and Spark properties. */
   private def loadEnvironmentArgs(): Unit = {
     // For backward compatibility, SPARK_YARN_DIST_{ARCHIVES/FILES} should be resolved to hdfs://,
     // while spark.yarn.dist.{archives/files} should be resolved to file:// (SPARK-2051).
     files = Option(files)
-      .orElse(sparkConf.getOption("spark.yarn.dist.files").map(p => Utils.resolveURIs(p)))
+      .orElse(sparkConf.get(FILES_TO_DISTRIBUTE).map(p => Utils.resolveURIs(p)))
       .orElse(sys.env.get("SPARK_YARN_DIST_FILES"))
       .orNull
     archives = Option(archives)
-      .orElse(sparkConf.getOption("spark.yarn.dist.archives").map(p => Utils.resolveURIs(p)))
+      .orElse(sparkConf.get(ARCHIVES_TO_DISTRIBUTE).map(p => Utils.resolveURIs(p)))
       .orElse(sys.env.get("SPARK_YARN_DIST_ARCHIVES"))
       .orNull
     // If dynamic allocation is enabled, start at the configured initial number of executors.
     // Default to minExecutors if no initialExecutors is set.
     numExecutors = YarnSparkHadoopUtil.getInitialTargetExecutorNumber(sparkConf, numExecutors)
     principal = Option(principal)
-      .orElse(sparkConf.getOption("spark.yarn.principal"))
+      .orElse(sparkConf.get(PRINCIPAL))
       .orNull
     keytab = Option(keytab)
-      .orElse(sparkConf.getOption("spark.yarn.keytab"))
+      .orElse(sparkConf.get(KEYTAB))
       .orNull
   }
 
@@ -103,13 +103,12 @@ private[spark] class ClientArguments(args: Array[String], sparkConf: SparkConf)
            |${getUsageMessage()}
          """.stripMargin)
     }
-    if (executorCores < sparkConf.getInt("spark.task.cpus", 1)) {
-      throw new SparkException("Executor cores must not be less than " +
-        "spark.task.cpus.")
+    if (executorCores < sparkConf.get(CPUS_PER_TASK)) {
+      throw new SparkException(s"Executor cores must not be less than ${CPUS_PER_TASK.key}.")
     }
     // scalastyle:off println
     if (isClusterMode) {
-      for (key <- Seq(amMemKey, amMemOverheadKey, amCoresKey)) {
+      for (key <- Seq(AM_MEMORY.key, AM_MEMORY_OVERHEAD.key, AM_CORES.key)) {
         if (sparkConf.contains(key)) {
           println(s"$key is set but does not apply in cluster mode.")
         }
@@ -117,17 +116,13 @@ private[spark] class ClientArguments(args: Array[String], sparkConf: SparkConf)
       amMemory = driverMemory
       amCores = driverCores
     } else {
-      for (key <- Seq(driverMemOverheadKey, driverCoresKey)) {
+      for (key <- Seq(DRIVER_MEMORY_OVERHEAD.key, DRIVER_CORES.key)) {
         if (sparkConf.contains(key)) {
           println(s"$key is set but does not apply in client mode.")
         }
       }
-      sparkConf.getOption(amMemKey)
-        .map(Utils.memoryStringToMb)
-        .foreach { mem => amMemory = mem }
-      sparkConf.getOption(amCoresKey)
-        .map(_.toInt)
-        .foreach { cores => amCores = cores }
+      amMemory = sparkConf.get(AM_MEMORY).toInt
+      amCores = sparkConf.get(AM_CORES)
     }
     // scalastyle:on println
   }
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorDelegationTokenUpdater.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorDelegationTokenUpdater.scala
index 6474acc3dc9d2..1ae278d76f027 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorDelegationTokenUpdater.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorDelegationTokenUpdater.scala
@@ -26,6 +26,7 @@ import org.apache.hadoop.security.{Credentials, UserGroupInformation}
 
 import org.apache.spark.{Logging, SparkConf}
 import org.apache.spark.deploy.SparkHadoopUtil
+import org.apache.spark.deploy.yarn.config._
 import org.apache.spark.util.{ThreadUtils, Utils}
 
 private[spark] class ExecutorDelegationTokenUpdater(
@@ -34,7 +35,7 @@ private[spark] class ExecutorDelegationTokenUpdater(
 
   @volatile private var lastCredentialsFileSuffix = 0
 
-  private val credentialsFile = sparkConf.get("spark.yarn.credentials.file")
+  private val credentialsFile = sparkConf.get(CREDENTIALS_FILE_PATH)
   private val freshHadoopConf =
     SparkHadoopUtil.get.getConfBypassingFSCache(
       hadoopConf, new Path(credentialsFile).toUri.getScheme)
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala
index 21ac04dc76c32..9f91d182ebc32 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala
@@ -38,11 +38,13 @@ import org.apache.hadoop.yarn.ipc.YarnRPC
 import org.apache.hadoop.yarn.util.{ConverterUtils, Records}
 
 import org.apache.spark.{Logging, SecurityManager, SparkConf, SparkException}
+import org.apache.spark.deploy.yarn.config._
+import org.apache.spark.internal.config._
 import org.apache.spark.launcher.YarnCommandBuilderUtils
 import org.apache.spark.network.util.JavaUtils
 import org.apache.spark.util.Utils
 
-class ExecutorRunnable(
+private[yarn] class ExecutorRunnable(
     container: Container,
     conf: Configuration,
     sparkConf: SparkConf,
@@ -104,7 +106,7 @@ class ExecutorRunnable(
     // If external shuffle service is enabled, register with the Yarn shuffle service already
     // started on the NodeManager and, if authentication is enabled, provide it with our secret
     // key for fetching shuffle files later
-    if (sparkConf.getBoolean("spark.shuffle.service.enabled", false)) {
+    if (sparkConf.get(SHUFFLE_SERVICE_ENABLED)) {
       val secretString = securityMgr.getSecretKey()
       val secretBytes =
         if (secretString != null) {
@@ -148,13 +150,13 @@ class ExecutorRunnable(
     javaOpts += "-Xmx" + executorMemoryString
 
     // Set extra Java options for the executor, if defined
-    sys.props.get("spark.executor.extraJavaOptions").foreach { opts =>
+    sparkConf.get(EXECUTOR_JAVA_OPTIONS).foreach { opts =>
       javaOpts ++= Utils.splitCommandString(opts).map(YarnSparkHadoopUtil.escapeForShell)
     }
     sys.env.get("SPARK_JAVA_OPTS").foreach { opts =>
       javaOpts ++= Utils.splitCommandString(opts).map(YarnSparkHadoopUtil.escapeForShell)
     }
-    sys.props.get("spark.executor.extraLibraryPath").foreach { p =>
+    sparkConf.get(EXECUTOR_LIBRARY_PATH).foreach { p =>
       prefixEnv = Some(Client.getClusterPath(sparkConf, Utils.libraryPathEnvPrefix(Seq(p))))
     }
 
@@ -286,8 +288,8 @@ class ExecutorRunnable(
 
   private def prepareEnvironment(container: Container): HashMap[String, String] = {
     val env = new HashMap[String, String]()
-    val extraCp = sparkConf.getOption("spark.executor.extraClassPath")
-    Client.populateClasspath(null, yarnConf, sparkConf, env, false, extraCp)
+    Client.populateClasspath(null, yarnConf, sparkConf, env, false,
+      sparkConf.get(EXECUTOR_CLASS_PATH))
 
     sparkConf.getExecutorEnv.foreach { case (key, value) =>
       // This assumes each executor environment variable set here is a path
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/LocalityPreferredContainerPlacementStrategy.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/LocalityPreferredContainerPlacementStrategy.scala
index 2ec189de7c914..8772e26f4314d 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/LocalityPreferredContainerPlacementStrategy.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/LocalityPreferredContainerPlacementStrategy.scala
@@ -26,6 +26,7 @@ import org.apache.hadoop.yarn.client.api.AMRMClient.ContainerRequest
 import org.apache.hadoop.yarn.util.RackResolver
 
 import org.apache.spark.SparkConf
+import org.apache.spark.internal.config._
 
 private[yarn] case class ContainerLocalityPreferences(nodes: Array[String], racks: Array[String])
 
@@ -84,9 +85,6 @@ private[yarn] class LocalityPreferredContainerPlacementStrategy(
     val yarnConf: Configuration,
     val resource: Resource) {
 
-  // Number of CPUs per task
-  private val CPUS_PER_TASK = sparkConf.getInt("spark.task.cpus", 1)
-
   /**
    * Calculate each container's node locality and rack locality
    * @param numContainer number of containers to calculate
@@ -159,7 +157,7 @@ private[yarn] class LocalityPreferredContainerPlacementStrategy(
    */
   private def numExecutorsPending(numTasksPending: Int): Int = {
     val coresPerExecutor = resource.getVirtualCores
-    (numTasksPending * CPUS_PER_TASK + coresPerExecutor - 1) / coresPerExecutor
+    (numTasksPending * sparkConf.get(CPUS_PER_TASK) + coresPerExecutor - 1) / coresPerExecutor
   }
 
   /**
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
index 11426eb07c7ed..a96cb4957be88 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
@@ -34,6 +34,7 @@ import org.apache.log4j.{Level, Logger}
 
 import org.apache.spark.{Logging, SecurityManager, SparkConf, SparkException}
 import org.apache.spark.deploy.yarn.YarnSparkHadoopUtil._
+import org.apache.spark.deploy.yarn.config._
 import org.apache.spark.rpc.{RpcCallContext, RpcEndpointRef}
 import org.apache.spark.scheduler.{ExecutorExited, ExecutorLossReason}
 import org.apache.spark.scheduler.cluster.CoarseGrainedClusterMessages.RemoveExecutor
@@ -107,21 +108,20 @@ private[yarn] class YarnAllocator(
   // Executor memory in MB.
   protected val executorMemory = args.executorMemory
   // Additional memory overhead.
-  protected val memoryOverhead: Int = sparkConf.getInt("spark.yarn.executor.memoryOverhead",
-    math.max((MEMORY_OVERHEAD_FACTOR * executorMemory).toInt, MEMORY_OVERHEAD_MIN))
+  protected val memoryOverhead: Int = sparkConf.get(EXECUTOR_MEMORY_OVERHEAD).getOrElse(
+    math.max((MEMORY_OVERHEAD_FACTOR * executorMemory).toInt, MEMORY_OVERHEAD_MIN)).toInt
   // Number of cores per executor.
   protected val executorCores = args.executorCores
   // Resource capability requested for each executors
   private[yarn] val resource = Resource.newInstance(executorMemory + memoryOverhead, executorCores)
 
   private val launcherPool = ThreadUtils.newDaemonCachedThreadPool(
-    "ContainerLauncher",
-    sparkConf.getInt("spark.yarn.containerLauncherMaxThreads", 25))
+    "ContainerLauncher", sparkConf.get(CONTAINER_LAUNCH_MAX_THREADS))
 
   // For testing
   private val launchContainers = sparkConf.getBoolean("spark.yarn.launchContainers", true)
 
-  private val labelExpression = sparkConf.getOption("spark.yarn.executor.nodeLabelExpression")
+  private val labelExpression = sparkConf.get(EXECUTOR_NODE_LABEL_EXPRESSION)
 
   // ContainerRequest constructor that can take a node label expression. We grab it through
   // reflection because it's only available in later versions of YARN.
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnRMClient.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnRMClient.scala
index 98505b93dda36..968f63527616a 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnRMClient.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnRMClient.scala
@@ -31,6 +31,7 @@ import org.apache.hadoop.yarn.conf.YarnConfiguration
 import org.apache.hadoop.yarn.webapp.util.WebAppUtils
 
 import org.apache.spark.{Logging, SecurityManager, SparkConf}
+import org.apache.spark.deploy.yarn.config._
 import org.apache.spark.rpc.RpcEndpointRef
 import org.apache.spark.util.Utils
 
@@ -117,7 +118,7 @@ private[spark] class YarnRMClient(args: ApplicationMasterArguments) extends Logg
 
   /** Returns the maximum number of attempts to register the AM. */
   def getMaxRegAttempts(sparkConf: SparkConf, yarnConf: YarnConfiguration): Int = {
-    val sparkMaxAttempts = sparkConf.getOption("spark.yarn.maxAppAttempts").map(_.toInt)
+    val sparkMaxAttempts = sparkConf.get(MAX_APP_ATTEMPTS).map(_.toInt)
     val yarnMaxAttempts = yarnConf.getInt(
       YarnConfiguration.RM_AM_MAX_ATTEMPTS, YarnConfiguration.DEFAULT_RM_AM_MAX_ATTEMPTS)
     val retval: Int = sparkMaxAttempts match {
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtil.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtil.scala
index aef78fdfd4c57..ed56d4bd44fe8 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtil.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtil.scala
@@ -44,6 +44,8 @@ import org.apache.hadoop.yarn.util.ConverterUtils
 
 import org.apache.spark.{SecurityManager, SparkConf, SparkException}
 import org.apache.spark.deploy.SparkHadoopUtil
+import org.apache.spark.deploy.yarn.config._
+import org.apache.spark.internal.config._
 import org.apache.spark.launcher.YarnCommandBuilderUtils
 import org.apache.spark.util.Utils
 
@@ -97,10 +99,7 @@ class YarnSparkHadoopUtil extends SparkHadoopUtil {
    * Get the list of namenodes the user may access.
    */
   def getNameNodesToAccess(sparkConf: SparkConf): Set[Path] = {
-    sparkConf.get("spark.yarn.access.namenodes", "")
-      .split(",")
-      .map(_.trim())
-      .filter(!_.isEmpty)
+    sparkConf.get(NAMENODES_TO_ACCESS)
       .map(new Path(_))
       .toSet
   }
@@ -335,7 +334,7 @@ object YarnSparkHadoopUtil {
   // the common cases. Memory overhead tends to grow with container size.
 
   val MEMORY_OVERHEAD_FACTOR = 0.10
-  val MEMORY_OVERHEAD_MIN = 384
+  val MEMORY_OVERHEAD_MIN = 384L
 
   val ANY_HOST = "*"
 
@@ -509,10 +508,9 @@ object YarnSparkHadoopUtil {
       conf: SparkConf,
       numExecutors: Int = DEFAULT_NUMBER_EXECUTORS): Int = {
     if (Utils.isDynamicAllocationEnabled(conf)) {
-      val minNumExecutors = conf.getInt("spark.dynamicAllocation.minExecutors", 0)
-      val initialNumExecutors =
-        conf.getInt("spark.dynamicAllocation.initialExecutors", minNumExecutors)
-      val maxNumExecutors = conf.getInt("spark.dynamicAllocation.maxExecutors", Int.MaxValue)
+      val minNumExecutors = conf.get(DYN_ALLOCATION_MIN_EXECUTORS)
+      val initialNumExecutors = conf.get(DYN_ALLOCATION_INITIAL_EXECUTORS)
+      val maxNumExecutors = conf.get(DYN_ALLOCATION_MAX_EXECUTORS)
       require(initialNumExecutors >= minNumExecutors && initialNumExecutors <= maxNumExecutors,
         s"initial executor number $initialNumExecutors must between min executor number" +
           s"$minNumExecutors and max executor number $maxNumExecutors")
@@ -522,7 +520,7 @@ object YarnSparkHadoopUtil {
       val targetNumExecutors =
         sys.env.get("SPARK_EXECUTOR_INSTANCES").map(_.toInt).getOrElse(numExecutors)
       // System property can override environment variable.
-      conf.getInt("spark.executor.instances", targetNumExecutors)
+      conf.get(EXECUTOR_INSTANCES).getOrElse(targetNumExecutors)
     }
   }
 }
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/config.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/config.scala
new file mode 100644
index 0000000000000..06c1be9bf0e07
--- /dev/null
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/config.scala
@@ -0,0 +1,243 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.deploy.yarn
+
+import java.util.concurrent.TimeUnit
+
+import org.apache.spark.internal.config.ConfigBuilder
+import org.apache.spark.network.util.ByteUnit
+
+package object config {
+
+  /* Common app configuration. */
+
+  private[spark] val APPLICATION_TAGS = ConfigBuilder("spark.yarn.tags")
+    .doc("Comma-separated list of strings to pass through as YARN application tags appearing " +
+      "in YARN Application Reports, which can be used for filtering when querying YARN.")
+    .stringConf
+    .toSequence
+    .optional
+
+  private[spark] val ATTEMPT_FAILURE_VALIDITY_INTERVAL_MS =
+    ConfigBuilder("spark.yarn.am.attemptFailuresValidityInterval")
+      .doc("Interval after which AM failures will be considered independent and " +
+        "not accumulate towards the attempt count.")
+      .timeConf(TimeUnit.MILLISECONDS)
+      .optional
+
+  private[spark] val MAX_APP_ATTEMPTS = ConfigBuilder("spark.yarn.maxAppAttempts")
+    .doc("Maximum number of AM attempts before failing the app.")
+    .intConf
+    .optional
+
+  private[spark] val USER_CLASS_PATH_FIRST = ConfigBuilder("spark.yarn.user.classpath.first")
+    .doc("Whether to place user jars in front of Spark's classpath.")
+    .booleanConf
+    .withDefault(false)
+
+  private[spark] val GATEWAY_ROOT_PATH = ConfigBuilder("spark.yarn.config.gatewayPath")
+    .doc("Root of configuration paths that is present on gateway nodes, and will be replaced " +
+      "with the corresponding path in cluster machines.")
+    .stringConf
+    .withDefault(null)
+
+  private[spark] val REPLACEMENT_ROOT_PATH = ConfigBuilder("spark.yarn.config.replacementPath")
+    .doc(s"Path to use as a replacement for ${GATEWAY_ROOT_PATH.key} when launching processes " +
+      "in the YARN cluster.")
+    .stringConf
+    .withDefault(null)
+
+  private[spark] val QUEUE_NAME = ConfigBuilder("spark.yarn.queue")
+    .stringConf
+    .withDefault("default")
+
+  private[spark] val HISTORY_SERVER_ADDRESS = ConfigBuilder("spark.yarn.historyServer.address")
+    .stringConf
+    .optional
+
+  /* File distribution. */
+
+  private[spark] val SPARK_JAR = ConfigBuilder("spark.yarn.jar")
+    .doc("Location of the Spark jar to use.")
+    .stringConf
+    .optional
+
+  private[spark] val ARCHIVES_TO_DISTRIBUTE = ConfigBuilder("spark.yarn.dist.archives")
+    .stringConf
+    .optional
+
+  private[spark] val FILES_TO_DISTRIBUTE = ConfigBuilder("spark.yarn.dist.files")
+    .stringConf
+    .optional
+
+  private[spark] val PRESERVE_STAGING_FILES = ConfigBuilder("spark.yarn.preserve.staging.files")
+    .doc("Whether to preserve temporary files created by the job in HDFS.")
+    .booleanConf
+    .withDefault(false)
+
+  private[spark] val STAGING_FILE_REPLICATION = ConfigBuilder("spark.yarn.submit.file.replication")
+    .doc("Replication factor for files uploaded by Spark to HDFS.")
+    .intConf
+    .optional
+
+  /* Cluster-mode launcher configuration. */
+
+  private[spark] val WAIT_FOR_APP_COMPLETION = ConfigBuilder("spark.yarn.submit.waitAppCompletion")
+    .doc("In cluster mode, whether to wait for the application to finishe before exiting the " +
+      "launcher process.")
+    .booleanConf
+    .withDefault(true)
+
+  private[spark] val REPORT_INTERVAL = ConfigBuilder("spark.yarn.report.interval")
+    .doc("Interval between reports of the current app status in cluster mode.")
+    .timeConf(TimeUnit.MILLISECONDS)
+    .withDefaultString("1s")
+
+  /* Shared Client-mode AM / Driver configuration. */
+
+  private[spark] val AM_MAX_WAIT_TIME = ConfigBuilder("spark.yarn.am.waitTime")
+    .timeConf(TimeUnit.MILLISECONDS)
+    .withDefaultString("100s")
+
+  private[spark] val AM_NODE_LABEL_EXPRESSION = ConfigBuilder("spark.yarn.am.nodeLabelExpression")
+    .doc("Node label expression for the AM.")
+    .stringConf
+    .optional
+
+  private[spark] val CONTAINER_LAUNCH_MAX_THREADS =
+    ConfigBuilder("spark.yarn.containerLauncherMaxThreads")
+      .intConf
+      .withDefault(25)
+
+  private[spark] val MAX_EXECUTOR_FAILURES = ConfigBuilder("spark.yarn.max.executor.failures")
+    .intConf
+    .optional
+
+  private[spark] val MAX_REPORTER_THREAD_FAILURES =
+    ConfigBuilder("spark.yarn.scheduler.reporterThread.maxFailures")
+      .intConf
+      .withDefault(5)
+
+  private[spark] val RM_HEARTBEAT_INTERVAL =
+    ConfigBuilder("spark.yarn.scheduler.heartbeat.interval-ms")
+      .timeConf(TimeUnit.MILLISECONDS)
+      .withDefaultString("3s")
+
+  private[spark] val INITIAL_HEARTBEAT_INTERVAL =
+    ConfigBuilder("spark.yarn.scheduler.initial-allocation.interval")
+      .timeConf(TimeUnit.MILLISECONDS)
+      .withDefaultString("200ms")
+
+  private[spark] val SCHEDULER_SERVICES = ConfigBuilder("spark.yarn.services")
+    .doc("A comma-separated list of class names of services to add to the scheduler.")
+    .stringConf
+    .toSequence
+    .withDefault(Nil)
+
+  /* Client-mode AM configuration. */
+
+  private[spark] val AM_CORES = ConfigBuilder("spark.yarn.am.cores")
+    .intConf
+    .withDefault(1)
+
+  private[spark] val AM_JAVA_OPTIONS = ConfigBuilder("spark.yarn.am.extraJavaOptions")
+    .doc("Extra Java options for the client-mode AM.")
+    .stringConf
+    .optional
+
+  private[spark] val AM_LIBRARY_PATH = ConfigBuilder("spark.yarn.am.extraLibraryPath")
+    .doc("Extra native library path for the client-mode AM.")
+    .stringConf
+    .optional
+
+  private[spark] val AM_MEMORY_OVERHEAD = ConfigBuilder("spark.yarn.am.memoryOverhead")
+    .bytesConf(ByteUnit.MiB)
+    .optional
+
+  private[spark] val AM_MEMORY = ConfigBuilder("spark.yarn.am.memory")
+    .bytesConf(ByteUnit.MiB)
+    .withDefaultString("512m")
+
+  /* Driver configuration. */
+
+  private[spark] val DRIVER_CORES = ConfigBuilder("spark.driver.cores")
+    .intConf
+    .optional
+
+  private[spark] val DRIVER_MEMORY_OVERHEAD = ConfigBuilder("spark.yarn.driver.memoryOverhead")
+    .bytesConf(ByteUnit.MiB)
+    .optional
+
+  /* Executor configuration. */
+
+  private[spark] val EXECUTOR_MEMORY_OVERHEAD = ConfigBuilder("spark.yarn.executor.memoryOverhead")
+    .bytesConf(ByteUnit.MiB)
+    .optional
+
+  private[spark] val EXECUTOR_NODE_LABEL_EXPRESSION =
+    ConfigBuilder("spark.yarn.executor.nodeLabelExpression")
+      .doc("Node label expression for executors.")
+      .stringConf
+      .optional
+
+  /* Security configuration. */
+
+  private[spark] val CREDENTIAL_FILE_MAX_COUNT =
+    ConfigBuilder("spark.yarn.credentials.file.retention.count")
+      .intConf
+      .withDefault(5)
+
+  private[spark] val CREDENTIALS_FILE_MAX_RETENTION =
+    ConfigBuilder("spark.yarn.credentials.file.retention.days")
+      .intConf
+      .withDefault(5)
+
+  private[spark] val NAMENODES_TO_ACCESS = ConfigBuilder("spark.yarn.access.namenodes")
+    .doc("Extra NameNode URLs for which to request delegation tokens. The NameNode that hosts " +
+      "fs.defaultFS does not need to be listed here.")
+    .stringConf
+    .toSequence
+    .withDefault(Nil)
+
+  private[spark] val TOKEN_RENEWAL_INTERVAL = ConfigBuilder("spark.yarn.token.renewal.interval")
+    .internal
+    .timeConf(TimeUnit.MILLISECONDS)
+    .optional
+
+  /* Private configs. */
+
+  private[spark] val CREDENTIALS_FILE_PATH = ConfigBuilder("spark.yarn.credentials.file")
+    .internal
+    .stringConf
+    .withDefault(null)
+
+  // Internal config to propagate the location of the user's jar to the driver/executors
+  private[spark] val APP_JAR = ConfigBuilder("spark.yarn.user.jar")
+    .internal
+    .stringConf
+    .optional
+
+  // Internal config to propagate the locations of any extra jars to add to the classpath
+  // of the executors
+  private[spark] val SECONDARY_JARS = ConfigBuilder("spark.yarn.secondary.jars")
+    .internal
+    .stringConf
+    .toSequence
+    .optional
+
+}
diff --git a/yarn/src/main/scala/org/apache/spark/scheduler/cluster/SchedulerExtensionService.scala b/yarn/src/main/scala/org/apache/spark/scheduler/cluster/SchedulerExtensionService.scala
index c064521845399..c4757e335b6c6 100644
--- a/yarn/src/main/scala/org/apache/spark/scheduler/cluster/SchedulerExtensionService.scala
+++ b/yarn/src/main/scala/org/apache/spark/scheduler/cluster/SchedulerExtensionService.scala
@@ -22,6 +22,7 @@ import java.util.concurrent.atomic.AtomicBoolean
 import org.apache.hadoop.yarn.api.records.{ApplicationAttemptId, ApplicationId}
 
 import org.apache.spark.{Logging, SparkContext}
+import org.apache.spark.deploy.yarn.config._
 import org.apache.spark.util.Utils
 
 /**
@@ -103,20 +104,15 @@ private[spark] class SchedulerExtensionServices extends SchedulerExtensionServic
     val attemptId = binding.attemptId
     logInfo(s"Starting Yarn extension services with app $appId and attemptId $attemptId")
 
-    serviceOption = sparkContext.getConf.getOption(SchedulerExtensionServices.SPARK_YARN_SERVICES)
-    services = serviceOption
-      .map { s =>
-        s.split(",").map(_.trim()).filter(!_.isEmpty)
-          .map { sClass =>
-            val instance = Utils.classForName(sClass)
-              .newInstance()
-              .asInstanceOf[SchedulerExtensionService]
-            // bind this service
-            instance.start(binding)
-            logInfo(s"Service $sClass started")
-            instance
-          }.toList
-      }.getOrElse(Nil)
+    services = sparkContext.conf.get(SCHEDULER_SERVICES).map { sClass =>
+      val instance = Utils.classForName(sClass)
+        .newInstance()
+        .asInstanceOf[SchedulerExtensionService]
+      // bind this service
+      instance.start(binding)
+      logInfo(s"Service $sClass started")
+      instance
+    }.toList
   }
 
   /**
@@ -144,11 +140,3 @@ private[spark] class SchedulerExtensionServices extends SchedulerExtensionServic
     | services=$services,
     | started=$started)""".stripMargin
 }
-
-private[spark] object SchedulerExtensionServices {
-
-  /**
-   * A list of comma separated services to instantiate in the scheduler
-   */
-  val SPARK_YARN_SERVICES = "spark.yarn.services"
-}
diff --git a/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala b/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala
index 19065373c6d55..b57c179d89bd2 100644
--- a/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala
+++ b/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala
@@ -41,6 +41,7 @@ import org.mockito.Mockito._
 import org.scalatest.{BeforeAndAfterAll, Matchers}
 
 import org.apache.spark.{SparkConf, SparkFunSuite}
+import org.apache.spark.deploy.yarn.config._
 import org.apache.spark.util.{ResetSystemProperties, Utils}
 
 class ClientSuite extends SparkFunSuite with Matchers with BeforeAndAfterAll
@@ -103,8 +104,9 @@ class ClientSuite extends SparkFunSuite with Matchers with BeforeAndAfterAll
 
   test("Local jar URIs") {
     val conf = new Configuration()
-    val sparkConf = new SparkConf().set(Client.CONF_SPARK_JAR, SPARK)
-      .set("spark.yarn.user.classpath.first", "true")
+    val sparkConf = new SparkConf()
+      .set(SPARK_JAR, SPARK)
+      .set(USER_CLASS_PATH_FIRST, true)
     val env = new MutableHashMap[String, String]()
     val args = new ClientArguments(Array("--jar", USER, "--addJars", ADDED), sparkConf)
 
@@ -129,13 +131,13 @@ class ClientSuite extends SparkFunSuite with Matchers with BeforeAndAfterAll
       }
     cp should contain(pwdVar)
     cp should contain (s"$pwdVar${Path.SEPARATOR}${Client.LOCALIZED_CONF_DIR}")
-    cp should not contain (Client.SPARK_JAR)
-    cp should not contain (Client.APP_JAR)
+    cp should not contain (Client.SPARK_JAR_NAME)
+    cp should not contain (Client.APP_JAR_NAME)
   }
 
   test("Jar path propagation through SparkConf") {
     val conf = new Configuration()
-    val sparkConf = new SparkConf().set(Client.CONF_SPARK_JAR, SPARK)
+    val sparkConf = new SparkConf().set(SPARK_JAR, SPARK)
     val args = new ClientArguments(Array("--jar", USER, "--addJars", ADDED), sparkConf)
 
     val client = spy(new Client(args, conf, sparkConf))
@@ -145,7 +147,7 @@ class ClientSuite extends SparkFunSuite with Matchers with BeforeAndAfterAll
     val tempDir = Utils.createTempDir()
     try {
       client.prepareLocalResources(tempDir.getAbsolutePath(), Nil)
-      sparkConf.getOption(Client.CONF_SPARK_USER_JAR) should be (Some(USER))
+      sparkConf.get(APP_JAR) should be (Some(USER))
 
       // The non-local path should be propagated by name only, since it will end up in the app's
       // staging dir.
@@ -160,7 +162,7 @@ class ClientSuite extends SparkFunSuite with Matchers with BeforeAndAfterAll
         })
         .mkString(",")
 
-      sparkConf.getOption(Client.CONF_SPARK_YARN_SECONDARY_JARS) should be (Some(expected))
+      sparkConf.get(SECONDARY_JARS) should be (Some(expected.split(",").toSeq))
     } finally {
       Utils.deleteRecursively(tempDir)
     }
@@ -169,9 +171,9 @@ class ClientSuite extends SparkFunSuite with Matchers with BeforeAndAfterAll
   test("Cluster path translation") {
     val conf = new Configuration()
     val sparkConf = new SparkConf()
-      .set(Client.CONF_SPARK_JAR, "local:/localPath/spark.jar")
-      .set("spark.yarn.config.gatewayPath", "/localPath")
-      .set("spark.yarn.config.replacementPath", "/remotePath")
+      .set(SPARK_JAR.key, "local:/localPath/spark.jar")
+      .set(GATEWAY_ROOT_PATH, "/localPath")
+      .set(REPLACEMENT_ROOT_PATH, "/remotePath")
 
     Client.getClusterPath(sparkConf, "/localPath") should be ("/remotePath")
     Client.getClusterPath(sparkConf, "/localPath/1:/localPath/2") should be (
@@ -191,8 +193,8 @@ class ClientSuite extends SparkFunSuite with Matchers with BeforeAndAfterAll
     // Spaces between non-comma strings should be preserved as single tags. Empty strings may or
     // may not be removed depending on the version of Hadoop being used.
     val sparkConf = new SparkConf()
-      .set(Client.CONF_SPARK_YARN_APPLICATION_TAGS, ",tag1, dup,tag2 , ,multi word , dup")
-      .set("spark.yarn.maxAppAttempts", "42")
+      .set(APPLICATION_TAGS.key, ",tag1, dup,tag2 , ,multi word , dup")
+      .set(MAX_APP_ATTEMPTS, 42)
     val args = new ClientArguments(Array(
       "--name", "foo-test-app",
       "--queue", "staging-queue"), sparkConf)
diff --git a/yarn/src/test/scala/org/apache/spark/scheduler/cluster/ExtensionServiceIntegrationSuite.scala b/yarn/src/test/scala/org/apache/spark/scheduler/cluster/ExtensionServiceIntegrationSuite.scala
index b4d1b0a3d22a7..338fbe2ef47fd 100644
--- a/yarn/src/test/scala/org/apache/spark/scheduler/cluster/ExtensionServiceIntegrationSuite.scala
+++ b/yarn/src/test/scala/org/apache/spark/scheduler/cluster/ExtensionServiceIntegrationSuite.scala
@@ -20,6 +20,7 @@ package org.apache.spark.scheduler.cluster
 import org.scalatest.BeforeAndAfter
 
 import org.apache.spark.{LocalSparkContext, Logging, SparkConf, SparkContext, SparkFunSuite}
+import org.apache.spark.deploy.yarn.config._
 
 /**
  * Test the integration with [[SchedulerExtensionServices]]
@@ -36,8 +37,7 @@ class ExtensionServiceIntegrationSuite extends SparkFunSuite
    */
   before {
     val sparkConf = new SparkConf()
-    sparkConf.set(SchedulerExtensionServices.SPARK_YARN_SERVICES,
-      classOf[SimpleExtensionService].getName())
+    sparkConf.set(SCHEDULER_SERVICES, Seq(classOf[SimpleExtensionService].getName()))
     sparkConf.setMaster("local").setAppName("ExtensionServiceIntegrationSuite")
     sc = new SparkContext(sparkConf)
   }

From 8577260abdc908ac08d28ddd3f07a2411fdc82b7 Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Mon, 7 Mar 2016 14:32:01 -0800
Subject: [PATCH 27/29] [SPARK-13442][SQL] Make type inference recognize
 boolean types

## What changes were proposed in this pull request?

https://issues.apache.org/jira/browse/SPARK-13442

This PR adds the support for inferring `BooleanType` for schema.
It supports to infer case-insensitive `true` / `false` as `BooleanType`.

Unittests were added for `CSVInferSchemaSuite` and `CSVSuite` for end-to-end test.

## How was the this patch tested?

This was tested with unittests and with `dev/run_tests` for coding style

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #11315 from HyukjinKwon/SPARK-13442.
---
 .../execution/datasources/csv/CSVInferSchema.scala  |  9 +++++++++
 sql/core/src/test/resources/bool.csv                |  5 +++++
 .../datasources/csv/CSVInferSchemaSuite.scala       | 11 +++++++++++
 .../sql/execution/datasources/csv/CSVSuite.scala    | 13 +++++++++++++
 4 files changed, 38 insertions(+)
 create mode 100644 sql/core/src/test/resources/bool.csv

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVInferSchema.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVInferSchema.scala
index 7f1ed28046b1d..edead9b21b21c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVInferSchema.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVInferSchema.scala
@@ -87,6 +87,7 @@ private[csv] object CSVInferSchema {
         case LongType => tryParseLong(field)
         case DoubleType => tryParseDouble(field)
         case TimestampType => tryParseTimestamp(field)
+        case BooleanType => tryParseBoolean(field)
         case StringType => StringType
         case other: DataType =>
           throw new UnsupportedOperationException(s"Unexpected data type $other")
@@ -117,6 +118,14 @@ private[csv] object CSVInferSchema {
   def tryParseTimestamp(field: String): DataType = {
     if ((allCatch opt Timestamp.valueOf(field)).isDefined) {
       TimestampType
+    } else {
+      tryParseBoolean(field)
+    }
+  }
+
+  def tryParseBoolean(field: String): DataType = {
+    if ((allCatch opt field.toBoolean).isDefined) {
+      BooleanType
     } else {
       stringType()
     }
diff --git a/sql/core/src/test/resources/bool.csv b/sql/core/src/test/resources/bool.csv
new file mode 100644
index 0000000000000..94b2d49506e0d
--- /dev/null
+++ b/sql/core/src/test/resources/bool.csv
@@ -0,0 +1,5 @@
+bool
+"True"
+"False"
+
+"true"
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVInferSchemaSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVInferSchemaSuite.scala
index 412f1b89beee7..7af3f94aefea2 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVInferSchemaSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVInferSchemaSuite.scala
@@ -30,6 +30,8 @@ class InferSchemaSuite extends SparkFunSuite {
     assert(CSVInferSchema.inferField(NullType, "3.5") == DoubleType)
     assert(CSVInferSchema.inferField(NullType, "test") == StringType)
     assert(CSVInferSchema.inferField(NullType, "2015-08-20 15:57:00") == TimestampType)
+    assert(CSVInferSchema.inferField(NullType, "True") == BooleanType)
+    assert(CSVInferSchema.inferField(NullType, "FAlSE") == BooleanType)
   }
 
   test("String fields types are inferred correctly from other types") {
@@ -40,6 +42,9 @@ class InferSchemaSuite extends SparkFunSuite {
     assert(CSVInferSchema.inferField(DoubleType, "test") == StringType)
     assert(CSVInferSchema.inferField(LongType, "2015-08-20 14:57:00") == TimestampType)
     assert(CSVInferSchema.inferField(DoubleType, "2015-08-20 15:57:00") == TimestampType)
+    assert(CSVInferSchema.inferField(LongType, "True") == BooleanType)
+    assert(CSVInferSchema.inferField(IntegerType, "FALSE") == BooleanType)
+    assert(CSVInferSchema.inferField(TimestampType, "FALSE") == BooleanType)
   }
 
   test("Timestamp field types are inferred correctly from other types") {
@@ -48,6 +53,11 @@ class InferSchemaSuite extends SparkFunSuite {
     assert(CSVInferSchema.inferField(LongType, "2015-08 14:49:00") == StringType)
   }
 
+  test("Boolean fields types are inferred correctly from other types") {
+    assert(CSVInferSchema.inferField(LongType, "Fale") == StringType)
+    assert(CSVInferSchema.inferField(DoubleType, "TRUEe") == StringType)
+  }
+
   test("Type arrays are merged to highest common type") {
     assert(
       CSVInferSchema.mergeRowTypes(Array(StringType),
@@ -67,6 +77,7 @@ class InferSchemaSuite extends SparkFunSuite {
     assert(CSVInferSchema.inferField(IntegerType, "\\N", "\\N") == IntegerType)
     assert(CSVInferSchema.inferField(DoubleType, "\\N", "\\N") == DoubleType)
     assert(CSVInferSchema.inferField(TimestampType, "\\N", "\\N") == TimestampType)
+    assert(CSVInferSchema.inferField(BooleanType, "\\N", "\\N") == BooleanType)
   }
 
   test("Merging Nulltypes should yeild Nulltype.") {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
index 9cd3a9ab952b4..53027bb698bf8 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
@@ -43,6 +43,7 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
   private val emptyFile = "empty.csv"
   private val commentsFile = "comments.csv"
   private val disableCommentsFile = "disable_comments.csv"
+  private val boolFile = "bool.csv"
   private val simpleSparseFile = "simple_sparse.csv"
 
   private def testFile(fileName: String): String = {
@@ -118,6 +119,18 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
     verifyCars(cars, withHeader = true, checkTypes = true)
   }
 
+  test("test inferring booleans") {
+    val result = sqlContext.read
+      .format("csv")
+      .option("header", "true")
+      .option("inferSchema", "true")
+      .load(testFile(boolFile))
+
+    val expectedSchema = StructType(List(
+      StructField("bool", BooleanType, nullable = true)))
+    assert(result.schema === expectedSchema)
+  }
+
   test("test with alternative delimiter and quote") {
     val cars = sqlContext.read
       .format("csv")

From 0eea12a3d956b54bbbd73d21b296868852a04494 Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Mon, 7 Mar 2016 14:48:02 -0800
Subject: [PATCH 28/29] [SPARK-13596][BUILD] Move misc top-level build files
 into appropriate subdirs

## What changes were proposed in this pull request?

Move many top-level files in dev/ or other appropriate directory. In particular, put `make-distribution.sh` in `dev` and update docs accordingly. Remove deprecated `sbt/sbt`.

I was (so far) unable to figure out how to move `tox.ini`. `scalastyle-config.xml` should be movable but edits to the project `.sbt` files didn't work; config file location is updatable for compile but not test scope.

## How was this patch tested?

`./dev/run-tests` to verify RAT and checkstyle work. Jenkins tests for the rest.

Author: Sean Owen <sowen@cloudera.com>

Closes #11522 from srowen/SPARK-13596.
---
 .gitignore                                    |  3 --
 .rat-excludes => dev/.rat-excludes            |  0
 dev/check-license                             |  7 +++--
 .../checkstyle-suppressions.xml               |  0
 checkstyle.xml => dev/checkstyle.xml          |  0
 dev/create-release/release-build.sh           |  2 +-
 dev/lint-python                               |  6 ++--
 .../make-distribution.sh                      |  6 ++--
 tox.ini => dev/tox.ini                        |  0
 docs/building-spark.md                        |  6 ++--
 docs/running-on-mesos.md                      |  4 +--
 pom.xml                                       |  4 +--
 pylintrc => python/pylintrc                   |  0
 sbt/sbt                                       | 29 -------------------
 14 files changed, 18 insertions(+), 49 deletions(-)
 rename .rat-excludes => dev/.rat-excludes (100%)
 rename checkstyle-suppressions.xml => dev/checkstyle-suppressions.xml (100%)
 rename checkstyle.xml => dev/checkstyle.xml (100%)
 rename make-distribution.sh => dev/make-distribution.sh (97%)
 rename tox.ini => dev/tox.ini (100%)
 rename pylintrc => python/pylintrc (100%)
 delete mode 100755 sbt/sbt

diff --git a/.gitignore b/.gitignore
index 8ecf536e79a5f..05afbb5e5ed69 100644
--- a/.gitignore
+++ b/.gitignore
@@ -17,8 +17,6 @@ cache
 work/
 out/
 .DS_Store
-third_party/libmesos.so
-third_party/libmesos.dylib
 build/apache-maven*
 build/zinc*
 build/scala*
@@ -60,7 +58,6 @@ dev/create-release/*final
 spark-*-bin-*.tgz
 unit-tests.log
 /lib/
-rat-results.txt
 scalastyle.txt
 scalastyle-output.xml
 R-unit-tests.log
diff --git a/.rat-excludes b/dev/.rat-excludes
similarity index 100%
rename from .rat-excludes
rename to dev/.rat-excludes
diff --git a/dev/check-license b/dev/check-license
index 10740cfdc5242..678e73fd60f1f 100755
--- a/dev/check-license
+++ b/dev/check-license
@@ -58,7 +58,7 @@ else
     declare java_cmd=java
 fi
 
-export RAT_VERSION=0.10
+export RAT_VERSION=0.11
 export rat_jar="$FWDIR"/lib/apache-rat-${RAT_VERSION}.jar
 mkdir -p "$FWDIR"/lib
 
@@ -67,14 +67,15 @@ mkdir -p "$FWDIR"/lib
     exit 1
 }
 
-$java_cmd -jar "$rat_jar" -E "$FWDIR"/.rat-excludes  -d "$FWDIR" > rat-results.txt
+mkdir target
+$java_cmd -jar "$rat_jar" -E "$FWDIR"/dev/.rat-excludes -d "$FWDIR" > target/rat-results.txt
 
 if [ $? -ne 0 ]; then
    echo "RAT exited abnormally"
    exit 1
 fi
 
-ERRORS="$(cat rat-results.txt | grep -e "??")"
+ERRORS="$(cat target/rat-results.txt | grep -e "??")"
 
 if test ! -z "$ERRORS"; then 
     echo "Could not find Apache license headers in the following files:"
diff --git a/checkstyle-suppressions.xml b/dev/checkstyle-suppressions.xml
similarity index 100%
rename from checkstyle-suppressions.xml
rename to dev/checkstyle-suppressions.xml
diff --git a/checkstyle.xml b/dev/checkstyle.xml
similarity index 100%
rename from checkstyle.xml
rename to dev/checkstyle.xml
diff --git a/dev/create-release/release-build.sh b/dev/create-release/release-build.sh
index c08b6d7de6fe0..65e80fc76056a 100755
--- a/dev/create-release/release-build.sh
+++ b/dev/create-release/release-build.sh
@@ -165,7 +165,7 @@ if [[ "$1" == "package" ]]; then
     # Get maven home set by MVN
     MVN_HOME=`$MVN -version 2>&1 | grep 'Maven home' | awk '{print $NF}'`
 
-    ./make-distribution.sh --name $NAME --mvn $MVN_HOME/bin/mvn --tgz $FLAGS \
+    ./dev/make-distribution.sh --name $NAME --mvn $MVN_HOME/bin/mvn --tgz $FLAGS \
       -DzincPort=$ZINC_PORT 2>&1 >  ../binary-release-$NAME.log
     cd ..
     cp spark-$SPARK_VERSION-bin-$NAME/spark-$SPARK_VERSION-bin-$NAME.tgz .
diff --git a/dev/lint-python b/dev/lint-python
index 068337d273f82..477ac0ef6d294 100755
--- a/dev/lint-python
+++ b/dev/lint-python
@@ -37,7 +37,7 @@ compile_status="${PIPESTATUS[0]}"
 #+ See: https://github.com/apache/spark/pull/1744#issuecomment-50982162
 #+ TODOs:
 #+  - Download pep8 from PyPI. It's more "official".
-PEP8_VERSION="1.6.2"
+PEP8_VERSION="1.7.0"
 PEP8_SCRIPT_PATH="$SPARK_ROOT_DIR/dev/pep8-$PEP8_VERSION.py"
 PEP8_SCRIPT_REMOTE_PATH="https://raw.githubusercontent.com/jcrocholl/pep8/$PEP8_VERSION/pep8.py"
 
@@ -80,7 +80,7 @@ export "PATH=$PYTHONPATH:$PATH"
 #+ first, but we do so so that the check status can
 #+ be output before the report, like with the
 #+ scalastyle and RAT checks.
-python "$PEP8_SCRIPT_PATH" --ignore=E402,E731,E241,W503,E226 $PATHS_TO_CHECK >> "$PEP8_REPORT_PATH"
+python "$PEP8_SCRIPT_PATH" --ignore=E402,E731,E241,W503,E226 --config=dev/tox.ini $PATHS_TO_CHECK >> "$PEP8_REPORT_PATH"
 pep8_status="${PIPESTATUS[0]}"
 
 if [ "$compile_status" -eq 0 -a "$pep8_status" -eq 0 ]; then
@@ -122,7 +122,7 @@ fi
 
 # for to_be_checked in "$PATHS_TO_CHECK"
 # do
-#     pylint --rcfile="$SPARK_ROOT_DIR/pylintrc" $to_be_checked >> "$PYLINT_REPORT_PATH"
+#     pylint --rcfile="$SPARK_ROOT_DIR/python/pylintrc" $to_be_checked >> "$PYLINT_REPORT_PATH"
 # done
 
 # if [ "${PIPESTATUS[0]}" -ne 0 ]; then
diff --git a/make-distribution.sh b/dev/make-distribution.sh
similarity index 97%
rename from make-distribution.sh
rename to dev/make-distribution.sh
index ac90ea317a6fc..ac4e9b90f0177 100755
--- a/make-distribution.sh
+++ b/dev/make-distribution.sh
@@ -29,7 +29,7 @@ set -e
 set -x
 
 # Figure out where the Spark framework is installed
-SPARK_HOME="$(cd "`dirname "$0"`"; pwd)"
+SPARK_HOME="$(cd "`dirname "$0"`/.."; pwd)"
 DISTDIR="$SPARK_HOME/dist"
 
 MAKE_TGZ=false
@@ -41,7 +41,7 @@ function exit_with_usage {
   echo ""
   echo "usage:"
   cl_options="[--name] [--tgz] [--mvn <mvn-command>]"
-  echo "./make-distribution.sh $cl_options <maven build options>"
+  echo "make-distribution.sh $cl_options <maven build options>"
   echo "See Spark's \"Building Spark\" doc for correct Maven options."
   echo ""
   exit 1
@@ -104,7 +104,7 @@ fi
 if [ $(command -v git) ]; then
     GITREV=$(git rev-parse --short HEAD 2>/dev/null || :)
     if [ ! -z "$GITREV" ]; then
-	 GITREVSTRING=" (git revision $GITREV)"
+        GITREVSTRING=" (git revision $GITREV)"
     fi
     unset GITREV
 fi
diff --git a/tox.ini b/dev/tox.ini
similarity index 100%
rename from tox.ini
rename to dev/tox.ini
diff --git a/docs/building-spark.md b/docs/building-spark.md
index adf798847c3c3..2c6294133e863 100644
--- a/docs/building-spark.md
+++ b/docs/building-spark.md
@@ -35,12 +35,12 @@ to the `sharedSettings` val. See also [this PR](https://github.com/apache/spark/
 
 To create a Spark distribution like those distributed by the
 [Spark Downloads](http://spark.apache.org/downloads.html) page, and that is laid out so as
-to be runnable, use `make-distribution.sh` in the project root directory. It can be configured
+to be runnable, use `./dev/make-distribution.sh` in the project root directory. It can be configured
 with Maven profile settings and so on like the direct Maven build. Example:
 
-    ./make-distribution.sh --name custom-spark --tgz -Psparkr -Phadoop-2.4 -Phive -Phive-thriftserver -Pyarn
+    ./dev/make-distribution.sh --name custom-spark --tgz -Psparkr -Phadoop-2.4 -Phive -Phive-thriftserver -Pyarn
 
-For more information on usage, run `./make-distribution.sh --help`
+For more information on usage, run `./dev/make-distribution.sh --help`
 
 # Setting up Maven's Memory Usage
 
diff --git a/docs/running-on-mesos.md b/docs/running-on-mesos.md
index 9816d030e90ac..912a0108129c2 100644
--- a/docs/running-on-mesos.md
+++ b/docs/running-on-mesos.md
@@ -98,10 +98,10 @@ To host on HDFS, use the Hadoop fs put command: `hadoop fs -put spark-{{site.SPA
 
 
 Or if you are using a custom-compiled version of Spark, you will need to create a package using
-the `make-distribution.sh` script included in a Spark source tarball/checkout.
+the `dev/make-distribution.sh` script included in a Spark source tarball/checkout.
 
 1. Download and build Spark using the instructions [here](index.html)
-2. Create a binary package using `make-distribution.sh --tgz`.
+2. Create a binary package using `./dev/make-distribution.sh --tgz`.
 3. Upload archive to http/s3/hdfs
 
 
diff --git a/pom.xml b/pom.xml
index 85c0131b4e38c..e7f3442d44f4b 100644
--- a/pom.xml
+++ b/pom.xml
@@ -2246,7 +2246,7 @@
           <failOnWarning>false</failOnWarning>
           <sourceDirectory>${basedir}/src/main/scala</sourceDirectory>
           <testSourceDirectory>${basedir}/src/test/scala</testSourceDirectory>
-          <configLocation>scalastyle-config.xml</configLocation>
+          <configLocation>dev/scalastyle-config.xml</configLocation>
           <outputFile>${basedir}/target/scalastyle-output.xml</outputFile>
           <inputEncoding>${project.build.sourceEncoding}</inputEncoding>
           <outputEncoding>${project.reporting.outputEncoding}</outputEncoding>
@@ -2270,7 +2270,7 @@
           <failOnWarning>false</failOnWarning>
           <sourceDirectory>${basedir}/src/main/java</sourceDirectory>
           <testSourceDirectory>${basedir}/src/test/java</testSourceDirectory>
-          <configLocation>checkstyle.xml</configLocation>
+          <configLocation>dev/checkstyle.xml</configLocation>
           <outputFile>${basedir}/target/checkstyle-output.xml</outputFile>
           <inputEncoding>${project.build.sourceEncoding}</inputEncoding>
           <outputEncoding>${project.reporting.outputEncoding}</outputEncoding>
diff --git a/pylintrc b/python/pylintrc
similarity index 100%
rename from pylintrc
rename to python/pylintrc
diff --git a/sbt/sbt b/sbt/sbt
deleted file mode 100755
index 41438251f681e..0000000000000
--- a/sbt/sbt
+++ /dev/null
@@ -1,29 +0,0 @@
-#!/usr/bin/env bash
-
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# Determine the current working directory
-_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
-
-echo "NOTE: The sbt/sbt script has been relocated to build/sbt." >&2
-echo "      Please update references to point to the new location." >&2
-echo "" >&2
-echo "      Invoking 'build/sbt $@' now ..." >&2
-echo "" >&2
-
-${_DIR}/../build/sbt "$@"

From e720dda42e806229ccfd970055c7b8a93eb447bf Mon Sep 17 00:00:00 2001
From: Michael Armbrust <michael@databricks.com>
Date: Mon, 7 Mar 2016 15:15:10 -0800
Subject: [PATCH 29/29] [SPARK-13665][SQL] Separate the concerns of
 HadoopFsRelation

`HadoopFsRelation` is used for reading most files into Spark SQL.  However today this class mixes the concerns of file management, schema reconciliation, scan building, bucketing, partitioning, and writing data.  As a result, many data sources are forced to reimplement the same functionality and the various layers have accumulated a fair bit of inefficiency.  This PR is a first cut at separating this into several components / interfaces that are each described below.  Additionally, all implementations inside of Spark (parquet, csv, json, text, orc, svmlib) have been ported to the new API `FileFormat`.  External libraries, such as spark-avro will also need to be ported to work with Spark 2.0.

### HadoopFsRelation
A simple `case class` that acts as a container for all of the metadata required to read from a datasource.  All discovery, resolution and merging logic for schemas and partitions has been removed.  This an internal representation that no longer needs to be exposed to developers.

```scala
case class HadoopFsRelation(
    sqlContext: SQLContext,
    location: FileCatalog,
    partitionSchema: StructType,
    dataSchema: StructType,
    bucketSpec: Option[BucketSpec],
    fileFormat: FileFormat,
    options: Map[String, String]) extends BaseRelation
```

### FileFormat
The primary interface that will be implemented by each different format including external libraries.  Implementors are responsible for reading a given format and converting it into `InternalRow` as well as writing out an `InternalRow`.  A format can optionally return a schema that is inferred from a set of files.

```scala
trait FileFormat {
  def inferSchema(
      sqlContext: SQLContext,
      options: Map[String, String],
      files: Seq[FileStatus]): Option[StructType]

  def prepareWrite(
      sqlContext: SQLContext,
      job: Job,
      options: Map[String, String],
      dataSchema: StructType): OutputWriterFactory

  def buildInternalScan(
      sqlContext: SQLContext,
      dataSchema: StructType,
      requiredColumns: Array[String],
      filters: Array[Filter],
      bucketSet: Option[BitSet],
      inputFiles: Array[FileStatus],
      broadcastedConf: Broadcast[SerializableConfiguration],
      options: Map[String, String]): RDD[InternalRow]
}
```

The current interface is based on what was required to get all the tests passing again, but still mixes a couple of concerns (i.e. `bucketSet` is passed down to the scan instead of being resolved by the planner).  Additionally, scans are still returning `RDD`s instead of iterators for single files.  In a future PR, bucketing should be removed from this interface and the scan should be isolated to a single file.

### FileCatalog
This interface is used to list the files that make up a given relation, as well as handle directory based partitioning.

```scala
trait FileCatalog {
  def paths: Seq[Path]
  def partitionSpec(schema: Option[StructType]): PartitionSpec
  def allFiles(): Seq[FileStatus]
  def getStatus(path: Path): Array[FileStatus]
  def refresh(): Unit
}
```

Currently there are two implementations:
 - `HDFSFileCatalog` - based on code from the old `HadoopFsRelation`.  Infers partitioning by recursive listing and caches this data for performance
 - `HiveFileCatalog` - based on the above, but it uses the partition spec from the Hive Metastore.

### ResolvedDataSource
Produces a logical plan given the following description of a Data Source (which can come from DataFrameReader or a metastore):
 - `paths: Seq[String] = Nil`
 - `userSpecifiedSchema: Option[StructType] = None`
 - `partitionColumns: Array[String] = Array.empty`
 - `bucketSpec: Option[BucketSpec] = None`
 - `provider: String`
 - `options: Map[String, String]`

This class is responsible for deciding which of the Data Source APIs a given provider is using (including the non-file based ones).  All reconciliation of partitions, buckets, schema from metastores or inference is done here.

### DataSourceAnalysis / DataSourceStrategy
Responsible for analyzing and planning reading/writing of data using any of the Data Source APIs, including:
 - pruning the files from partitions that will be read based on filters.
 - appending partition columns*
 - applying additional filters when a data source can not evaluate them internally.
 - constructing an RDD that is bucketed correctly when required*
 - sanity checking schema match-up and other analysis when writing.

*In the future we should do that following:
 - Break out file handling into its own Strategy as its sufficiently complex / isolated.
 - Push the appending of partition columns down in to `FileFormat` to avoid an extra copy / unvectorization.
 - Use a custom RDD for scans instead of `SQLNewNewHadoopRDD2`

Author: Michael Armbrust <michael@databricks.com>
Author: Wenchen Fan <wenchen@databricks.com>

Closes #11509 from marmbrus/fileDataSource.
---
 .../spark/rdd/ZippedPartitionsRDD.scala       |   3 +-
 .../ml/source/libsvm/LibSVMRelation.scala     | 135 ++--
 .../source/libsvm/LibSVMRelationSuite.scala   |   8 +-
 project/MimaExcludes.scala                    |   6 +-
 .../org/apache/spark/sql/types/DataType.scala |   2 +-
 .../apache/spark/sql/DataFrameReader.scala    |  59 +-
 .../apache/spark/sql/DataFrameWriter.scala    |   7 -
 .../spark/sql/execution/ExistingRDD.scala     |  17 +-
 .../datasources/DataSourceStrategy.scala      | 235 ++++--
 .../InsertIntoHadoopFsRelation.scala          |  77 +-
 .../datasources/PartitioningUtils.scala       |  16 +-
 .../datasources/ResolvedDataSource.scala      | 261 ++++---
 .../datasources/WriterContainer.scala         |  24 +-
 .../sql/execution/datasources/bucket.scala    |  24 -
 .../datasources/csv/CSVRelation.scala         | 136 +---
 .../datasources/csv/DefaultSource.scala       | 157 +++-
 .../spark/sql/execution/datasources/ddl.scala |   5 +-
 .../datasources/json/InferSchema.scala        |   2 +-
 .../datasources/json/JSONRelation.scala       | 176 ++---
 .../datasources/parquet/ParquetRelation.scala | 503 +++++--------
 .../sql/execution/datasources/rules.scala     |   3 +-
 .../datasources/text/DefaultSource.scala      | 122 ++-
 .../spark/sql/internal/SessionState.scala     |   7 +-
 .../apache/spark/sql/sources/interfaces.scala | 701 +++++-------------
 .../org/apache/spark/sql/DataFrameSuite.scala |   2 -
 .../org/apache/spark/sql/SQLQuerySuite.scala  |   2 +-
 .../datasources/json/JsonSuite.scala          |  71 --
 .../parquet/ParquetFilterSuite.scala          |   5 +-
 .../datasources/parquet/ParquetIOSuite.scala  |   4 +-
 .../ParquetPartitionDiscoverySuite.scala      |   3 +-
 .../spark/sql/sources/InsertSuite.scala       |   2 +-
 .../sql/streaming/FileStreamSourceSuite.scala |  16 +-
 .../apache/spark/sql/test/SQLTestUtils.scala  |   9 +-
 .../spark/sql/hive/HiveMetastoreCatalog.scala | 111 ++-
 .../spark/sql/hive/HiveSessionState.scala     |   1 +
 .../spark/sql/hive/execution/commands.scala   |  40 +-
 .../spark/sql/hive/orc/OrcFileOperator.scala  |  25 +-
 .../spark/sql/hive/orc/OrcRelation.scala      | 206 +++--
 .../apache/spark/sql/hive/test/TestHive.scala |   2 +-
 .../sql/hive/MetastoreDataSourcesSuite.scala  |  49 +-
 .../sql/hive/execution/SQLQuerySuite.scala    |   8 +-
 .../spark/sql/hive/orc/OrcFilterSuite.scala   |   5 +-
 .../spark/sql/hive/orc/OrcQuerySuite.scala    |   9 +-
 .../apache/spark/sql/hive/parquetSuites.scala |  43 +-
 .../spark/sql/sources/BucketedReadSuite.scala |  43 +-
 .../sql/sources/BucketedWriteSuite.scala      |   3 +-
 .../CommitFailureTestRelationSuite.scala      | 104 ---
 .../SimpleTextHadoopFsRelationSuite.scala     | 382 ----------
 .../sql/sources/SimpleTextRelation.scala      | 271 -------
 .../sql/sources/hadoopFsRelationSuites.scala  |   4 +-
 50 files changed, 1450 insertions(+), 2656 deletions(-)
 delete mode 100644 sql/hive/src/test/scala/org/apache/spark/sql/sources/CommitFailureTestRelationSuite.scala
 delete mode 100644 sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextHadoopFsRelationSuite.scala
 delete mode 100644 sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextRelation.scala

diff --git a/core/src/main/scala/org/apache/spark/rdd/ZippedPartitionsRDD.scala b/core/src/main/scala/org/apache/spark/rdd/ZippedPartitionsRDD.scala
index 4333a679c8aae..3cb1231bd3477 100644
--- a/core/src/main/scala/org/apache/spark/rdd/ZippedPartitionsRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/ZippedPartitionsRDD.scala
@@ -54,7 +54,8 @@ private[spark] abstract class ZippedPartitionsBaseRDD[V: ClassTag](
   override def getPartitions: Array[Partition] = {
     val numParts = rdds.head.partitions.length
     if (!rdds.forall(rdd => rdd.partitions.length == numParts)) {
-      throw new IllegalArgumentException("Can't zip RDDs with unequal numbers of partitions")
+      throw new IllegalArgumentException(
+        s"Can't zip RDDs with unequal numbers of partitions: ${rdds.map(_.partitions.length)}")
     }
     Array.tabulate[Partition](numParts) { i =>
       val prefs = rdds.map(rdd => rdd.preferredLocations(rdd.partitions(i)))
diff --git a/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala b/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala
index b9c364b05dc11..976343ed961c5 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala
@@ -19,74 +19,23 @@ package org.apache.spark.ml.source.libsvm
 
 import java.io.IOException
 
-import com.google.common.base.Objects
 import org.apache.hadoop.fs.{FileStatus, Path}
 import org.apache.hadoop.io.{NullWritable, Text}
-import org.apache.hadoop.mapreduce.{RecordWriter, TaskAttemptContext}
+import org.apache.hadoop.mapreduce.{Job, RecordWriter, TaskAttemptContext}
 import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat
 
 import org.apache.spark.annotation.Since
+import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.mllib.linalg.{Vector, VectorUDT}
 import org.apache.spark.mllib.util.MLUtils
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.{DataFrame, DataFrameReader, Row, SQLContext}
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.encoders.RowEncoder
 import org.apache.spark.sql.sources._
 import org.apache.spark.sql.types._
-
-/**
- * LibSVMRelation provides the DataFrame constructed from LibSVM format data.
- * @param path File path of LibSVM format
- * @param numFeatures The number of features
- * @param vectorType The type of vector. It can be 'sparse' or 'dense'
- * @param sqlContext The Spark SQLContext
- */
-private[libsvm] class LibSVMRelation(val path: String, val numFeatures: Int, val vectorType: String)
-    (@transient val sqlContext: SQLContext)
-  extends HadoopFsRelation with Serializable {
-
-  override def buildScan(requiredColumns: Array[String], inputFiles: Array[FileStatus])
-    : RDD[Row] = {
-    val sc = sqlContext.sparkContext
-    val baseRdd = MLUtils.loadLibSVMFile(sc, path, numFeatures)
-    val sparse = vectorType == "sparse"
-    baseRdd.map { pt =>
-      val features = if (sparse) pt.features.toSparse else pt.features.toDense
-      Row(pt.label, features)
-    }
-  }
-
-  override def hashCode(): Int = {
-    Objects.hashCode(path, Double.box(numFeatures), vectorType)
-  }
-
-  override def equals(other: Any): Boolean = other match {
-    case that: LibSVMRelation =>
-      path == that.path &&
-        numFeatures == that.numFeatures &&
-        vectorType == that.vectorType
-    case _ =>
-      false
-  }
-
-  override def prepareJobForWrite(job: _root_.org.apache.hadoop.mapreduce.Job):
-    _root_.org.apache.spark.sql.sources.OutputWriterFactory = {
-    new OutputWriterFactory {
-      override def newInstance(
-          path: String,
-          dataSchema: StructType,
-          context: TaskAttemptContext): OutputWriter = {
-        new LibSVMOutputWriter(path, dataSchema, context)
-      }
-    }
-  }
-
-  override def paths: Array[String] = Array(path)
-
-  override def dataSchema: StructType = StructType(
-    StructField("label", DoubleType, nullable = false) ::
-      StructField("features", new VectorUDT(), nullable = false) :: Nil)
-}
-
+import org.apache.spark.util.SerializableConfiguration
+import org.apache.spark.util.collection.BitSet
 
 private[libsvm] class LibSVMOutputWriter(
     path: String,
@@ -124,6 +73,7 @@ private[libsvm] class LibSVMOutputWriter(
     recordWriter.close(context)
   }
 }
+
 /**
  * `libsvm` package implements Spark SQL data source API for loading LIBSVM data as [[DataFrame]].
  * The loaded [[DataFrame]] has two columns: `label` containing labels stored as doubles and
@@ -155,7 +105,7 @@ private[libsvm] class LibSVMOutputWriter(
  *  @see [[https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/ LIBSVM datasets]]
  */
 @Since("1.6.0")
-class DefaultSource extends HadoopFsRelationProvider with DataSourceRegister {
+class DefaultSource extends FileFormat with DataSourceRegister {
 
   @Since("1.6.0")
   override def shortName(): String = "libsvm"
@@ -167,22 +117,63 @@ class DefaultSource extends HadoopFsRelationProvider with DataSourceRegister {
       throw new IOException(s"Illegal schema for libsvm data, schema=${dataSchema}")
     }
   }
+  override def inferSchema(
+      sqlContext: SQLContext,
+      options: Map[String, String],
+      files: Seq[FileStatus]): Option[StructType] = {
+    Some(
+      StructType(
+        StructField("label", DoubleType, nullable = false) ::
+        StructField("features", new VectorUDT(), nullable = false) :: Nil))
+  }
 
-  override def createRelation(
+  override def prepareWrite(
       sqlContext: SQLContext,
-      paths: Array[String],
-      dataSchema: Option[StructType],
-      partitionColumns: Option[StructType],
-      parameters: Map[String, String]): HadoopFsRelation = {
-    val path = if (paths.length == 1) paths(0)
-      else if (paths.isEmpty) throw new IOException("No input path specified for libsvm data")
-      else throw new IOException("Multiple input paths are not supported for libsvm data")
-    if (partitionColumns.isDefined && !partitionColumns.get.isEmpty) {
-      throw new IOException("Partition is not supported for libsvm data")
+      job: Job,
+      options: Map[String, String],
+      dataSchema: StructType): OutputWriterFactory = {
+    new OutputWriterFactory {
+      override def newInstance(
+          path: String,
+          bucketId: Option[Int],
+          dataSchema: StructType,
+          context: TaskAttemptContext): OutputWriter = {
+        if (bucketId.isDefined) { sys.error("LibSVM doesn't support bucketing") }
+        new LibSVMOutputWriter(path, dataSchema, context)
+      }
+    }
+  }
+
+  override def buildInternalScan(
+      sqlContext: SQLContext,
+      dataSchema: StructType,
+      requiredColumns: Array[String],
+      filters: Array[Filter],
+      bucketSet: Option[BitSet],
+      inputFiles: Array[FileStatus],
+      broadcastedConf: Broadcast[SerializableConfiguration],
+      options: Map[String, String]): RDD[InternalRow] = {
+    // TODO: This does not handle cases where column pruning has been performed.
+
+    verifySchema(dataSchema)
+    val dataFiles = inputFiles.filterNot(_.getPath.getName startsWith "_")
+
+    val path = if (dataFiles.length == 1) dataFiles(0).getPath.toUri.toString
+    else if (dataFiles.isEmpty) throw new IOException("No input path specified for libsvm data")
+    else throw new IOException("Multiple input paths are not supported for libsvm data.")
+
+    val numFeatures = options.getOrElse("numFeatures", "-1").toInt
+    val vectorType = options.getOrElse("vectorType", "sparse")
+
+    val sc = sqlContext.sparkContext
+    val baseRdd = MLUtils.loadLibSVMFile(sc, path, numFeatures)
+    val sparse = vectorType == "sparse"
+    baseRdd.map { pt =>
+      val features = if (sparse) pt.features.toSparse else pt.features.toDense
+      Row(pt.label, features)
+    }.mapPartitions { externalRows =>
+      val converter = RowEncoder(dataSchema)
+      externalRows.map(converter.toRow)
     }
-    dataSchema.foreach(verifySchema(_))
-    val numFeatures = parameters.getOrElse("numFeatures", "-1").toInt
-    val vectorType = parameters.getOrElse("vectorType", "sparse")
-    new LibSVMRelation(path, numFeatures, vectorType)(sqlContext)
   }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/source/libsvm/LibSVMRelationSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/source/libsvm/LibSVMRelationSuite.scala
index 528d9e21cb1fd..84fc08be09ee7 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/source/libsvm/LibSVMRelationSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/source/libsvm/LibSVMRelationSuite.scala
@@ -22,7 +22,7 @@ import java.io.{File, IOException}
 import com.google.common.base.Charsets
 import com.google.common.io.Files
 
-import org.apache.spark.SparkFunSuite
+import org.apache.spark.{SparkException, SparkFunSuite}
 import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vectors}
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.sql.SaveMode
@@ -88,7 +88,8 @@ class LibSVMRelationSuite extends SparkFunSuite with MLlibTestSparkContext {
     val df = sqlContext.read.format("libsvm").load(path)
     val tempDir2 = Utils.createTempDir()
     val writepath = tempDir2.toURI.toString
-    df.write.format("libsvm").mode(SaveMode.Overwrite).save(writepath)
+    // TODO: Remove requirement to coalesce by supporting mutiple reads.
+    df.coalesce(1).write.format("libsvm").mode(SaveMode.Overwrite).save(writepath)
 
     val df2 = sqlContext.read.format("libsvm").load(writepath)
     val row1 = df2.first()
@@ -98,9 +99,8 @@ class LibSVMRelationSuite extends SparkFunSuite with MLlibTestSparkContext {
 
   test("write libsvm data failed due to invalid schema") {
     val df = sqlContext.read.format("text").load(path)
-    val e = intercept[IOException] {
+    val e = intercept[SparkException] {
       df.write.format("libsvm").save(path + "_2")
     }
-    assert(e.getMessage.contains("Illegal schema for libsvm data"))
   }
 }
diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index 983f71684c38b..45776fbb9f336 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -60,7 +60,11 @@ object MimaExcludes {
         ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.sql.SQLContext.jsonRDD"),
         ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.sql.SQLContext.load"),
         ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.sql.SQLContext.dialectClassName"),
-        ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.sql.SQLContext.getSQLDialect")
+        ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.sql.SQLContext.getSQLDialect"),
+        // SPARK-13664 Replace HadoopFsRelation with FileFormat
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.ml.source.libsvm.LibSVMRelation"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.HadoopFsRelationProvider"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.HadoopFsRelation$FileStatusCache")
       ) ++ Seq(
         ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.SparkContext.emptyRDD"),
         ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.broadcast.HttpBroadcastFactory")
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala
index 92cf8d4c46bda..3d4a02b0ffebd 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala
@@ -103,7 +103,7 @@ object DataType {
 
   /** Given the string representation of a type, return its DataType */
   private def nameToType(name: String): DataType = {
-    val FIXED_DECIMAL = """decimal\(\s*(\d+)\s*,\s*(\d+)\s*\)""".r
+    val FIXED_DECIMAL = """decimal\(\s*(\d+)\s*,\s*(\-?\d+)\s*\)""".r
     name match {
       case "decimal" => DecimalType.USER_DEFAULT
       case FIXED_DECIMAL(precision, scale) => DecimalType(precision.toInt, scale.toInt)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
index 20c861de23778..fd92e526e1529 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
@@ -21,18 +21,14 @@ import java.util.Properties
 
 import scala.collection.JavaConverters._
 
-import org.apache.hadoop.fs.Path
-import org.apache.hadoop.util.StringUtils
-
 import org.apache.spark.{Logging, Partition}
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.api.java.JavaRDD
-import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.execution.LogicalRDD
 import org.apache.spark.sql.execution.datasources.{LogicalRelation, ResolvedDataSource}
 import org.apache.spark.sql.execution.datasources.jdbc.{JDBCPartition, JDBCPartitioningInfo, JDBCRelation}
-import org.apache.spark.sql.execution.datasources.json.JSONRelation
-import org.apache.spark.sql.execution.datasources.parquet.ParquetRelation
+import org.apache.spark.sql.execution.datasources.json.{InferSchema, JacksonParser, JSONOptions}
 import org.apache.spark.sql.execution.streaming.StreamingRelation
 import org.apache.spark.sql.types.StructType
 
@@ -129,8 +125,6 @@ class DataFrameReader private[sql](sqlContext: SQLContext) extends Logging {
     val resolved = ResolvedDataSource(
       sqlContext,
       userSpecifiedSchema = userSpecifiedSchema,
-      partitionColumns = Array.empty[String],
-      bucketSpec = None,
       provider = source,
       options = extraOptions.toMap)
     DataFrame(sqlContext, LogicalRelation(resolved.relation))
@@ -154,7 +148,17 @@ class DataFrameReader private[sql](sqlContext: SQLContext) extends Logging {
    */
   @scala.annotation.varargs
   def load(paths: String*): DataFrame = {
-    option("paths", paths.map(StringUtils.escapeString(_, '\\', ',')).mkString(",")).load()
+    if (paths.isEmpty) {
+      sqlContext.emptyDataFrame
+    } else {
+      sqlContext.baseRelationToDataFrame(
+        ResolvedDataSource.apply(
+          sqlContext,
+          paths = paths,
+          userSpecifiedSchema = userSpecifiedSchema,
+          provider = source,
+          options = extraOptions.toMap).relation)
+    }
   }
 
   /**
@@ -334,14 +338,20 @@ class DataFrameReader private[sql](sqlContext: SQLContext) extends Logging {
    * @since 1.4.0
    */
   def json(jsonRDD: RDD[String]): DataFrame = {
-    sqlContext.baseRelationToDataFrame(
-      new JSONRelation(
-        Some(jsonRDD),
-        maybeDataSchema = userSpecifiedSchema,
-        maybePartitionSpec = None,
-        userDefinedPartitionColumns = None,
-        parameters = extraOptions.toMap)(sqlContext)
-    )
+    val parsedOptions: JSONOptions = new JSONOptions(extraOptions.toMap)
+    val schema = userSpecifiedSchema.getOrElse {
+      InferSchema.infer(jsonRDD, sqlContext.conf.columnNameOfCorruptRecord, parsedOptions)
+    }
+
+    new DataFrame(
+      sqlContext,
+      LogicalRDD(
+        schema.toAttributes,
+        JacksonParser.parse(
+          jsonRDD,
+          schema,
+          sqlContext.conf.columnNameOfCorruptRecord,
+          parsedOptions))(sqlContext))
   }
 
   /**
@@ -363,20 +373,7 @@ class DataFrameReader private[sql](sqlContext: SQLContext) extends Logging {
    */
   @scala.annotation.varargs
   def parquet(paths: String*): DataFrame = {
-    if (paths.isEmpty) {
-      sqlContext.emptyDataFrame
-    } else {
-      val globbedPaths = paths.flatMap { path =>
-        val hdfsPath = new Path(path)
-        val fs = hdfsPath.getFileSystem(sqlContext.sparkContext.hadoopConfiguration)
-        val qualified = hdfsPath.makeQualified(fs.getUri, fs.getWorkingDirectory)
-        SparkHadoopUtil.get.globPathIfNecessary(qualified)
-      }.toArray
-
-      sqlContext.baseRelationToDataFrame(
-        new ParquetRelation(
-          globbedPaths.map(_.toString), userSpecifiedSchema, None, extraOptions.toMap)(sqlContext))
-    }
+    format("parquet").load(paths: _*)
   }
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
index c373606a2e07e..6d8c8f6b4f979 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
@@ -366,13 +366,6 @@ final class DataFrameWriter private[sql](df: DataFrame) {
       case (true, SaveMode.ErrorIfExists) =>
         throw new AnalysisException(s"Table $tableIdent already exists.")
 
-      case (true, SaveMode.Append) =>
-        // If it is Append, we just ask insertInto to handle it. We will not use insertInto
-        // to handle saveAsTable with Overwrite because saveAsTable can change the schema of
-        // the table. But, insertInto with Overwrite requires the schema of data be the same
-        // the schema of the table.
-        insertInto(tableIdent)
-
       case _ =>
         val cmd =
           CreateTableUsingAsSelect(
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ExistingRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ExistingRDD.scala
index 36e656b8b6abf..4ad07508ca429 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ExistingRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ExistingRDD.scala
@@ -25,7 +25,7 @@ import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.codegen.CodegenContext
 import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Statistics}
 import org.apache.spark.sql.catalyst.plans.physical.{HashPartitioning, Partitioning, UnknownPartitioning}
-import org.apache.spark.sql.execution.datasources.parquet.ParquetRelation
+import org.apache.spark.sql.execution.datasources.parquet.{DefaultSource => ParquetSource}
 import org.apache.spark.sql.execution.metric.SQLMetrics
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.sources.{BaseRelation, HadoopFsRelation}
@@ -226,16 +226,17 @@ private[sql] object PhysicalRDD {
       rdd: RDD[InternalRow],
       relation: BaseRelation,
       metadata: Map[String, String] = Map.empty): PhysicalRDD = {
-    val outputUnsafeRows = if (relation.isInstanceOf[ParquetRelation]) {
-      // The vectorized parquet reader does not produce unsafe rows.
-      !SQLContext.getActive().get.conf.getConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED)
-    } else {
-      // All HadoopFsRelations output UnsafeRows
-      relation.isInstanceOf[HadoopFsRelation]
+
+    val outputUnsafeRows = relation match {
+      case r: HadoopFsRelation if r.fileFormat.isInstanceOf[ParquetSource] =>
+        !SQLContext.getActive().get.conf.getConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED)
+      case _: HadoopFsRelation => true
+      case _ => false
     }
 
     val bucketSpec = relation match {
-      case r: HadoopFsRelation => r.getBucketSpec
+      // TODO: this should be closer to bucket planning.
+      case r: HadoopFsRelation if r.sqlContext.conf.bucketingEnabled() => r.bucketSpec
       case _ => None
     }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
index 69a6d23203b93..2944a8f86f169 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
@@ -25,12 +25,14 @@ import org.apache.spark.rdd.{MapPartitionsRDD, RDD, UnionRDD}
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
 import org.apache.spark.sql.catalyst.CatalystTypeConverters.convertToScala
+import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
 import org.apache.spark.sql.catalyst.expressions
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.planning.PhysicalOperation
 import org.apache.spark.sql.catalyst.plans.logical
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.catalyst.plans.physical.HashPartitioning
+import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.execution.PhysicalRDD.{INPUT_PATHS, PUSHED_FILTERS}
 import org.apache.spark.sql.execution.SparkPlan
 import org.apache.spark.sql.execution.command.ExecutedCommand
@@ -41,6 +43,45 @@ import org.apache.spark.unsafe.types.UTF8String
 import org.apache.spark.util.{SerializableConfiguration, Utils}
 import org.apache.spark.util.collection.BitSet
 
+/**
+ * Replaces generic operations with specific variants that are designed to work with Spark
+ * SQL Data Sources.
+ */
+private[sql] object DataSourceAnalysis extends Rule[LogicalPlan] {
+  override def apply(plan: LogicalPlan): LogicalPlan = plan transform {
+    case i @ logical.InsertIntoTable(
+           l @ LogicalRelation(t: HadoopFsRelation, _, _), part, query, overwrite, false)
+        if query.resolved && t.schema.asNullable == query.schema.asNullable =>
+
+      // Sanity checks
+      if (t.location.paths.size != 1) {
+        throw new AnalysisException(
+          "Can only write data to relations with a single path.")
+      }
+
+      val outputPath = t.location.paths.head
+      val inputPaths = query.collect {
+        case LogicalRelation(r: HadoopFsRelation, _, _) => r.location.paths
+      }.flatten
+
+      val mode = if (overwrite) SaveMode.Overwrite else SaveMode.Append
+      if (overwrite && inputPaths.contains(outputPath)) {
+        throw new AnalysisException(
+          "Cannot overwrite a path that is also being read from.")
+      }
+
+      InsertIntoHadoopFsRelation(
+        outputPath,
+        t.partitionSchema.fields.map(_.name).map(UnresolvedAttribute(_)),
+        t.bucketSpec,
+        t.fileFormat,
+        () => t.refresh(),
+        t.options,
+        query,
+        mode)
+  }
+}
+
 /**
  * A Strategy for planning scans over data sources defined using the sources API.
  */
@@ -70,10 +111,10 @@ private[sql] object DataSourceStrategy extends Strategy with Logging {
 
     // Scanning partitioned HadoopFsRelation
     case PhysicalOperation(projects, filters, l @ LogicalRelation(t: HadoopFsRelation, _, _))
-        if t.partitionSpec.partitionColumns.nonEmpty =>
+        if t.partitionSchema.nonEmpty =>
       // We divide the filter expressions into 3 parts
       val partitionColumns = AttributeSet(
-        t.partitionColumns.map(c => l.output.find(_.name == c.name).get))
+        t.partitionSchema.map(c => l.output.find(_.name == c.name).get))
 
       // Only pruning the partition keys
       val partitionFilters = filters.filter(_.references.subsetOf(partitionColumns))
@@ -104,15 +145,15 @@ private[sql] object DataSourceStrategy extends Strategy with Logging {
 
       // Prune the buckets based on the pushed filters that do not contain partitioning key
       // since the bucketing key is not allowed to use the columns in partitioning key
-      val bucketSet = getBuckets(pushedFilters, t.getBucketSpec)
-
+      val bucketSet = getBuckets(pushedFilters, t.bucketSpec)
       val scan = buildPartitionedTableScan(
         l,
         partitionAndNormalColumnProjs,
         pushedFilters,
         bucketSet,
         t.partitionSpec.partitionColumns,
-        selectedPartitions)
+        selectedPartitions,
+        t.options)
 
       // Add a Projection to guarantee the original projection:
       // this is because "partitionAndNormalColumnAttrs" may be different
@@ -127,6 +168,9 @@ private[sql] object DataSourceStrategy extends Strategy with Logging {
         }
       ).getOrElse(scan) :: Nil
 
+    // TODO: The code for planning bucketed/unbucketed/partitioned/unpartitioned tables contains
+    // a lot of duplication and produces overly complicated RDDs.
+
     // Scanning non-partitioned HadoopFsRelation
     case PhysicalOperation(projects, filters, l @ LogicalRelation(t: HadoopFsRelation, _, _)) =>
       // See buildPartitionedTableScan for the reason that we need to create a shard
@@ -134,14 +178,65 @@ private[sql] object DataSourceStrategy extends Strategy with Logging {
       val sharedHadoopConf = SparkHadoopUtil.get.conf
       val confBroadcast =
         t.sqlContext.sparkContext.broadcast(new SerializableConfiguration(sharedHadoopConf))
-      // Prune the buckets based on the filters
-      val bucketSet = getBuckets(filters, t.getBucketSpec)
-      pruneFilterProject(
-        l,
-        projects,
-        filters,
-        (a, f) =>
-          t.buildInternalScan(a.map(_.name).toArray, f, bucketSet, t.paths, confBroadcast)) :: Nil
+
+      t.bucketSpec match {
+        case Some(spec) if t.sqlContext.conf.bucketingEnabled() =>
+          val scanBuilder: (Seq[Attribute], Array[Filter]) => RDD[InternalRow] = {
+            (requiredColumns: Seq[Attribute], filters: Array[Filter]) => {
+              val bucketed =
+                t.location
+                  .allFiles()
+                  .filterNot(_.getPath.getName startsWith "_")
+                  .groupBy { f =>
+                    BucketingUtils
+                      .getBucketId(f.getPath.getName)
+                      .getOrElse(sys.error(s"Invalid bucket file ${f.getPath}"))
+                  }
+
+              val bucketedDataMap = bucketed.mapValues { bucketFiles =>
+                t.fileFormat.buildInternalScan(
+                  t.sqlContext,
+                  t.dataSchema,
+                  requiredColumns.map(_.name).toArray,
+                  filters,
+                  None,
+                  bucketFiles.toArray,
+                  confBroadcast,
+                  t.options).coalesce(1)
+              }
+
+              val bucketedRDD = new UnionRDD(t.sqlContext.sparkContext,
+                (0 until spec.numBuckets).map { bucketId =>
+                  bucketedDataMap.get(bucketId).getOrElse {
+                    t.sqlContext.emptyResult: RDD[InternalRow]
+                  }
+                })
+              bucketedRDD
+            }
+          }
+
+          pruneFilterProject(
+            l,
+            projects,
+            filters,
+            scanBuilder) :: Nil
+
+        case _ =>
+          pruneFilterProject(
+            l,
+            projects,
+            filters,
+            (a, f) =>
+              t.fileFormat.buildInternalScan(
+                t.sqlContext,
+                t.dataSchema,
+                a.map(_.name).toArray,
+                f,
+                None,
+                t.location.allFiles().toArray,
+                confBroadcast,
+                t.options)) :: Nil
+      }
 
     case l @ LogicalRelation(baseRelation: TableScan, _, _) =>
       execution.PhysicalRDD.createFromDataSource(
@@ -151,11 +246,6 @@ private[sql] object DataSourceStrategy extends Strategy with Logging {
       part, query, overwrite, false) if part.isEmpty =>
       ExecutedCommand(InsertIntoDataSource(l, query, overwrite)) :: Nil
 
-    case i @ logical.InsertIntoTable(
-      l @ LogicalRelation(t: HadoopFsRelation, _, _), part, query, overwrite, false) =>
-      val mode = if (overwrite) SaveMode.Overwrite else SaveMode.Append
-      ExecutedCommand(InsertIntoHadoopFsRelation(t, query, mode)) :: Nil
-
     case _ => Nil
   }
 
@@ -165,7 +255,8 @@ private[sql] object DataSourceStrategy extends Strategy with Logging {
       filters: Seq[Expression],
       buckets: Option[BitSet],
       partitionColumns: StructType,
-      partitions: Array[Partition]): SparkPlan = {
+      partitions: Array[Partition],
+      options: Map[String, String]): SparkPlan = {
     val relation = logicalRelation.relation.asInstanceOf[HadoopFsRelation]
 
     // Because we are creating one RDD per partition, we need to have a shared HadoopConf.
@@ -177,36 +268,86 @@ private[sql] object DataSourceStrategy extends Strategy with Logging {
 
     // Now, we create a scan builder, which will be used by pruneFilterProject. This scan builder
     // will union all partitions and attach partition values if needed.
-    val scanBuilder = {
+    val scanBuilder: (Seq[Attribute], Array[Filter]) => RDD[InternalRow] = {
       (requiredColumns: Seq[Attribute], filters: Array[Filter]) => {
-        val requiredDataColumns =
-          requiredColumns.filterNot(c => partitionColumnNames.contains(c.name))
-
-        // Builds RDD[Row]s for each selected partition.
-        val perPartitionRows = partitions.map { case Partition(partitionValues, dir) =>
-          // Don't scan any partition columns to save I/O.  Here we are being optimistic and
-          // assuming partition columns data stored in data files are always consistent with those
-          // partition values encoded in partition directory paths.
-          val dataRows = relation.buildInternalScan(
-            requiredDataColumns.map(_.name).toArray, filters, buckets, Array(dir), confBroadcast)
-
-          // Merges data values with partition values.
-          mergeWithPartitionValues(
-            requiredColumns,
-            requiredDataColumns,
-            partitionColumns,
-            partitionValues,
-            dataRows)
-        }
 
-        val unionedRows =
-          if (perPartitionRows.length == 0) {
-            relation.sqlContext.emptyResult
-          } else {
-            new UnionRDD(relation.sqlContext.sparkContext, perPartitionRows)
-          }
+        relation.bucketSpec match {
+          case Some(spec) if relation.sqlContext.conf.bucketingEnabled() =>
+            val requiredDataColumns =
+              requiredColumns.filterNot(c => partitionColumnNames.contains(c.name))
+
+            // Builds RDD[Row]s for each selected partition.
+            val perPartitionRows: Seq[(Int, RDD[InternalRow])] = partitions.flatMap {
+              case Partition(partitionValues, dir) =>
+                val files = relation.location.getStatus(dir)
+                val bucketed = files.groupBy { f =>
+                  BucketingUtils
+                    .getBucketId(f.getPath.getName)
+                    .getOrElse(sys.error(s"Invalid bucket file ${f.getPath}"))
+                }
+
+                bucketed.map { bucketFiles =>
+                  // Don't scan any partition columns to save I/O.  Here we are being optimistic and
+                  // assuming partition columns data stored in data files are always consistent with
+                  // those partition values encoded in partition directory paths.
+                  val dataRows = relation.fileFormat.buildInternalScan(
+                    relation.sqlContext,
+                    relation.dataSchema,
+                    requiredDataColumns.map(_.name).toArray,
+                    filters,
+                    buckets,
+                    bucketFiles._2,
+                    confBroadcast,
+                    options)
+
+                  // Merges data values with partition values.
+                  bucketFiles._1 -> mergeWithPartitionValues(
+                    requiredColumns,
+                    requiredDataColumns,
+                    partitionColumns,
+                    partitionValues,
+                    dataRows)
+                }
+            }
 
-        unionedRows
+            val bucketedDataMap: Map[Int, Seq[RDD[InternalRow]]] =
+              perPartitionRows.groupBy(_._1).mapValues(_.map(_._2))
+
+            val bucketed = new UnionRDD(relation.sqlContext.sparkContext,
+              (0 until spec.numBuckets).map { bucketId =>
+                bucketedDataMap.get(bucketId).map(i => i.reduce(_ ++ _).coalesce(1)).getOrElse {
+                  relation.sqlContext.emptyResult: RDD[InternalRow]
+                }
+              })
+            bucketed
+
+          case _ =>
+            val requiredDataColumns =
+              requiredColumns.filterNot(c => partitionColumnNames.contains(c.name))
+
+            // Builds RDD[Row]s for each selected partition.
+            val perPartitionRows = partitions.map {
+              case Partition(partitionValues, dir) =>
+                val dataRows = relation.fileFormat.buildInternalScan(
+                  relation.sqlContext,
+                  relation.dataSchema,
+                  requiredDataColumns.map(_.name).toArray,
+                  filters,
+                  buckets,
+                  relation.location.getStatus(dir),
+                  confBroadcast,
+                  options)
+
+                // Merges data values with partition values.
+                mergeWithPartitionValues(
+                  requiredColumns,
+                  requiredDataColumns,
+                  partitionColumns,
+                  partitionValues,
+                  dataRows)
+            }
+            new UnionRDD(relation.sqlContext.sparkContext, perPartitionRows)
+        }
       }
     }
 
@@ -477,7 +618,7 @@ private[sql] object DataSourceStrategy extends Strategy with Logging {
       }
 
       relation.relation match {
-        case r: HadoopFsRelation => pairs += INPUT_PATHS -> r.paths.mkString(", ")
+        case r: HadoopFsRelation => pairs += INPUT_PATHS -> r.location.paths.mkString(", ")
         case _ =>
       }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelation.scala
index d4cc20b06fd3d..fb52730104f58 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelation.scala
@@ -25,8 +25,8 @@ import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat
 
 import org.apache.spark._
 import org.apache.spark.sql._
-import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
-import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project}
+import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeSet}
+import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.execution.SQLExecution
 import org.apache.spark.sql.execution.command.RunnableCommand
@@ -34,7 +34,6 @@ import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.sources._
 import org.apache.spark.util.Utils
 
-
 /**
  * A command for writing data to a [[HadoopFsRelation]].  Supports both overwriting and appending.
  * Writing to dynamic partitions is also supported.  Each [[InsertIntoHadoopFsRelation]] issues a
@@ -58,18 +57,29 @@ import org.apache.spark.util.Utils
  *      thrown during job commitment, also aborts the job.
  */
 private[sql] case class InsertIntoHadoopFsRelation(
-    @transient relation: HadoopFsRelation,
+    outputPath: Path,
+    partitionColumns: Seq[Attribute],
+    bucketSpec: Option[BucketSpec],
+    fileFormat: FileFormat,
+    refreshFunction: () => Unit,
+    options: Map[String, String],
     @transient query: LogicalPlan,
     mode: SaveMode)
   extends RunnableCommand {
 
+  override def children: Seq[LogicalPlan] = query :: Nil
+
   override def run(sqlContext: SQLContext): Seq[Row] = {
-    require(
-      relation.paths.length == 1,
-      s"Cannot write to multiple destinations: ${relation.paths.mkString(",")}")
+    // Most formats don't do well with duplicate columns, so lets not allow that
+    if (query.schema.fieldNames.length != query.schema.fieldNames.distinct.length) {
+      val duplicateColumns = query.schema.fieldNames.groupBy(identity).collect {
+        case (x, ys) if ys.length > 1 => "\"" + x + "\""
+      }.mkString(", ")
+      throw new AnalysisException(s"Duplicate column(s) : $duplicateColumns found, " +
+          s"cannot save to file.")
+    }
 
     val hadoopConf = sqlContext.sparkContext.hadoopConfiguration
-    val outputPath = new Path(relation.paths.head)
     val fs = outputPath.getFileSystem(hadoopConf)
     val qualifiedOutputPath = outputPath.makeQualified(fs.getUri, fs.getWorkingDirectory)
 
@@ -101,45 +111,28 @@ private[sql] case class InsertIntoHadoopFsRelation(
       job.setOutputValueClass(classOf[InternalRow])
       FileOutputFormat.setOutputPath(job, qualifiedOutputPath)
 
-      // A partitioned relation schema's can be different from the input logicalPlan, since
-      // partition columns are all moved after data column. We Project to adjust the ordering.
-      // TODO: this belongs in the analyzer.
-      val project = Project(
-        relation.schema.map(field => UnresolvedAttribute.quoted(field.name)), query)
-      val queryExecution = DataFrame(sqlContext, project).queryExecution
+      val partitionSet = AttributeSet(partitionColumns)
+      val dataColumns = query.output.filterNot(partitionSet.contains)
 
+      val queryExecution = DataFrame(sqlContext, query).queryExecution
       SQLExecution.withNewExecutionId(sqlContext, queryExecution) {
-        val df = sqlContext.internalCreateDataFrame(queryExecution.toRdd, relation.schema)
-        val partitionColumns = relation.partitionColumns.fieldNames
-
-        // Some pre-flight checks.
-        require(
-          df.schema == relation.schema,
-          s"""DataFrame must have the same schema as the relation to which is inserted.
-             |DataFrame schema: ${df.schema}
-             |Relation schema: ${relation.schema}
-          """.stripMargin)
-        val partitionColumnsInSpec = relation.partitionColumns.fieldNames
-        require(
-          partitionColumnsInSpec.sameElements(partitionColumns),
-          s"""Partition columns mismatch.
-             |Expected: ${partitionColumnsInSpec.mkString(", ")}
-             |Actual: ${partitionColumns.mkString(", ")}
-          """.stripMargin)
-
-        val writerContainer = if (partitionColumns.isEmpty && relation.maybeBucketSpec.isEmpty) {
+        val relation =
+          WriteRelation(
+            sqlContext,
+            dataColumns.toStructType,
+            qualifiedOutputPath.toString,
+            fileFormat.prepareWrite(sqlContext, _, options, dataColumns.toStructType),
+            bucketSpec)
+
+        val writerContainer = if (partitionColumns.isEmpty && bucketSpec.isEmpty) {
           new DefaultWriterContainer(relation, job, isAppend)
         } else {
-          val output = df.queryExecution.executedPlan.output
-          val (partitionOutput, dataOutput) =
-            output.partition(a => partitionColumns.contains(a.name))
-
           new DynamicPartitionWriterContainer(
             relation,
             job,
-            partitionOutput,
-            dataOutput,
-            output,
+            partitionColumns = partitionColumns,
+            dataColumns = dataColumns,
+            inputSchema = query.output,
             PartitioningUtils.DEFAULT_PARTITION_NAME,
             sqlContext.conf.getConf(SQLConf.PARTITION_MAX_FILES),
             isAppend)
@@ -150,9 +143,9 @@ private[sql] case class InsertIntoHadoopFsRelation(
         writerContainer.driverSideSetup()
 
         try {
-          sqlContext.sparkContext.runJob(df.queryExecution.toRdd, writerContainer.writeRows _)
+          sqlContext.sparkContext.runJob(queryExecution.toRdd, writerContainer.writeRows _)
           writerContainer.commitJob()
-          relation.refresh()
+          refreshFunction()
         } catch { case cause: Throwable =>
           logError("Aborting job.", cause)
           writerContainer.abortJob()
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala
index 65a715caf1cee..eda3c366745ef 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala
@@ -32,7 +32,12 @@ import org.apache.spark.sql.catalyst.expressions.{Cast, Literal}
 import org.apache.spark.sql.types._
 
 
-private[sql] case class Partition(values: InternalRow, path: String)
+object Partition {
+  def apply(values: InternalRow, path: String): Partition =
+    apply(values, new Path(path))
+}
+
+private[sql] case class Partition(values: InternalRow, path: Path)
 
 private[sql] case class PartitionSpec(partitionColumns: StructType, partitions: Seq[Partition])
 
@@ -102,7 +107,8 @@ private[sql] object PartitioningUtils {
       // It will be recognised as conflicting directory structure:
       //   "hdfs://host:9000/invalidPath"
       //   "hdfs://host:9000/path"
-      val discoveredBasePaths = optDiscoveredBasePaths.flatMap(x => x)
+      // TODO: Selective case sensitivity.
+      val discoveredBasePaths = optDiscoveredBasePaths.flatMap(x => x).map(_.toString.toLowerCase())
       assert(
         discoveredBasePaths.distinct.size == 1,
         "Conflicting directory structures detected. Suspicious paths:\b" +
@@ -127,7 +133,7 @@ private[sql] object PartitioningUtils {
       // Finally, we create `Partition`s based on paths and resolved partition values.
       val partitions = resolvedPartitionValues.zip(pathsWithPartitionValues).map {
         case (PartitionValues(_, literals), (path, _)) =>
-          Partition(InternalRow.fromSeq(literals.map(_.value)), path.toString)
+          Partition(InternalRow.fromSeq(literals.map(_.value)), path)
       }
 
       PartitionSpec(StructType(fields), partitions)
@@ -242,7 +248,9 @@ private[sql] object PartitioningUtils {
     if (pathsWithPartitionValues.isEmpty) {
       Seq.empty
     } else {
-      val distinctPartColNames = pathsWithPartitionValues.map(_._2.columnNames).distinct
+      // TODO: Selective case sensitivity.
+      val distinctPartColNames =
+        pathsWithPartitionValues.map(_._2.columnNames.map(_.toLowerCase())).distinct
       assert(
         distinctPartColNames.size == 1,
         listConflictingPartitionColumns(pathsWithPartitionValues))
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ResolvedDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ResolvedDataSource.scala
index eec9070beed65..8dd975ed4123b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ResolvedDataSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ResolvedDataSource.scala
@@ -24,19 +24,23 @@ import scala.language.{existentials, implicitConversions}
 import scala.util.{Failure, Success, Try}
 
 import org.apache.hadoop.fs.Path
-import org.apache.hadoop.util.StringUtils
 
 import org.apache.spark.Logging
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.sql.{AnalysisException, DataFrame, SaveMode, SQLContext}
-import org.apache.spark.sql.execution.streaming.{Sink, Source}
+import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
+import org.apache.spark.sql.execution.streaming.{FileStreamSource, Sink, Source}
 import org.apache.spark.sql.sources._
 import org.apache.spark.sql.types.{CalendarIntervalType, StructType}
 import org.apache.spark.util.Utils
 
 case class ResolvedDataSource(provider: Class[_], relation: BaseRelation)
 
-
+/**
+ * Responsible for taking a description of a datasource (either from
+ * [[org.apache.spark.sql.DataFrameReader]], or a metastore) and converting it into a logical
+ * relation that can be used in a query plan.
+ */
 object ResolvedDataSource extends Logging {
 
   /** A map to maintain backward compatibility in case we move data sources around. */
@@ -92,19 +96,61 @@ object ResolvedDataSource extends Logging {
     }
   }
 
+  // TODO: Combine with apply?
   def createSource(
       sqlContext: SQLContext,
       userSpecifiedSchema: Option[StructType],
       providerName: String,
       options: Map[String, String]): Source = {
     val provider = lookupDataSource(providerName).newInstance() match {
-      case s: StreamSourceProvider => s
+      case s: StreamSourceProvider =>
+        s.createSource(sqlContext, userSpecifiedSchema, providerName, options)
+
+      case format: FileFormat =>
+        val caseInsensitiveOptions = new CaseInsensitiveMap(options)
+        val path = caseInsensitiveOptions.getOrElse("path", {
+          throw new IllegalArgumentException("'path' is not specified")
+        })
+        val metadataPath = caseInsensitiveOptions.getOrElse("metadataPath", s"$path/_metadata")
+
+        val allPaths = caseInsensitiveOptions.get("path")
+        val globbedPaths = allPaths.toSeq.flatMap { path =>
+          val hdfsPath = new Path(path)
+          val fs = hdfsPath.getFileSystem(sqlContext.sparkContext.hadoopConfiguration)
+          val qualified = hdfsPath.makeQualified(fs.getUri, fs.getWorkingDirectory)
+          SparkHadoopUtil.get.globPathIfNecessary(qualified)
+        }.toArray
+
+        val fileCatalog: FileCatalog = new HDFSFileCatalog(sqlContext, options, globbedPaths)
+        val dataSchema = userSpecifiedSchema.orElse {
+          format.inferSchema(
+            sqlContext,
+            caseInsensitiveOptions,
+            fileCatalog.allFiles())
+        }.getOrElse {
+          throw new AnalysisException("Unable to infer schema.  It must be specified manually.")
+        }
+
+        def dataFrameBuilder(files: Array[String]): DataFrame = {
+          new DataFrame(
+            sqlContext,
+            LogicalRelation(
+              apply(
+                sqlContext,
+                paths = files,
+                userSpecifiedSchema = Some(dataSchema),
+                provider = providerName,
+                options = options.filterKeys(_ != "path")).relation))
+        }
+
+        new FileStreamSource(
+          sqlContext, metadataPath, path, Some(dataSchema), providerName, dataFrameBuilder)
       case _ =>
         throw new UnsupportedOperationException(
           s"Data source $providerName does not support streamed reading")
     }
 
-    provider.createSource(sqlContext, userSpecifiedSchema, providerName, options)
+    provider
   }
 
   def createSink(
@@ -125,98 +171,72 @@ object ResolvedDataSource extends Logging {
   /** Create a [[ResolvedDataSource]] for reading data in. */
   def apply(
       sqlContext: SQLContext,
-      userSpecifiedSchema: Option[StructType],
-      partitionColumns: Array[String],
-      bucketSpec: Option[BucketSpec],
+      paths: Seq[String] = Nil,
+      userSpecifiedSchema: Option[StructType] = None,
+      partitionColumns: Array[String] = Array.empty,
+      bucketSpec: Option[BucketSpec] = None,
       provider: String,
       options: Map[String, String]): ResolvedDataSource = {
     val clazz: Class[_] = lookupDataSource(provider)
     def className: String = clazz.getCanonicalName
-    val relation = userSpecifiedSchema match {
-      case Some(schema: StructType) => clazz.newInstance() match {
-        case dataSource: SchemaRelationProvider =>
-          val caseInsensitiveOptions = new CaseInsensitiveMap(options)
-          if (caseInsensitiveOptions.contains("paths")) {
-            throw new AnalysisException(s"$className does not support paths option.")
-          }
-          dataSource.createRelation(sqlContext, caseInsensitiveOptions, schema)
-        case dataSource: HadoopFsRelationProvider =>
-          val maybePartitionsSchema = if (partitionColumns.isEmpty) {
-            None
-          } else {
-            Some(partitionColumnsSchema(
-              schema, partitionColumns, sqlContext.conf.caseSensitiveAnalysis))
-          }
 
-          val caseInsensitiveOptions = new CaseInsensitiveMap(options)
-          val paths = {
-            if (caseInsensitiveOptions.contains("paths") &&
-              caseInsensitiveOptions.contains("path")) {
-              throw new AnalysisException(s"Both path and paths options are present.")
-            }
-            caseInsensitiveOptions.get("paths")
-              .map(_.split("(?<!\\\\),").map(StringUtils.unEscapeString(_, '\\', ',')))
-              .getOrElse(Array(caseInsensitiveOptions.getOrElse("path", {
-                throw new IllegalArgumentException("'path' is not specified")
-              })))
-              .flatMap{ pathString =>
-                val hdfsPath = new Path(pathString)
-                val fs = hdfsPath.getFileSystem(sqlContext.sparkContext.hadoopConfiguration)
-                val qualified = hdfsPath.makeQualified(fs.getUri, fs.getWorkingDirectory)
-                SparkHadoopUtil.get.globPathIfNecessary(qualified).map(_.toString)
-              }
-          }
+    val caseInsensitiveOptions = new CaseInsensitiveMap(options)
+    val relation = (clazz.newInstance(), userSpecifiedSchema) match {
+      // TODO: Throw when too much is given.
+      case (dataSource: SchemaRelationProvider, Some(schema)) =>
+        dataSource.createRelation(sqlContext, caseInsensitiveOptions, schema)
+      case (dataSource: RelationProvider, None) =>
+        dataSource.createRelation(sqlContext, caseInsensitiveOptions)
+      case (_: SchemaRelationProvider, None) =>
+        throw new AnalysisException(s"A schema needs to be specified when using $className.")
+      case (_: RelationProvider, Some(_)) =>
+        throw new AnalysisException(s"$className does not allow user-specified schemas.")
 
-          val dataSchema =
-            StructType(schema.filterNot(f => partitionColumns.contains(f.name))).asNullable
+      case (format: FileFormat, _) =>
+        val allPaths = caseInsensitiveOptions.get("path") ++ paths
+        val globbedPaths = allPaths.flatMap { path =>
+          val hdfsPath = new Path(path)
+          val fs = hdfsPath.getFileSystem(sqlContext.sparkContext.hadoopConfiguration)
+          val qualified = hdfsPath.makeQualified(fs.getUri, fs.getWorkingDirectory)
+          SparkHadoopUtil.get.globPathIfNecessary(qualified)
+        }.toArray
 
-          dataSource.createRelation(
+        val fileCatalog: FileCatalog = new HDFSFileCatalog(sqlContext, options, globbedPaths)
+        val dataSchema = userSpecifiedSchema.orElse {
+          format.inferSchema(
             sqlContext,
-            paths,
-            Some(dataSchema),
-            maybePartitionsSchema,
-            bucketSpec,
-            caseInsensitiveOptions)
-        case dataSource: org.apache.spark.sql.sources.RelationProvider =>
-          throw new AnalysisException(s"$className does not allow user-specified schemas.")
-        case _ =>
-          throw new AnalysisException(s"$className is not a RelationProvider.")
-      }
-
-      case None => clazz.newInstance() match {
-        case dataSource: RelationProvider =>
-          val caseInsensitiveOptions = new CaseInsensitiveMap(options)
-          if (caseInsensitiveOptions.contains("paths")) {
-            throw new AnalysisException(s"$className does not support paths option.")
-          }
-          dataSource.createRelation(sqlContext, caseInsensitiveOptions)
-        case dataSource: HadoopFsRelationProvider =>
-          val caseInsensitiveOptions = new CaseInsensitiveMap(options)
-          val paths = {
-            if (caseInsensitiveOptions.contains("paths") &&
-              caseInsensitiveOptions.contains("path")) {
-              throw new AnalysisException(s"Both path and paths options are present.")
-            }
-            caseInsensitiveOptions.get("paths")
-              .map(_.split("(?<!\\\\),").map(StringUtils.unEscapeString(_, '\\', ',')))
-              .getOrElse(Array(caseInsensitiveOptions.getOrElse("path", {
-                throw new IllegalArgumentException("'path' is not specified")
-              })))
-              .flatMap{ pathString =>
-                val hdfsPath = new Path(pathString)
-                val fs = hdfsPath.getFileSystem(sqlContext.sparkContext.hadoopConfiguration)
-                val qualified = hdfsPath.makeQualified(fs.getUri, fs.getWorkingDirectory)
-                SparkHadoopUtil.get.globPathIfNecessary(qualified).map(_.toString)
-              }
-          }
-          dataSource.createRelation(sqlContext, paths, None, None, None, caseInsensitiveOptions)
-        case dataSource: org.apache.spark.sql.sources.SchemaRelationProvider =>
+            caseInsensitiveOptions,
+            fileCatalog.allFiles())
+        }.getOrElse {
           throw new AnalysisException(
-            s"A schema needs to be specified when using $className.")
-        case _ =>
-          throw new AnalysisException(
-            s"$className is neither a RelationProvider nor a FSBasedRelationProvider.")
-      }
+            s"Unable to infer schema for $format at ${allPaths.take(2).mkString(",")}. " +
+            "It must be specified manually")
+        }
+
+        // If they gave a schema, then we try and figure out the types of the partition columns
+        // from that schema.
+        val partitionSchema = userSpecifiedSchema.map { schema =>
+          StructType(
+            partitionColumns.map { c =>
+              // TODO: Case sensitivity.
+              schema
+                .find(_.name.toLowerCase() == c.toLowerCase())
+                .getOrElse(throw new AnalysisException(s"Invalid partition column '$c'"))
+          })
+        }.getOrElse(fileCatalog.partitionSpec(None).partitionColumns)
+
+        HadoopFsRelation(
+          sqlContext,
+          fileCatalog,
+          partitionSchema = partitionSchema,
+          dataSchema = dataSchema.asNullable,
+          bucketSpec = bucketSpec,
+          format,
+          options)
+
+      case _ =>
+        throw new AnalysisException(
+          s"$className is not a valid Spark SQL Data Source.")
     }
     new ResolvedDataSource(clazz, relation)
   }
@@ -254,10 +274,10 @@ object ResolvedDataSource extends Logging {
       throw new AnalysisException("Cannot save interval data type into external storage.")
     }
     val clazz: Class[_] = lookupDataSource(provider)
-    val relation = clazz.newInstance() match {
+    clazz.newInstance() match {
       case dataSource: CreatableRelationProvider =>
         dataSource.createRelation(sqlContext, mode, options, data)
-      case dataSource: HadoopFsRelationProvider =>
+      case format: FileFormat =>
         // Don't glob path for the write path.  The contracts here are:
         //  1. Only one output path can be specified on the write path;
         //  2. Output path must be a legal HDFS style file system path;
@@ -278,26 +298,63 @@ object ResolvedDataSource extends Logging {
         val equality = columnNameEquality(caseSensitive)
         val dataSchema = StructType(
           data.schema.filterNot(f => partitionColumns.exists(equality(_, f.name))))
-        val r = dataSource.createRelation(
-          sqlContext,
-          Array(outputPath.toString),
-          Some(dataSchema.asNullable),
-          Some(partitionColumnsSchema(data.schema, partitionColumns, caseSensitive)),
-          bucketSpec,
-          caseInsensitiveOptions)
+
+        // If we are appending to a table that already exists, make sure the partitioning matches
+        // up.  If we fail to load the table for whatever reason, ignore the check.
+        if (mode == SaveMode.Append) {
+          val existingPartitionColumnSet = try {
+            val resolved = apply(
+              sqlContext,
+              userSpecifiedSchema = Some(data.schema.asNullable),
+              provider = provider,
+              options = options)
+
+            Some(resolved.relation
+              .asInstanceOf[HadoopFsRelation]
+              .location
+              .partitionSpec(None)
+              .partitionColumns
+              .fieldNames
+              .toSet)
+          } catch {
+            case e: Exception =>
+              None
+          }
+
+          existingPartitionColumnSet.foreach { ex =>
+            if (ex.map(_.toLowerCase) != partitionColumns.map(_.toLowerCase()).toSet) {
+              throw new AnalysisException(
+                s"Requested partitioning does not equal existing partitioning: " +
+                s"$ex != ${partitionColumns.toSet}.")
+            }
+          }
+        }
 
         // For partitioned relation r, r.schema's column ordering can be different from the column
         // ordering of data.logicalPlan (partition columns are all moved after data column).  This
         // will be adjusted within InsertIntoHadoopFsRelation.
-        sqlContext.executePlan(
+        val plan =
           InsertIntoHadoopFsRelation(
-            r,
+            outputPath,
+            partitionColumns.map(UnresolvedAttribute.quoted),
+            bucketSpec,
+            format,
+            () => Unit, // No existing table needs to be refreshed.
+            options,
             data.logicalPlan,
-            mode)).toRdd
-        r
+            mode)
+        sqlContext.executePlan(plan).toRdd
+
       case _ =>
         sys.error(s"${clazz.getCanonicalName} does not allow create table as select.")
     }
-    ResolvedDataSource(clazz, relation)
+
+    apply(
+      sqlContext,
+      userSpecifiedSchema = Some(data.schema.asNullable),
+      partitionColumns = partitionColumns,
+      bucketSpec = bucketSpec,
+      provider = provider,
+      options = options)
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriterContainer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriterContainer.scala
index 3653aca994f78..d8aad5efe39d8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriterContainer.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriterContainer.scala
@@ -26,6 +26,7 @@ import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl
 
 import org.apache.spark._
 import org.apache.spark.mapred.SparkHadoopMapRedUtil
+import org.apache.spark.sql.SQLContext
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.physical.HashPartitioning
 import org.apache.spark.sql.catalyst.InternalRow
@@ -35,9 +36,16 @@ import org.apache.spark.sql.sources.{HadoopFsRelation, OutputWriter, OutputWrite
 import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
 import org.apache.spark.util.SerializableConfiguration
 
+/** A container for all the details required when writing to a table. */
+case class WriteRelation(
+    sqlContext: SQLContext,
+    dataSchema: StructType,
+    path: String,
+    prepareJobForWrite: Job => OutputWriterFactory,
+    bucketSpec: Option[BucketSpec])
 
 private[sql] abstract class BaseWriterContainer(
-    @transient val relation: HadoopFsRelation,
+    @transient val relation: WriteRelation,
     @transient private val job: Job,
     isAppend: Boolean)
   extends Logging with Serializable {
@@ -67,12 +75,7 @@ private[sql] abstract class BaseWriterContainer(
   @transient private var taskAttemptId: TaskAttemptID = _
   @transient protected var taskAttemptContext: TaskAttemptContext = _
 
-  protected val outputPath: String = {
-    assert(
-      relation.paths.length == 1,
-      s"Cannot write to multiple destinations: ${relation.paths.mkString(",")}")
-    relation.paths.head
-  }
+  protected val outputPath: String = relation.path
 
   protected var outputWriterFactory: OutputWriterFactory = _
 
@@ -237,7 +240,7 @@ private[sql] abstract class BaseWriterContainer(
  * A writer that writes all of the rows in a partition to a single file.
  */
 private[sql] class DefaultWriterContainer(
-    relation: HadoopFsRelation,
+    relation: WriteRelation,
     job: Job,
     isAppend: Boolean)
   extends BaseWriterContainer(relation, job, isAppend) {
@@ -299,7 +302,7 @@ private[sql] class DefaultWriterContainer(
  * writer externally sorts the remaining rows and then writes out them out one file at a time.
  */
 private[sql] class DynamicPartitionWriterContainer(
-    relation: HadoopFsRelation,
+    relation: WriteRelation,
     job: Job,
     partitionColumns: Seq[Attribute],
     dataColumns: Seq[Attribute],
@@ -309,7 +312,7 @@ private[sql] class DynamicPartitionWriterContainer(
     isAppend: Boolean)
   extends BaseWriterContainer(relation, job, isAppend) {
 
-  private val bucketSpec = relation.maybeBucketSpec
+  private val bucketSpec = relation.bucketSpec
 
   private val bucketColumns: Seq[Attribute] = bucketSpec.toSeq.flatMap {
     spec => spec.bucketColumnNames.map(c => inputSchema.find(_.name == c).get)
@@ -374,7 +377,6 @@ private[sql] class DynamicPartitionWriterContainer(
 
     // We should first sort by partition columns, then bucket id, and finally sorting columns.
     val sortingExpressions: Seq[Expression] = partitionColumns ++ bucketIdExpression ++ sortColumns
-
     val getSortingKey = UnsafeProjection.create(sortingExpressions, inputSchema)
 
     val sortingKeySchema = StructType(sortingExpressions.map {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/bucket.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/bucket.scala
index 3e0d484b74cfe..6008d73717f77 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/bucket.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/bucket.scala
@@ -17,12 +17,6 @@
 
 package org.apache.spark.sql.execution.datasources
 
-import org.apache.hadoop.mapreduce.TaskAttemptContext
-
-import org.apache.spark.sql.SQLContext
-import org.apache.spark.sql.sources.{HadoopFsRelation, HadoopFsRelationProvider, OutputWriter, OutputWriterFactory}
-import org.apache.spark.sql.types.StructType
-
 /**
  * A container for bucketing information.
  * Bucketing is a technology for decomposing data sets into more manageable parts, and the number
@@ -37,24 +31,6 @@ private[sql] case class BucketSpec(
     bucketColumnNames: Seq[String],
     sortColumnNames: Seq[String])
 
-private[sql] trait BucketedHadoopFsRelationProvider extends HadoopFsRelationProvider {
-  final override def createRelation(
-      sqlContext: SQLContext,
-      paths: Array[String],
-      dataSchema: Option[StructType],
-      partitionColumns: Option[StructType],
-      parameters: Map[String, String]): HadoopFsRelation =
-    throw new UnsupportedOperationException("use the overload version with bucketSpec parameter")
-}
-
-private[sql] abstract class BucketedOutputWriterFactory extends OutputWriterFactory {
-  final override def newInstance(
-      path: String,
-      dataSchema: StructType,
-      context: TaskAttemptContext): OutputWriter =
-    throw new UnsupportedOperationException("use the overload version with bucketSpec parameter")
-}
-
 private[sql] object BucketingUtils {
   // The file name of bucketed data should have 3 parts:
   //   1. some other information in the head of file name
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVRelation.scala
index d2d7996f56273..d7ce9a0ce8894 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVRelation.scala
@@ -17,151 +17,21 @@
 
 package org.apache.spark.sql.execution.datasources.csv
 
-import java.nio.charset.Charset
-
 import scala.util.control.NonFatal
 
-import com.google.common.base.Objects
 import org.apache.hadoop.fs.{FileStatus, Path}
-import org.apache.hadoop.io.{LongWritable, NullWritable, Text}
-import org.apache.hadoop.mapred.TextInputFormat
-import org.apache.hadoop.mapreduce.{Job, TaskAttemptContext}
+import org.apache.hadoop.io.{NullWritable, Text}
 import org.apache.hadoop.mapreduce.RecordWriter
+import org.apache.hadoop.mapreduce.TaskAttemptContext
 import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat
 
 import org.apache.spark.Logging
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.execution.datasources.CompressionCodecs
 import org.apache.spark.sql.sources._
 import org.apache.spark.sql.types._
 
-private[sql] class CSVRelation(
-    private val inputRDD: Option[RDD[String]],
-    override val paths: Array[String] = Array.empty[String],
-    private val maybeDataSchema: Option[StructType],
-    override val userDefinedPartitionColumns: Option[StructType],
-    private val parameters: Map[String, String])
-    (@transient val sqlContext: SQLContext) extends HadoopFsRelation {
-
-  override lazy val dataSchema: StructType = maybeDataSchema match {
-    case Some(structType) => structType
-    case None => inferSchema(paths)
-  }
-
-  private val options = new CSVOptions(parameters)
-
-  @transient
-  private var cachedRDD: Option[RDD[String]] = None
-
-  private def readText(location: String): RDD[String] = {
-    if (Charset.forName(options.charset) == Charset.forName("UTF-8")) {
-      sqlContext.sparkContext.textFile(location)
-    } else {
-      val charset = options.charset
-      sqlContext.sparkContext.hadoopFile[LongWritable, Text, TextInputFormat](location)
-        .mapPartitions { _.map { pair =>
-            new String(pair._2.getBytes, 0, pair._2.getLength, charset)
-          }
-        }
-    }
-  }
-
-  private def baseRdd(inputPaths: Array[String]): RDD[String] = {
-    inputRDD.getOrElse {
-      cachedRDD.getOrElse {
-        val rdd = readText(inputPaths.mkString(","))
-        cachedRDD = Some(rdd)
-        rdd
-      }
-    }
-  }
-
-  private def tokenRdd(header: Array[String], inputPaths: Array[String]): RDD[Array[String]] = {
-    val rdd = baseRdd(inputPaths)
-    // Make sure firstLine is materialized before sending to executors
-    val firstLine = if (options.headerFlag) findFirstLine(rdd) else null
-    CSVRelation.univocityTokenizer(rdd, header, firstLine, options)
-  }
-
-  /**
-    * This supports to eliminate unneeded columns before producing an RDD
-    * containing all of its tuples as Row objects. This reads all the tokens of each line
-    * and then drop unneeded tokens without casting and type-checking by mapping
-    * both the indices produced by `requiredColumns` and the ones of tokens.
-    * TODO: Switch to using buildInternalScan
-    */
-  override def buildScan(requiredColumns: Array[String], inputs: Array[FileStatus]): RDD[Row] = {
-    val pathsString = inputs.map(_.getPath.toUri.toString)
-    val header = schema.fields.map(_.name)
-    val tokenizedRdd = tokenRdd(header, pathsString)
-    CSVRelation.parseCsv(tokenizedRdd, schema, requiredColumns, inputs, sqlContext, options)
-  }
-
-  override def prepareJobForWrite(job: Job): OutputWriterFactory = {
-    val conf = job.getConfiguration
-    options.compressionCodec.foreach { codec =>
-      CompressionCodecs.setCodecConfiguration(conf, codec)
-    }
-
-    new CSVOutputWriterFactory(options)
-  }
-
-  override def hashCode(): Int = Objects.hashCode(paths.toSet, dataSchema, schema, partitionColumns)
-
-  override def equals(other: Any): Boolean = other match {
-    case that: CSVRelation => {
-      val equalPath = paths.toSet == that.paths.toSet
-      val equalDataSchema = dataSchema == that.dataSchema
-      val equalSchema = schema == that.schema
-      val equalPartitionColums = partitionColumns == that.partitionColumns
-
-      equalPath && equalDataSchema && equalSchema && equalPartitionColums
-    }
-    case _ => false
-  }
-
-  private def inferSchema(paths: Array[String]): StructType = {
-    val rdd = baseRdd(paths)
-    val firstLine = findFirstLine(rdd)
-    val firstRow = new LineCsvReader(options).parseLine(firstLine)
-
-    val header = if (options.headerFlag) {
-      firstRow
-    } else {
-      firstRow.zipWithIndex.map { case (value, index) => s"C$index" }
-    }
-
-    val parsedRdd = tokenRdd(header, paths)
-    if (options.inferSchemaFlag) {
-      CSVInferSchema.infer(parsedRdd, header, options.nullValue)
-    } else {
-      // By default fields are assumed to be StringType
-      val schemaFields = header.map { fieldName =>
-        StructField(fieldName.toString, StringType, nullable = true)
-      }
-      StructType(schemaFields)
-    }
-  }
-
-  /**
-    * Returns the first line of the first non-empty file in path
-    */
-  private def findFirstLine(rdd: RDD[String]): String = {
-    if (options.isCommentSet) {
-      val comment = options.comment.toString
-      rdd.filter { line =>
-        line.trim.nonEmpty && !line.startsWith(comment)
-      }.first()
-    } else {
-      rdd.filter { line =>
-        line.trim.nonEmpty
-      }.first()
-    }
-  }
-}
-
 object CSVRelation extends Logging {
 
   def univocityTokenizer(
@@ -246,8 +116,10 @@ object CSVRelation extends Logging {
 private[sql] class CSVOutputWriterFactory(params: CSVOptions) extends OutputWriterFactory {
   override def newInstance(
       path: String,
+      bucketId: Option[Int],
       dataSchema: StructType,
       context: TaskAttemptContext): OutputWriter = {
+    if (bucketId.isDefined) sys.error("csv doesn't support bucketing")
     new CsvOutputWriter(path, dataSchema, context, params)
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/DefaultSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/DefaultSource.scala
index 2fffae452c2f7..aff672281d640 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/DefaultSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/DefaultSource.scala
@@ -17,32 +17,157 @@
 
 package org.apache.spark.sql.execution.datasources.csv
 
+import java.nio.charset.Charset
+
+import org.apache.hadoop.fs.FileStatus
+import org.apache.hadoop.io.{LongWritable, Text}
+import org.apache.hadoop.mapred.TextInputFormat
+import org.apache.hadoop.mapreduce.Job
+
+import org.apache.spark.broadcast.Broadcast
+import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.SQLContext
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.encoders.RowEncoder
+import org.apache.spark.sql.execution.datasources.CompressionCodecs
 import org.apache.spark.sql.sources._
-import org.apache.spark.sql.types.StructType
+import org.apache.spark.sql.types.{StringType, StructField, StructType}
+import org.apache.spark.util.SerializableConfiguration
+import org.apache.spark.util.collection.BitSet
 
 /**
   * Provides access to CSV data from pure SQL statements.
   */
-class DefaultSource extends HadoopFsRelationProvider with DataSourceRegister {
+class DefaultSource extends FileFormat with DataSourceRegister {
 
   override def shortName(): String = "csv"
 
+  override def toString: String = "CSV"
+
+  override def equals(other: Any): Boolean = other.isInstanceOf[DefaultSource]
+
+  override def inferSchema(
+      sqlContext: SQLContext,
+      options: Map[String, String],
+      files: Seq[FileStatus]): Option[StructType] = {
+    val csvOptions = new CSVOptions(options)
+
+    // TODO: Move filtering.
+    val paths = files.filterNot(_.getPath.getName startsWith "_").map(_.getPath.toString)
+    val rdd = baseRdd(sqlContext, csvOptions, paths)
+    val firstLine = findFirstLine(csvOptions, rdd)
+    val firstRow = new LineCsvReader(csvOptions).parseLine(firstLine)
+
+    val header = if (csvOptions.headerFlag) {
+      firstRow
+    } else {
+      firstRow.zipWithIndex.map { case (value, index) => s"C$index" }
+    }
+
+    val parsedRdd = tokenRdd(sqlContext, csvOptions, header, paths)
+    val schema = if (csvOptions.inferSchemaFlag) {
+      CSVInferSchema.infer(parsedRdd, header, csvOptions.nullValue)
+    } else {
+      // By default fields are assumed to be StringType
+      val schemaFields = header.map { fieldName =>
+        StructField(fieldName.toString, StringType, nullable = true)
+      }
+      StructType(schemaFields)
+    }
+    Some(schema)
+  }
+
+  override def prepareWrite(
+      sqlContext: SQLContext,
+      job: Job,
+      options: Map[String, String],
+      dataSchema: StructType): OutputWriterFactory = {
+    val conf = job.getConfiguration
+    val csvOptions = new CSVOptions(options)
+    csvOptions.compressionCodec.foreach { codec =>
+      CompressionCodecs.setCodecConfiguration(conf, codec)
+    }
+
+    new CSVOutputWriterFactory(csvOptions)
+  }
+
   /**
-    * Creates a new relation for data store in CSV given parameters and user supported schema.
-    */
-  override def createRelation(
+   * This supports to eliminate unneeded columns before producing an RDD
+   * containing all of its tuples as Row objects. This reads all the tokens of each line
+   * and then drop unneeded tokens without casting and type-checking by mapping
+   * both the indices produced by `requiredColumns` and the ones of tokens.
+   */
+  override def buildInternalScan(
+      sqlContext: SQLContext,
+      dataSchema: StructType,
+      requiredColumns: Array[String],
+      filters: Array[Filter],
+      bucketSet: Option[BitSet],
+      inputFiles: Array[FileStatus],
+      broadcastedConf: Broadcast[SerializableConfiguration],
+      options: Map[String, String]): RDD[InternalRow] = {
+    // TODO: Filter before calling buildInternalScan.
+    val csvFiles = inputFiles.filterNot(_.getPath.getName startsWith "_")
+
+    val csvOptions = new CSVOptions(options)
+    val pathsString = csvFiles.map(_.getPath.toUri.toString)
+    val header = dataSchema.fields.map(_.name)
+    val tokenizedRdd = tokenRdd(sqlContext, csvOptions, header, pathsString)
+    val external = CSVRelation.parseCsv(
+      tokenizedRdd, dataSchema, requiredColumns, csvFiles, sqlContext, csvOptions)
+
+    // TODO: Generate InternalRow in parseCsv
+    val outputSchema = StructType(requiredColumns.map(c => dataSchema.find(_.name == c).get))
+    val encoder = RowEncoder(outputSchema)
+    external.map(encoder.toRow)
+  }
+
+
+  private def baseRdd(
+      sqlContext: SQLContext,
+      options: CSVOptions,
+      inputPaths: Seq[String]): RDD[String] = {
+    readText(sqlContext, options, inputPaths.mkString(","))
+  }
+
+  private def tokenRdd(
+      sqlContext: SQLContext,
+      options: CSVOptions,
+      header: Array[String],
+      inputPaths: Seq[String]): RDD[Array[String]] = {
+    val rdd = baseRdd(sqlContext, options, inputPaths)
+    // Make sure firstLine is materialized before sending to executors
+    val firstLine = if (options.headerFlag) findFirstLine(options, rdd) else null
+    CSVRelation.univocityTokenizer(rdd, header, firstLine, options)
+  }
+
+  /**
+   * Returns the first line of the first non-empty file in path
+   */
+  private def findFirstLine(options: CSVOptions, rdd: RDD[String]): String = {
+    if (options.isCommentSet) {
+      val comment = options.comment.toString
+      rdd.filter { line =>
+        line.trim.nonEmpty && !line.startsWith(comment)
+      }.first()
+    } else {
+      rdd.filter { line =>
+        line.trim.nonEmpty
+      }.first()
+    }
+  }
+
+  private def readText(
       sqlContext: SQLContext,
-      paths: Array[String],
-      dataSchema: Option[StructType],
-      partitionColumns: Option[StructType],
-      parameters: Map[String, String]): HadoopFsRelation = {
-
-    new CSVRelation(
-      None,
-      paths,
-      dataSchema,
-      partitionColumns,
-      parameters)(sqlContext)
+      options: CSVOptions,
+      location: String): RDD[String] = {
+    if (Charset.forName(options.charset) == Charset.forName("UTF-8")) {
+      sqlContext.sparkContext.textFile(location)
+    } else {
+      val charset = options.charset
+      sqlContext.sparkContext
+        .hadoopFile[LongWritable, Text, TextInputFormat](location)
+        .mapPartitions(_.map(pair => new String(pair._2.getBytes, 0, pair._2.getLength, charset)))
+    }
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ddl.scala
index fb9618804d9aa..3d7c6a6a5ea1a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ddl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ddl.scala
@@ -92,7 +92,10 @@ case class CreateTempTableUsing(
 
   def run(sqlContext: SQLContext): Seq[Row] = {
     val resolved = ResolvedDataSource(
-      sqlContext, userSpecifiedSchema, Array.empty[String], bucketSpec = None, provider, options)
+      sqlContext,
+      userSpecifiedSchema = userSpecifiedSchema,
+      provider = provider,
+      options = options)
     sqlContext.catalog.registerTable(
       tableIdent,
       DataFrame(sqlContext, LogicalRelation(resolved.relation)).logicalPlan)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/InferSchema.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/InferSchema.scala
index 8b773ddfcb656..0937a213c984f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/InferSchema.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/InferSchema.scala
@@ -26,7 +26,7 @@ import org.apache.spark.sql.types._
 import org.apache.spark.util.Utils
 
 
-private[json] object InferSchema {
+private[sql] object InferSchema {
 
   /**
    * Infer the type of a collection of json records in three stages:
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JSONRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JSONRelation.scala
index 2eba52f3266b4..497e3c59e9ef0 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JSONRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JSONRelation.scala
@@ -38,101 +38,76 @@ import org.apache.spark.sql.execution.datasources._
 import org.apache.spark.sql.sources._
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.util.SerializableConfiguration
+import org.apache.spark.util.collection.BitSet
 
-
-class DefaultSource extends BucketedHadoopFsRelationProvider with DataSourceRegister {
+class DefaultSource extends FileFormat with DataSourceRegister {
 
   override def shortName(): String = "json"
 
-  override def createRelation(
+  override def inferSchema(
       sqlContext: SQLContext,
-      paths: Array[String],
-      dataSchema: Option[StructType],
-      partitionColumns: Option[StructType],
-      bucketSpec: Option[BucketSpec],
-      parameters: Map[String, String]): HadoopFsRelation = {
-
-    new JSONRelation(
-      inputRDD = None,
-      maybeDataSchema = dataSchema,
-      maybePartitionSpec = None,
-      userDefinedPartitionColumns = partitionColumns,
-      maybeBucketSpec = bucketSpec,
-      paths = paths,
-      parameters = parameters)(sqlContext)
-  }
-}
-
-private[sql] class JSONRelation(
-    val inputRDD: Option[RDD[String]],
-    val maybeDataSchema: Option[StructType],
-    val maybePartitionSpec: Option[PartitionSpec],
-    override val userDefinedPartitionColumns: Option[StructType],
-    override val maybeBucketSpec: Option[BucketSpec] = None,
-    override val paths: Array[String] = Array.empty[String],
-    parameters: Map[String, String] = Map.empty[String, String])
-    (@transient val sqlContext: SQLContext)
-  extends HadoopFsRelation(maybePartitionSpec, parameters) {
+      options: Map[String, String],
+      files: Seq[FileStatus]): Option[StructType] = {
+    if (files.isEmpty) {
+      None
+    } else {
+      val parsedOptions: JSONOptions = new JSONOptions(options)
+      val jsonFiles = files.filterNot { status =>
+        val name = status.getPath.getName
+        name.startsWith("_") || name.startsWith(".")
+      }.toArray
 
-  val options: JSONOptions = new JSONOptions(parameters)
+      val jsonSchema = InferSchema.infer(
+        createBaseRdd(sqlContext, jsonFiles),
+        sqlContext.conf.columnNameOfCorruptRecord,
+        parsedOptions)
+      checkConstraints(jsonSchema)
 
-  /** Constraints to be imposed on schema to be stored. */
-  private def checkConstraints(schema: StructType): Unit = {
-    if (schema.fieldNames.length != schema.fieldNames.distinct.length) {
-      val duplicateColumns = schema.fieldNames.groupBy(identity).collect {
-        case (x, ys) if ys.length > 1 => "\"" + x + "\""
-      }.mkString(", ")
-      throw new AnalysisException(s"Duplicate column(s) : $duplicateColumns found, " +
-        s"cannot save to JSON format")
+      Some(jsonSchema)
     }
   }
 
-  override val needConversion: Boolean = false
-
-  private def createBaseRdd(inputPaths: Array[FileStatus]): RDD[String] = {
-    val job = Job.getInstance(sqlContext.sparkContext.hadoopConfiguration)
+  override def prepareWrite(
+      sqlContext: SQLContext,
+      job: Job,
+      options: Map[String, String],
+      dataSchema: StructType): OutputWriterFactory = {
     val conf = job.getConfiguration
-
-    val paths = inputPaths.map(_.getPath)
-
-    if (paths.nonEmpty) {
-      FileInputFormat.setInputPaths(job, paths: _*)
+    val parsedOptions: JSONOptions = new JSONOptions(options)
+    parsedOptions.compressionCodec.foreach { codec =>
+      CompressionCodecs.setCodecConfiguration(conf, codec)
     }
 
-    sqlContext.sparkContext.hadoopRDD(
-      conf.asInstanceOf[JobConf],
-      classOf[TextInputFormat],
-      classOf[LongWritable],
-      classOf[Text]).map(_._2.toString) // get the text line
-  }
-
-  override lazy val dataSchema: StructType = {
-    val jsonSchema = maybeDataSchema.getOrElse {
-      val files = cachedLeafStatuses().filterNot { status =>
-        val name = status.getPath.getName
-        name.startsWith("_") || name.startsWith(".")
-      }.toArray
-      InferSchema.infer(
-        inputRDD.getOrElse(createBaseRdd(files)),
-        sqlContext.conf.columnNameOfCorruptRecord,
-        options)
+    new OutputWriterFactory {
+      override def newInstance(
+          path: String,
+          bucketId: Option[Int],
+          dataSchema: StructType,
+          context: TaskAttemptContext): OutputWriter = {
+        new JsonOutputWriter(path, bucketId, dataSchema, context)
+      }
     }
-    checkConstraints(jsonSchema)
-
-    jsonSchema
   }
 
-  override private[sql] def buildInternalScan(
+  override def buildInternalScan(
+      sqlContext: SQLContext,
+      dataSchema: StructType,
       requiredColumns: Array[String],
       filters: Array[Filter],
-      inputPaths: Array[FileStatus],
-      broadcastedConf: Broadcast[SerializableConfiguration]): RDD[InternalRow] = {
+      bucketSet: Option[BitSet],
+      inputFiles: Array[FileStatus],
+      broadcastedConf: Broadcast[SerializableConfiguration],
+      options: Map[String, String]): RDD[InternalRow] = {
+    // TODO: Filter files for all formats before calling buildInternalScan.
+    val jsonFiles = inputFiles.filterNot(_.getPath.getName startsWith "_")
+
+    val parsedOptions: JSONOptions = new JSONOptions(options)
     val requiredDataSchema = StructType(requiredColumns.map(dataSchema(_)))
     val rows = JacksonParser.parse(
-      inputRDD.getOrElse(createBaseRdd(inputPaths)),
+      createBaseRdd(sqlContext, jsonFiles),
       requiredDataSchema,
       sqlContext.conf.columnNameOfCorruptRecord,
-      options)
+      parsedOptions)
 
     rows.mapPartitions { iterator =>
       val unsafeProjection = UnsafeProjection.create(requiredDataSchema)
@@ -140,43 +115,36 @@ private[sql] class JSONRelation(
     }
   }
 
-  override def equals(other: Any): Boolean = other match {
-    case that: JSONRelation =>
-      ((inputRDD, that.inputRDD) match {
-        case (Some(thizRdd), Some(thatRdd)) => thizRdd eq thatRdd
-        case (None, None) => true
-        case _ => false
-      }) && paths.toSet == that.paths.toSet &&
-        dataSchema == that.dataSchema &&
-        schema == that.schema
-    case _ => false
-  }
+  private def createBaseRdd(sqlContext: SQLContext, inputPaths: Array[FileStatus]): RDD[String] = {
+    val job = Job.getInstance(sqlContext.sparkContext.hadoopConfiguration)
+    val conf = job.getConfiguration
 
-  override def hashCode(): Int = {
-    Objects.hashCode(
-      inputRDD,
-      paths.toSet,
-      dataSchema,
-      schema,
-      partitionColumns)
-  }
+    val paths = inputPaths.map(_.getPath)
 
-  override def prepareJobForWrite(job: Job): BucketedOutputWriterFactory = {
-    val conf = job.getConfiguration
-    options.compressionCodec.foreach { codec =>
-      CompressionCodecs.setCodecConfiguration(conf, codec)
+    if (paths.nonEmpty) {
+      FileInputFormat.setInputPaths(job, paths: _*)
     }
 
-    new BucketedOutputWriterFactory {
-      override def newInstance(
-          path: String,
-          bucketId: Option[Int],
-          dataSchema: StructType,
-          context: TaskAttemptContext): OutputWriter = {
-        new JsonOutputWriter(path, bucketId, dataSchema, context)
-      }
+    sqlContext.sparkContext.hadoopRDD(
+      conf.asInstanceOf[JobConf],
+      classOf[TextInputFormat],
+      classOf[LongWritable],
+      classOf[Text]).map(_._2.toString) // get the text line
+  }
+
+  /** Constraints to be imposed on schema to be stored. */
+  private def checkConstraints(schema: StructType): Unit = {
+    if (schema.fieldNames.length != schema.fieldNames.distinct.length) {
+      val duplicateColumns = schema.fieldNames.groupBy(identity).collect {
+        case (x, ys) if ys.length > 1 => "\"" + x + "\""
+      }.mkString(", ")
+      throw new AnalysisException(s"Duplicate column(s) : $duplicateColumns found, " +
+          s"cannot save to JSON format")
     }
   }
+
+  override def toString: String = "JSON"
+  override def equals(other: Any): Boolean = other.isInstanceOf[DefaultSource]
 }
 
 private[json] class JsonOutputWriter(
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRelation.scala
index b8af832861a0f..82404b8499163 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRelation.scala
@@ -25,7 +25,6 @@ import scala.collection.JavaConverters._
 import scala.collection.mutable
 import scala.util.{Failure, Try}
 
-import com.google.common.base.Objects
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.{FileStatus, Path}
 import org.apache.hadoop.io.Writable
@@ -51,193 +50,23 @@ import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.sources._
 import org.apache.spark.sql.types.{DataType, StructType}
 import org.apache.spark.util.{SerializableConfiguration, Utils}
+import org.apache.spark.util.collection.BitSet
 
-private[sql] class DefaultSource extends BucketedHadoopFsRelationProvider with DataSourceRegister {
 
-  override def shortName(): String = "parquet"
-
-  override def createRelation(
-      sqlContext: SQLContext,
-      paths: Array[String],
-      schema: Option[StructType],
-      partitionColumns: Option[StructType],
-      bucketSpec: Option[BucketSpec],
-      parameters: Map[String, String]): HadoopFsRelation = {
-    new ParquetRelation(paths, schema, None, partitionColumns, bucketSpec, parameters)(sqlContext)
-  }
-}
-
-// NOTE: This class is instantiated and used on executor side only, no need to be serializable.
-private[sql] class ParquetOutputWriter(
-    path: String,
-    bucketId: Option[Int],
-    context: TaskAttemptContext)
-  extends OutputWriter {
-
-  private val recordWriter: RecordWriter[Void, InternalRow] = {
-    val outputFormat = {
-      new ParquetOutputFormat[InternalRow]() {
-        // Here we override `getDefaultWorkFile` for two reasons:
-        //
-        //  1. To allow appending.  We need to generate unique output file names to avoid
-        //     overwriting existing files (either exist before the write job, or are just written
-        //     by other tasks within the same write job).
-        //
-        //  2. To allow dynamic partitioning.  Default `getDefaultWorkFile` uses
-        //     `FileOutputCommitter.getWorkPath()`, which points to the base directory of all
-        //     partitions in the case of dynamic partitioning.
-        override def getDefaultWorkFile(context: TaskAttemptContext, extension: String): Path = {
-          val configuration = context.getConfiguration
-          val uniqueWriteJobId = configuration.get("spark.sql.sources.writeJobUUID")
-          val taskAttemptId = context.getTaskAttemptID
-          val split = taskAttemptId.getTaskID.getId
-          val bucketString = bucketId.map(BucketingUtils.bucketIdToString).getOrElse("")
-          new Path(path, f"part-r-$split%05d-$uniqueWriteJobId$bucketString$extension")
-        }
-      }
-    }
-
-    outputFormat.getRecordWriter(context)
-  }
+private[sql] class DefaultSource extends FileFormat with DataSourceRegister with Logging {
 
-  override def write(row: Row): Unit = throw new UnsupportedOperationException("call writeInternal")
-
-  override protected[sql] def writeInternal(row: InternalRow): Unit = recordWriter.write(null, row)
-
-  override def close(): Unit = recordWriter.close(context)
-}
-
-private[sql] class ParquetRelation(
-    override val paths: Array[String],
-    private val maybeDataSchema: Option[StructType],
-    // This is for metastore conversion.
-    private val maybePartitionSpec: Option[PartitionSpec],
-    override val userDefinedPartitionColumns: Option[StructType],
-    override val maybeBucketSpec: Option[BucketSpec],
-    parameters: Map[String, String])(
-    val sqlContext: SQLContext)
-  extends HadoopFsRelation(maybePartitionSpec, parameters)
-  with Logging {
-
-  private[sql] def this(
-      paths: Array[String],
-      maybeDataSchema: Option[StructType],
-      maybePartitionSpec: Option[PartitionSpec],
-      parameters: Map[String, String])(
-      sqlContext: SQLContext) = {
-    this(
-      paths,
-      maybeDataSchema,
-      maybePartitionSpec,
-      maybePartitionSpec.map(_.partitionColumns),
-      None,
-      parameters)(sqlContext)
-  }
-
-  // Should we merge schemas from all Parquet part-files?
-  private val shouldMergeSchemas =
-    parameters
-      .get(ParquetRelation.MERGE_SCHEMA)
-      .map(_.toBoolean)
-      .getOrElse(sqlContext.conf.getConf(SQLConf.PARQUET_SCHEMA_MERGING_ENABLED))
-
-  private val mergeRespectSummaries =
-    sqlContext.conf.getConf(SQLConf.PARQUET_SCHEMA_RESPECT_SUMMARIES)
-
-  private val maybeMetastoreSchema = parameters
-    .get(ParquetRelation.METASTORE_SCHEMA)
-    .map(DataType.fromJson(_).asInstanceOf[StructType])
-
-  private val compressionCodec: Option[String] = parameters
-    .get("compression")
-    .map { codecName =>
-      // Validate if given compression codec is supported or not.
-      val shortParquetCompressionCodecNames = ParquetRelation.shortParquetCompressionCodecNames
-      if (!shortParquetCompressionCodecNames.contains(codecName.toLowerCase)) {
-        val availableCodecs = shortParquetCompressionCodecNames.keys.map(_.toLowerCase)
-        throw new IllegalArgumentException(s"Codec [$codecName] " +
-          s"is not available. Available codecs are ${availableCodecs.mkString(", ")}.")
-      }
-      codecName.toLowerCase
-    }
-
-  private lazy val metadataCache: MetadataCache = {
-    val meta = new MetadataCache
-    meta.refresh()
-    meta
-  }
-
-  override def toString: String = {
-    parameters.get(ParquetRelation.METASTORE_TABLE_NAME).map { tableName =>
-      s"${getClass.getSimpleName}: $tableName"
-    }.getOrElse(super.toString)
-  }
-
-  override def equals(other: Any): Boolean = other match {
-    case that: ParquetRelation =>
-      val schemaEquality = if (shouldMergeSchemas) {
-        this.shouldMergeSchemas == that.shouldMergeSchemas
-      } else {
-        this.dataSchema == that.dataSchema &&
-          this.schema == that.schema
-      }
-
-      this.paths.toSet == that.paths.toSet &&
-        schemaEquality &&
-        this.maybeDataSchema == that.maybeDataSchema &&
-        this.partitionColumns == that.partitionColumns
-
-    case _ => false
-  }
-
-  override def hashCode(): Int = {
-    if (shouldMergeSchemas) {
-      Objects.hashCode(
-        Boolean.box(shouldMergeSchemas),
-        paths.toSet,
-        maybeDataSchema,
-        partitionColumns)
-    } else {
-      Objects.hashCode(
-        Boolean.box(shouldMergeSchemas),
-        paths.toSet,
-        dataSchema,
-        schema,
-        maybeDataSchema,
-        partitionColumns)
-    }
-  }
-
-  /** Constraints on schema of dataframe to be stored. */
-  private def checkConstraints(schema: StructType): Unit = {
-    if (schema.fieldNames.length != schema.fieldNames.distinct.length) {
-      val duplicateColumns = schema.fieldNames.groupBy(identity).collect {
-        case (x, ys) if ys.length > 1 => "\"" + x + "\""
-      }.mkString(", ")
-      throw new AnalysisException(s"Duplicate column(s) : $duplicateColumns found, " +
-        s"cannot save to parquet format")
-    }
-  }
+  override def shortName(): String = "parquet"
 
-  override def dataSchema: StructType = {
-    val schema = maybeDataSchema.getOrElse(metadataCache.dataSchema)
-    // check if schema satisfies the constraints
-    // before moving forward
-    checkConstraints(schema)
-    schema
-  }
+  override def toString: String = "ParquetFormat"
 
-  override private[sql] def refresh(): Unit = {
-    super.refresh()
-    metadataCache.refresh()
-  }
+  override def equals(other: Any): Boolean = other.isInstanceOf[DefaultSource]
 
-  // Parquet data source always uses Catalyst internal representations.
-  override val needConversion: Boolean = false
-
-  override def sizeInBytes: Long = metadataCache.dataStatuses.map(_.getLen).sum
+  override def prepareWrite(
+      sqlContext: SQLContext,
+      job: Job,
+      options: Map[String, String],
+      dataSchema: StructType): OutputWriterFactory = {
 
-  override def prepareJobForWrite(job: Job): BucketedOutputWriterFactory = {
     val conf = ContextUtil.getConfiguration(job)
 
     // SPARK-9849 DirectParquetOutputCommitter qualified name should be backward compatible
@@ -255,11 +84,24 @@ private[sql] class ParquetRelation(
 
     if (conf.get(SQLConf.PARQUET_OUTPUT_COMMITTER_CLASS.key) == null) {
       logInfo("Using default output committer for Parquet: " +
-        classOf[ParquetOutputCommitter].getCanonicalName)
+          classOf[ParquetOutputCommitter].getCanonicalName)
     } else {
       logInfo("Using user defined output committer for Parquet: " + committerClass.getCanonicalName)
     }
 
+    val compressionCodec: Option[String] = options
+      .get("compression")
+      .map { codecName =>
+        // Validate if given compression codec is supported or not.
+        val shortParquetCompressionCodecNames = ParquetRelation.shortParquetCompressionCodecNames
+        if (!shortParquetCompressionCodecNames.contains(codecName.toLowerCase)) {
+          val availableCodecs = shortParquetCompressionCodecNames.keys.map(_.toLowerCase)
+          throw new IllegalArgumentException(s"Codec [$codecName] " +
+              s"is not available. Available codecs are ${availableCodecs.mkString(", ")}.")
+        }
+        codecName.toLowerCase
+      }
+
     conf.setClass(
       SQLConf.OUTPUT_COMMITTER_CLASS.key,
       committerClass,
@@ -303,7 +145,7 @@ private[sql] class ParquetRelation(
             .getOrElse(sqlContext.conf.parquetCompressionCodec.toLowerCase),
           CompressionCodecName.UNCOMPRESSED).name())
 
-    new BucketedOutputWriterFactory {
+    new OutputWriterFactory {
       override def newInstance(
           path: String,
           bucketId: Option[Int],
@@ -314,11 +156,127 @@ private[sql] class ParquetRelation(
     }
   }
 
+  def inferSchema(
+      sqlContext: SQLContext,
+      parameters: Map[String, String],
+      files: Seq[FileStatus]): Option[StructType] = {
+    // Should we merge schemas from all Parquet part-files?
+    val shouldMergeSchemas =
+      parameters
+          .get(ParquetRelation.MERGE_SCHEMA)
+          .map(_.toBoolean)
+          .getOrElse(sqlContext.conf.getConf(SQLConf.PARQUET_SCHEMA_MERGING_ENABLED))
+
+    val mergeRespectSummaries =
+      sqlContext.conf.getConf(SQLConf.PARQUET_SCHEMA_RESPECT_SUMMARIES)
+
+    val filesByType = splitFiles(files)
+
+    // Sees which file(s) we need to touch in order to figure out the schema.
+    //
+    // Always tries the summary files first if users don't require a merged schema.  In this case,
+    // "_common_metadata" is more preferable than "_metadata" because it doesn't contain row
+    // groups information, and could be much smaller for large Parquet files with lots of row
+    // groups.  If no summary file is available, falls back to some random part-file.
+    //
+    // NOTE: Metadata stored in the summary files are merged from all part-files.  However, for
+    // user defined key-value metadata (in which we store Spark SQL schema), Parquet doesn't know
+    // how to merge them correctly if some key is associated with different values in different
+    // part-files.  When this happens, Parquet simply gives up generating the summary file.  This
+    // implies that if a summary file presents, then:
+    //
+    //   1. Either all part-files have exactly the same Spark SQL schema, or
+    //   2. Some part-files don't contain Spark SQL schema in the key-value metadata at all (thus
+    //      their schemas may differ from each other).
+    //
+    // Here we tend to be pessimistic and take the second case into account.  Basically this means
+    // we can't trust the summary files if users require a merged schema, and must touch all part-
+    // files to do the merge.
+    val filesToTouch =
+      if (shouldMergeSchemas) {
+        // Also includes summary files, 'cause there might be empty partition directories.
+
+        // If mergeRespectSummaries config is true, we assume that all part-files are the same for
+        // their schema with summary files, so we ignore them when merging schema.
+        // If the config is disabled, which is the default setting, we merge all part-files.
+        // In this mode, we only need to merge schemas contained in all those summary files.
+        // You should enable this configuration only if you are very sure that for the parquet
+        // part-files to read there are corresponding summary files containing correct schema.
+
+        // As filed in SPARK-11500, the order of files to touch is a matter, which might affect
+        // the ordering of the output columns. There are several things to mention here.
+        //
+        //  1. If mergeRespectSummaries config is false, then it merges schemas by reducing from
+        //     the first part-file so that the columns of the lexicographically first file show
+        //     first.
+        //
+        //  2. If mergeRespectSummaries config is true, then there should be, at least,
+        //     "_metadata"s for all given files, so that we can ensure the columns of
+        //     the lexicographically first file show first.
+        //
+        //  3. If shouldMergeSchemas is false, but when multiple files are given, there is
+        //     no guarantee of the output order, since there might not be a summary file for the
+        //     lexicographically first file, which ends up putting ahead the columns of
+        //     the other files. However, this should be okay since not enabling
+        //     shouldMergeSchemas means (assumes) all the files have the same schemas.
+
+        val needMerged: Seq[FileStatus] =
+          if (mergeRespectSummaries) {
+            Seq()
+          } else {
+            filesByType.data
+          }
+        needMerged ++ filesByType.metadata ++ filesByType.commonMetadata
+      } else {
+        // Tries any "_common_metadata" first. Parquet files written by old versions or Parquet
+        // don't have this.
+        filesByType.commonMetadata.headOption
+            // Falls back to "_metadata"
+            .orElse(filesByType.metadata.headOption)
+            // Summary file(s) not found, the Parquet file is either corrupted, or different part-
+            // files contain conflicting user defined metadata (two or more values are associated
+            // with a same key in different files).  In either case, we fall back to any of the
+            // first part-file, and just assume all schemas are consistent.
+            .orElse(filesByType.data.headOption)
+            .toSeq
+      }
+    ParquetRelation.mergeSchemasInParallel(filesToTouch, sqlContext)
+  }
+
+  case class FileTypes(
+      data: Seq[FileStatus],
+      metadata: Seq[FileStatus],
+      commonMetadata: Seq[FileStatus])
+
+  private def splitFiles(allFiles: Seq[FileStatus]): FileTypes = {
+    // Lists `FileStatus`es of all leaf nodes (files) under all base directories.
+    val leaves = allFiles.filter { f =>
+      isSummaryFile(f.getPath) ||
+          !(f.getPath.getName.startsWith("_") || f.getPath.getName.startsWith("."))
+    }.toArray.sortBy(_.getPath.toString)
+
+    FileTypes(
+      data = leaves.filterNot(f => isSummaryFile(f.getPath)),
+      metadata =
+        leaves.filter(_.getPath.getName == ParquetFileWriter.PARQUET_METADATA_FILE),
+      commonMetadata =
+        leaves.filter(_.getPath.getName == ParquetFileWriter.PARQUET_COMMON_METADATA_FILE))
+  }
+
+  private def isSummaryFile(file: Path): Boolean = {
+    file.getName == ParquetFileWriter.PARQUET_COMMON_METADATA_FILE ||
+        file.getName == ParquetFileWriter.PARQUET_METADATA_FILE
+  }
+
   override def buildInternalScan(
+      sqlContext: SQLContext,
+      dataSchema: StructType,
       requiredColumns: Array[String],
       filters: Array[Filter],
-      inputFiles: Array[FileStatus],
-      broadcastedConf: Broadcast[SerializableConfiguration]): RDD[InternalRow] = {
+      bucketSet: Option[BitSet],
+      allFiles: Array[FileStatus],
+      broadcastedConf: Broadcast[SerializableConfiguration],
+      options: Map[String, String]): RDD[InternalRow] = {
     val useMetadataCache = sqlContext.getConf(SQLConf.PARQUET_CACHE_METADATA)
     val parquetFilterPushDown = sqlContext.conf.parquetFilterPushDown
     val assumeBinaryIsString = sqlContext.conf.isParquetBinaryAsString
@@ -341,6 +299,8 @@ private[sql] class ParquetRelation(
         assumeBinaryIsString,
         assumeInt96IsTimestamp) _
 
+    val inputFiles = splitFiles(allFiles).data.toArray
+
     // Create the function to set input paths at the driver side.
     val setInputPaths =
       ParquetRelation.initializeDriverSideJobFunc(inputFiles, parquetBlockSize) _
@@ -392,153 +352,46 @@ private[sql] class ParquetRelation(
       }
     }
   }
+}
 
-  private class MetadataCache {
-    // `FileStatus` objects of all "_metadata" files.
-    private var metadataStatuses: Array[FileStatus] = _
-
-    // `FileStatus` objects of all "_common_metadata" files.
-    private var commonMetadataStatuses: Array[FileStatus] = _
-
-    // `FileStatus` objects of all data files (Parquet part-files).
-    var dataStatuses: Array[FileStatus] = _
-
-    // Schema of the actual Parquet files, without partition columns discovered from partition
-    // directory paths.
-    var dataSchema: StructType = null
-
-    // Schema of the whole table, including partition columns.
-    var schema: StructType = _
-
-    // Cached leaves
-    var cachedLeaves: mutable.LinkedHashSet[FileStatus] = null
-
-    /**
-     * Refreshes `FileStatus`es, footers, partition spec, and table schema.
-     */
-    def refresh(): Unit = {
-      val currentLeafStatuses = cachedLeafStatuses()
-
-      // Check if cachedLeafStatuses is changed or not
-      val leafStatusesChanged = (cachedLeaves == null) ||
-        !cachedLeaves.equals(currentLeafStatuses)
-
-      if (leafStatusesChanged) {
-        cachedLeaves = currentLeafStatuses
-
-        // Lists `FileStatus`es of all leaf nodes (files) under all base directories.
-        val leaves = currentLeafStatuses.filter { f =>
-          isSummaryFile(f.getPath) ||
-            !(f.getPath.getName.startsWith("_") || f.getPath.getName.startsWith("."))
-        }.toArray.sortBy(_.getPath.toString)
-
-        dataStatuses = leaves.filterNot(f => isSummaryFile(f.getPath))
-        metadataStatuses =
-          leaves.filter(_.getPath.getName == ParquetFileWriter.PARQUET_METADATA_FILE)
-        commonMetadataStatuses =
-          leaves.filter(_.getPath.getName == ParquetFileWriter.PARQUET_COMMON_METADATA_FILE)
-
-        dataSchema = {
-          val dataSchema0 = maybeDataSchema
-            .orElse(readSchema())
-            .orElse(maybeMetastoreSchema)
-            .getOrElse(throw new AnalysisException(
-              s"Failed to discover schema of Parquet file(s) in the following location(s):\n" +
-                paths.mkString("\n\t")))
-
-          // If this Parquet relation is converted from a Hive Metastore table, must reconcile case
-          // case insensitivity issue and possible schema mismatch (probably caused by schema
-          // evolution).
-          maybeMetastoreSchema
-            .map(ParquetRelation.mergeMetastoreParquetSchema(_, dataSchema0))
-            .getOrElse(dataSchema0)
+// NOTE: This class is instantiated and used on executor side only, no need to be serializable.
+private[sql] class ParquetOutputWriter(
+    path: String,
+    bucketId: Option[Int],
+    context: TaskAttemptContext)
+  extends OutputWriter {
+
+  private val recordWriter: RecordWriter[Void, InternalRow] = {
+    val outputFormat = {
+      new ParquetOutputFormat[InternalRow]() {
+        // Here we override `getDefaultWorkFile` for two reasons:
+        //
+        //  1. To allow appending.  We need to generate unique output file names to avoid
+        //     overwriting existing files (either exist before the write job, or are just written
+        //     by other tasks within the same write job).
+        //
+        //  2. To allow dynamic partitioning.  Default `getDefaultWorkFile` uses
+        //     `FileOutputCommitter.getWorkPath()`, which points to the base directory of all
+        //     partitions in the case of dynamic partitioning.
+        override def getDefaultWorkFile(context: TaskAttemptContext, extension: String): Path = {
+          val configuration = context.getConfiguration
+          val uniqueWriteJobId = configuration.get("spark.sql.sources.writeJobUUID")
+          val taskAttemptId = context.getTaskAttemptID
+          val split = taskAttemptId.getTaskID.getId
+          val bucketString = bucketId.map(BucketingUtils.bucketIdToString).getOrElse("")
+          new Path(path, f"part-r-$split%05d-$uniqueWriteJobId$bucketString$extension")
         }
       }
     }
 
-    private def isSummaryFile(file: Path): Boolean = {
-      file.getName == ParquetFileWriter.PARQUET_COMMON_METADATA_FILE ||
-        file.getName == ParquetFileWriter.PARQUET_METADATA_FILE
-    }
+    outputFormat.getRecordWriter(context)
+  }
 
-    private def readSchema(): Option[StructType] = {
-      // Sees which file(s) we need to touch in order to figure out the schema.
-      //
-      // Always tries the summary files first if users don't require a merged schema.  In this case,
-      // "_common_metadata" is more preferable than "_metadata" because it doesn't contain row
-      // groups information, and could be much smaller for large Parquet files with lots of row
-      // groups.  If no summary file is available, falls back to some random part-file.
-      //
-      // NOTE: Metadata stored in the summary files are merged from all part-files.  However, for
-      // user defined key-value metadata (in which we store Spark SQL schema), Parquet doesn't know
-      // how to merge them correctly if some key is associated with different values in different
-      // part-files.  When this happens, Parquet simply gives up generating the summary file.  This
-      // implies that if a summary file presents, then:
-      //
-      //   1. Either all part-files have exactly the same Spark SQL schema, or
-      //   2. Some part-files don't contain Spark SQL schema in the key-value metadata at all (thus
-      //      their schemas may differ from each other).
-      //
-      // Here we tend to be pessimistic and take the second case into account.  Basically this means
-      // we can't trust the summary files if users require a merged schema, and must touch all part-
-      // files to do the merge.
-      val filesToTouch =
-        if (shouldMergeSchemas) {
-          // Also includes summary files, 'cause there might be empty partition directories.
-
-          // If mergeRespectSummaries config is true, we assume that all part-files are the same for
-          // their schema with summary files, so we ignore them when merging schema.
-          // If the config is disabled, which is the default setting, we merge all part-files.
-          // In this mode, we only need to merge schemas contained in all those summary files.
-          // You should enable this configuration only if you are very sure that for the parquet
-          // part-files to read there are corresponding summary files containing correct schema.
-
-          // As filed in SPARK-11500, the order of files to touch is a matter, which might affect
-          // the ordering of the output columns. There are several things to mention here.
-          //
-          //  1. If mergeRespectSummaries config is false, then it merges schemas by reducing from
-          //     the first part-file so that the columns of the lexicographically first file show
-          //     first.
-          //
-          //  2. If mergeRespectSummaries config is true, then there should be, at least,
-          //     "_metadata"s for all given files, so that we can ensure the columns of
-          //     the lexicographically first file show first.
-          //
-          //  3. If shouldMergeSchemas is false, but when multiple files are given, there is
-          //     no guarantee of the output order, since there might not be a summary file for the
-          //     lexicographically first file, which ends up putting ahead the columns of
-          //     the other files. However, this should be okay since not enabling
-          //     shouldMergeSchemas means (assumes) all the files have the same schemas.
-
-          val needMerged: Seq[FileStatus] =
-            if (mergeRespectSummaries) {
-              Seq()
-            } else {
-              dataStatuses
-            }
-          needMerged ++ metadataStatuses ++ commonMetadataStatuses
-        } else {
-          // Tries any "_common_metadata" first. Parquet files written by old versions or Parquet
-          // don't have this.
-          commonMetadataStatuses.headOption
-            // Falls back to "_metadata"
-            .orElse(metadataStatuses.headOption)
-            // Summary file(s) not found, the Parquet file is either corrupted, or different part-
-            // files contain conflicting user defined metadata (two or more values are associated
-            // with a same key in different files).  In either case, we fall back to any of the
-            // first part-file, and just assume all schemas are consistent.
-            .orElse(dataStatuses.headOption)
-            .toSeq
-        }
+  override def write(row: Row): Unit = throw new UnsupportedOperationException("call writeInternal")
 
-      assert(
-        filesToTouch.nonEmpty || maybeDataSchema.isDefined || maybeMetastoreSchema.isDefined,
-        "No predefined schema found, " +
-          s"and no Parquet data files or summary files found under ${paths.mkString(", ")}.")
+  override protected[sql] def writeInternal(row: InternalRow): Unit = recordWriter.write(null, row)
 
-      ParquetRelation.mergeSchemasInParallel(filesToTouch, sqlContext)
-    }
-  }
+  override def close(): Unit = recordWriter.close(context)
 }
 
 private[sql] object ParquetRelation extends Logging {
@@ -699,7 +552,7 @@ private[sql] object ParquetRelation extends Logging {
    * distinguish binary and string).  This method generates a correct schema by merging Metastore
    * schema data types and Parquet schema field names.
    */
-  private[parquet] def mergeMetastoreParquetSchema(
+  private[sql] def mergeMetastoreParquetSchema(
       metastoreSchema: StructType,
       parquetSchema: StructType): StructType = {
     def schemaConflictMessage: String =
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
index 2e41e88392600..0eae34614c56f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
@@ -34,6 +34,7 @@ private[sql] class ResolveDataSource(sqlContext: SQLContext) extends Rule[Logica
       try {
         val resolved = ResolvedDataSource(
           sqlContext,
+          paths = Seq.empty,
           userSpecifiedSchema = None,
           partitionColumns = Array(),
           bucketSpec = None,
@@ -130,7 +131,7 @@ private[sql] case class PreWriteCheck(catalog: Catalog) extends (LogicalPlan =>
         LogicalRelation(r: HadoopFsRelation, _, _), part, query, overwrite, _) =>
         // We need to make sure the partition columns specified by users do match partition
         // columns of the relation.
-        val existingPartitionColumns = r.partitionColumns.fieldNames.toSet
+        val existingPartitionColumns = r.partitionSchema.fieldNames.toSet
         val specifiedPartitionColumns = part.keySet
         if (existingPartitionColumns != specifiedPartitionColumns) {
           failAnalysis(s"Specified partition columns " +
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/text/DefaultSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/text/DefaultSource.scala
index 8f3f6335e4282..b3297254cbca8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/text/DefaultSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/text/DefaultSource.scala
@@ -31,25 +31,16 @@ import org.apache.spark.sql.{AnalysisException, Row, SQLContext}
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.UnsafeRow
 import org.apache.spark.sql.catalyst.expressions.codegen.{BufferHolder, UnsafeRowWriter}
-import org.apache.spark.sql.execution.datasources.{CompressionCodecs, PartitionSpec}
+import org.apache.spark.sql.execution.datasources.CompressionCodecs
 import org.apache.spark.sql.sources._
 import org.apache.spark.sql.types.{StringType, StructType}
 import org.apache.spark.util.SerializableConfiguration
+import org.apache.spark.util.collection.BitSet
 
 /**
  * A data source for reading text files.
  */
-class DefaultSource extends HadoopFsRelationProvider with DataSourceRegister {
-
-  override def createRelation(
-      sqlContext: SQLContext,
-      paths: Array[String],
-      dataSchema: Option[StructType],
-      partitionColumns: Option[StructType],
-      parameters: Map[String, String]): HadoopFsRelation = {
-    dataSchema.foreach(verifySchema)
-    new TextRelation(None, dataSchema, partitionColumns, paths, parameters)(sqlContext)
-  }
+class DefaultSource extends FileFormat with DataSourceRegister {
 
   override def shortName(): String = "text"
 
@@ -64,58 +55,21 @@ class DefaultSource extends HadoopFsRelationProvider with DataSourceRegister {
         s"Text data source supports only a string column, but you have ${tpe.simpleString}.")
     }
   }
-}
-
-private[sql] class TextRelation(
-    val maybePartitionSpec: Option[PartitionSpec],
-    val textSchema: Option[StructType],
-    override val userDefinedPartitionColumns: Option[StructType],
-    override val paths: Array[String] = Array.empty[String],
-    parameters: Map[String, String] = Map.empty[String, String])
-    (@transient val sqlContext: SQLContext)
-  extends HadoopFsRelation(maybePartitionSpec, parameters) {
 
-  /** Data schema is always a single column, named "value" if original Data source has no schema. */
-  override def dataSchema: StructType =
-    textSchema.getOrElse(new StructType().add("value", StringType))
-  /** This is an internal data source that outputs internal row format. */
-  override val needConversion: Boolean = false
-
-
-  override private[sql] def buildInternalScan(
-      requiredColumns: Array[String],
-      filters: Array[Filter],
-      inputPaths: Array[FileStatus],
-      broadcastedConf: Broadcast[SerializableConfiguration]): RDD[InternalRow] = {
-    val job = Job.getInstance(sqlContext.sparkContext.hadoopConfiguration)
-    val conf = job.getConfiguration
-    val paths = inputPaths.map(_.getPath).sortBy(_.toUri)
-
-    if (paths.nonEmpty) {
-      FileInputFormat.setInputPaths(job, paths: _*)
-    }
+  override def inferSchema(
+      sqlContext: SQLContext,
+      options: Map[String, String],
+      files: Seq[FileStatus]): Option[StructType] = Some(new StructType().add("value", StringType))
 
-    sqlContext.sparkContext.hadoopRDD(
-      conf.asInstanceOf[JobConf], classOf[TextInputFormat], classOf[LongWritable], classOf[Text])
-      .mapPartitions { iter =>
-        val unsafeRow = new UnsafeRow(1)
-        val bufferHolder = new BufferHolder(unsafeRow)
-        val unsafeRowWriter = new UnsafeRowWriter(bufferHolder, 1)
-
-        iter.map { case (_, line) =>
-          // Writes to an UnsafeRow directly
-          bufferHolder.reset()
-          unsafeRowWriter.write(0, line.getBytes, 0, line.getLength)
-          unsafeRow.setTotalSize(bufferHolder.totalSize())
-          unsafeRow
-        }
-      }
-  }
+  override def prepareWrite(
+      sqlContext: SQLContext,
+      job: Job,
+      options: Map[String, String],
+      dataSchema: StructType): OutputWriterFactory = {
+    verifySchema(dataSchema)
 
-  /** Write path. */
-  override def prepareJobForWrite(job: Job): OutputWriterFactory = {
     val conf = job.getConfiguration
-    val compressionCodec = parameters.get("compression").map(CompressionCodecs.getCodecClassName)
+    val compressionCodec = options.get("compression").map(CompressionCodecs.getCodecClassName)
     compressionCodec.foreach { codec =>
       CompressionCodecs.setCodecConfiguration(conf, codec)
     }
@@ -123,21 +77,54 @@ private[sql] class TextRelation(
     new OutputWriterFactory {
       override def newInstance(
           path: String,
+          bucketId: Option[Int],
           dataSchema: StructType,
           context: TaskAttemptContext): OutputWriter = {
+        if (bucketId.isDefined) {
+          throw new AnalysisException("Text doesn't support bucketing")
+        }
         new TextOutputWriter(path, dataSchema, context)
       }
     }
   }
 
-  override def equals(other: Any): Boolean = other match {
-    case that: TextRelation =>
-      paths.toSet == that.paths.toSet && partitionColumns == that.partitionColumns
-    case _ => false
-  }
+  override def buildInternalScan(
+      sqlContext: SQLContext,
+      dataSchema: StructType,
+      requiredColumns: Array[String],
+      filters: Array[Filter],
+      bucketSet: Option[BitSet],
+      inputFiles: Array[FileStatus],
+      broadcastedConf: Broadcast[SerializableConfiguration],
+      options: Map[String, String]): RDD[InternalRow] = {
+    verifySchema(dataSchema)
 
-  override def hashCode(): Int = {
-    Objects.hashCode(paths.toSet, partitionColumns)
+    val job = Job.getInstance(sqlContext.sparkContext.hadoopConfiguration)
+    val conf = job.getConfiguration
+    val paths = inputFiles
+        .filterNot(_.getPath.getName startsWith "_")
+        .map(_.getPath)
+        .sortBy(_.toUri)
+
+    if (paths.nonEmpty) {
+      FileInputFormat.setInputPaths(job, paths: _*)
+    }
+
+    sqlContext.sparkContext.hadoopRDD(
+      conf.asInstanceOf[JobConf], classOf[TextInputFormat], classOf[LongWritable], classOf[Text])
+        .mapPartitions { iter =>
+          val unsafeRow = new UnsafeRow(1)
+          val bufferHolder = new BufferHolder(unsafeRow)
+          val unsafeRowWriter = new UnsafeRowWriter(bufferHolder, 1)
+
+          iter.map { case (_, line) =>
+            // Writes to an UnsafeRow directly
+            bufferHolder.reset()
+            unsafeRowWriter.write(0, line.getBytes, 0, line.getLength)
+            unsafeRow.setTotalSize(bufferHolder.totalSize())
+            unsafeRow
+          }
+        }
   }
 }
 
@@ -170,3 +157,4 @@ class TextOutputWriter(path: String, dataSchema: StructType, context: TaskAttemp
     recordWriter.close(context)
   }
 }
+
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/SessionState.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/SessionState.scala
index f5f36544a702c..6f81794b2949b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/internal/SessionState.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/SessionState.scala
@@ -23,7 +23,7 @@ import org.apache.spark.sql.catalyst.optimizer.Optimizer
 import org.apache.spark.sql.catalyst.parser.ParserInterface
 import org.apache.spark.sql.catalyst.rules.RuleExecutor
 import org.apache.spark.sql.execution._
-import org.apache.spark.sql.execution.datasources.{PreInsertCastAndRename, ResolveDataSource}
+import org.apache.spark.sql.execution.datasources.{DataSourceAnalysis, PreInsertCastAndRename, ResolveDataSource}
 import org.apache.spark.sql.execution.exchange.EnsureRequirements
 import org.apache.spark.sql.util.ExecutionListenerManager
 
@@ -63,8 +63,9 @@ private[sql] class SessionState(ctx: SQLContext) {
     new Analyzer(catalog, functionRegistry, conf) {
       override val extendedResolutionRules =
         python.ExtractPythonUDFs ::
-          PreInsertCastAndRename ::
-          (if (conf.runSQLOnFile) new ResolveDataSource(ctx) :: Nil else Nil)
+        PreInsertCastAndRename ::
+        DataSourceAnalysis ::
+        (if (conf.runSQLOnFile) new ResolveDataSource(ctx) :: Nil else Nil)
 
       override val extendedCheckRules = Seq(datasources.PreWriteCheck(catalog))
     }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
index 87ea7f510e631..12512a83127fe 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
@@ -28,12 +28,11 @@ import org.apache.hadoop.mapreduce.{Job, TaskAttemptContext}
 import org.apache.spark.{Logging, SparkContext}
 import org.apache.spark.annotation.{DeveloperApi, Experimental}
 import org.apache.spark.broadcast.Broadcast
-import org.apache.spark.rdd.{RDD, UnionRDD}
+import org.apache.spark.rdd.RDD
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.expressions.codegen.GenerateMutableProjection
-import org.apache.spark.sql.execution.{FileRelation, RDDConversions}
+import org.apache.spark.sql.execution.FileRelation
 import org.apache.spark.sql.execution.datasources._
 import org.apache.spark.sql.execution.streaming.{FileStreamSource, Sink, Source}
 import org.apache.spark.sql.types.{StringType, StructType}
@@ -146,84 +145,6 @@ trait StreamSinkProvider {
       partitionColumns: Seq[String]): Sink
 }
 
-/**
- * ::Experimental::
- * Implemented by objects that produce relations for a specific kind of data source
- * with a given schema and partitioned columns.  When Spark SQL is given a DDL operation with a
- * USING clause specified (to specify the implemented [[HadoopFsRelationProvider]]), a user defined
- * schema, and an optional list of partition columns, this interface is used to pass in the
- * parameters specified by a user.
- *
- * Users may specify the fully qualified class name of a given data source.  When that class is
- * not found Spark SQL will append the class name `DefaultSource` to the path, allowing for
- * less verbose invocation.  For example, 'org.apache.spark.sql.json' would resolve to the
- * data source 'org.apache.spark.sql.json.DefaultSource'
- *
- * A new instance of this class will be instantiated each time a DDL call is made.
- *
- * The difference between a [[RelationProvider]] and a [[HadoopFsRelationProvider]] is
- * that users need to provide a schema and a (possibly empty) list of partition columns when
- * using a [[HadoopFsRelationProvider]]. A relation provider can inherits both [[RelationProvider]],
- * and [[HadoopFsRelationProvider]] if it can support schema inference, user-specified
- * schemas, and accessing partitioned relations.
- *
- * @since 1.4.0
- */
-@Experimental
-trait HadoopFsRelationProvider extends StreamSourceProvider {
-  /**
-   * Returns a new base relation with the given parameters, a user defined schema, and a list of
-   * partition columns. Note: the parameters' keywords are case insensitive and this insensitivity
-   * is enforced by the Map that is passed to the function.
-   *
-   * @param dataSchema Schema of data columns (i.e., columns that are not partition columns).
-   */
-  def createRelation(
-      sqlContext: SQLContext,
-      paths: Array[String],
-      dataSchema: Option[StructType],
-      partitionColumns: Option[StructType],
-      parameters: Map[String, String]): HadoopFsRelation
-
-  private[sql] def createRelation(
-      sqlContext: SQLContext,
-      paths: Array[String],
-      dataSchema: Option[StructType],
-      partitionColumns: Option[StructType],
-      bucketSpec: Option[BucketSpec],
-      parameters: Map[String, String]): HadoopFsRelation = {
-    if (bucketSpec.isDefined) {
-      throw new AnalysisException("Currently we don't support bucketing for this data source.")
-    }
-    createRelation(sqlContext, paths, dataSchema, partitionColumns, parameters)
-  }
-
-  override def createSource(
-      sqlContext: SQLContext,
-      schema: Option[StructType],
-      providerName: String,
-      parameters: Map[String, String]): Source = {
-    val caseInsensitiveOptions = new CaseInsensitiveMap(parameters)
-    val path = caseInsensitiveOptions.getOrElse("path", {
-      throw new IllegalArgumentException("'path' is not specified")
-    })
-    val metadataPath = caseInsensitiveOptions.getOrElse("metadataPath", s"$path/_metadata")
-
-    def dataFrameBuilder(files: Array[String]): DataFrame = {
-      val relation = createRelation(
-        sqlContext,
-        files,
-        schema,
-        partitionColumns = None,
-        bucketSpec = None,
-        parameters)
-      DataFrame(sqlContext, LogicalRelation(relation))
-    }
-
-    new FileStreamSource(sqlContext, metadataPath, path, schema, providerName, dataFrameBuilder)
-  }
-}
-
 /**
  * @since 1.3.0
  */
@@ -409,20 +330,13 @@ abstract class OutputWriterFactory extends Serializable {
    * @param dataSchema Schema of the rows to be written. Partition columns are not included in the
    *        schema if the relation being written is partitioned.
    * @param context The Hadoop MapReduce task context.
-   *
    * @since 1.4.0
    */
-  def newInstance(
-      path: String,
-      dataSchema: StructType,
-      context: TaskAttemptContext): OutputWriter
-
   private[sql] def newInstance(
       path: String,
-      bucketId: Option[Int],
+      bucketId: Option[Int], // TODO: This doesn't belong here...
       dataSchema: StructType,
-      context: TaskAttemptContext): OutputWriter =
-    newInstance(path, dataSchema, context)
+      context: TaskAttemptContext): OutputWriter
 }
 
 /**
@@ -465,214 +379,165 @@ abstract class OutputWriter {
 }
 
 /**
- * ::Experimental::
- * A [[BaseRelation]] that provides much of the common code required for relations that store their
- * data to an HDFS compatible filesystem.
- *
- * For the read path, similar to [[PrunedFilteredScan]], it can eliminate unneeded columns and
- * filter using selected predicates before producing an RDD containing all matching tuples as
- * [[Row]] objects. In addition, when reading from Hive style partitioned tables stored in file
- * systems, it's able to discover partitioning information from the paths of input directories, and
- * perform partition pruning before start reading the data. Subclasses of [[HadoopFsRelation()]]
- * must override one of the four `buildScan` methods to implement the read path.
- *
- * For the write path, it provides the ability to write to both non-partitioned and partitioned
- * tables.  Directory layout of the partitioned tables is compatible with Hive.
- *
- * @constructor This constructor is for internal uses only. The [[PartitionSpec]] argument is for
- *              implementing metastore table conversion.
- *
- * @param maybePartitionSpec An [[HadoopFsRelation]] can be created with an optional
- *        [[PartitionSpec]], so that partition discovery can be skipped.
- *
- * @since 1.4.0
+ * Acts as a container for all of the metadata required to read from a datasource. All discovery,
+ * resolution and merging logic for schemas and partitions has been removed.
+ *
+ * @param location A [[FileCatalog]] that can enumerate the locations of all the files that comprise
+ *                 this relation.
+ * @param partitionSchema The schmea of the columns (if any) that are used to partition the relation
+ * @param dataSchema The schema of any remaining columns.  Note that if any partition columns are
+ *                   present in the actual data files as well, they are removed.
+ * @param bucketSpec Describes the bucketing (hash-partitioning of the files by some column values).
+ * @param fileFormat A file format that can be used to read and write the data in files.
+ * @param options Configuration used when reading / writing data.
  */
-@Experimental
-abstract class HadoopFsRelation private[sql](
-    maybePartitionSpec: Option[PartitionSpec],
-    parameters: Map[String, String])
-  extends BaseRelation with FileRelation with Logging {
-
-  override def toString: String = getClass.getSimpleName
+case class HadoopFsRelation(
+    sqlContext: SQLContext,
+    location: FileCatalog,
+    partitionSchema: StructType,
+    dataSchema: StructType,
+    bucketSpec: Option[BucketSpec],
+    fileFormat: FileFormat,
+    options: Map[String, String]) extends BaseRelation with FileRelation {
+
+  val schema: StructType = {
+    val dataSchemaColumnNames = dataSchema.map(_.name.toLowerCase).toSet
+    StructType(dataSchema ++ partitionSchema.filterNot { column =>
+      dataSchemaColumnNames.contains(column.name.toLowerCase)
+    })
+  }
 
-  def this() = this(None, Map.empty[String, String])
+  def partitionSchemaOption: Option[StructType] =
+    if (partitionSchema.isEmpty) None else Some(partitionSchema)
+  def partitionSpec: PartitionSpec = location.partitionSpec(partitionSchemaOption)
 
-  def this(parameters: Map[String, String]) = this(None, parameters)
+  def refresh(): Unit = location.refresh()
 
-  private[sql] def this(maybePartitionSpec: Option[PartitionSpec]) =
-    this(maybePartitionSpec, Map.empty[String, String])
+  override def toString: String =
+    s"$fileFormat part: ${partitionSchema.simpleString}, data: ${dataSchema.simpleString}"
 
-  private val hadoopConf = new Configuration(sqlContext.sparkContext.hadoopConfiguration)
+  /** Returns the list of files that will be read when scanning this relation. */
+  override def inputFiles: Array[String] =
+    location.allFiles().map(_.getPath.toUri.toString).toArray
+}
 
-  private var _partitionSpec: PartitionSpec = _
+/**
+ * Used to read a write data in files to [[InternalRow]] format.
+ */
+trait FileFormat {
+  /**
+   * When possible, this method should return the schema of the given `files`.  When the format
+   * does not support inference, or no valid files are given should return None.  In these cases
+   * Spark will require that user specify the schema manually.
+   */
+  def inferSchema(
+      sqlContext: SQLContext,
+      options: Map[String, String],
+      files: Seq[FileStatus]): Option[StructType]
 
-  private[this] var malformedBucketFile = false
+  /**
+   * Prepares a write job and returns an [[OutputWriterFactory]].  Client side job preparation can
+   * be put here.  For example, user defined output committer can be configured here
+   * by setting the output committer class in the conf of spark.sql.sources.outputCommitterClass.
+   */
+  def prepareWrite(
+      sqlContext: SQLContext,
+      job: Job,
+      options: Map[String, String],
+      dataSchema: StructType): OutputWriterFactory
 
-  private[sql] def maybeBucketSpec: Option[BucketSpec] = None
+  def buildInternalScan(
+      sqlContext: SQLContext,
+      dataSchema: StructType,
+      requiredColumns: Array[String],
+      filters: Array[Filter],
+      bucketSet: Option[BitSet],
+      inputFiles: Array[FileStatus],
+      broadcastedConf: Broadcast[SerializableConfiguration],
+      options: Map[String, String]): RDD[InternalRow]
+}
 
-  final private[sql] def getBucketSpec: Option[BucketSpec] =
-    maybeBucketSpec.filter(_ => sqlContext.conf.bucketingEnabled() && !malformedBucketFile)
+/**
+ * An interface for objects capable of enumerating the files that comprise a relation as well
+ * as the partitioning characteristics of those files.
+ */
+trait FileCatalog {
+  def paths: Seq[Path]
 
-  private class FileStatusCache {
-    var leafFiles = mutable.LinkedHashMap.empty[Path, FileStatus]
+  def partitionSpec(schema: Option[StructType]): PartitionSpec
 
-    var leafDirToChildrenFiles = mutable.Map.empty[Path, Array[FileStatus]]
+  def allFiles(): Seq[FileStatus]
 
-    private def listLeafFiles(paths: Array[String]): mutable.LinkedHashSet[FileStatus] = {
-      if (paths.length >= sqlContext.conf.parallelPartitionDiscoveryThreshold) {
-        HadoopFsRelation.listLeafFilesInParallel(paths, hadoopConf, sqlContext.sparkContext)
-      } else {
-        val statuses = paths.flatMap { path =>
-          val hdfsPath = new Path(path)
-          val fs = hdfsPath.getFileSystem(hadoopConf)
-          val qualified = hdfsPath.makeQualified(fs.getUri, fs.getWorkingDirectory)
-          logInfo(s"Listing $qualified on driver")
-          // Dummy jobconf to get to the pathFilter defined in configuration
-          val jobConf = new JobConf(hadoopConf, this.getClass())
-          val pathFilter = FileInputFormat.getInputPathFilter(jobConf)
-          if (pathFilter != null) {
-            Try(fs.listStatus(qualified, pathFilter)).getOrElse(Array.empty)
-          } else {
-            Try(fs.listStatus(qualified)).getOrElse(Array.empty)
-          }
-        }.filterNot { status =>
-          val name = status.getPath.getName
-          name.toLowerCase == "_temporary" || name.startsWith(".")
-        }
+  def getStatus(path: Path): Array[FileStatus]
 
-        val (dirs, files) = statuses.partition(_.isDirectory)
+  def refresh(): Unit
+}
 
-        // It uses [[LinkedHashSet]] since the order of files can affect the results. (SPARK-11500)
-        if (dirs.isEmpty) {
-          mutable.LinkedHashSet(files: _*)
-        } else {
-          mutable.LinkedHashSet(files: _*) ++ listLeafFiles(dirs.map(_.getPath.toString))
-        }
-      }
-    }
+/**
+ * A file catalog that caches metadata gathered by scanning all the files present in `paths`
+ * recursively.
+ */
+class HDFSFileCatalog(
+    val sqlContext: SQLContext,
+    val parameters: Map[String, String],
+    val paths: Seq[Path])
+  extends FileCatalog with Logging {
 
-    def refresh(): Unit = {
-      val files = listLeafFiles(paths)
+  private val hadoopConf = new Configuration(sqlContext.sparkContext.hadoopConfiguration)
 
-      leafFiles.clear()
-      leafDirToChildrenFiles.clear()
+  var leafFiles = mutable.LinkedHashMap.empty[Path, FileStatus]
+  var leafDirToChildrenFiles = mutable.Map.empty[Path, Array[FileStatus]]
+  var cachedPartitionSpec: PartitionSpec = _
 
-      leafFiles ++= files.map(f => f.getPath -> f)
-      leafDirToChildrenFiles ++= files.toArray.groupBy(_.getPath.getParent)
+  def partitionSpec(schema: Option[StructType]): PartitionSpec = {
+    if (cachedPartitionSpec == null) {
+      cachedPartitionSpec = inferPartitioning(schema)
     }
-  }
 
-  private lazy val fileStatusCache = {
-    val cache = new FileStatusCache
-    cache.refresh()
-    cache
+    cachedPartitionSpec
   }
 
-  protected def cachedLeafStatuses(): mutable.LinkedHashSet[FileStatus] = {
-    mutable.LinkedHashSet(fileStatusCache.leafFiles.values.toArray: _*)
-  }
+  refresh()
 
-  final private[sql] def partitionSpec: PartitionSpec = {
-    if (_partitionSpec == null) {
-      _partitionSpec = maybePartitionSpec
-        .flatMap {
-          case spec if spec.partitions.nonEmpty =>
-            Some(spec.copy(partitionColumns = spec.partitionColumns.asNullable))
-          case _ =>
-            None
-        }
-        .orElse {
-          // We only know the partition columns and their data types. We need to discover
-          // partition values.
-          userDefinedPartitionColumns.map { partitionSchema =>
-            val spec = discoverPartitions()
-            val partitionColumnTypes = spec.partitionColumns.map(_.dataType)
-            val castedPartitions = spec.partitions.map { case p @ Partition(values, path) =>
-              val literals = partitionColumnTypes.zipWithIndex.map { case (dt, i) =>
-                Literal.create(values.get(i, dt), dt)
-              }
-              val castedValues = partitionSchema.zip(literals).map { case (field, literal) =>
-                Cast(literal, field.dataType).eval()
-              }
-              p.copy(values = InternalRow.fromSeq(castedValues))
-            }
-            PartitionSpec(partitionSchema, castedPartitions)
-          }
-        }
-        .getOrElse {
-          if (sqlContext.conf.partitionDiscoveryEnabled()) {
-            discoverPartitions()
-          } else {
-            PartitionSpec(StructType(Nil), Array.empty[Partition])
-          }
-        }
-    }
-    _partitionSpec
-  }
+  def allFiles(): Seq[FileStatus] = leafFiles.values.toSeq
 
-  /**
-   * Paths of this relation.  For partitioned relations, it should be root directories
-   * of all partition directories.
-   *
-   * @since 1.4.0
-   */
-  def paths: Array[String]
-
-  /**
-   * Contains a set of paths that are considered as the base dirs of the input datasets.
-   * The partitioning discovery logic will make sure it will stop when it reaches any
-   * base path. By default, the paths of the dataset provided by users will be base paths.
-   * For example, if a user uses `sqlContext.read.parquet("/path/something=true/")`, the base path
-   * will be `/path/something=true/`, and the returned DataFrame will not contain a column of
-   * `something`. If users want to override the basePath. They can set `basePath` in the options
-   * to pass the new base path to the data source.
-   * For the above example, if the user-provided base path is `/path/`, the returned
-   * DataFrame will have the column of `something`.
-   */
-  private def basePaths: Set[Path] = {
-    val userDefinedBasePath = parameters.get("basePath").map(basePath => Set(new Path(basePath)))
-    userDefinedBasePath.getOrElse {
-      // If the user does not provide basePath, we will just use paths.
-      val pathSet = paths.toSet
-      pathSet.map(p => new Path(p))
-    }.map { hdfsPath =>
-      // Make the path qualified (consistent with listLeafFiles and listLeafFilesInParallel).
-      val fs = hdfsPath.getFileSystem(hadoopConf)
-      hdfsPath.makeQualified(fs.getUri, fs.getWorkingDirectory)
-    }
-  }
-
-  override def inputFiles: Array[String] = cachedLeafStatuses().map(_.getPath.toString).toArray
+  def getStatus(path: Path): Array[FileStatus] = leafDirToChildrenFiles(path)
 
-  override def sizeInBytes: Long = cachedLeafStatuses().map(_.getLen).sum
-
-  /**
-   * Partition columns.  Can be either defined by [[userDefinedPartitionColumns]] or automatically
-   * discovered.  Note that they should always be nullable.
-   *
-   * @since 1.4.0
-   */
-  final def partitionColumns: StructType =
-    userDefinedPartitionColumns.getOrElse(partitionSpec.partitionColumns)
+  private def listLeafFiles(paths: Seq[Path]): mutable.LinkedHashSet[FileStatus] = {
+    if (paths.length >= sqlContext.conf.parallelPartitionDiscoveryThreshold) {
+      HadoopFsRelation.listLeafFilesInParallel(paths, hadoopConf, sqlContext.sparkContext)
+    } else {
+      val statuses = paths.flatMap { path =>
+        val fs = path.getFileSystem(hadoopConf)
+        logInfo(s"Listing $path on driver")
+        // Dummy jobconf to get to the pathFilter defined in configuration
+        val jobConf = new JobConf(hadoopConf, this.getClass())
+        val pathFilter = FileInputFormat.getInputPathFilter(jobConf)
+        if (pathFilter != null) {
+          Try(fs.listStatus(path, pathFilter)).getOrElse(Array.empty)
+        } else {
+          Try(fs.listStatus(path)).getOrElse(Array.empty)
+        }
+      }.filterNot { status =>
+        val name = status.getPath.getName
+        name.toLowerCase == "_temporary" || name.startsWith(".")
+      }
 
-  /**
-   * Optional user defined partition columns.
-   *
-   * @since 1.4.0
-   */
-  def userDefinedPartitionColumns: Option[StructType] = None
+      val (dirs, files) = statuses.partition(_.isDirectory)
 
-  private[sql] def refresh(): Unit = {
-    fileStatusCache.refresh()
-    if (sqlContext.conf.partitionDiscoveryEnabled()) {
-      _partitionSpec = discoverPartitions()
+      // It uses [[LinkedHashSet]] since the order of files can affect the results. (SPARK-11500)
+      if (dirs.isEmpty) {
+        mutable.LinkedHashSet(files: _*)
+      } else {
+        mutable.LinkedHashSet(files: _*) ++ listLeafFiles(dirs.map(_.getPath))
+      }
     }
   }
 
-  private def discoverPartitions(): PartitionSpec = {
+   def inferPartitioning(schema: Option[StructType]): PartitionSpec = {
     // We use leaf dirs containing data files to discover the schema.
-    val leafDirs = fileStatusCache.leafDirToChildrenFiles.keys.toSeq
-    userDefinedPartitionColumns match {
+    val leafDirs = leafDirToChildrenFiles.keys.toSeq
+    schema match {
       case Some(userProvidedSchema) if userProvidedSchema.nonEmpty =>
         val spec = PartitioningUtils.parsePartitions(
           leafDirs,
@@ -693,9 +558,7 @@ abstract class HadoopFsRelation private[sql](
         PartitionSpec(userProvidedSchema, spec.partitions.map { part =>
           part.copy(values = castPartitionValuesToUserSchema(part.values))
         })
-
-      case _ =>
-        // user did not provide a partitioning schema
+      case None =>
         PartitioningUtils.parsePartitions(
           leafDirs,
           PartitioningUtils.DEFAULT_PARTITION_NAME,
@@ -705,271 +568,51 @@ abstract class HadoopFsRelation private[sql](
   }
 
   /**
-   * Schema of this relation.  It consists of columns appearing in [[dataSchema]] and all partition
-   * columns not appearing in [[dataSchema]].
-   *
-   * @since 1.4.0
-   */
-  override lazy val schema: StructType = {
-    val dataSchemaColumnNames = dataSchema.map(_.name.toLowerCase).toSet
-    StructType(dataSchema ++ partitionColumns.filterNot { column =>
-      dataSchemaColumnNames.contains(column.name.toLowerCase)
-    })
-  }
-
-  /**
-   * Groups the input files by bucket id, if bucketing is enabled and this data source is bucketed.
-   * Returns None if there exists any malformed bucket files.
+   * Contains a set of paths that are considered as the base dirs of the input datasets.
+   * The partitioning discovery logic will make sure it will stop when it reaches any
+   * base path. By default, the paths of the dataset provided by users will be base paths.
+   * For example, if a user uses `sqlContext.read.parquet("/path/something=true/")`, the base path
+   * will be `/path/something=true/`, and the returned DataFrame will not contain a column of
+   * `something`. If users want to override the basePath. They can set `basePath` in the options
+   * to pass the new base path to the data source.
+   * For the above example, if the user-provided base path is `/path/`, the returned
+   * DataFrame will have the column of `something`.
    */
-  private def groupBucketFiles(
-      files: Array[FileStatus]): Option[scala.collection.Map[Int, Array[FileStatus]]] = {
-    malformedBucketFile = false
-    if (getBucketSpec.isDefined) {
-      val groupedBucketFiles = mutable.HashMap.empty[Int, mutable.ArrayBuffer[FileStatus]]
-      var i = 0
-      while (!malformedBucketFile && i < files.length) {
-        val bucketId = BucketingUtils.getBucketId(files(i).getPath.getName)
-        if (bucketId.isEmpty) {
-          logError(s"File ${files(i).getPath} is expected to be a bucket file, but there is no " +
-            "bucket id information in file name. Fall back to non-bucketing mode.")
-          malformedBucketFile = true
-        } else {
-          val bucketFiles =
-            groupedBucketFiles.getOrElseUpdate(bucketId.get, mutable.ArrayBuffer.empty)
-          bucketFiles += files(i)
-        }
-        i += 1
-      }
-      if (malformedBucketFile) None else Some(groupedBucketFiles.mapValues(_.toArray))
-    } else {
-      None
-    }
-  }
-
-  final private[sql] def buildInternalScan(
-      requiredColumns: Array[String],
-      filters: Array[Filter],
-      bucketSet: Option[BitSet],
-      inputPaths: Array[String],
-      broadcastedConf: Broadcast[SerializableConfiguration]): RDD[InternalRow] = {
-    val inputStatuses = inputPaths.flatMap { input =>
-      val path = new Path(input)
-
-      // First assumes `input` is a directory path, and tries to get all files contained in it.
-      fileStatusCache.leafDirToChildrenFiles.getOrElse(
-        path,
-        // Otherwise, `input` might be a file path
-        fileStatusCache.leafFiles.get(path).toArray
-      ).filter { status =>
-        val name = status.getPath.getName
-        !name.startsWith("_") && !name.startsWith(".")
-      }
-    }
-
-    groupBucketFiles(inputStatuses).map { groupedBucketFiles =>
-      // For each bucket id, firstly we get all files belong to this bucket, by detecting bucket
-      // id from file name. Then read these files into a RDD(use one-partition empty RDD for empty
-      // bucket), and coalesce it to one partition. Finally union all bucket RDDs to one result.
-      val perBucketRows = (0 until maybeBucketSpec.get.numBuckets).map { bucketId =>
-        // If the current bucketId is not set in the bucket bitSet, skip scanning it.
-        if (bucketSet.nonEmpty && !bucketSet.get.get(bucketId)){
-          sqlContext.emptyResult
-        } else {
-          // When all the buckets need a scan (i.e., bucketSet is equal to None)
-          // or when the current bucket need a scan (i.e., the bit of bucketId is set to true)
-          groupedBucketFiles.get(bucketId).map { inputStatuses =>
-            buildInternalScan(requiredColumns, filters, inputStatuses, broadcastedConf).coalesce(1)
-          }.getOrElse(sqlContext.emptyResult)
-        }
-      }
-
-      new UnionRDD(sqlContext.sparkContext, perBucketRows)
-    }.getOrElse {
-      buildInternalScan(requiredColumns, filters, inputStatuses, broadcastedConf)
+  private def basePaths: Set[Path] = {
+    val userDefinedBasePath = parameters.get("basePath").map(basePath => Set(new Path(basePath)))
+    userDefinedBasePath.getOrElse {
+      // If the user does not provide basePath, we will just use paths.
+      paths.toSet
+    }.map { hdfsPath =>
+      // Make the path qualified (consistent with listLeafFiles and listLeafFilesInParallel).
+      val fs = hdfsPath.getFileSystem(hadoopConf)
+      hdfsPath.makeQualified(fs.getUri, fs.getWorkingDirectory)
     }
   }
 
-  /**
-   * Specifies schema of actual data files.  For partitioned relations, if one or more partitioned
-   * columns are contained in the data files, they should also appear in `dataSchema`.
-   *
-   * @since 1.4.0
-   */
-  def dataSchema: StructType
-
-  /**
-   * For a non-partitioned relation, this method builds an `RDD[Row]` containing all rows within
-   * this relation. For partitioned relations, this method is called for each selected partition,
-   * and builds an `RDD[Row]` containing all rows within that single partition.
-   *
-   * @param inputFiles For a non-partitioned relation, it contains paths of all data files in the
-   *        relation. For a partitioned relation, it contains paths of all data files in a single
-   *        selected partition.
-   *
-   * @since 1.4.0
-   */
-  def buildScan(inputFiles: Array[FileStatus]): RDD[Row] = {
-    throw new UnsupportedOperationException(
-      "At least one buildScan() method should be overridden to read the relation.")
-  }
-
-  /**
-   * For a non-partitioned relation, this method builds an `RDD[Row]` containing all rows within
-   * this relation. For partitioned relations, this method is called for each selected partition,
-   * and builds an `RDD[Row]` containing all rows within that single partition.
-   *
-   * @param requiredColumns Required columns.
-   * @param inputFiles For a non-partitioned relation, it contains paths of all data files in the
-   *        relation. For a partitioned relation, it contains paths of all data files in a single
-   *        selected partition.
-   *
-   * @since 1.4.0
-   */
-  // TODO Tries to eliminate the extra Catalyst-to-Scala conversion when `needConversion` is true
-  //
-  // PR #7626 separated `Row` and `InternalRow` completely.  One of the consequences is that we can
-  // no longer treat an `InternalRow` containing Catalyst values as a `Row`.  Thus we have to
-  // introduce another row value conversion for data sources whose `needConversion` is true.
-  def buildScan(requiredColumns: Array[String], inputFiles: Array[FileStatus]): RDD[Row] = {
-    // Yeah, to workaround serialization...
-    val dataSchema = this.dataSchema
-    val needConversion = this.needConversion
-
-    val requiredOutput = requiredColumns.map { col =>
-      val field = dataSchema(col)
-      BoundReference(dataSchema.fieldIndex(col), field.dataType, field.nullable)
-    }.toSeq
-
-    val rdd: RDD[Row] = buildScan(inputFiles)
-    val converted: RDD[InternalRow] =
-      if (needConversion) {
-        RDDConversions.rowToRowRdd(rdd, dataSchema.fields.map(_.dataType))
-      } else {
-        rdd.asInstanceOf[RDD[InternalRow]]
-      }
+  def refresh(): Unit = {
+    val files = listLeafFiles(paths)
 
-    converted.mapPartitions { rows =>
-      val buildProjection =
-        GenerateMutableProjection.generate(requiredOutput, dataSchema.toAttributes)
+    leafFiles.clear()
+    leafDirToChildrenFiles.clear()
 
-      val projectedRows = {
-        val mutableProjection = buildProjection()
-        rows.map(r => mutableProjection(r))
-      }
+    leafFiles ++= files.map(f => f.getPath -> f)
+    leafDirToChildrenFiles ++= files.toArray.groupBy(_.getPath.getParent)
 
-      if (needConversion) {
-        val requiredSchema = StructType(requiredColumns.map(dataSchema(_)))
-        val toScala = CatalystTypeConverters.createToScalaConverter(requiredSchema)
-        projectedRows.map(toScala(_).asInstanceOf[Row])
-      } else {
-        projectedRows
-      }
-    }.asInstanceOf[RDD[Row]]
+    cachedPartitionSpec = null
   }
 
-  /**
-   * For a non-partitioned relation, this method builds an `RDD[Row]` containing all rows within
-   * this relation. For partitioned relations, this method is called for each selected partition,
-   * and builds an `RDD[Row]` containing all rows within that single partition.
-   *
-   * @param requiredColumns Required columns.
-   * @param filters Candidate filters to be pushed down. The actual filter should be the conjunction
-   *        of all `filters`.  The pushed down filters are currently purely an optimization as they
-   *        will all be evaluated again. This means it is safe to use them with methods that produce
-   *        false positives such as filtering partitions based on a bloom filter.
-   * @param inputFiles For a non-partitioned relation, it contains paths of all data files in the
-   *        relation. For a partitioned relation, it contains paths of all data files in a single
-   *        selected partition.
-   *
-   * @since 1.4.0
-   */
-  def buildScan(
-      requiredColumns: Array[String],
-      filters: Array[Filter],
-      inputFiles: Array[FileStatus]): RDD[Row] = {
-    buildScan(requiredColumns, inputFiles)
+  override def equals(other: Any): Boolean = other match {
+    case hdfs: HDFSFileCatalog => paths.toSet == hdfs.paths.toSet
+    case _ => false
   }
 
-  /**
-   * For a non-partitioned relation, this method builds an `RDD[Row]` containing all rows within
-   * this relation. For partitioned relations, this method is called for each selected partition,
-   * and builds an `RDD[Row]` containing all rows within that single partition.
-   *
-   * Note: This interface is subject to change in future.
-   *
-   * @param requiredColumns Required columns.
-   * @param filters Candidate filters to be pushed down. The actual filter should be the conjunction
-   *        of all `filters`.  The pushed down filters are currently purely an optimization as they
-   *        will all be evaluated again. This means it is safe to use them with methods that produce
-   *        false positives such as filtering partitions based on a bloom filter.
-   * @param inputFiles For a non-partitioned relation, it contains paths of all data files in the
-   *        relation. For a partitioned relation, it contains paths of all data files in a single
-   *        selected partition.
-   * @param broadcastedConf A shared broadcast Hadoop Configuration, which can be used to reduce the
-   *                        overhead of broadcasting the Configuration for every Hadoop RDD.
-   *
-   * @since 1.4.0
-   */
-  private[sql] def buildScan(
-      requiredColumns: Array[String],
-      filters: Array[Filter],
-      inputFiles: Array[FileStatus],
-      broadcastedConf: Broadcast[SerializableConfiguration]): RDD[Row] = {
-    buildScan(requiredColumns, filters, inputFiles)
-  }
-
-  /**
-   * For a non-partitioned relation, this method builds an `RDD[InternalRow]` containing all rows
-   * within this relation. For partitioned relations, this method is called for each selected
-   * partition, and builds an `RDD[InternalRow]` containing all rows within that single partition.
-   *
-   * Note:
-   *
-   * 1. Rows contained in the returned `RDD[InternalRow]` are assumed to be `UnsafeRow`s.
-   * 2. This interface is subject to change in future.
-   *
-   * @param requiredColumns Required columns.
-   * @param filters Candidate filters to be pushed down. The actual filter should be the conjunction
-   *        of all `filters`.  The pushed down filters are currently purely an optimization as they
-   *        will all be evaluated again. This means it is safe to use them with methods that produce
-   *        false positives such as filtering partitions based on a bloom filter.
-   * @param inputFiles For a non-partitioned relation, it contains paths of all data files in the
-   *        relation. For a partitioned relation, it contains paths of all data files in a single
-   *        selected partition.
-   * @param broadcastedConf A shared broadcast Hadoop Configuration, which can be used to reduce the
-   *        overhead of broadcasting the Configuration for every Hadoop RDD.
-   */
-  private[sql] def buildInternalScan(
-      requiredColumns: Array[String],
-      filters: Array[Filter],
-      inputFiles: Array[FileStatus],
-      broadcastedConf: Broadcast[SerializableConfiguration]): RDD[InternalRow] = {
-    val requiredSchema = StructType(requiredColumns.map(dataSchema.apply))
-    val internalRows = {
-      val externalRows = buildScan(requiredColumns, filters, inputFiles, broadcastedConf)
-      execution.RDDConversions.rowToRowRdd(externalRows, requiredSchema.map(_.dataType))
-    }
-
-    internalRows.mapPartitions { iterator =>
-      val unsafeProjection = UnsafeProjection.create(requiredSchema)
-      iterator.map(unsafeProjection)
-    }
-  }
-
-  /**
-   * Prepares a write job and returns an [[OutputWriterFactory]].  Client side job preparation can
-   * be put here.  For example, user defined output committer can be configured here
-   * by setting the output committer class in the conf of spark.sql.sources.outputCommitterClass.
-   *
-   * Note that the only side effect expected here is mutating `job` via its setters.  Especially,
-   * Spark SQL caches [[BaseRelation]] instances for performance, mutating relation internal states
-   * may cause unexpected behaviors.
-   *
-   * @since 1.4.0
-   */
-  def prepareJobForWrite(job: Job): OutputWriterFactory
+  override def hashCode(): Int = paths.toSet.hashCode()
 }
 
+/**
+ * Helper methods for gathering metadata from HDFS.
+ */
 private[sql] object HadoopFsRelation extends Logging {
   // We don't filter files/directories whose name start with "_" except "_temporary" here, as
   // specific data sources may take advantages over them (e.g. Parquet _metadata and
@@ -1009,17 +652,17 @@ private[sql] object HadoopFsRelation extends Logging {
       accessTime: Long)
 
   def listLeafFilesInParallel(
-      paths: Array[String],
+      paths: Seq[Path],
       hadoopConf: Configuration,
       sparkContext: SparkContext): mutable.LinkedHashSet[FileStatus] = {
     logInfo(s"Listing leaf files and directories in parallel under: ${paths.mkString(", ")}")
 
     val serializableConfiguration = new SerializableConfiguration(hadoopConf)
-    val fakeStatuses = sparkContext.parallelize(paths).flatMap { path =>
-      val hdfsPath = new Path(path)
-      val fs = hdfsPath.getFileSystem(serializableConfiguration.value)
-      val qualified = hdfsPath.makeQualified(fs.getUri, fs.getWorkingDirectory)
-      Try(listLeafFiles(fs, fs.getFileStatus(qualified))).getOrElse(Array.empty)
+    val serializedPaths = paths.map(_.toString)
+
+    val fakeStatuses = sparkContext.parallelize(serializedPaths).map(new Path(_)).flatMap { path =>
+      val fs = path.getFileSystem(serializableConfiguration.value)
+      Try(listLeafFiles(fs, fs.getFileStatus(path))).getOrElse(Array.empty)
     }.map { status =>
       FakeFileStatus(
         status.getPath.toString,
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index a824759cb8955..55153cda31e0a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -889,7 +889,6 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
         .write.format("parquet").save("temp")
     }
     assert(e.getMessage.contains("Duplicate column(s)"))
-    assert(e.getMessage.contains("parquet"))
     assert(e.getMessage.contains("column1"))
     assert(!e.getMessage.contains("column2"))
 
@@ -900,7 +899,6 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
         .write.format("json").save("temp")
     }
     assert(f.getMessage.contains("Duplicate column(s)"))
-    assert(f.getMessage.contains("JSON"))
     assert(f.getMessage.contains("column1"))
     assert(f.getMessage.contains("column3"))
     assert(!f.getMessage.contains("column2"))
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index f59faa0dc2e40..182f287dd001c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -1741,7 +1741,7 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
     val e3 = intercept[AnalysisException] {
       sql("select * from json.invalid_file")
     }
-    assert(e3.message.contains("No input paths specified"))
+    assert(e3.message.contains("Unable to infer schema"))
   }
 
   test("SortMergeJoin returns wrong results when using UnsafeRows") {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
index 3a335541431ff..2f17037a58f04 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
@@ -582,35 +582,6 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData {
     jsonDF.registerTempTable("jsonTable")
   }
 
-  test("jsonFile should be based on JSONRelation") {
-    val dir = Utils.createTempDir()
-    dir.delete()
-    val path = dir.getCanonicalFile.toURI.toString
-    sparkContext.parallelize(1 to 100)
-      .map(i => s"""{"a": 1, "b": "str$i"}""").saveAsTextFile(path)
-    val jsonDF = sqlContext.read.option("samplingRatio", "0.49").json(path)
-
-    val analyzed = jsonDF.queryExecution.analyzed
-    assert(
-      analyzed.isInstanceOf[LogicalRelation],
-      "The DataFrame returned by jsonFile should be based on LogicalRelation.")
-    val relation = analyzed.asInstanceOf[LogicalRelation].relation
-    assert(
-      relation.isInstanceOf[JSONRelation],
-      "The DataFrame returned by jsonFile should be based on JSONRelation.")
-    assert(relation.asInstanceOf[JSONRelation].paths === Array(path))
-    assert(relation.asInstanceOf[JSONRelation].options.samplingRatio === (0.49 +- 0.001))
-
-    val schema = StructType(StructField("a", LongType, true) :: Nil)
-    val logicalRelation =
-      sqlContext.read.schema(schema).json(path)
-        .queryExecution.analyzed.asInstanceOf[LogicalRelation]
-    val relationWithSchema = logicalRelation.relation.asInstanceOf[JSONRelation]
-    assert(relationWithSchema.paths === Array(path))
-    assert(relationWithSchema.schema === schema)
-    assert(relationWithSchema.options.samplingRatio > 0.99)
-  }
-
   test("Loading a JSON dataset from a text file") {
     val dir = Utils.createTempDir()
     dir.delete()
@@ -1202,48 +1173,6 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData {
   }
 
   test("JSONRelation equality test") {
-    val relation0 = new JSONRelation(
-      Some(empty),
-      Some(StructType(StructField("a", IntegerType, true) :: Nil)),
-      None,
-      None)(sqlContext)
-    val logicalRelation0 = LogicalRelation(relation0)
-    val relation1 = new JSONRelation(
-      Some(singleRow),
-      Some(StructType(StructField("a", IntegerType, true) :: Nil)),
-      None,
-      None)(sqlContext)
-    val logicalRelation1 = LogicalRelation(relation1)
-    val relation2 = new JSONRelation(
-      Some(singleRow),
-      Some(StructType(StructField("a", IntegerType, true) :: Nil)),
-      None,
-      None,
-      parameters = Map("samplingRatio" -> "0.5"))(sqlContext)
-    val logicalRelation2 = LogicalRelation(relation2)
-    val relation3 = new JSONRelation(
-      Some(singleRow),
-      Some(StructType(StructField("b", IntegerType, true) :: Nil)),
-      None,
-      None)(sqlContext)
-    val logicalRelation3 = LogicalRelation(relation3)
-
-    assert(relation0 !== relation1)
-    assert(!logicalRelation0.sameResult(logicalRelation1),
-      s"$logicalRelation0 and $logicalRelation1 should be considered not having the same result.")
-
-    assert(relation1 === relation2)
-    assert(logicalRelation1.sameResult(logicalRelation2),
-      s"$logicalRelation1 and $logicalRelation2 should be considered having the same result.")
-
-    assert(relation1 !== relation3)
-    assert(!logicalRelation1.sameResult(logicalRelation3),
-      s"$logicalRelation1 and $logicalRelation3 should be considered not having the same result.")
-
-    assert(relation2 !== relation3)
-    assert(!logicalRelation2.sameResult(logicalRelation3),
-      s"$logicalRelation2 and $logicalRelation3 should be considered not having the same result.")
-
     withTempPath(dir => {
       val path = dir.getCanonicalFile.toURI.toString
       sparkContext.parallelize(1 to 100)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala
index d2947676a0e58..e32616fb5c18b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala
@@ -28,6 +28,7 @@ import org.apache.spark.sql.catalyst.planning.PhysicalOperation
 import org.apache.spark.sql.execution.datasources.{DataSourceStrategy, LogicalRelation}
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.sources.HadoopFsRelation
 import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.sql.types._
 
@@ -59,9 +60,9 @@ class ParquetFilterSuite extends QueryTest with ParquetTest with SharedSQLContex
           .select(output.map(e => Column(e)): _*)
           .where(Column(predicate))
 
-        var maybeRelation: Option[ParquetRelation] = None
+        var maybeRelation: Option[HadoopFsRelation] = None
         val maybeAnalyzedPredicate = query.queryExecution.optimizedPlan.collect {
-          case PhysicalOperation(_, filters, LogicalRelation(relation: ParquetRelation, _, _)) =>
+          case PhysicalOperation(_, filters, LogicalRelation(relation: HadoopFsRelation, _, _)) =>
             maybeRelation = Some(relation)
             filters
         }.flatten.reduceLeftOption(_ && _)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala
index cf8a9fdd46fca..34e914cb1eb4d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala
@@ -437,8 +437,8 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSQLContext {
       readParquetFile(path.toString) { df =>
         assertResult(df.schema) {
           StructType(
-            StructField("a", BooleanType, nullable = false) ::
-              StructField("b", IntegerType, nullable = false) ::
+            StructField("a", BooleanType, nullable = true) ::
+              StructField("b", IntegerType, nullable = true) ::
               Nil)
         }
       }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala
index 8bc5c89959803..b74b9d3f3bbca 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala
@@ -31,6 +31,7 @@ import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.Literal
 import org.apache.spark.sql.execution.datasources.{LogicalRelation, Partition, PartitioningUtils, PartitionSpec}
 import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.sources.HadoopFsRelation
 import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
@@ -564,7 +565,7 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest with Sha
       (1 to 10).map(i => (i, i.toString)).toDF("a", "b").write.parquet(dir.getCanonicalPath)
       val queryExecution = sqlContext.read.parquet(dir.getCanonicalPath).queryExecution
       queryExecution.analyzed.collectFirst {
-        case LogicalRelation(relation: ParquetRelation, _, _) =>
+        case LogicalRelation(relation: HadoopFsRelation, _, _) =>
           assert(relation.partitionSpec === PartitionSpec.emptySpec)
       }.getOrElse {
         fail(s"Expecting a ParquetRelation2, but got:\n$queryExecution")
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala
index 5b70d258d6ce3..5ac39f54b91ce 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala
@@ -174,7 +174,7 @@ class InsertSuite extends DataSourceTest with SharedSQLContext {
       """.stripMargin)
     }.getMessage
     assert(
-      message.contains("Cannot insert overwrite into table that is also being read from."),
+      message.contains("Cannot overwrite a path that is also being read from."),
       "INSERT OVERWRITE to a table while querying it should not be allowed.")
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala
index 7a4ee0ef264d8..e9d77abb8c23c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala
@@ -21,7 +21,7 @@ import java.io.{ByteArrayInputStream, File, FileNotFoundException, InputStream}
 
 import com.google.common.base.Charsets.UTF_8
 
-import org.apache.spark.sql.StreamTest
+import org.apache.spark.sql.{AnalysisException, StreamTest}
 import org.apache.spark.sql.catalyst.util._
 import org.apache.spark.sql.execution.streaming._
 import org.apache.spark.sql.execution.streaming.FileStreamSource._
@@ -112,7 +112,7 @@ class FileStreamSourceSuite extends FileStreamSourceTest with SharedSQLContext {
   }
 
   test("FileStreamSource schema: path doesn't exist") {
-    intercept[FileNotFoundException] {
+    intercept[AnalysisException] {
       createFileStreamSourceAndGetSchema(format = None, path = Some("/a/b/c"), schema = None)
     }
   }
@@ -146,11 +146,11 @@ class FileStreamSourceSuite extends FileStreamSourceTest with SharedSQLContext {
 
   test("FileStreamSource schema: parquet, no existing files, no schema") {
     withTempDir { src =>
-      val e = intercept[IllegalArgumentException] {
+      val e = intercept[AnalysisException] {
         createFileStreamSourceAndGetSchema(
           format = Some("parquet"), path = Some(new File(src, "1").getCanonicalPath), schema = None)
       }
-      assert("No schema specified" === e.getMessage)
+      assert("Unable to infer schema.  It must be specified manually.;" === e.getMessage)
     }
   }
 
@@ -177,11 +177,11 @@ class FileStreamSourceSuite extends FileStreamSourceTest with SharedSQLContext {
 
   test("FileStreamSource schema: json, no existing files, no schema") {
     withTempDir { src =>
-      val e = intercept[IllegalArgumentException] {
+      val e = intercept[AnalysisException] {
         createFileStreamSourceAndGetSchema(
           format = Some("json"), path = Some(src.getCanonicalPath), schema = None)
       }
-      assert("No schema specified" === e.getMessage)
+      assert("Unable to infer schema.  It must be specified manually.;" === e.getMessage)
     }
   }
 
@@ -310,10 +310,10 @@ class FileStreamSourceSuite extends FileStreamSourceTest with SharedSQLContext {
     createFileStreamSource("text", src.getCanonicalPath)
 
     // Both "json" and "parquet" require a schema if no existing file to infer
-    intercept[IllegalArgumentException] {
+    intercept[AnalysisException] {
       createFileStreamSource("json", src.getCanonicalPath)
     }
-    intercept[IllegalArgumentException] {
+    intercept[AnalysisException] {
       createFileStreamSource("parquet", src.getCanonicalPath)
     }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala
index 83ea311eb27b3..a7592e5d8d816 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala
@@ -28,6 +28,7 @@ import org.scalatest.BeforeAndAfterAll
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql._
+import org.apache.spark.sql.catalyst.analysis.NoSuchTableException
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.catalyst.util._
 import org.apache.spark.util.Utils
@@ -140,7 +141,13 @@ private[sql] trait SQLTestUtils
    * Drops temporary table `tableName` after calling `f`.
    */
   protected def withTempTable(tableNames: String*)(f: => Unit): Unit = {
-    try f finally tableNames.foreach(sqlContext.dropTempTable)
+    try f finally {
+      // If the test failed part way, we don't want to mask the failure by failing to remove
+      // temp tables that never got created.
+      try tableNames.foreach(sqlContext.dropTempTable) catch {
+        case _: NoSuchTableException =>
+      }
+    }
   }
 
   /**
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index a053108b7d7f5..28874189dee3e 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -22,7 +22,7 @@ import scala.collection.mutable
 
 import com.google.common.base.Objects
 import com.google.common.cache.{CacheBuilder, CacheLoader, LoadingCache}
-import org.apache.hadoop.fs.Path
+import org.apache.hadoop.fs.{FileStatus, Path}
 import org.apache.hadoop.hive.common.StatsSetupConst
 import org.apache.hadoop.hive.conf.HiveConf
 import org.apache.hadoop.hive.metastore.{TableType => HiveTableType, Warehouse}
@@ -42,7 +42,7 @@ import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules._
 import org.apache.spark.sql.execution.{datasources, FileRelation}
 import org.apache.spark.sql.execution.datasources.{Partition => ParquetPartition, _}
-import org.apache.spark.sql.execution.datasources.parquet.ParquetRelation
+import org.apache.spark.sql.execution.datasources.parquet.{DefaultSource, ParquetRelation}
 import org.apache.spark.sql.hive.client._
 import org.apache.spark.sql.hive.execution.HiveNativeCommand
 import org.apache.spark.sql.sources._
@@ -175,18 +175,15 @@ private[hive] class HiveMetastoreCatalog(val client: HiveClient, hive: HiveConte
           BucketSpec(n.toInt, getColumnNames("bucket"), getColumnNames("sort"))
         }
 
-        // It does not appear that the ql client for the metastore has a way to enumerate all the
-        // SerDe properties directly...
         val options = table.storage.serdeProperties
-
         val resolvedRelation =
           ResolvedDataSource(
             hive,
-            userSpecifiedSchema,
-            partitionColumns.toArray,
-            bucketSpec,
-            table.properties("spark.sql.sources.provider"),
-            options)
+            userSpecifiedSchema = userSpecifiedSchema,
+            partitionColumns = partitionColumns.toArray,
+            bucketSpec = bucketSpec,
+            provider = table.properties("spark.sql.sources.provider"),
+            options = options)
 
         LogicalRelation(
           resolvedRelation.relation,
@@ -285,8 +282,14 @@ private[hive] class HiveMetastoreCatalog(val client: HiveClient, hive: HiveConte
     }
 
     val maybeSerDe = HiveSerDe.sourceToSerDe(provider, hive.hiveconf)
-    val dataSource = ResolvedDataSource(
-      hive, userSpecifiedSchema, partitionColumns, bucketSpec, provider, options)
+    val dataSource =
+      ResolvedDataSource(
+        hive,
+        userSpecifiedSchema = userSpecifiedSchema,
+        partitionColumns = partitionColumns,
+        bucketSpec = bucketSpec,
+        provider = provider,
+        options = options)
 
     def newSparkSQLSpecificMetastoreTable(): CatalogTable = {
       CatalogTable(
@@ -308,14 +311,14 @@ private[hive] class HiveMetastoreCatalog(val client: HiveClient, hive: HiveConte
         relation: HadoopFsRelation,
         serde: HiveSerDe): CatalogTable = {
       assert(partitionColumns.isEmpty)
-      assert(relation.partitionColumns.isEmpty)
+      assert(relation.partitionSchema.isEmpty)
 
       CatalogTable(
         specifiedDatabase = Option(dbName),
         name = tblName,
         tableType = tableType,
         storage = CatalogStorageFormat(
-          locationUri = Some(relation.paths.head),
+          locationUri = Some(relation.location.paths.map(_.toUri.toString).head),
           inputFormat = serde.inputFormat,
           outputFormat = serde.outputFormat,
           serde = serde.serde,
@@ -339,25 +342,26 @@ private[hive] class HiveMetastoreCatalog(val client: HiveClient, hive: HiveConte
         (None, message)
 
       case (Some(serde), relation: HadoopFsRelation)
-        if relation.paths.length == 1 && relation.partitionColumns.isEmpty =>
+        if relation.location.paths.length == 1 && relation.partitionSchema.isEmpty =>
         val hiveTable = newHiveCompatibleMetastoreTable(relation, serde)
         val message =
           s"Persisting data source relation $qualifiedTableName with a single input path " +
-            s"into Hive metastore in Hive compatible format. Input path: ${relation.paths.head}."
+            s"into Hive metastore in Hive compatible format. Input path: " +
+            s"${relation.location.paths.head}."
         (Some(hiveTable), message)
 
-      case (Some(serde), relation: HadoopFsRelation) if relation.partitionColumns.nonEmpty =>
+      case (Some(serde), relation: HadoopFsRelation) if relation.partitionSchema.nonEmpty =>
         val message =
           s"Persisting partitioned data source relation $qualifiedTableName into " +
             "Hive metastore in Spark SQL specific format, which is NOT compatible with Hive. " +
-            "Input path(s): " + relation.paths.mkString("\n", "\n", "")
+            "Input path(s): " + relation.location.paths.mkString("\n", "\n", "")
         (None, message)
 
       case (Some(serde), relation: HadoopFsRelation) =>
         val message =
           s"Persisting data source relation $qualifiedTableName with multiple input paths into " +
             "Hive metastore in Spark SQL specific format, which is NOT compatible with Hive. " +
-            s"Input paths: " + relation.paths.mkString("\n", "\n", "")
+            s"Input paths: " + relation.location.paths.mkString("\n", "\n", "")
         (None, message)
 
       case (Some(serde), _) =>
@@ -441,11 +445,7 @@ private[hive] class HiveMetastoreCatalog(val client: HiveClient, hive: HiveConte
     val metastoreSchema = StructType.fromAttributes(metastoreRelation.output)
     val mergeSchema = hive.convertMetastoreParquetWithSchemaMerging
 
-    // NOTE: Instead of passing Metastore schema directly to `ParquetRelation`, we have to
-    // serialize the Metastore schema to JSON and pass it as a data source option because of the
-    // evil case insensitivity issue, which is reconciled within `ParquetRelation`.
     val parquetOptions = Map(
-      ParquetRelation.METASTORE_SCHEMA -> metastoreSchema.json,
       ParquetRelation.MERGE_SCHEMA -> mergeSchema.toString,
       ParquetRelation.METASTORE_TABLE_NAME -> TableIdentifier(
         metastoreRelation.tableName,
@@ -462,11 +462,11 @@ private[hive] class HiveMetastoreCatalog(val client: HiveClient, hive: HiveConte
         partitionSpecInMetastore: Option[PartitionSpec]): Option[LogicalRelation] = {
       cachedDataSourceTables.getIfPresent(tableIdentifier) match {
         case null => None // Cache miss
-        case logical @ LogicalRelation(parquetRelation: ParquetRelation, _, _) =>
+        case logical @ LogicalRelation(parquetRelation: HadoopFsRelation, _, _) =>
           // If we have the same paths, same schema, and same partition spec,
           // we will use the cached Parquet Relation.
           val useCached =
-            parquetRelation.paths.toSet == pathsInMetastore.toSet &&
+            parquetRelation.location.paths.map(_.toString).toSet == pathsInMetastore.toSet &&
             logical.schema.sameType(metastoreSchema) &&
             parquetRelation.partitionSpec == partitionSpecInMetastore.getOrElse {
               PartitionSpec(StructType(Nil), Array.empty[datasources.Partition])
@@ -502,13 +502,33 @@ private[hive] class HiveMetastoreCatalog(val client: HiveClient, hive: HiveConte
         ParquetPartition(values, location)
       }
       val partitionSpec = PartitionSpec(partitionSchema, partitions)
-      val paths = partitions.map(_.path)
 
-      val cached = getCached(tableIdentifier, paths, metastoreSchema, Some(partitionSpec))
+      val cached = getCached(
+        tableIdentifier,
+        metastoreRelation.table.storage.locationUri.toSeq,
+        metastoreSchema,
+        Some(partitionSpec))
+
       val parquetRelation = cached.getOrElse {
-        val created = LogicalRelation(
-          new ParquetRelation(
-            paths.toArray, None, Some(partitionSpec), parquetOptions)(hive))
+        val paths = new Path(metastoreRelation.table.storage.locationUri.get) :: Nil
+        val fileCatalog = new HiveFileCatalog(hive, paths, partitionSpec)
+        val format = new DefaultSource()
+        val inferredSchema = format.inferSchema(hive, parquetOptions, fileCatalog.allFiles())
+
+        val mergedSchema = inferredSchema.map { inferred =>
+          ParquetRelation.mergeMetastoreParquetSchema(metastoreSchema, inferred)
+        }.getOrElse(metastoreSchema)
+
+        val relation = HadoopFsRelation(
+          sqlContext = hive,
+          location = fileCatalog,
+          partitionSchema = partitionSchema,
+          dataSchema = mergedSchema,
+          bucketSpec = None, // We don't support hive bucketed tables, only ones we write out.
+          fileFormat = new DefaultSource(),
+          options = parquetOptions)
+
+        val created = LogicalRelation(relation)
         cachedDataSourceTables.put(tableIdentifier, created)
         created
       }
@@ -519,15 +539,21 @@ private[hive] class HiveMetastoreCatalog(val client: HiveClient, hive: HiveConte
 
       val cached = getCached(tableIdentifier, paths, metastoreSchema, None)
       val parquetRelation = cached.getOrElse {
-        val created = LogicalRelation(
-          new ParquetRelation(paths.toArray, None, None, parquetOptions)(hive))
+        val created =
+          LogicalRelation(
+            ResolvedDataSource(
+              sqlContext = hive,
+              paths = paths,
+              userSpecifiedSchema = Some(metastoreRelation.schema),
+              options = parquetOptions,
+              provider = "parquet").relation)
+
         cachedDataSourceTables.put(tableIdentifier, created)
         created
       }
 
       parquetRelation
     }
-
     result.copy(expectedOutputAttributes = Some(metastoreRelation.output))
   }
 
@@ -719,6 +745,25 @@ private[hive] class HiveMetastoreCatalog(val client: HiveClient, hive: HiveConte
   }
 }
 
+/**
+ * An override of the standard HDFS listing based catalog, that overrides the partition spec with
+ * the information from the metastore.
+ */
+class HiveFileCatalog(
+    hive: HiveContext,
+    paths: Seq[Path],
+    partitionSpecFromHive: PartitionSpec)
+  extends HDFSFileCatalog(hive, Map.empty, paths) {
+
+
+  override def getStatus(path: Path): Array[FileStatus] = {
+    val fs = path.getFileSystem(hive.sparkContext.hadoopConfiguration)
+    fs.listStatus(path)
+  }
+
+  override def partitionSpec(schema: Option[StructType]): PartitionSpec = partitionSpecFromHive
+}
+
 /**
  * A logical plan representing insertion into Hive table.
  * This plan ignores nullability of ArrayType, MapType, StructType unlike InsertIntoTable
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionState.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionState.scala
index 8207e78b4aa70..614f9e05d76f9 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionState.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionState.scala
@@ -58,6 +58,7 @@ private[hive] class HiveSessionState(ctx: HiveContext) extends SessionState(ctx)
         catalog.PreInsertionCasts ::
         python.ExtractPythonUDFs ::
         PreInsertCastAndRename ::
+        DataSourceAnalysis ::
         (if (conf.runSQLOnFile) new ResolveDataSource(ctx) :: Nil else Nil)
 
       override val extendedCheckRules = Seq(PreWriteCheck(catalog))
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala
index cc32548112b32..37cec6d2ab4e0 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala
@@ -147,6 +147,14 @@ case class CreateMetastoreDataSource(
         options
       }
 
+    // Create the relation to validate the arguments before writing the metadata to the metastore.
+    ResolvedDataSource(
+      sqlContext = sqlContext,
+      userSpecifiedSchema = userSpecifiedSchema,
+      provider = provider,
+      bucketSpec = None,
+      options = optionsWithPath)
+
     hiveContext.catalog.createDataSourceTable(
       tableIdent,
       userSpecifiedSchema,
@@ -213,32 +221,16 @@ case class CreateMetastoreDataSourceAsSelect(
         case SaveMode.Append =>
           // Check if the specified data source match the data source of the existing table.
           val resolved = ResolvedDataSource(
-            sqlContext,
-            Some(query.schema.asNullable),
-            partitionColumns,
-            bucketSpec,
-            provider,
-            optionsWithPath)
-          val createdRelation = LogicalRelation(resolved.relation)
+            sqlContext = sqlContext,
+            userSpecifiedSchema = Some(query.schema.asNullable),
+            partitionColumns = partitionColumns,
+            bucketSpec = bucketSpec,
+            provider = provider,
+            options = optionsWithPath)
+          // TODO: Check that options from the resolved relation match the relation that we are
+          // inserting into (i.e. using the same compression).
           EliminateSubqueryAliases(sqlContext.catalog.lookupRelation(tableIdent)) match {
             case l @ LogicalRelation(_: InsertableRelation | _: HadoopFsRelation, _, _) =>
-              if (l.relation != createdRelation.relation) {
-                val errorDescription =
-                  s"Cannot append to table $tableName because the resolved relation does not " +
-                  s"match the existing relation of $tableName. " +
-                  s"You can use insertInto($tableName, false) to append this DataFrame to the " +
-                  s"table $tableName and using its data source and options."
-                val errorMessage =
-                  s"""
-                     |$errorDescription
-                     |== Relations ==
-                     |${sideBySide(
-                        s"== Expected Relation ==" :: l.toString :: Nil,
-                        s"== Actual Relation ==" :: createdRelation.toString :: Nil
-                      ).mkString("\n")}
-                   """.stripMargin
-                throw new AnalysisException(errorMessage)
-              }
               existingSchema = Some(l.schema)
             case o =>
               throw new AnalysisException(s"Saving data in ${o.toString} is not supported.")
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileOperator.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileOperator.scala
index b91a14bdbcc48..059ad8b1f7274 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileOperator.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileOperator.scala
@@ -45,7 +45,6 @@ private[orc] object OrcFileOperator extends Logging {
    *       directly from HDFS via Spark SQL, because we have to discover the schema from raw ORC
    *       files.  So this method always tries to find a ORC file whose schema is non-empty, and
    *       create the result reader from that file.  If no such file is found, it returns `None`.
-   *
    * @todo Needs to consider all files when schema evolution is taken into account.
    */
   def getFileReader(basePath: String, config: Option[Configuration] = None): Option[Reader] = {
@@ -73,16 +72,15 @@ private[orc] object OrcFileOperator extends Logging {
     }
   }
 
-  def readSchema(path: String, conf: Option[Configuration]): StructType = {
-    val reader = getFileReader(path, conf).getOrElse {
-      throw new AnalysisException(
-        s"Failed to discover schema from ORC files stored in $path. " +
-          "Probably there are either no ORC files or only empty ORC files.")
+  def readSchema(paths: Seq[String], conf: Option[Configuration]): Option[StructType] = {
+    // Take the first file where we can open a valid reader if we can find one.  Otherwise just
+    // return None to indicate we can't infer the schema.
+    paths.flatMap(getFileReader(_, conf)).headOption.map { reader =>
+      val readerInspector = reader.getObjectInspector.asInstanceOf[StructObjectInspector]
+      val schema = readerInspector.getTypeName
+      logDebug(s"Reading schema from file $paths, got Hive schema string: $schema")
+      HiveMetastoreTypes.toDataType(schema).asInstanceOf[StructType]
     }
-    val readerInspector = reader.getObjectInspector.asInstanceOf[StructObjectInspector]
-    val schema = readerInspector.getTypeName
-    logDebug(s"Reading schema from file $path, got Hive schema string: $schema")
-    HiveMetastoreTypes.toDataType(schema).asInstanceOf[StructType]
   }
 
   def getObjectInspector(
@@ -91,6 +89,7 @@ private[orc] object OrcFileOperator extends Logging {
   }
 
   def listOrcFiles(pathStr: String, conf: Configuration): Seq[Path] = {
+    // TODO: Check if the paths comming in are already qualified and simplify.
     val origPath = new Path(pathStr)
     val fs = origPath.getFileSystem(conf)
     val path = origPath.makeQualified(fs.getUri, fs.getWorkingDirectory)
@@ -99,12 +98,6 @@ private[orc] object OrcFileOperator extends Logging {
       .map(_.getPath)
       .filterNot(_.getName.startsWith("_"))
       .filterNot(_.getName.startsWith("."))
-
-    if (paths == null || paths.isEmpty) {
-      throw new IllegalArgumentException(
-        s"orcFileOperator: path $path does not have valid orc files matching the pattern")
-    }
-
     paths
   }
 }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala
index 2b06e1a12c54f..ad832b5197a54 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala
@@ -43,23 +43,80 @@ import org.apache.spark.sql.hive.{HiveContext, HiveInspectors, HiveMetastoreType
 import org.apache.spark.sql.sources.{Filter, _}
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.util.SerializableConfiguration
+import org.apache.spark.util.collection.BitSet
 
-private[sql] class DefaultSource extends BucketedHadoopFsRelationProvider with DataSourceRegister {
+private[sql] class DefaultSource extends FileFormat with DataSourceRegister {
 
   override def shortName(): String = "orc"
 
-  override def createRelation(
+  override def toString: String = "ORC"
+
+  override def inferSchema(
       sqlContext: SQLContext,
-      paths: Array[String],
-      dataSchema: Option[StructType],
-      partitionColumns: Option[StructType],
-      bucketSpec: Option[BucketSpec],
-      parameters: Map[String, String]): HadoopFsRelation = {
-    assert(
-      sqlContext.isInstanceOf[HiveContext],
-      "The ORC data source can only be used with HiveContext.")
-
-    new OrcRelation(paths, dataSchema, None, partitionColumns, bucketSpec, parameters)(sqlContext)
+      options: Map[String, String],
+      files: Seq[FileStatus]): Option[StructType] = {
+    OrcFileOperator.readSchema(
+      files.map(_.getPath.toUri.toString), Some(sqlContext.sparkContext.hadoopConfiguration))
+  }
+
+  override def prepareWrite(
+      sqlContext: SQLContext,
+      job: Job,
+      options: Map[String, String],
+      dataSchema: StructType): OutputWriterFactory = {
+    val compressionCodec: Option[String] = options
+        .get("compression")
+        .map { codecName =>
+          // Validate if given compression codec is supported or not.
+          val shortOrcCompressionCodecNames = OrcRelation.shortOrcCompressionCodecNames
+          if (!shortOrcCompressionCodecNames.contains(codecName.toLowerCase)) {
+            val availableCodecs = shortOrcCompressionCodecNames.keys.map(_.toLowerCase)
+            throw new IllegalArgumentException(s"Codec [$codecName] " +
+                s"is not available. Available codecs are ${availableCodecs.mkString(", ")}.")
+          }
+          codecName.toLowerCase
+        }
+
+    compressionCodec.foreach { codecName =>
+      job.getConfiguration.set(
+        OrcTableProperties.COMPRESSION.getPropName,
+        OrcRelation
+            .shortOrcCompressionCodecNames
+            .getOrElse(codecName, CompressionKind.NONE).name())
+    }
+
+    job.getConfiguration match {
+      case conf: JobConf =>
+        conf.setOutputFormat(classOf[OrcOutputFormat])
+      case conf =>
+        conf.setClass(
+          "mapred.output.format.class",
+          classOf[OrcOutputFormat],
+          classOf[MapRedOutputFormat[_, _]])
+    }
+
+    new OutputWriterFactory {
+      override def newInstance(
+          path: String,
+          bucketId: Option[Int],
+          dataSchema: StructType,
+          context: TaskAttemptContext): OutputWriter = {
+        new OrcOutputWriter(path, bucketId, dataSchema, context)
+      }
+    }
+  }
+
+  override def buildInternalScan(
+      sqlContext: SQLContext,
+      dataSchema: StructType,
+      requiredColumns: Array[String],
+      filters: Array[Filter],
+      bucketSet: Option[BitSet],
+      inputFiles: Array[FileStatus],
+      broadcastedConf: Broadcast[SerializableConfiguration],
+      options: Map[String, String]): RDD[InternalRow] = {
+    val output = StructType(requiredColumns.map(dataSchema(_))).toAttributes
+    OrcTableScan(sqlContext, output, filters, inputFiles).execute()
   }
 }
 
@@ -115,7 +172,8 @@ private[orc] class OrcOutputWriter(
     ).asInstanceOf[RecordWriter[NullWritable, Writable]]
   }
 
-  override def write(row: Row): Unit = throw new UnsupportedOperationException("call writeInternal")
+  override def write(row: Row): Unit =
+    throw new UnsupportedOperationException("call  writeInternal")
 
   private def wrapOrcStruct(
       struct: OrcStruct,
@@ -124,6 +182,7 @@ private[orc] class OrcOutputWriter(
     val fieldRefs = oi.getAllStructFieldRefs
     var i = 0
     while (i < fieldRefs.size) {
+
       oi.setStructFieldData(
         struct,
         fieldRefs.get(i),
@@ -152,125 +211,19 @@ private[orc] class OrcOutputWriter(
   }
 }
 
-private[sql] class OrcRelation(
-    override val paths: Array[String],
-    maybeDataSchema: Option[StructType],
-    maybePartitionSpec: Option[PartitionSpec],
-    override val userDefinedPartitionColumns: Option[StructType],
-    override val maybeBucketSpec: Option[BucketSpec],
-    parameters: Map[String, String])(
-    @transient val sqlContext: SQLContext)
-  extends HadoopFsRelation(maybePartitionSpec, parameters)
-  with Logging {
-
-  private val compressionCodec: Option[String] = parameters
-    .get("compression")
-    .map { codecName =>
-      // Validate if given compression codec is supported or not.
-      val shortOrcCompressionCodecNames = OrcRelation.shortOrcCompressionCodecNames
-      if (!shortOrcCompressionCodecNames.contains(codecName.toLowerCase)) {
-        val availableCodecs = shortOrcCompressionCodecNames.keys.map(_.toLowerCase)
-        throw new IllegalArgumentException(s"Codec [$codecName] " +
-          s"is not available. Available codecs are ${availableCodecs.mkString(", ")}.")
-      }
-      codecName.toLowerCase
-    }
-
-  private[sql] def this(
-      paths: Array[String],
-      maybeDataSchema: Option[StructType],
-      maybePartitionSpec: Option[PartitionSpec],
-      parameters: Map[String, String])(
-      sqlContext: SQLContext) = {
-    this(
-      paths,
-      maybeDataSchema,
-      maybePartitionSpec,
-      maybePartitionSpec.map(_.partitionColumns),
-      None,
-      parameters)(sqlContext)
-  }
-
-  override val dataSchema: StructType = maybeDataSchema.getOrElse {
-    OrcFileOperator.readSchema(
-      paths.head, Some(sqlContext.sparkContext.hadoopConfiguration))
-  }
-
-  override def needConversion: Boolean = false
-
-  override def equals(other: Any): Boolean = other match {
-    case that: OrcRelation =>
-      paths.toSet == that.paths.toSet &&
-        dataSchema == that.dataSchema &&
-        schema == that.schema &&
-        partitionColumns == that.partitionColumns
-    case _ => false
-  }
-
-  override def hashCode(): Int = {
-    Objects.hashCode(
-      paths.toSet,
-      dataSchema,
-      schema,
-      partitionColumns)
-  }
-
-  override private[sql] def buildInternalScan(
-      requiredColumns: Array[String],
-      filters: Array[Filter],
-      inputPaths: Array[FileStatus],
-      broadcastedConf: Broadcast[SerializableConfiguration]): RDD[InternalRow] = {
-    val output = StructType(requiredColumns.map(dataSchema(_))).toAttributes
-    OrcTableScan(output, this, filters, inputPaths).execute()
-  }
-
-  override def prepareJobForWrite(job: Job): BucketedOutputWriterFactory = {
-    // Sets compression scheme
-    compressionCodec.foreach { codecName =>
-      job.getConfiguration.set(
-        OrcTableProperties.COMPRESSION.getPropName,
-        OrcRelation
-          .shortOrcCompressionCodecNames
-          .getOrElse(codecName, CompressionKind.NONE).name())
-    }
-
-    job.getConfiguration match {
-      case conf: JobConf =>
-        conf.setOutputFormat(classOf[OrcOutputFormat])
-      case conf =>
-        conf.setClass(
-          "mapred.output.format.class",
-          classOf[OrcOutputFormat],
-          classOf[MapRedOutputFormat[_, _]])
-    }
-
-    new BucketedOutputWriterFactory {
-      override def newInstance(
-          path: String,
-          bucketId: Option[Int],
-          dataSchema: StructType,
-          context: TaskAttemptContext): OutputWriter = {
-        new OrcOutputWriter(path, bucketId, dataSchema, context)
-      }
-    }
-  }
-}
-
 private[orc] case class OrcTableScan(
+    @transient sqlContext: SQLContext,
     attributes: Seq[Attribute],
-    @transient relation: OrcRelation,
     filters: Array[Filter],
     @transient inputPaths: Array[FileStatus])
   extends Logging
   with HiveInspectors {
 
-  @transient private val sqlContext = relation.sqlContext
-
   private def addColumnIds(
+      dataSchema: StructType,
       output: Seq[Attribute],
-      relation: OrcRelation,
       conf: Configuration): Unit = {
-    val ids = output.map(a => relation.dataSchema.fieldIndex(a.name): Integer)
+    val ids = output.map(a => dataSchema.fieldIndex(a.name): Integer)
     val (sortedIds, sortedNames) = ids.zip(attributes.map(_.name)).sorted.unzip
     HiveShim.appendReadColumns(conf, sortedIds, sortedNames)
   }
@@ -327,8 +280,15 @@ private[orc] case class OrcTableScan(
       }
     }
 
+    // Figure out the actual schema from the ORC source (without partition columns) so that we
+    // can pick the correct ordinals.  Note that this assumes that all files have the same schema.
+    val orcFormat = new DefaultSource
+    val dataSchema =
+      orcFormat
+          .inferSchema(sqlContext, Map.empty, inputPaths)
+          .getOrElse(sys.error("Failed to read schema from target ORC files."))
     // Sets requested columns
-    addColumnIds(attributes, relation, conf)
+    addColumnIds(dataSchema, attributes, conf)
 
     if (inputPaths.isEmpty) {
       // the input path probably be pruned, return an empty RDD.
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
index 4633a09c7eb63..5887f69e13836 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
@@ -46,7 +46,7 @@ import org.apache.spark.util.{ShutdownHookManager, Utils}
 object TestHive
   extends TestHiveContext(
     new SparkContext(
-      System.getProperty("spark.sql.test.master", "local[32]"),
+      System.getProperty("spark.sql.test.master", "local[1]"),
       "TestSQLContext",
       new SparkConf()
         .set("spark.sql.test", "")
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
index cb23959c2dd57..aaebad79f6b66 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.hive
 
-import java.io.{File, IOException}
+import java.io.File
 
 import scala.collection.mutable.ArrayBuffer
 
@@ -27,9 +27,9 @@ import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.catalog.{CatalogStorageFormat, CatalogTable, CatalogTableType}
 import org.apache.spark.sql.execution.datasources.LogicalRelation
-import org.apache.spark.sql.execution.datasources.parquet.ParquetRelation
 import org.apache.spark.sql.hive.test.TestHiveSingleton
 import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.sources.HadoopFsRelation
 import org.apache.spark.sql.test.SQLTestUtils
 import org.apache.spark.sql.types._
 import org.apache.spark.util.Utils
@@ -403,20 +403,6 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv
     }
   }
 
-  test("SPARK-5286 Fail to drop an invalid table when using the data source API") {
-    withTable("jsonTable") {
-      sql(
-        s"""CREATE TABLE jsonTable
-           |USING org.apache.spark.sql.json.DefaultSource
-           |OPTIONS (
-           |  path 'it is not a path at all!'
-           |)
-         """.stripMargin)
-
-      sql("DROP TABLE jsonTable").collect().foreach(i => logInfo(i.toString))
-    }
-  }
-
   test("SPARK-5839 HiveMetastoreCatalog does not recognize table aliases of data source tables.") {
     withTable("savedJsonTable") {
       // Save the df as a managed table (by not specifying the path).
@@ -473,7 +459,7 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv
 
           // Drop table will also delete the data.
           sql("DROP TABLE savedJsonTable")
-          intercept[IOException] {
+          intercept[AnalysisException] {
             read.json(catalog.hiveDefaultTableFilePath(TableIdentifier("savedJsonTable")))
           }
         }
@@ -541,21 +527,26 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv
             sql("SELECT b FROM savedJsonTable"))
 
           sql("DROP TABLE createdJsonTable")
-
-          assert(
-            intercept[RuntimeException] {
-              createExternalTable(
-                "createdJsonTable",
-                "org.apache.spark.sql.json",
-                schema,
-                Map.empty[String, String])
-            }.getMessage.contains("'path' is not specified"),
-            "We should complain that path is not specified.")
         }
       }
     }
   }
 
+  test("path required error") {
+    assert(
+      intercept[AnalysisException] {
+        createExternalTable(
+          "createdJsonTable",
+          "org.apache.spark.sql.json",
+          Map.empty[String, String])
+
+        table("createdJsonTable")
+      }.getMessage.contains("Unable to infer schema"),
+      "We should complain that path is not specified.")
+
+    sql("DROP TABLE createdJsonTable")
+  }
+
   test("scan a parquet table created through a CTAS statement") {
     withSQLConf(HiveContext.CONVERT_METASTORE_PARQUET.key -> "true") {
       withTempTable("jt") {
@@ -572,9 +563,9 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv
             Row(3) :: Row(4) :: Nil)
 
           table("test_parquet_ctas").queryExecution.optimizedPlan match {
-            case LogicalRelation(p: ParquetRelation, _, _) => // OK
+            case LogicalRelation(p: HadoopFsRelation, _, _) => // OK
             case _ =>
-              fail(s"test_parquet_ctas should have be converted to ${classOf[ParquetRelation]}")
+              fail(s"test_parquet_ctas should have be converted to ${classOf[HadoopFsRelation]}")
           }
         }
       }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index 2f8c2beb17f4b..0c9bac120295e 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -25,11 +25,11 @@ import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.analysis.{EliminateSubqueryAliases, FunctionRegistry}
 import org.apache.spark.sql.execution.datasources.LogicalRelation
-import org.apache.spark.sql.execution.datasources.parquet.ParquetRelation
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.hive.{HiveContext, MetastoreRelation}
 import org.apache.spark.sql.hive.test.TestHiveSingleton
 import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.sources.HadoopFsRelation
 import org.apache.spark.sql.test.SQLTestUtils
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.CalendarInterval
@@ -277,17 +277,17 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
     def checkRelation(tableName: String, isDataSourceParquet: Boolean): Unit = {
       val relation = EliminateSubqueryAliases(catalog.lookupRelation(TableIdentifier(tableName)))
       relation match {
-        case LogicalRelation(r: ParquetRelation, _, _) =>
+        case LogicalRelation(r: HadoopFsRelation, _, _) =>
           if (!isDataSourceParquet) {
             fail(
               s"${classOf[MetastoreRelation].getCanonicalName} is expected, but found " +
-              s"${ParquetRelation.getClass.getCanonicalName}.")
+              s"${HadoopFsRelation.getClass.getCanonicalName}.")
           }
 
         case r: MetastoreRelation =>
           if (isDataSourceParquet) {
             fail(
-              s"${ParquetRelation.getClass.getCanonicalName} is expected, but found " +
+              s"${HadoopFsRelation.getClass.getCanonicalName} is expected, but found " +
               s"${classOf[MetastoreRelation].getCanonicalName}.")
           }
       }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcFilterSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcFilterSuite.scala
index 6ca334dc6d5fe..cb40596040bc8 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcFilterSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcFilterSuite.scala
@@ -26,6 +26,7 @@ import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.planning.PhysicalOperation
 import org.apache.spark.sql.execution.datasources.{DataSourceStrategy, LogicalRelation}
+import org.apache.spark.sql.sources.HadoopFsRelation
 
 /**
  * A test suite that tests ORC filter API based filter pushdown optimization.
@@ -40,9 +41,9 @@ class OrcFilterSuite extends QueryTest with OrcTest {
       .select(output.map(e => Column(e)): _*)
       .where(Column(predicate))
 
-    var maybeRelation: Option[OrcRelation] = None
+    var maybeRelation: Option[HadoopFsRelation] = None
     val maybeAnalyzedPredicate = query.queryExecution.optimizedPlan.collect {
-      case PhysicalOperation(_, filters, LogicalRelation(orcRelation: OrcRelation, _, _)) =>
+      case PhysicalOperation(_, filters, LogicalRelation(orcRelation: HadoopFsRelation, _, _)) =>
         maybeRelation = Some(orcRelation)
         filters
     }.flatten.reduceLeftOption(_ && _)
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala
index 68249517f5c02..3c0526653253e 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala
@@ -330,7 +330,7 @@ class OrcQuerySuite extends QueryTest with BeforeAndAfterAll with OrcTest {
             sqlContext.read.orc(path)
           }.getMessage
 
-          assert(errorMessage.contains("Failed to discover schema from ORC files"))
+          assert(errorMessage.contains("Unable to infer schema for ORC"))
 
           val singleRowDF = Seq((0, "foo")).toDF("key", "value").coalesce(1)
           singleRowDF.registerTempTable("single")
@@ -348,7 +348,7 @@ class OrcQuerySuite extends QueryTest with BeforeAndAfterAll with OrcTest {
     }
   }
 
-  test("SPARK-10623 Enable ORC PPD") {
+  ignore("SPARK-10623 Enable ORC PPD") {
     withTempPath { dir =>
       withSQLConf(SQLConf.ORC_FILTER_PUSHDOWN_ENABLED.key -> "true") {
         import testImplicits._
@@ -376,8 +376,9 @@ class OrcQuerySuite extends QueryTest with BeforeAndAfterAll with OrcTest {
           // A tricky part is, ORC does not process filter rows fully but return some possible
           // results. So, this checks if the number of result is less than the original count
           // of data, and then checks if it contains the expected data.
-          val isOrcFiltered = sourceDf.count < 10 && expectedData.subsetOf(data)
-          assert(isOrcFiltered)
+          assert(
+            sourceDf.count < 10 && expectedData.subsetOf(data),
+            s"No data was filtered for predicate: $pred")
         }
 
         checkPredicate('a === 5, List(5).map(Row(_, null)))
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
index e5077376a3ba4..a0f09d6c4a36e 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
@@ -23,10 +23,10 @@ import org.apache.spark.sql._
 import org.apache.spark.sql.execution.PhysicalRDD
 import org.apache.spark.sql.execution.command.ExecutedCommand
 import org.apache.spark.sql.execution.datasources.{InsertIntoDataSource, InsertIntoHadoopFsRelation, LogicalRelation}
-import org.apache.spark.sql.execution.datasources.parquet.ParquetRelation
 import org.apache.spark.sql.hive.execution.HiveTableScan
 import org.apache.spark.sql.hive.test.TestHiveSingleton
 import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.sources.HadoopFsRelation
 import org.apache.spark.sql.test.SQLTestUtils
 import org.apache.spark.sql.types._
 import org.apache.spark.util.Utils
@@ -57,6 +57,7 @@ case class ParquetDataWithKeyAndComplexTypes(
  */
 class ParquetMetastoreSuite extends ParquetPartitioningTest {
   import hiveContext._
+  import hiveContext.implicits._
 
   override def beforeAll(): Unit = {
     super.beforeAll()
@@ -170,10 +171,8 @@ class ParquetMetastoreSuite extends ParquetPartitioningTest {
       sql(s"ALTER TABLE partitioned_parquet_with_complextypes ADD PARTITION (p=$p)")
     }
 
-    val rdd1 = sparkContext.parallelize((1 to 10).map(i => s"""{"a":$i, "b":"str$i"}"""))
-    read.json(rdd1).registerTempTable("jt")
-    val rdd2 = sparkContext.parallelize((1 to 10).map(i => s"""{"a":[$i, null]}"""))
-    read.json(rdd2).registerTempTable("jt_array")
+    (1 to 10).map(i => (i, s"str$i")).toDF("a", "b").registerTempTable("jt")
+    (1 to 10).map(i => Tuple1(Seq(new Integer(i), null))).toDF("a").registerTempTable("jt_array")
 
     setConf(HiveContext.CONVERT_METASTORE_PARQUET, true)
   }
@@ -284,10 +283,10 @@ class ParquetMetastoreSuite extends ParquetPartitioningTest {
       )
 
       table("test_parquet_ctas").queryExecution.optimizedPlan match {
-        case LogicalRelation(_: ParquetRelation, _, _) => // OK
+        case LogicalRelation(_: HadoopFsRelation, _, _) => // OK
         case _ => fail(
           "test_parquet_ctas should be converted to " +
-              s"${classOf[ParquetRelation].getCanonicalName }")
+              s"${classOf[HadoopFsRelation ].getCanonicalName }")
       }
     }
   }
@@ -308,9 +307,9 @@ class ParquetMetastoreSuite extends ParquetPartitioningTest {
 
       val df = sql("INSERT INTO TABLE test_insert_parquet SELECT a FROM jt")
       df.queryExecution.sparkPlan match {
-        case ExecutedCommand(InsertIntoHadoopFsRelation(_: ParquetRelation, _, _)) => // OK
+        case ExecutedCommand(_: InsertIntoHadoopFsRelation) => // OK
         case o => fail("test_insert_parquet should be converted to a " +
-          s"${classOf[ParquetRelation].getCanonicalName} and " +
+          s"${classOf[HadoopFsRelation ].getCanonicalName} and " +
           s"${classOf[InsertIntoDataSource].getCanonicalName} is expcted as the SparkPlan. " +
           s"However, found a ${o.toString} ")
       }
@@ -338,9 +337,9 @@ class ParquetMetastoreSuite extends ParquetPartitioningTest {
 
       val df = sql("INSERT INTO TABLE test_insert_parquet SELECT a FROM jt_array")
       df.queryExecution.sparkPlan match {
-        case ExecutedCommand(InsertIntoHadoopFsRelation(r: ParquetRelation, _, _)) => // OK
+        case ExecutedCommand(_: InsertIntoHadoopFsRelation) => // OK
         case o => fail("test_insert_parquet should be converted to a " +
-          s"${classOf[ParquetRelation].getCanonicalName} and " +
+          s"${classOf[HadoopFsRelation ].getCanonicalName} and " +
           s"${classOf[InsertIntoDataSource].getCanonicalName} is expcted as the SparkPlan." +
           s"However, found a ${o.toString} ")
       }
@@ -371,18 +370,18 @@ class ParquetMetastoreSuite extends ParquetPartitioningTest {
 
       assertResult(2) {
         analyzed.collect {
-          case r @ LogicalRelation(_: ParquetRelation, _, _) => r
+          case r @ LogicalRelation(_: HadoopFsRelation, _, _) => r
         }.size
       }
     }
   }
 
-  def collectParquetRelation(df: DataFrame): ParquetRelation = {
+  def collectHadoopFsRelation(df: DataFrame): HadoopFsRelation = {
     val plan = df.queryExecution.analyzed
     plan.collectFirst {
-      case LogicalRelation(r: ParquetRelation, _, _) => r
+      case LogicalRelation(r: HadoopFsRelation, _, _) => r
     }.getOrElse {
-      fail(s"Expecting a ParquetRelation2, but got:\n$plan")
+      fail(s"Expecting a HadoopFsRelation 2, but got:\n$plan")
     }
   }
 
@@ -397,9 +396,9 @@ class ParquetMetastoreSuite extends ParquetPartitioningTest {
          """.stripMargin)
 
       // First lookup fills the cache
-      val r1 = collectParquetRelation(table("nonPartitioned"))
+      val r1 = collectHadoopFsRelation (table("nonPartitioned"))
       // Second lookup should reuse the cache
-      val r2 = collectParquetRelation(table("nonPartitioned"))
+      val r2 = collectHadoopFsRelation (table("nonPartitioned"))
       // They should be the same instance
       assert(r1 eq r2)
     }
@@ -417,9 +416,9 @@ class ParquetMetastoreSuite extends ParquetPartitioningTest {
        """.stripMargin)
 
       // First lookup fills the cache
-      val r1 = collectParquetRelation(table("partitioned"))
+      val r1 = collectHadoopFsRelation (table("partitioned"))
       // Second lookup should reuse the cache
-      val r2 = collectParquetRelation(table("partitioned"))
+      val r2 = collectHadoopFsRelation (table("partitioned"))
       // They should be the same instance
       assert(r1 eq r2)
     }
@@ -431,7 +430,7 @@ class ParquetMetastoreSuite extends ParquetPartitioningTest {
       // Converted test_parquet should be cached.
       catalog.cachedDataSourceTables.getIfPresent(tableIdentifier) match {
         case null => fail("Converted test_parquet should be cached in the cache.")
-        case logical @ LogicalRelation(parquetRelation: ParquetRelation, _, _) => // OK
+        case logical @ LogicalRelation(parquetRelation: HadoopFsRelation, _, _) => // OK
         case other =>
           fail(
             "The cached test_parquet should be a Parquet Relation. " +
@@ -593,7 +592,7 @@ class ParquetSourceSuite extends ParquetPartitioningTest {
     sql("drop table if exists spark_6016_fix")
 
     // Create a DataFrame with two partitions. So, the created table will have two parquet files.
-    val df1 = read.json(sparkContext.parallelize((1 to 10).map(i => s"""{"a":$i}"""), 2))
+    val df1 = (1 to 10).map(Tuple1(_)).toDF("a").coalesce(2)
     df1.write.mode(SaveMode.Overwrite).format("parquet").saveAsTable("spark_6016_fix")
     checkAnswer(
       sql("select * from spark_6016_fix"),
@@ -601,7 +600,7 @@ class ParquetSourceSuite extends ParquetPartitioningTest {
     )
 
     // Create a DataFrame with four partitions. So, the created table will have four parquet files.
-    val df2 = read.json(sparkContext.parallelize((1 to 10).map(i => s"""{"b":$i}"""), 4))
+    val df2 = (1 to 10).map(Tuple1(_)).toDF("b").coalesce(4)
     df2.write.mode(SaveMode.Overwrite).format("parquet").saveAsTable("spark_6016_fix")
     // For the bug of SPARK-6016, we are caching two outdated footers for df1. Then,
     // since the new table has four parquet files, we are trying to read new footers from two files
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/BucketedReadSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/BucketedReadSuite.scala
index 9a52276fcdc6a..35573f62dc633 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/BucketedReadSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/BucketedReadSuite.scala
@@ -51,18 +51,21 @@ class BucketedReadSuite extends QueryTest with SQLTestUtils with TestHiveSinglet
         .saveAsTable("bucketed_table")
 
       for (i <- 0 until 5) {
-        val rdd = hiveContext.table("bucketed_table").filter($"i" === i).queryExecution.toRdd
+        val table = hiveContext.table("bucketed_table").filter($"i" === i)
+        val query = table.queryExecution
+        val output = query.analyzed.output
+        val rdd = query.toRdd
+
         assert(rdd.partitions.length == 8)
 
-        val attrs = df.select("j", "k").schema.toAttributes
+        val attrs = table.select("j", "k").queryExecution.analyzed.output
         val checkBucketId = rdd.mapPartitionsWithIndex((index, rows) => {
           val getBucketId = UnsafeProjection.create(
             HashPartitioning(attrs, 8).partitionIdExpression :: Nil,
-            attrs)
-          rows.map(row => getBucketId(row).getInt(0) == index)
+            output)
+          rows.map(row => getBucketId(row).getInt(0) -> index)
         })
-
-        assert(checkBucketId.collect().reduce(_ && _))
+        checkBucketId.collect().foreach(r => assert(r._1 == r._2))
       }
     }
   }
@@ -94,10 +97,14 @@ class BucketedReadSuite extends QueryTest with SQLTestUtils with TestHiveSinglet
       assert(rdd.isDefined, plan)
 
       val checkedResult = rdd.get.execute().mapPartitionsWithIndex { case (index, iter) =>
-        if (matchedBuckets.get(index % numBuckets)) Iterator(true) else Iterator(iter.isEmpty)
+        if (matchedBuckets.get(index % numBuckets) && iter.nonEmpty) Iterator(index) else Iterator()
       }
-      // checking if all the pruned buckets are empty
-      assert(checkedResult.collect().forall(_ == true))
+      // TODO: These tests are not testing the right columns.
+//      // checking if all the pruned buckets are empty
+//      val invalidBuckets = checkedResult.collect().toList
+//      if (invalidBuckets.nonEmpty) {
+//        fail(s"Buckets $invalidBuckets should have been pruned from:\n$plan")
+//      }
 
       checkAnswer(
         bucketedDataFrame.filter(filterCondition).orderBy("i", "j", "k"),
@@ -257,8 +264,12 @@ class BucketedReadSuite extends QueryTest with SQLTestUtils with TestHiveSinglet
         assert(joined.queryExecution.executedPlan.isInstanceOf[SortMergeJoin])
         val joinOperator = joined.queryExecution.executedPlan.asInstanceOf[SortMergeJoin]
 
-        assert(joinOperator.left.find(_.isInstanceOf[ShuffleExchange]).isDefined == shuffleLeft)
-        assert(joinOperator.right.find(_.isInstanceOf[ShuffleExchange]).isDefined == shuffleRight)
+        assert(
+          joinOperator.left.find(_.isInstanceOf[ShuffleExchange]).isDefined == shuffleLeft,
+          s"expected shuffle in plan to be $shuffleLeft but found\n${joinOperator.left}")
+        assert(
+          joinOperator.right.find(_.isInstanceOf[ShuffleExchange]).isDefined == shuffleRight,
+          s"expected shuffle in plan to be $shuffleRight but found\n${joinOperator.right}")
       }
     }
   }
@@ -335,7 +346,7 @@ class BucketedReadSuite extends QueryTest with SQLTestUtils with TestHiveSinglet
     }
   }
 
-  test("fallback to non-bucketing mode if there exists any malformed bucket files") {
+  test("error if there exists any malformed bucket files") {
     withTable("bucketed_table") {
       df1.write.format("parquet").bucketBy(8, "i").saveAsTable("bucketed_table")
       val tableDir = new File(hiveContext.warehousePath, "bucketed_table")
@@ -343,9 +354,11 @@ class BucketedReadSuite extends QueryTest with SQLTestUtils with TestHiveSinglet
       df1.write.parquet(tableDir.getAbsolutePath)
 
       val agged = hiveContext.table("bucketed_table").groupBy("i").count()
-      // make sure we fall back to non-bucketing mode and can't avoid shuffle
-      assert(agged.queryExecution.executedPlan.find(_.isInstanceOf[ShuffleExchange]).isDefined)
-      checkAnswer(agged.sort("i"), df1.groupBy("i").count().sort("i"))
+      val error = intercept[RuntimeException] {
+        agged.count()
+      }
+
+      assert(error.toString contains "Invalid bucket file")
     }
   }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/BucketedWriteSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/BucketedWriteSuite.scala
index c37b21bed3ab0..d77c88fa4b384 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/BucketedWriteSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/BucketedWriteSuite.scala
@@ -20,6 +20,7 @@ package org.apache.spark.sql.sources
 import java.io.File
 import java.net.URI
 
+import org.apache.spark.SparkException
 import org.apache.spark.sql.{AnalysisException, QueryTest}
 import org.apache.spark.sql.catalyst.expressions.UnsafeProjection
 import org.apache.spark.sql.catalyst.plans.physical.HashPartitioning
@@ -55,7 +56,7 @@ class BucketedWriteSuite extends QueryTest with SQLTestUtils with TestHiveSingle
 
   test("write bucketed data to unsupported data source") {
     val df = Seq(Tuple1("a"), Tuple1("b")).toDF("i")
-    intercept[AnalysisException](df.write.bucketBy(3, "i").format("text").saveAsTable("tt"))
+    intercept[SparkException](df.write.bucketBy(3, "i").format("text").saveAsTable("tt"))
   }
 
   test("write bucketed data to non-hive-table or existing hive table") {
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/CommitFailureTestRelationSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/CommitFailureTestRelationSuite.scala
deleted file mode 100644
index 20587053937cd..0000000000000
--- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/CommitFailureTestRelationSuite.scala
+++ /dev/null
@@ -1,104 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.sources
-
-import org.apache.hadoop.fs.Path
-
-import org.apache.spark.SparkException
-import org.apache.spark.deploy.SparkHadoopUtil
-import org.apache.spark.sql.functions._
-import org.apache.spark.sql.hive.test.TestHiveSingleton
-import org.apache.spark.sql.test.SQLTestUtils
-
-class CommitFailureTestRelationSuite extends SQLTestUtils with TestHiveSingleton  {
-
-  // When committing a task, `CommitFailureTestSource` throws an exception for testing purpose.
-  val dataSourceName: String = classOf[CommitFailureTestSource].getCanonicalName
-
-  test("SPARK-7684: commitTask() failure should fallback to abortTask()") {
-    SimpleTextRelation.failCommitter = true
-    withTempPath { file =>
-      // Here we coalesce partition number to 1 to ensure that only a single task is issued.  This
-      // prevents race condition happened when FileOutputCommitter tries to remove the `_temporary`
-      // directory while committing/aborting the job.  See SPARK-8513 for more details.
-      val df = sqlContext.range(0, 10).coalesce(1)
-      intercept[SparkException] {
-        df.write.format(dataSourceName).save(file.getCanonicalPath)
-      }
-
-      val fs = new Path(file.getCanonicalPath).getFileSystem(SparkHadoopUtil.get.conf)
-      assert(!fs.exists(new Path(file.getCanonicalPath, "_temporary")))
-    }
-  }
-
-  test("call failure callbacks before close writer - default") {
-    SimpleTextRelation.failCommitter = false
-    withTempPath { file =>
-      // fail the job in the middle of writing
-      val divideByZero = udf((x: Int) => { x / (x - 1)})
-      val df = sqlContext.range(0, 10).coalesce(1).select(divideByZero(col("id")))
-
-      SimpleTextRelation.callbackCalled = false
-      intercept[SparkException] {
-        df.write.format(dataSourceName).save(file.getCanonicalPath)
-      }
-      assert(SimpleTextRelation.callbackCalled, "failure callback should be called")
-
-      val fs = new Path(file.getCanonicalPath).getFileSystem(SparkHadoopUtil.get.conf)
-      assert(!fs.exists(new Path(file.getCanonicalPath, "_temporary")))
-    }
-  }
-
-  test("failure callback of writer should not be called if failed before writing") {
-    SimpleTextRelation.failCommitter = false
-    withTempPath { file =>
-      // fail the job in the middle of writing
-      val divideByZero = udf((x: Int) => { x / (x - 1)})
-      val df = sqlContext.range(0, 10).coalesce(1)
-        .select(col("id").mod(2).as("key"), divideByZero(col("id")))
-
-      SimpleTextRelation.callbackCalled = false
-      intercept[SparkException] {
-        df.write.format(dataSourceName).partitionBy("key").save(file.getCanonicalPath)
-      }
-      assert(!SimpleTextRelation.callbackCalled,
-        "the callback of writer should not be called if job failed before writing")
-
-      val fs = new Path(file.getCanonicalPath).getFileSystem(SparkHadoopUtil.get.conf)
-      assert(!fs.exists(new Path(file.getCanonicalPath, "_temporary")))
-    }
-  }
-
-  test("call failure callbacks before close writer - partitioned") {
-    SimpleTextRelation.failCommitter = false
-    withTempPath { file =>
-      // fail the job in the middle of writing
-      val df = sqlContext.range(0, 10).coalesce(1).select(col("id").mod(2).as("key"), col("id"))
-
-      SimpleTextRelation.callbackCalled = false
-      SimpleTextRelation.failWriter = true
-      intercept[SparkException] {
-        df.write.format(dataSourceName).partitionBy("key").save(file.getCanonicalPath)
-      }
-      assert(SimpleTextRelation.callbackCalled, "failure callback should be called")
-
-      val fs = new Path(file.getCanonicalPath).getFileSystem(SparkHadoopUtil.get.conf)
-      assert(!fs.exists(new Path(file.getCanonicalPath, "_temporary")))
-    }
-  }
-}
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextHadoopFsRelationSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextHadoopFsRelationSuite.scala
deleted file mode 100644
index e64bb77a03a58..0000000000000
--- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextHadoopFsRelationSuite.scala
+++ /dev/null
@@ -1,382 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.sources
-
-import java.io.File
-
-import org.apache.hadoop.fs.Path
-
-import org.apache.spark.deploy.SparkHadoopUtil
-import org.apache.spark.sql.{execution, Column, DataFrame, Row}
-import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
-import org.apache.spark.sql.catalyst.expressions.{AttributeReference, Expression, PredicateHelper}
-import org.apache.spark.sql.execution.{LogicalRDD, PhysicalRDD}
-import org.apache.spark.sql.functions._
-import org.apache.spark.sql.types._
-import org.apache.spark.util.Utils
-
-class SimpleTextHadoopFsRelationSuite extends HadoopFsRelationTest with PredicateHelper {
-  import testImplicits._
-
-  override val dataSourceName: String = classOf[SimpleTextSource].getCanonicalName
-
-  // We have a very limited number of supported types at here since it is just for a
-  // test relation and we do very basic testing at here.
-  override protected def supportsDataType(dataType: DataType): Boolean = dataType match {
-    case _: BinaryType => false
-    // We are using random data generator and the generated strings are not really valid string.
-    case _: StringType => false
-    case _: BooleanType => false // see https://issues.apache.org/jira/browse/SPARK-10442
-    case _: CalendarIntervalType => false
-    case _: DateType => false
-    case _: TimestampType => false
-    case _: ArrayType => false
-    case _: MapType => false
-    case _: StructType => false
-    case _: UserDefinedType[_] => false
-    case _ => true
-  }
-
-  test("save()/load() - partitioned table - simple queries - partition columns in data") {
-    withTempDir { file =>
-      val basePath = new Path(file.getCanonicalPath)
-      val fs = basePath.getFileSystem(SparkHadoopUtil.get.conf)
-      val qualifiedBasePath = fs.makeQualified(basePath)
-
-      for (p1 <- 1 to 2; p2 <- Seq("foo", "bar")) {
-        val partitionDir = new Path(qualifiedBasePath, s"p1=$p1/p2=$p2")
-        sparkContext
-          .parallelize(for (i <- 1 to 3) yield s"$i,val_$i,$p1")
-          .saveAsTextFile(partitionDir.toString)
-      }
-
-      val dataSchemaWithPartition =
-        StructType(dataSchema.fields :+ StructField("p1", IntegerType, nullable = true))
-
-      checkQueries(
-        hiveContext.read.format(dataSourceName)
-          .option("dataSchema", dataSchemaWithPartition.json)
-          .load(file.getCanonicalPath))
-    }
-  }
-
-  private var tempPath: File = _
-
-  private var partitionedDF: DataFrame = _
-
-  private val partitionedDataSchema: StructType =
-    new StructType()
-      .add("a", IntegerType)
-      .add("b", IntegerType)
-      .add("c", StringType)
-
-  protected override def beforeAll(): Unit = {
-    this.tempPath = Utils.createTempDir()
-
-    val df = sqlContext.range(10).select(
-      'id cast IntegerType as 'a,
-      ('id cast IntegerType) * 2 as 'b,
-      concat(lit("val_"), 'id) as 'c
-    )
-
-    partitionedWriter(df).save(s"${tempPath.getCanonicalPath}/p=0")
-    partitionedWriter(df).save(s"${tempPath.getCanonicalPath}/p=1")
-
-    partitionedDF = partitionedReader.load(tempPath.getCanonicalPath)
-  }
-
-  override protected def afterAll(): Unit = {
-    Utils.deleteRecursively(tempPath)
-  }
-
-  private def partitionedWriter(df: DataFrame) =
-    df.write.option("dataSchema", partitionedDataSchema.json).format(dataSourceName)
-
-  private def partitionedReader =
-    sqlContext.read.option("dataSchema", partitionedDataSchema.json).format(dataSourceName)
-
-  /**
-   * Constructs test cases that test column pruning and filter push-down.
-   *
-   * For filter push-down, the following filters are not pushed-down.
-   *
-   * 1. Partitioning filters don't participate filter push-down, they are handled separately in
-   *    `DataSourceStrategy`
-   *
-   * 2. Catalyst filter `Expression`s that cannot be converted to data source `Filter`s are not
-   *    pushed down (e.g. UDF and filters referencing multiple columns).
-   *
-   * 3. Catalyst filter `Expression`s that can be converted to data source `Filter`s but cannot be
-   *    handled by the underlying data source are not pushed down (e.g. returned from
-   *    `BaseRelation.unhandledFilters()`).
-   *
-   *    Note that for [[SimpleTextRelation]], all data source [[Filter]]s other than [[GreaterThan]]
-   *    are unhandled.  We made this assumption in [[SimpleTextRelation.unhandledFilters()]] only
-   *    for testing purposes.
-   *
-   * @param projections Projection list of the query
-   * @param filter Filter condition of the query
-   * @param requiredColumns Expected names of required columns
-   * @param pushedFilters Expected data source [[Filter]]s that are pushed down
-   * @param inconvertibleFilters Expected Catalyst filter [[Expression]]s that cannot be converted
-   *        to data source [[Filter]]s
-   * @param unhandledFilters Expected Catalyst flter [[Expression]]s that can be converted to data
-   *        source [[Filter]]s but cannot be handled by the data source relation
-   * @param partitioningFilters Expected Catalyst filter [[Expression]]s that reference partition
-   *        columns
-   * @param expectedRawScanAnswer Expected query result of the raw table scan returned by the data
-   *        source relation
-   * @param expectedAnswer Expected query result of the full query
-   */
-  def testPruningAndFiltering(
-      projections: Seq[Column],
-      filter: Column,
-      requiredColumns: Seq[String],
-      pushedFilters: Seq[Filter],
-      inconvertibleFilters: Seq[Column],
-      unhandledFilters: Seq[Column],
-      partitioningFilters: Seq[Column])(
-      expectedRawScanAnswer: => Seq[Row])(
-      expectedAnswer: => Seq[Row]): Unit = {
-    test(s"pruning and filtering: df.select(${projections.mkString(", ")}).where($filter)") {
-      val df = partitionedDF.where(filter).select(projections: _*)
-      val queryExecution = df.queryExecution
-      val sparkPlan = queryExecution.sparkPlan
-
-      val rawScan = sparkPlan.collect {
-        case p: PhysicalRDD => p
-      } match {
-        case Seq(scan) => scan
-        case _ => fail(s"More than one PhysicalRDD found\n$queryExecution")
-      }
-
-      markup("Checking raw scan answer")
-      checkAnswer(
-        DataFrame(sqlContext, LogicalRDD(rawScan.output, rawScan.rdd)(sqlContext)),
-        expectedRawScanAnswer)
-
-      markup("Checking full query answer")
-      checkAnswer(df, expectedAnswer)
-
-      markup("Checking required columns")
-      assert(requiredColumns === SimpleTextRelation.requiredColumns)
-
-      val nonPushedFilters = {
-        val boundFilters = sparkPlan.collect {
-          case f: execution.Filter => f
-        } match {
-          case Nil => Nil
-          case Seq(f) => splitConjunctivePredicates(f.condition)
-          case _ => fail(s"More than one PhysicalRDD found\n$queryExecution")
-        }
-
-        // Unbound these bound filters so that we can easily compare them with expected results.
-        boundFilters.map {
-          _.transform { case a: AttributeReference => UnresolvedAttribute(a.name) }
-        }.toSet
-      }
-
-      markup("Checking pushed filters")
-      assert(pushedFilters.toSet.subsetOf(SimpleTextRelation.pushedFilters))
-
-      val expectedInconvertibleFilters = inconvertibleFilters.map(_.expr).toSet
-      val expectedUnhandledFilters = unhandledFilters.map(_.expr).toSet
-      val expectedPartitioningFilters = partitioningFilters.map(_.expr).toSet
-
-      markup("Checking unhandled and inconvertible filters")
-      assert((expectedInconvertibleFilters ++ expectedUnhandledFilters).subsetOf(nonPushedFilters))
-
-      markup("Checking partitioning filters")
-      val actualPartitioningFilters = splitConjunctivePredicates(filter.expr).filter {
-        _.references.contains(UnresolvedAttribute("p"))
-      }.toSet
-
-      // Partitioning filters are handled separately and don't participate filter push-down. So they
-      // shouldn't be part of non-pushed filters.
-      assert(expectedPartitioningFilters.intersect(nonPushedFilters).isEmpty)
-      assert(expectedPartitioningFilters === actualPartitioningFilters)
-    }
-  }
-
-  testPruningAndFiltering(
-    projections = Seq('*),
-    filter = 'p > 0,
-    requiredColumns = Seq("a", "b", "c"),
-    pushedFilters = Nil,
-    inconvertibleFilters = Nil,
-    unhandledFilters = Nil,
-    partitioningFilters = Seq('p > 0)
-  ) {
-    Seq(
-      Row(0, 0, "val_0", 1),
-      Row(1, 2, "val_1", 1),
-      Row(2, 4, "val_2", 1),
-      Row(3, 6, "val_3", 1),
-      Row(4, 8, "val_4", 1),
-      Row(5, 10, "val_5", 1),
-      Row(6, 12, "val_6", 1),
-      Row(7, 14, "val_7", 1),
-      Row(8, 16, "val_8", 1),
-      Row(9, 18, "val_9", 1))
-  } {
-    Seq(
-      Row(0, 0, "val_0", 1),
-      Row(1, 2, "val_1", 1),
-      Row(2, 4, "val_2", 1),
-      Row(3, 6, "val_3", 1),
-      Row(4, 8, "val_4", 1),
-      Row(5, 10, "val_5", 1),
-      Row(6, 12, "val_6", 1),
-      Row(7, 14, "val_7", 1),
-      Row(8, 16, "val_8", 1),
-      Row(9, 18, "val_9", 1))
-  }
-
-  testPruningAndFiltering(
-    projections = Seq('c, 'p),
-    filter = 'a < 3 && 'p > 0,
-    requiredColumns = Seq("c", "a"),
-    pushedFilters = Seq(LessThan("a", 3)),
-    inconvertibleFilters = Nil,
-    unhandledFilters = Seq('a < 3),
-    partitioningFilters = Seq('p > 0)
-  ) {
-    Seq(
-      Row("val_0", 1, 0),
-      Row("val_1", 1, 1),
-      Row("val_2", 1, 2),
-      Row("val_3", 1, 3),
-      Row("val_4", 1, 4),
-      Row("val_5", 1, 5),
-      Row("val_6", 1, 6),
-      Row("val_7", 1, 7),
-      Row("val_8", 1, 8),
-      Row("val_9", 1, 9))
-  } {
-    Seq(
-      Row("val_0", 1),
-      Row("val_1", 1),
-      Row("val_2", 1))
-  }
-
-  testPruningAndFiltering(
-    projections = Seq('*),
-    filter = 'a > 8,
-    requiredColumns = Seq("a", "b", "c"),
-    pushedFilters = Seq(GreaterThan("a", 8)),
-    inconvertibleFilters = Nil,
-    unhandledFilters = Nil,
-    partitioningFilters = Nil
-  ) {
-    Seq(
-      Row(9, 18, "val_9", 0),
-      Row(9, 18, "val_9", 1))
-  } {
-    Seq(
-      Row(9, 18, "val_9", 0),
-      Row(9, 18, "val_9", 1))
-  }
-
-  testPruningAndFiltering(
-    projections = Seq('b, 'p),
-    filter = 'a > 8,
-    requiredColumns = Seq("b"),
-    pushedFilters = Seq(GreaterThan("a", 8)),
-    inconvertibleFilters = Nil,
-    unhandledFilters = Nil,
-    partitioningFilters = Nil
-  ) {
-    Seq(
-      Row(18, 0),
-      Row(18, 1))
-  } {
-    Seq(
-      Row(18, 0),
-      Row(18, 1))
-  }
-
-  testPruningAndFiltering(
-    projections = Seq('b, 'p),
-    filter = 'a > 8 && 'p > 0,
-    requiredColumns = Seq("b"),
-    pushedFilters = Seq(GreaterThan("a", 8)),
-    inconvertibleFilters = Nil,
-    unhandledFilters = Nil,
-    partitioningFilters = Seq('p > 0)
-  ) {
-    Seq(
-      Row(18, 1))
-  } {
-    Seq(
-      Row(18, 1))
-  }
-
-  testPruningAndFiltering(
-    projections = Seq('b, 'p),
-    filter = 'c > "val_7" && 'b < 18 && 'p > 0,
-    requiredColumns = Seq("b"),
-    pushedFilters = Seq(GreaterThan("c", "val_7"), LessThan("b", 18)),
-    inconvertibleFilters = Nil,
-    unhandledFilters = Seq('b < 18),
-    partitioningFilters = Seq('p > 0)
-  ) {
-    Seq(
-      Row(16, 1),
-      Row(18, 1))
-  } {
-    Seq(
-      Row(16, 1))
-  }
-
-  testPruningAndFiltering(
-    projections = Seq('b, 'p),
-    filter = 'a % 2 === 0 && 'c > "val_7" && 'b < 18 && 'p > 0,
-    requiredColumns = Seq("b", "a"),
-    pushedFilters = Seq(GreaterThan("c", "val_7"), LessThan("b", 18)),
-    inconvertibleFilters = Seq('a % 2 === 0),
-    unhandledFilters = Seq('b < 18),
-    partitioningFilters = Seq('p > 0)
-  ) {
-    Seq(
-      Row(16, 1, 8),
-      Row(18, 1, 9))
-  } {
-    Seq(
-      Row(16, 1))
-  }
-
-  testPruningAndFiltering(
-    projections = Seq('b, 'p),
-    filter = 'a > 7 && 'a < 9,
-    requiredColumns = Seq("b", "a"),
-    pushedFilters = Seq(GreaterThan("a", 7), LessThan("a", 9)),
-    inconvertibleFilters = Nil,
-    unhandledFilters = Seq('a < 9),
-    partitioningFilters = Nil
-  ) {
-    Seq(
-      Row(16, 0, 8),
-      Row(16, 1, 8),
-      Row(18, 0, 9),
-      Row(18, 1, 9))
-  } {
-    Seq(
-      Row(16, 0),
-      Row(16, 1))
-  }
-}
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextRelation.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextRelation.scala
deleted file mode 100644
index bb552d6aa3e3f..0000000000000
--- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextRelation.scala
+++ /dev/null
@@ -1,271 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.sources
-
-import java.text.NumberFormat
-
-import com.google.common.base.Objects
-import org.apache.hadoop.fs.{FileStatus, Path}
-import org.apache.hadoop.io.{NullWritable, Text}
-import org.apache.hadoop.mapreduce.{Job, RecordWriter, TaskAttemptContext}
-import org.apache.hadoop.mapreduce.lib.output.{FileOutputFormat, TextOutputFormat}
-
-import org.apache.spark.TaskContext
-import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.{sources, Row, SQLContext}
-import org.apache.spark.sql.catalyst.{expressions, CatalystTypeConverters}
-import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.types.{DataType, StructType}
-
-/**
- * A simple example [[HadoopFsRelationProvider]].
- */
-class SimpleTextSource extends HadoopFsRelationProvider {
-  override def createRelation(
-      sqlContext: SQLContext,
-      paths: Array[String],
-      schema: Option[StructType],
-      partitionColumns: Option[StructType],
-      parameters: Map[String, String]): HadoopFsRelation = {
-    new SimpleTextRelation(paths, schema, partitionColumns, parameters)(sqlContext)
-  }
-}
-
-class AppendingTextOutputFormat(outputFile: Path) extends TextOutputFormat[NullWritable, Text] {
-  val numberFormat = NumberFormat.getInstance()
-
-  numberFormat.setMinimumIntegerDigits(5)
-  numberFormat.setGroupingUsed(false)
-
-  override def getDefaultWorkFile(context: TaskAttemptContext, extension: String): Path = {
-    val configuration = context.getConfiguration
-    val uniqueWriteJobId = configuration.get("spark.sql.sources.writeJobUUID")
-    val taskAttemptId = context.getTaskAttemptID
-    val split = taskAttemptId.getTaskID.getId
-    val name = FileOutputFormat.getOutputName(context)
-    new Path(outputFile, s"$name-${numberFormat.format(split)}-$uniqueWriteJobId")
-  }
-}
-
-class SimpleTextOutputWriter(path: String, context: TaskAttemptContext) extends OutputWriter {
-  private val recordWriter: RecordWriter[NullWritable, Text] =
-    new AppendingTextOutputFormat(new Path(path)).getRecordWriter(context)
-
-  override def write(row: Row): Unit = {
-    val serialized = row.toSeq.map { v =>
-      if (v == null) "" else v.toString
-    }.mkString(",")
-    recordWriter.write(null, new Text(serialized))
-  }
-
-  override def close(): Unit = {
-    recordWriter.close(context)
-  }
-}
-
-/**
- * A simple example [[HadoopFsRelation]], used for testing purposes.  Data are stored as comma
- * separated string lines.  When scanning data, schema must be explicitly provided via data source
- * option `"dataSchema"`.
- */
-class SimpleTextRelation(
-    override val paths: Array[String],
-    val maybeDataSchema: Option[StructType],
-    override val userDefinedPartitionColumns: Option[StructType],
-    parameters: Map[String, String])(
-    @transient val sqlContext: SQLContext)
-  extends HadoopFsRelation(parameters) {
-
-  import sqlContext.sparkContext
-
-  override val dataSchema: StructType =
-    maybeDataSchema.getOrElse(DataType.fromJson(parameters("dataSchema")).asInstanceOf[StructType])
-
-  override def equals(other: Any): Boolean = other match {
-    case that: SimpleTextRelation =>
-      this.paths.sameElements(that.paths) &&
-        this.maybeDataSchema == that.maybeDataSchema &&
-        this.dataSchema == that.dataSchema &&
-        this.partitionColumns == that.partitionColumns
-
-    case _ => false
-  }
-
-  override def hashCode(): Int =
-    Objects.hashCode(paths, maybeDataSchema, dataSchema, partitionColumns)
-
-  override def buildScan(inputStatuses: Array[FileStatus]): RDD[Row] = {
-    val fields = dataSchema.map(_.dataType)
-
-    sparkContext.textFile(inputStatuses.map(_.getPath).mkString(",")).map { record =>
-      Row(record.split(",", -1).zip(fields).map { case (v, dataType) =>
-        val value = if (v == "") null else v
-        // `Cast`ed values are always of Catalyst types (i.e. UTF8String instead of String, etc.)
-        val catalystValue = Cast(Literal(value), dataType).eval()
-        // Here we're converting Catalyst values to Scala values to test `needsConversion`
-        CatalystTypeConverters.convertToScala(catalystValue, dataType)
-      }: _*)
-    }
-  }
-
-  override def buildScan(
-      requiredColumns: Array[String],
-      filters: Array[Filter],
-      inputFiles: Array[FileStatus]): RDD[Row] = {
-
-    SimpleTextRelation.requiredColumns = requiredColumns
-    SimpleTextRelation.pushedFilters = filters.toSet
-
-    val fields = this.dataSchema.map(_.dataType)
-    val inputAttributes = this.dataSchema.toAttributes
-    val outputAttributes = requiredColumns.flatMap(name => inputAttributes.find(_.name == name))
-    val dataSchema = this.dataSchema
-
-    val inputPaths = inputFiles.map(_.getPath).mkString(",")
-    sparkContext.textFile(inputPaths).mapPartitions { iterator =>
-      // Constructs a filter predicate to simulate filter push-down
-      val predicate = {
-        val filterCondition: Expression = filters.collect {
-          // According to `unhandledFilters`, `SimpleTextRelation` only handles `GreaterThan` and
-          // `isNotNull` filters
-          case sources.GreaterThan(column, value) =>
-            val dataType = dataSchema(column).dataType
-            val literal = Literal.create(value, dataType)
-            val attribute = inputAttributes.find(_.name == column).get
-            expressions.GreaterThan(attribute, literal)
-          case sources.IsNotNull(column) =>
-            val dataType = dataSchema(column).dataType
-            val attribute = inputAttributes.find(_.name == column).get
-            expressions.IsNotNull(attribute)
-        }.reduceOption(expressions.And).getOrElse(Literal(true))
-        InterpretedPredicate.create(filterCondition, inputAttributes)
-      }
-
-      // Uses a simple projection to simulate column pruning
-      val projection = new InterpretedMutableProjection(outputAttributes, inputAttributes)
-      val toScala = {
-        val requiredSchema = StructType.fromAttributes(outputAttributes)
-        CatalystTypeConverters.createToScalaConverter(requiredSchema)
-      }
-
-      iterator.map { record =>
-        new GenericInternalRow(record.split(",", -1).zip(fields).map {
-          case (v, dataType) =>
-            val value = if (v == "") null else v
-            // `Cast`ed values are always of internal types (e.g. UTF8String instead of String)
-            Cast(Literal(value), dataType).eval()
-        })
-      }.filter { row =>
-        predicate(row)
-      }.map { row =>
-        toScala(projection(row)).asInstanceOf[Row]
-      }
-    }
-  }
-
-  override def prepareJobForWrite(job: Job): OutputWriterFactory = new OutputWriterFactory {
-    job.setOutputFormatClass(classOf[TextOutputFormat[_, _]])
-
-    override def newInstance(
-        path: String,
-        dataSchema: StructType,
-        context: TaskAttemptContext): OutputWriter = {
-      new SimpleTextOutputWriter(path, context)
-    }
-  }
-
-  // `SimpleTextRelation` only handles `GreaterThan` and `IsNotNull` filters.  This is used to test
-  // filter push-down and `BaseRelation.unhandledFilters()`.
-  override def unhandledFilters(filters: Array[Filter]): Array[Filter] = {
-    filters.filter {
-      case _: GreaterThan => false
-      case _: IsNotNull => false
-      case _ => true
-    }
-  }
-}
-
-object SimpleTextRelation {
-  // Used to test column pruning
-  var requiredColumns: Seq[String] = Nil
-
-  // Used to test filter push-down
-  var pushedFilters: Set[Filter] = Set.empty
-
-  // Used to test failed committer
-  var failCommitter = false
-
-  // Used to test failed writer
-  var failWriter = false
-
-  // Used to test failure callback
-  var callbackCalled = false
-}
-
-/**
- * A simple example [[HadoopFsRelationProvider]].
- */
-class CommitFailureTestSource extends HadoopFsRelationProvider {
-  override def createRelation(
-      sqlContext: SQLContext,
-      paths: Array[String],
-      schema: Option[StructType],
-      partitionColumns: Option[StructType],
-      parameters: Map[String, String]): HadoopFsRelation = {
-    new CommitFailureTestRelation(paths, schema, partitionColumns, parameters)(sqlContext)
-  }
-}
-
-class CommitFailureTestRelation(
-    override val paths: Array[String],
-    maybeDataSchema: Option[StructType],
-    override val userDefinedPartitionColumns: Option[StructType],
-    parameters: Map[String, String])(
-    @transient sqlContext: SQLContext)
-  extends SimpleTextRelation(
-    paths, maybeDataSchema, userDefinedPartitionColumns, parameters)(sqlContext) {
-  override def prepareJobForWrite(job: Job): OutputWriterFactory = new OutputWriterFactory {
-    override def newInstance(
-        path: String,
-        dataSchema: StructType,
-        context: TaskAttemptContext): OutputWriter = {
-      new SimpleTextOutputWriter(path, context) {
-        var failed = false
-        TaskContext.get().addTaskFailureListener { (t: TaskContext, e: Throwable) =>
-          failed = true
-          SimpleTextRelation.callbackCalled = true
-        }
-
-        override def write(row: Row): Unit = {
-          if (SimpleTextRelation.failWriter) {
-            sys.error("Intentional task writer failure for testing purpose.")
-
-          }
-          super.write(row)
-        }
-
-        override def close(): Unit = {
-          if (SimpleTextRelation.failCommitter) {
-            sys.error("Intentional task commitment failure for testing purpose.")
-          }
-          super.close()
-        }
-      }
-    }
-  }
-}
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala
index 2a921a061f358..7e09616380659 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala
@@ -503,7 +503,7 @@ abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils with Tes
 
       val actualPaths = df.queryExecution.analyzed.collectFirst {
         case LogicalRelation(relation: HadoopFsRelation, _, _) =>
-          relation.paths.toSet
+          relation.location.paths.map(_.toString).toSet
       }.getOrElse {
         fail("Expect an FSBasedRelation, but none could be found")
       }
@@ -560,7 +560,7 @@ abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils with Tes
       .saveAsTable("t")
 
     withTable("t") {
-      checkAnswer(sqlContext.table("t"), df.select('b, 'c, 'a).collect())
+      checkAnswer(sqlContext.table("t").select('b, 'c, 'a), df.select('b, 'c, 'a).collect())
     }
   }